master - rgmanager: Implement enforcement of timeouts on a per-resource basis

public inbox for cluster-cvs@sourceware.org
help / color / mirror / Atom feed

* master - rgmanager: Implement enforcement of timeouts on a per-resource basis
@ 2008-09-24 18:07 Lon Hohberger
  0 siblings, 0 replies; only message in thread
From: Lon Hohberger @ 2008-09-24 18:07 UTC (permalink / raw)
  To: cluster-cvs-relay

Gitweb:        http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=28aa1c508a4d8fe8dfec7824966642c31acb4349
Commit:        28aa1c508a4d8fe8dfec7824966642c31acb4349
Parent:        e646399f8b0b7d7f233bef3a678971d04f12bf6f
Author:        Lon Hohberger <lhh@redhat.com>
AuthorDate:    Mon Sep 8 15:43:39 2008 -0400
Committer:     Lon Hohberger <lhh@redhat.com>
CommitterDate: Wed Sep 24 13:41:41 2008 -0400

rgmanager: Implement enforcement of timeouts on a per-resource basis

Set "__enforce_timeouts" to "1" in the resource tree in order to
enable this behavior (e.g. not the global resources list).

rhbz #455326
---
 rgmanager/include/reslist.h     |    1 +
 rgmanager/src/daemons/restree.c |   98 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 94 insertions(+), 5 deletions(-)

diff --git a/rgmanager/include/reslist.h b/rgmanager/include/reslist.h
index 7b0934a..206567d 100644
--- a/rgmanager/include/reslist.h
+++ b/rgmanager/include/reslist.h
@@ -27,6 +27,7 @@
 #define RF_DESTROY	(1<<8)	/** Resource rule flag: Destroy this
 				  resource class if you delete it from
 				  the configuration */
+#define RF_ENFORCE_TIMEOUTS (1<<9) /** Enforce timeouts for this node */
 
 
 
diff --git a/rgmanager/src/daemons/restree.c b/rgmanager/src/daemons/restree.c
index ec5a684..94e8b2f 100644
--- a/rgmanager/src/daemons/restree.c
+++ b/rgmanager/src/daemons/restree.c
@@ -313,6 +313,26 @@ restore_signals(void)
 }
 
 
+/** Find the index for a given operation / depth in a resource node */
+int
+res_act_index(resource_node_t *node, const char *op_str, int depth)
+{
+	int x = 0;
+	resource_act_t *act;
+
+	for (x = 0; node->rn_actions[x].ra_name; x++) {
+		act = &node->rn_actions[x];
+		if (depth != act->ra_depth)
+			continue;
+		if (strcasecmp(act->ra_name, op_str))
+			continue;
+		return x;
+	}
+
+	return -1;
+}
+
+
 /**
    Execute a resource-specific agent for a resource node in the tree.
 
@@ -327,6 +347,8 @@ res_exec(resource_node_t *node, int op, const char *arg, int depth)
 {
 	int childpid, pid;
 	int ret = 0;
+	int act_index;
+	time_t sleeptime = 0;
 	char **env = NULL;
 	resource_t *res = node->rn_resource;
 	const char *op_str = agent_op_str(op);
@@ -335,6 +357,17 @@ res_exec(resource_node_t *node, int op, const char *arg, int depth)
 	if (!res->r_rule->rr_agent)
 		return 0;
 
+	/* Get the action index for later */
+	act_index = res_act_index(node, op_str, depth);
+
+	/* This shouldn't happen, but execing an action for which 
+	   we have an incorrect depth or no status action does not
+	   indicate a problem.  This allows people writing resource
+	   agents to write agents which have no status/monitor function
+	   at their option, in violation of the OCF RA API specification. */
+	if (act_index < 0)
+		return 0;
+
 #ifdef DEBUG
 	env = build_env(node, depth, node->rn_resource->r_incarnations);
 	if (!env)
@@ -390,11 +423,53 @@ res_exec(resource_node_t *node, int op, const char *arg, int depth)
 	kill_env(env);
 #endif
 
-	do {
-		pid = waitpid(childpid, &ret, 0);
-		if ((pid < 0) && (errno == EINTR))
-			continue;
-	} while (0);
+	if (node->rn_flags & RF_ENFORCE_TIMEOUTS)
+		sleeptime = node->rn_actions[act_index].ra_timeout;
+
+	if (sleeptime > 0) {
+
+		/* There's a better way to do this, but this is easy and
+		   doesn't introduce signal woes */
+		while (sleeptime) {
+			pid = waitpid(childpid, &ret, WNOHANG);
+
+			if (pid == childpid)
+				break;
+			sleep(1);
+			--sleeptime;
+		}
+
+		if (pid != childpid && sleeptime == 0) {
+
+			clulog(LOG_ERR,
+			       "%s on %s:%s timed out after %d seconds\n",
+			       op_str, res->r_rule->rr_type,
+			       res->r_attrs->ra_value,
+			       node->rn_actions[act_index].ra_timeout,
+			       ocf_strerror(ret));
+			
+			/* This can't be guaranteed to kill even the child
+			   process if the child is in disk-wait :( */
+			kill(childpid, SIGKILL);
+			sleep(1);
+			pid = waitpid(childpid, &ret, WNOHANG);
+			if (pid == 0) {
+				clulog(LOG_ERR,
+				       "Task %s PID %d did not exit "
+				       "after SIGKILL\n",
+				       op_str, childpid);
+			}
+
+			/* Always an error if we time out */
+			return 1;
+		}
+	} else {
+		do {
+			pid = waitpid(childpid, &ret, 0);
+			if ((pid < 0) && (errno == EINTR))
+				continue;
+		} while (0);
+	}
 
 	if (WIFEXITED(ret)) {
 
@@ -552,6 +627,17 @@ do_load_resource(int ccsfd, char *base,
 		free(ref);
 	}
 
+	snprintf(tok, sizeof(tok), "%s/@__enforce_timeouts", base);
+#ifndef NO_CCS
+	if (ccs_get(ccsfd, tok, &ref) == 0) {
+#else
+	if (conf_get(tok, &ref) == 0) {
+#endif
+		if (atoi(ref) > 0 || strcasecmp(ref, "yes") == 0)
+			node->rn_flags |= RF_ENFORCE_TIMEOUTS;
+		free(ref);
+	}
+
 	curres->r_refs++;
 
 	*newnode = node;
@@ -828,6 +914,8 @@ _print_resource_tree(resource_node_t **tree, int level)
 				printf("COMMON ");
 			if (node->rn_flags & RF_INDEPENDENT)
 				printf("INDEPENDENT ");
+			if (node->rn_flags & RF_ENFORCE_TIMEOUTS)
+				printf("ENFORCE-TIMEOUTS ");
 			printf("]");
 		}
 		printf(" {\n");


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2008-09-24 18:06 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-09-24 18:07 master - rgmanager: Implement enforcement of timeouts on a per-resource basis Lon Hohberger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).