public inbox for cluster-cvs@sourceware.org
help / color / mirror / Atom feed
* master - rgmanager: Implement enforcement of timeouts on a per-resource basis
@ 2008-09-24 18:07 Lon Hohberger
0 siblings, 0 replies; only message in thread
From: Lon Hohberger @ 2008-09-24 18:07 UTC (permalink / raw)
To: cluster-cvs-relay
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=28aa1c508a4d8fe8dfec7824966642c31acb4349
Commit: 28aa1c508a4d8fe8dfec7824966642c31acb4349
Parent: e646399f8b0b7d7f233bef3a678971d04f12bf6f
Author: Lon Hohberger <lhh@redhat.com>
AuthorDate: Mon Sep 8 15:43:39 2008 -0400
Committer: Lon Hohberger <lhh@redhat.com>
CommitterDate: Wed Sep 24 13:41:41 2008 -0400
rgmanager: Implement enforcement of timeouts on a per-resource basis
Set "__enforce_timeouts" to "1" in the resource tree in order to
enable this behavior (e.g. not the global resources list).
rhbz #455326
---
rgmanager/include/reslist.h | 1 +
rgmanager/src/daemons/restree.c | 98 +++++++++++++++++++++++++++++++++++++--
2 files changed, 94 insertions(+), 5 deletions(-)
diff --git a/rgmanager/include/reslist.h b/rgmanager/include/reslist.h
index 7b0934a..206567d 100644
--- a/rgmanager/include/reslist.h
+++ b/rgmanager/include/reslist.h
@@ -27,6 +27,7 @@
#define RF_DESTROY (1<<8) /** Resource rule flag: Destroy this
resource class if you delete it from
the configuration */
+#define RF_ENFORCE_TIMEOUTS (1<<9) /** Enforce timeouts for this node */
diff --git a/rgmanager/src/daemons/restree.c b/rgmanager/src/daemons/restree.c
index ec5a684..94e8b2f 100644
--- a/rgmanager/src/daemons/restree.c
+++ b/rgmanager/src/daemons/restree.c
@@ -313,6 +313,26 @@ restore_signals(void)
}
+/** Find the index for a given operation / depth in a resource node */
+int
+res_act_index(resource_node_t *node, const char *op_str, int depth)
+{
+ int x = 0;
+ resource_act_t *act;
+
+ for (x = 0; node->rn_actions[x].ra_name; x++) {
+ act = &node->rn_actions[x];
+ if (depth != act->ra_depth)
+ continue;
+ if (strcasecmp(act->ra_name, op_str))
+ continue;
+ return x;
+ }
+
+ return -1;
+}
+
+
/**
Execute a resource-specific agent for a resource node in the tree.
@@ -327,6 +347,8 @@ res_exec(resource_node_t *node, int op, const char *arg, int depth)
{
int childpid, pid;
int ret = 0;
+ int act_index;
+ time_t sleeptime = 0;
char **env = NULL;
resource_t *res = node->rn_resource;
const char *op_str = agent_op_str(op);
@@ -335,6 +357,17 @@ res_exec(resource_node_t *node, int op, const char *arg, int depth)
if (!res->r_rule->rr_agent)
return 0;
+ /* Get the action index for later */
+ act_index = res_act_index(node, op_str, depth);
+
+ /* This shouldn't happen, but execing an action for which
+ we have an incorrect depth or no status action does not
+ indicate a problem. This allows people writing resource
+ agents to write agents which have no status/monitor function
+ at their option, in violation of the OCF RA API specification. */
+ if (act_index < 0)
+ return 0;
+
#ifdef DEBUG
env = build_env(node, depth, node->rn_resource->r_incarnations);
if (!env)
@@ -390,11 +423,53 @@ res_exec(resource_node_t *node, int op, const char *arg, int depth)
kill_env(env);
#endif
- do {
- pid = waitpid(childpid, &ret, 0);
- if ((pid < 0) && (errno == EINTR))
- continue;
- } while (0);
+ if (node->rn_flags & RF_ENFORCE_TIMEOUTS)
+ sleeptime = node->rn_actions[act_index].ra_timeout;
+
+ if (sleeptime > 0) {
+
+ /* There's a better way to do this, but this is easy and
+ doesn't introduce signal woes */
+ while (sleeptime) {
+ pid = waitpid(childpid, &ret, WNOHANG);
+
+ if (pid == childpid)
+ break;
+ sleep(1);
+ --sleeptime;
+ }
+
+ if (pid != childpid && sleeptime == 0) {
+
+ clulog(LOG_ERR,
+ "%s on %s:%s timed out after %d seconds\n",
+ op_str, res->r_rule->rr_type,
+ res->r_attrs->ra_value,
+ node->rn_actions[act_index].ra_timeout,
+ ocf_strerror(ret));
+
+ /* This can't be guaranteed to kill even the child
+ process if the child is in disk-wait :( */
+ kill(childpid, SIGKILL);
+ sleep(1);
+ pid = waitpid(childpid, &ret, WNOHANG);
+ if (pid == 0) {
+ clulog(LOG_ERR,
+ "Task %s PID %d did not exit "
+ "after SIGKILL\n",
+ op_str, childpid);
+ }
+
+ /* Always an error if we time out */
+ return 1;
+ }
+ } else {
+ do {
+ pid = waitpid(childpid, &ret, 0);
+ if ((pid < 0) && (errno == EINTR))
+ continue;
+ } while (0);
+ }
if (WIFEXITED(ret)) {
@@ -552,6 +627,17 @@ do_load_resource(int ccsfd, char *base,
free(ref);
}
+ snprintf(tok, sizeof(tok), "%s/@__enforce_timeouts", base);
+#ifndef NO_CCS
+ if (ccs_get(ccsfd, tok, &ref) == 0) {
+#else
+ if (conf_get(tok, &ref) == 0) {
+#endif
+ if (atoi(ref) > 0 || strcasecmp(ref, "yes") == 0)
+ node->rn_flags |= RF_ENFORCE_TIMEOUTS;
+ free(ref);
+ }
+
curres->r_refs++;
*newnode = node;
@@ -828,6 +914,8 @@ _print_resource_tree(resource_node_t **tree, int level)
printf("COMMON ");
if (node->rn_flags & RF_INDEPENDENT)
printf("INDEPENDENT ");
+ if (node->rn_flags & RF_ENFORCE_TIMEOUTS)
+ printf("ENFORCE-TIMEOUTS ");
printf("]");
}
printf(" {\n");
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2008-09-24 18:06 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-09-24 18:07 master - rgmanager: Implement enforcement of timeouts on a per-resource basis Lon Hohberger
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).