From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 13037 invoked by alias); 24 Sep 2008 18:06:22 -0000 Received: (qmail 13029 invoked by alias); 24 Sep 2008 18:06:22 -0000 X-Spam-Status: No, hits=-0.7 required=5.0 tests=AWL,BAYES_00,J_CHICKENPOX_48,J_CHICKENPOX_83,KAM_MX,SPF_HELO_PASS X-Spam-Check-By: sourceware.org X-Spam-Checker-Version: SpamAssassin 3.2.4 (2008-01-01) on bastion.fedora.phx.redhat.com X-Spam-Level: Subject: master - rgmanager: Implement enforcement of timeouts on a per-resource basis To: cluster-cvs-relay@redhat.com X-Project: Cluster Project X-Git-Module: cluster.git X-Git-Refname: refs/heads/master X-Git-Reftype: branch X-Git-Oldrev: e646399f8b0b7d7f233bef3a678971d04f12bf6f X-Git-Newrev: 28aa1c508a4d8fe8dfec7824966642c31acb4349 From: Lon Hohberger Message-Id: <20080924175655.86A8C120433@lists.fedorahosted.org> Date: Wed, 24 Sep 2008 18:07:00 -0000 X-Scanned-By: MIMEDefang 2.58 on 172.16.52.254 Mailing-List: contact cluster-cvs-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Post: List-Help: , Sender: cluster-cvs-owner@sourceware.org X-SW-Source: 2008-q3/txt/msg00521.txt.bz2 Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=28aa1c508a4d8fe8dfec7824966642c31acb4349 Commit: 28aa1c508a4d8fe8dfec7824966642c31acb4349 Parent: e646399f8b0b7d7f233bef3a678971d04f12bf6f Author: Lon Hohberger AuthorDate: Mon Sep 8 15:43:39 2008 -0400 Committer: Lon Hohberger CommitterDate: Wed Sep 24 13:41:41 2008 -0400 rgmanager: Implement enforcement of timeouts on a per-resource basis Set "__enforce_timeouts" to "1" in the resource tree in order to enable this behavior (e.g. not the global resources list). rhbz #455326 --- rgmanager/include/reslist.h | 1 + rgmanager/src/daemons/restree.c | 98 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 94 insertions(+), 5 deletions(-) diff --git a/rgmanager/include/reslist.h b/rgmanager/include/reslist.h index 7b0934a..206567d 100644 --- a/rgmanager/include/reslist.h +++ b/rgmanager/include/reslist.h @@ -27,6 +27,7 @@ #define RF_DESTROY (1<<8) /** Resource rule flag: Destroy this resource class if you delete it from the configuration */ +#define RF_ENFORCE_TIMEOUTS (1<<9) /** Enforce timeouts for this node */ diff --git a/rgmanager/src/daemons/restree.c b/rgmanager/src/daemons/restree.c index ec5a684..94e8b2f 100644 --- a/rgmanager/src/daemons/restree.c +++ b/rgmanager/src/daemons/restree.c @@ -313,6 +313,26 @@ restore_signals(void) } +/** Find the index for a given operation / depth in a resource node */ +int +res_act_index(resource_node_t *node, const char *op_str, int depth) +{ + int x = 0; + resource_act_t *act; + + for (x = 0; node->rn_actions[x].ra_name; x++) { + act = &node->rn_actions[x]; + if (depth != act->ra_depth) + continue; + if (strcasecmp(act->ra_name, op_str)) + continue; + return x; + } + + return -1; +} + + /** Execute a resource-specific agent for a resource node in the tree. @@ -327,6 +347,8 @@ res_exec(resource_node_t *node, int op, const char *arg, int depth) { int childpid, pid; int ret = 0; + int act_index; + time_t sleeptime = 0; char **env = NULL; resource_t *res = node->rn_resource; const char *op_str = agent_op_str(op); @@ -335,6 +357,17 @@ res_exec(resource_node_t *node, int op, const char *arg, int depth) if (!res->r_rule->rr_agent) return 0; + /* Get the action index for later */ + act_index = res_act_index(node, op_str, depth); + + /* This shouldn't happen, but execing an action for which + we have an incorrect depth or no status action does not + indicate a problem. This allows people writing resource + agents to write agents which have no status/monitor function + at their option, in violation of the OCF RA API specification. */ + if (act_index < 0) + return 0; + #ifdef DEBUG env = build_env(node, depth, node->rn_resource->r_incarnations); if (!env) @@ -390,11 +423,53 @@ res_exec(resource_node_t *node, int op, const char *arg, int depth) kill_env(env); #endif - do { - pid = waitpid(childpid, &ret, 0); - if ((pid < 0) && (errno == EINTR)) - continue; - } while (0); + if (node->rn_flags & RF_ENFORCE_TIMEOUTS) + sleeptime = node->rn_actions[act_index].ra_timeout; + + if (sleeptime > 0) { + + /* There's a better way to do this, but this is easy and + doesn't introduce signal woes */ + while (sleeptime) { + pid = waitpid(childpid, &ret, WNOHANG); + + if (pid == childpid) + break; + sleep(1); + --sleeptime; + } + + if (pid != childpid && sleeptime == 0) { + + clulog(LOG_ERR, + "%s on %s:%s timed out after %d seconds\n", + op_str, res->r_rule->rr_type, + res->r_attrs->ra_value, + node->rn_actions[act_index].ra_timeout, + ocf_strerror(ret)); + + /* This can't be guaranteed to kill even the child + process if the child is in disk-wait :( */ + kill(childpid, SIGKILL); + sleep(1); + pid = waitpid(childpid, &ret, WNOHANG); + if (pid == 0) { + clulog(LOG_ERR, + "Task %s PID %d did not exit " + "after SIGKILL\n", + op_str, childpid); + } + + /* Always an error if we time out */ + return 1; + } + } else { + do { + pid = waitpid(childpid, &ret, 0); + if ((pid < 0) && (errno == EINTR)) + continue; + } while (0); + } if (WIFEXITED(ret)) { @@ -552,6 +627,17 @@ do_load_resource(int ccsfd, char *base, free(ref); } + snprintf(tok, sizeof(tok), "%s/@__enforce_timeouts", base); +#ifndef NO_CCS + if (ccs_get(ccsfd, tok, &ref) == 0) { +#else + if (conf_get(tok, &ref) == 0) { +#endif + if (atoi(ref) > 0 || strcasecmp(ref, "yes") == 0) + node->rn_flags |= RF_ENFORCE_TIMEOUTS; + free(ref); + } + curres->r_refs++; *newnode = node; @@ -828,6 +914,8 @@ _print_resource_tree(resource_node_t **tree, int level) printf("COMMON "); if (node->rn_flags & RF_INDEPENDENT) printf("INDEPENDENT "); + if (node->rn_flags & RF_ENFORCE_TIMEOUTS) + printf("ENFORCE-TIMEOUTS "); printf("]"); } printf(" {\n");