From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 5550 invoked by alias); 5 Mar 2009 14:08:25 -0000 Received: (qmail 5538 invoked by alias); 5 Mar 2009 14:08:24 -0000 X-SWARE-Spam-Status: No, hits=-0.3 required=5.0 tests=AWL,BAYES_40,J_CHICKENPOX_83,SPF_HELO_PASS X-Spam-Status: No, hits=-0.3 required=5.0 tests=AWL,BAYES_40,J_CHICKENPOX_83,SPF_HELO_PASS X-Spam-Check-By: sourceware.org X-Spam-Checker-Version: SpamAssassin 3.2.5 (2008-06-10) on bastion.fedora.phx.redhat.com Subject: cluster: STABLE3 - rgmanager: Allow restart counters to work with central_processing To: cluster-cvs-relay@redhat.com X-Project: Cluster Project X-Git-Module: cluster.git X-Git-Refname: refs/heads/STABLE3 X-Git-Reftype: branch X-Git-Oldrev: 7981232fc6862eba25c45cd7f5a36df7bfcf96a1 X-Git-Newrev: 2c724185bb57b9998c555edafc326badf5facd60 From: Lon Hohberger Message-Id: <20090305140802.73CEE1201F2@lists.fedorahosted.org> Date: Thu, 05 Mar 2009 14:08:00 -0000 X-Scanned-By: MIMEDefang 2.58 on 172.16.52.254 Mailing-List: contact cluster-cvs-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Post: List-Help: , Sender: cluster-cvs-owner@sourceware.org X-SW-Source: 2009-q1/txt/msg00687.txt.bz2 Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=2c724185bb57b9998c555edafc326badf5facd60 Commit: 2c724185bb57b9998c555edafc326badf5facd60 Parent: 7981232fc6862eba25c45cd7f5a36df7bfcf96a1 Author: Lon Hohberger AuthorDate: Thu Mar 5 09:03:23 2009 -0500 Committer: Lon Hohberger CommitterDate: Thu Mar 5 09:03:23 2009 -0500 rgmanager: Allow restart counters to work with central_processing rhbz #400211 / #431130 --- rgmanager/ChangeLog | 5 +++++ rgmanager/include/resgroup.h | 1 + rgmanager/include/restart_counter.h | 1 + rgmanager/src/daemons/groups.c | 25 +++++++++++++++++++------ rgmanager/src/daemons/restart_counter.c | 22 +++++++++++++++++++++- rgmanager/src/daemons/rg_state.c | 24 +++++++++++++++++++----- rgmanager/src/daemons/slang_event.c | 15 +++++++++++++++ 7 files changed, 81 insertions(+), 12 deletions(-) diff --git a/rgmanager/ChangeLog b/rgmanager/ChangeLog index 2387815..917500f 100644 --- a/rgmanager/ChangeLog +++ b/rgmanager/ChangeLog @@ -8,6 +8,11 @@ 2008-02-26 Lon Hohberger * src/resources/ip.sh: Fix netmask handling in ip.sh * src/utils/clustat.c: Don't show estranged nodes if they're down +2008-02-01 Lon Hohberger + * src/daemons/rg_state.c, slang_event.c, groups.c, restart_counter.c, + include/restart_counter.h, resgroup.h, + src/resources/default_event_script.sl: Allow restart counters to + correctly work with central_processing. (#400211 / #431130) 2008-01-25 Lon Hohberger * src/daemons/rg_thread.c: Fix case that broke 'clusvcadm -e diff --git a/rgmanager/include/resgroup.h b/rgmanager/include/resgroup.h index 67acb6d..84442fd 100644 --- a/rgmanager/include/resgroup.h +++ b/rgmanager/include/resgroup.h @@ -162,6 +162,7 @@ int svc_freeze(char *svcName); int svc_unfreeze(char *svcName); int svc_migrate(char *svcName, int target); int check_restart(char *svcName); +int add_restart(char *svcName); int rt_enqueue_request(const char *resgroupname, int request, msgctx_t *resp_ctx, diff --git a/rgmanager/include/restart_counter.h b/rgmanager/include/restart_counter.h index 399680c..04714e4 100644 --- a/rgmanager/include/restart_counter.h +++ b/rgmanager/include/restart_counter.h @@ -8,6 +8,7 @@ typedef void *restart_counter_t; int restart_add(restart_counter_t arg); int restart_clear(restart_counter_t arg); int restart_count(restart_counter_t arg); +int restart_treshold_exceeded(restart_counter_t arg); restart_counter_t restart_init(time_t expire_timeout, int max_restarts); int restart_cleanup(restart_counter_t arg); diff --git a/rgmanager/src/daemons/groups.c b/rgmanager/src/daemons/groups.c index 4047be1..d91d6e6 100644 --- a/rgmanager/src/daemons/groups.c +++ b/rgmanager/src/daemons/groups.c @@ -1797,7 +1797,7 @@ get_service_property(char *rg_name, char *prop, char *buf, size_t buflen) int -check_restart(char *rg_name) +add_restart(char *rg_name) { resource_node_t *node; int ret = 1; @@ -1806,11 +1806,24 @@ check_restart(char *rg_name) node = node_by_ref(&_tree, rg_name); if (node) { ret = restart_add(node->rn_restart_counter); - if (ret) { - /* Clear it out - caller is about - to relocate the service anyway */ - restart_clear(node->rn_restart_counter); - } + } + pthread_rwlock_unlock(&resource_lock); + + return ret; +} + + +int +check_restart(char *rg_name) +{ + resource_node_t *node; + int ret = 0; + + pthread_rwlock_rdlock(&resource_lock); + node = node_by_ref(&_tree, rg_name); + if (node) { + printf("%s %p\n", rg_name, node->rn_restart_counter); + ret = restart_threshold_exceeded(node->rn_restart_counter); } pthread_rwlock_unlock(&resource_lock); diff --git a/rgmanager/src/daemons/restart_counter.c b/rgmanager/src/daemons/restart_counter.c index 9b2e3c6..8789987 100644 --- a/rgmanager/src/daemons/restart_counter.c +++ b/rgmanager/src/daemons/restart_counter.c @@ -29,6 +29,10 @@ typedef struct { #define VALIDATE(arg, ret) \ do { \ + if (!arg) {\ + errno = EINVAL; \ + return ret; \ + } \ if (((restart_info_t *)arg)->magic != RESTART_INFO_MAGIC) {\ errno = EINVAL; \ return ret; \ @@ -80,6 +84,21 @@ restart_count(restart_counter_t arg) } +int +restart_threshold_exceeded(restart_counter_t arg) +{ + restart_info_t *restarts = (restart_info_t *)arg; + time_t now; + + VALIDATE(arg, -1); + now = time(NULL); + restart_timer_purge(arg, now); + if (restarts->restart_count >= restarts->max_restarts) + return 1; + return 0; +} + + /* Add a restart entry to the list. Returns 1 if restart count is exceeded */ int @@ -110,7 +129,7 @@ restart_add(restart_counter_t arg) /* Check and remove old entries */ restart_timer_purge(restarts, t); - if (restarts->restart_count > restarts->max_restarts) + if (restarts->restart_count >= restarts->max_restarts) return 1; return 0; @@ -153,6 +172,7 @@ restart_init(time_t expire_timeout, int max_restarts) info->expire_timeout = expire_timeout; info->max_restarts = max_restarts; info->restart_count = 0; + info->restart_nodes = NULL; return (void *)info; } diff --git a/rgmanager/src/daemons/rg_state.c b/rgmanager/src/daemons/rg_state.c index ad6ba1f..7937c0d 100644 --- a/rgmanager/src/daemons/rg_state.c +++ b/rgmanager/src/daemons/rg_state.c @@ -684,7 +684,6 @@ svc_advise_start(rg_state_t *svcStatus, char *svcName, int req) logt_print(LOG_NOTICE, "Recovering failed service %s\n", svcName); - svcStatus->rs_state = RG_STATE_STOPPED; /* Start! */ ret = 1; break; @@ -798,13 +797,16 @@ svc_start(char *svcName, int req) /* LOCK HELD if we get here */ svcStatus.rs_owner = my_id(); - svcStatus.rs_state = RG_STATE_STARTING; svcStatus.rs_transition = (uint64_t)time(NULL); - if (req == RG_START_RECOVER) + if (svcStatus.rs_state == RG_STATE_RECOVER) { + add_restart(svcName); svcStatus.rs_restarts++; - else + } else { svcStatus.rs_restarts = 0; + } + + svcStatus.rs_state = RG_STATE_STARTING; if (set_rg_state(svcName, &svcStatus) < 0) { logt_print(LOG_ERR, @@ -1273,7 +1275,7 @@ _svc_stop(char *svcName, int req, int recover, uint32_t newstate) { struct dlm_lksb lockp; rg_state_t svcStatus; - int ret; + int ret = 0; int old_state; if (!rg_quorate()) { @@ -1329,6 +1331,18 @@ _svc_stop(char *svcName, int req, int recover, uint32_t newstate) old_state = svcStatus.rs_state; + if (old_state == RG_STATE_RECOVER) { + logt_print(LOG_DEBUG, "%s is clean; skipping double-stop\n", + svcName); + svcStatus.rs_state = newstate; + + if (set_rg_state(svcName, &svcStatus) != 0) { + rg_unlock(&lockp); + logt_print(LOG_ERR, "#52: Failed changing RG status\n"); + return RG_EFAIL; + } + } + logt_print(LOG_NOTICE, "Stopping service %s\n", svcName); if (recover) diff --git a/rgmanager/src/daemons/slang_event.c b/rgmanager/src/daemons/slang_event.c index b215ef9..e019fbb 100644 --- a/rgmanager/src/daemons/slang_event.c +++ b/rgmanager/src/daemons/slang_event.c @@ -63,6 +63,7 @@ static int _node_clean = 0, _service_owner = 0, _service_last_owner = 0, + _service_restarts_exceeded = 0, _user_request = 0, _user_arg1 = 0, _user_arg2 = 0, @@ -108,6 +109,8 @@ SLang_Intrin_Var_Type rgmanager_vars[] = MAKE_VARIABLE("service_owner", &_service_owner,SLANG_INT_TYPE, 1), MAKE_VARIABLE("service_last_owner", &_service_last_owner, SLANG_INT_TYPE, 1), + MAKE_VARIABLE("service_restarts_exceeded", &_service_restarts_exceeded, + SLANG_INT_TYPE, 1), /* User event information */ MAKE_VARIABLE("user_request", &_user_request, SLANG_INT_TYPE,1), @@ -204,6 +207,7 @@ void sl_service_status(char *svcName) { rg_state_t svcStatus; + int restarts_exceeded = 0; char *state_str; if (get_service_state_internal(svcName, &svcStatus) < 0) { @@ -214,6 +218,15 @@ sl_service_status(char *svcName) return; } + restarts_exceeded = check_restart(svcName); + if (SLang_push_integer(restarts_exceeded) < 0) { + SLang_verror(SL_RunTime_Error, + "%s: Failed to push restarts_exceeded %s", + __FUNCTION__, + svcName); + return; + } + if (SLang_push_integer(svcStatus.rs_restarts) < 0) { SLang_verror(SL_RunTime_Error, "%s: Failed to push restarts for %s", @@ -1077,6 +1090,7 @@ S_service_event(const char *file, const char *script, char *name, _service_state = (char *)rg_state_str(state); _service_owner = owner; _service_last_owner = last_owner; + _service_restarts_exceeded = check_restart(name); switch(state) { case RG_STATE_DISABLED: @@ -1094,6 +1108,7 @@ S_service_event(const char *file, const char *script, char *name, _service_state = 0; _service_owner = 0; _service_last_owner = 0; + _service_restarts_exceeded = 0; return ret; }