From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 20072 invoked by alias); 3 Apr 2009 14:13:04 -0000 Received: (qmail 20066 invoked by alias); 3 Apr 2009 14:13:03 -0000 X-SWARE-Spam-Status: No, hits=-1.7 required=5.0 tests=AWL,BAYES_00,J_CHICKENPOX_57,SPF_HELO_PASS X-Spam-Status: No, hits=-1.7 required=5.0 tests=AWL,BAYES_00,J_CHICKENPOX_57,SPF_HELO_PASS X-Spam-Check-By: sourceware.org X-Spam-Checker-Version: SpamAssassin 3.2.5 (2008-06-10) on bastion.fedora.phx.redhat.com Subject: cluster: RHEL48 - rgmanager: Detect restricted failover domain crash To: cluster-cvs-relay@redhat.com X-Project: Cluster Project X-Git-Module: cluster.git X-Git-Refname: refs/heads/RHEL48 X-Git-Reftype: branch X-Git-Oldrev: da9f72456bdda7833f8360de92807b0f66cb334a X-Git-Newrev: 4a62d37ed15229356fde8945fa3b5798fb85b663 From: Lon Hohberger Message-Id: <20090403141230.BC09D1201D6@lists.fedorahosted.org> Date: Fri, 03 Apr 2009 14:13:00 -0000 X-Scanned-By: MIMEDefang 2.58 on 172.16.52.254 Mailing-List: contact cluster-cvs-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Post: List-Help: , Sender: cluster-cvs-owner@sourceware.org X-SW-Source: 2009-q2/txt/msg00022.txt.bz2 Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=4a62d37ed15229356fde8945fa3b5798fb85b663 Commit: 4a62d37ed15229356fde8945fa3b5798fb85b663 Parent: da9f72456bdda7833f8360de92807b0f66cb334a Author: Lon Hohberger AuthorDate: Mon Aug 25 15:54:44 2008 -0400 Committer: Lon Hohberger CommitterDate: Fri Apr 3 10:12:12 2009 -0400 rgmanager: Detect restricted failover domain crash Mark service as 'stopped' when it is 'running' but the node is down. rhbz #428108 --- rgmanager/include/reslist.h | 5 +- rgmanager/src/daemons/fo_domain.c | 17 ++++- rgmanager/src/daemons/groups.c | 80 +++++++++++++++++------ rgmanager/src/daemons/members.c | 30 +++++++++ rgmanager/src/daemons/rg_state.c | 27 ++++++-- rgmanager/src/daemons/service_op.c | 15 ++++- rgmanager/src/daemons/slang_event.c | 23 ++----- rgmanager/src/resources/default_event_script.sl | 3 +- 8 files changed, 151 insertions(+), 49 deletions(-) diff --git a/rgmanager/include/reslist.h b/rgmanager/include/reslist.h index f78288f..4d3feea 100644 --- a/rgmanager/include/reslist.h +++ b/rgmanager/include/reslist.h @@ -23,6 +23,7 @@ #include #include #include +#include #define RA_PRIMARY (1<<0) /** Primary key */ @@ -205,8 +206,8 @@ void deconstruct_domains(fod_t **domains); void print_domains(fod_t **domains); int node_should_start(uint64_t nodeid, cluster_member_list_t *membership, char *rg_name, fod_t **domains); -int node_domain_set(fod_t *domain, uint64_t **ret, int *retlen); -int node_domain_set_safe(char *domainname, uint64_t **ret, int *retlen, int *flags); +int node_domain_set(fod_t **domains, char *name, set_type_t **ret, int *retlen, int *flags); +int node_domain_set_safe(char *domainname, set_type_t **ret, int *retlen, int *flags); /* diff --git a/rgmanager/src/daemons/fo_domain.c b/rgmanager/src/daemons/fo_domain.c index 9019a10..be1918d 100644 --- a/rgmanager/src/daemons/fo_domain.c +++ b/rgmanager/src/daemons/fo_domain.c @@ -349,13 +349,24 @@ node_in_domain(char *nodename, fod_t *domain, int -node_domain_set(fod_t *domain, uint64_t **ret, int *retlen) +node_domain_set(fod_t **domains, char *name, set_type_t **ret, int *retlen, int *flags) { int x, i, j; set_type_t *tmpset; int ts_count; - fod_node_t *fodn; + fod_t *domain; + int found = 0; + + list_for(domains, domain, x) { + if (!strcasecmp(domain->fd_name, name)) { + found = 1; + break; + } + } // while (!list_done(&_domains, fod)); + + if (!found) + return -1; /* Count domain length */ list_for(&domain->fd_nodes, fodn, x) { } @@ -368,6 +379,8 @@ node_domain_set(fod_t *domain, uint64_t **ret, int *retlen) if (!(*tmpset)) return -1; + *flags = domain->fd_flags; + if (domain->fd_flags & FOD_ORDERED) { for (i = 1; i <= 100; i++) { diff --git a/rgmanager/src/daemons/groups.c b/rgmanager/src/daemons/groups.c index d1fe3db..a0816d5 100644 --- a/rgmanager/src/daemons/groups.c +++ b/rgmanager/src/daemons/groups.c @@ -18,9 +18,10 @@ MA 02139, USA. */ //#define DEBUG +#include #include -#include #include +#include #include #include #include @@ -60,6 +61,8 @@ pthread_rwlock_t resource_lock = PTHREAD_RWLOCK_INITIALIZER; void res_build_name(char *, size_t, resource_t *); int group_migratory(char *groupname, int lock); +int group_property(char *groupname, char *property, char *ret, size_t len); +int member_online_set(set_type_t **nodes, int *nodecount); struct status_arg { @@ -88,25 +91,11 @@ node_should_start_safe(uint64_t nodeid, cluster_member_list_t *membership, int -node_domain_set_safe(char *domainname, uint64_t **ret, int *retlen, int *flags) +node_domain_set_safe(char *domainname, set_type_t **ret, int *retlen, int *flags) { - fod_t *fod; - int rv = -1, found = 0, x = 0; - + int rv = 0; pthread_rwlock_rdlock(&resource_lock); - - list_for(&_domains, fod, x) { - if (!strcasecmp(fod->fd_name, domainname)) { - found = 1; - break; - } - } // while (!list_done(&_domains, fod)); - - if (found) { - rv = node_domain_set(fod, ret, retlen); - *flags = fod->fd_flags; - } - + rv = node_domain_set(&_domains, domainname, ret, retlen, flags); pthread_rwlock_unlock(&resource_lock); return rv; @@ -440,6 +429,52 @@ check_depend_safe(char *rg_name) } +int +check_rdomain_crash(char *svcName) +{ + set_type_t *nodes = NULL; + set_type_t *fd_nodes = NULL; + set_type_t *isect = NULL; + int nodecount; + int fd_nodecount, fl; + int icount; + char fd_name[256]; + + if (group_property(svcName, "domain", fd_name, sizeof(fd_name)) != 0) + goto out_free; + + member_online_set(&nodes, &nodecount); + + if (node_domain_set(&_domains, fd_name, &fd_nodes, + &fd_nodecount, &fl) != 0) + goto out_free; + + if (!(fl & FOD_RESTRICTED)) + goto out_free; + + if (s_intersection(fd_nodes, fd_nodecount, nodes, nodecount, + &isect, &icount) < 0) + goto out_free; + + if (icount == 0) { + clulog(LOG_DEBUG, "Marking %s as stopped: " + "Restricted domain unavailable\n", svcName); + rt_enqueue_request(svcName, RG_STOP, -1, 0, 0, + 0, 0); + } + +out_free: + if (fd_nodes) + free(fd_nodes); + if (nodes) + free(nodes); + if (isect) + free(isect); + + return 0; +} + + /** Start or failback a resource group: if it's not running, start it. If it is running and we're a better member to run it, then ask for @@ -453,6 +488,7 @@ consider_start(resource_node_t *node, char *svcName, rg_state_t *svcStatus, cluster_member_t *mp; int autostart, exclusive; void *lockp = NULL; + int fod_ret; mp = memb_id_to_p(membership, my_id()); assert(mp); @@ -545,10 +581,13 @@ consider_start(resource_node_t *node, char *svcName, rg_state_t *svcStatus, * Start any stopped services, or started services * that are owned by a down node. */ - if (node_should_start(mp->cm_id, membership, svcName, &_domains) == - FOD_BEST) + fod_ret = node_should_start(mp->cm_id, membership, + svcName, &_domains); + if (fod_ret == FOD_BEST) rt_enqueue_request(svcName, RG_START, -1, 0, mp->cm_id, 0, 0); + else if (fod_ret == FOD_ILLEGAL) + check_rdomain_crash(svcName); } @@ -979,7 +1018,6 @@ group_property_unlocked(char *groupname, char *property, char *ret, } - /** Send the state of a resource group to a given file descriptor. diff --git a/rgmanager/src/daemons/members.c b/rgmanager/src/daemons/members.c index 910d174..6fc1327 100644 --- a/rgmanager/src/daemons/members.c +++ b/rgmanager/src/daemons/members.c @@ -16,6 +16,7 @@ Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include #include #include #include @@ -94,6 +95,35 @@ member_list(void) } +int +member_online_set(set_type_t **nodes, int *nodecount) +{ + int ret = 1, i; + + pthread_rwlock_rdlock(&memblock); + if (!membership) + goto out_unlock; + + *nodes = malloc(sizeof(set_type_t) * membership->cml_count); + if (!*nodes) + goto out_unlock; + + *nodecount = 0; + for (i = 0; i < membership->cml_count; i++) { + if (membership->cml_members[i].cm_state && + membership->cml_members[i].cm_id > 0) { + (*nodes)[*nodecount] = membership->cml_members[i].cm_id; + ++(*nodecount); + } + } + + ret = 0; +out_unlock: + pthread_rwlock_unlock(&memblock); + return ret; +} + + char * member_name(uint64_t id, char *buf, int buflen) { diff --git a/rgmanager/src/daemons/rg_state.c b/rgmanager/src/daemons/rg_state.c index c717ecc..386d569 100644 --- a/rgmanager/src/daemons/rg_state.c +++ b/rgmanager/src/daemons/rg_state.c @@ -17,6 +17,7 @@ MA 02139, USA. */ //#define DEBUG +#include #include #include #include @@ -30,6 +31,7 @@ #include #include #include +#include #define cm_svccount cm_pad[0] /* Theses are uint8_t size */ #define cm_svcexcl cm_pad[1] @@ -46,6 +48,7 @@ int get_rg_state(char *servicename, rg_state_t *svcblk); void get_recovery_policy(char *rg_name, char *buf, size_t buflen); int have_exclusive_resources(void); int check_exclusive_resources(cluster_member_list_t *membership, char *svcName); +int count_resource_groups_local(cluster_member_t *mp); pthread_mutex_t exclusive_mutex = PTHREAD_MUTEX_INITIALIZER; @@ -433,9 +436,12 @@ get_rg_state_local(char *rgname, rg_state_t *svcblk) * @param req Specify request to perform * @return 0 = DO RG_NOT stop service, return RG_EFAIL * 1 = STOP service - return whatever it returns. - * 2 = DO RG_NOT stop service, return 0 (success) - * 3 = DO RG_NOT stop service, return RG_EFORWARD - * 4 = DO RG_NOT stop service, return RG_EAGAIN + * 2 = DO NOT stop service, return 0 (success) + * 3 = DO NOT stop service, return RG_EFORWARD + * 4 = DO NOT stop service, return RG_EAGAIN + * 5 = DO NOT stop service, return RG_EFROZEN + * 6 = DO NOT stop service, mark stopped and return + * RG_SUCCESS (0) */ int svc_advise_stop(rg_state_t *svcStatus, char *svcName, int req) @@ -494,9 +500,10 @@ svc_advise_stop(rg_state_t *svcStatus, char *svcName, int req) /* Service is marked as running but node is down. - Doesn't make much sense to stop it. + Doesn't make much sense to stop it - but we need + to mark it stopped */ - ret = 2; + ret = 6; break; case RG_STATE_ERROR: @@ -929,6 +936,16 @@ _svc_stop(char *svcName, int req, int recover, uint32_t newstate) clulog(LOG_DEBUG, "Unable to stop %s in %s state\n", svcName, rg_state_str(svcStatus.rs_state)); return RG_EFAIL; + case 6: + /* Mark stopped, but do not do anything */ + svcStatus.rs_last_owner = svcStatus.rs_owner; + svcStatus.rs_owner = 0; + svcStatus.rs_state = RG_STATE_STOPPED; + if (set_rg_state(svcName, &svcStatus) != 0) { + rg_unlock(svcName, lockp); + return RG_EFAIL; + } + /* FALLTHROUGH */ case 2: rg_unlock(svcName, lockp); return RG_ESUCCESS; diff --git a/rgmanager/src/daemons/service_op.c b/rgmanager/src/daemons/service_op.c index 3c02688..48fbd2d 100644 --- a/rgmanager/src/daemons/service_op.c +++ b/rgmanager/src/daemons/service_op.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -153,8 +154,18 @@ service_op_stop(char *svcName, int do_disable, int event_type) if (get_service_state_internal(svcName, &svcStatus) < 0) return RG_EFAIL; - if (svcStatus.rs_owner != NODE_ID_NONE) - msgtarget = svcStatus.rs_owner; + + if (svcStatus.rs_owner != NODE_ID_NONE) { + if (member_online(svcStatus.rs_owner)) { + msgtarget = svcStatus.rs_owner; + } else { + /* If the owner is not online, + mark the service as 'stopped' but + otherwise, do nothing. + */ + return svc_stop(svcName, RG_STOP); + } + } if ((fd = msg_open(msgtarget, RG_PORT, RG_PURPOSE, 2)) < 0) { clulog(LOG_ERR, diff --git a/rgmanager/src/daemons/slang_event.c b/rgmanager/src/daemons/slang_event.c index d3a522b..6e17db0 100644 --- a/rgmanager/src/daemons/slang_event.c +++ b/rgmanager/src/daemons/slang_event.c @@ -19,10 +19,12 @@ @file S/Lang event handling & intrinsic functions + vars */ #include +#include #include #include #include #include +#include #include #include #include @@ -35,7 +37,6 @@ #include #include #include -#include #include static int __sl_initialized = 0; @@ -46,6 +47,8 @@ static int _service_list_len = 0; char **get_service_names(int *len); /* from groups.c */ int get_service_property(char *rg_name, char *prop, char *buf, size_t buflen); void push_int_array(set_type_t *stuff, int len); +int member_online_set(set_type_t **nodes, int *nodecount); + /* ================================================================ @@ -604,25 +607,13 @@ push_int_array(set_type_t *stuff, int len) void sl_nodes_online(void) { - int i, nodecount = 0; set_type_t *nodes; + int nodecount = 0, x = 0; - cluster_member_list_t *membership = member_list(); - if (!membership) - return; - nodes = malloc(sizeof(set_type_t) * membership->cml_count); - if (!nodes) + x = member_online_set(&nodes, &nodecount); + if (x < 0 || !nodes || !nodecount) return; - nodecount = 0; - for (i = 0; i < membership->cml_count; i++) { - if (membership->cml_members[i].cm_state && - membership->cml_members[i].cm_id != 0) { - nodes[nodecount] = membership->cml_members[i].cm_id; - ++nodecount; - } - } - cml_free(membership); push_int_array(nodes, nodecount); free(nodes); } diff --git a/rgmanager/src/resources/default_event_script.sl b/rgmanager/src/resources/default_event_script.sl index e961266..cecc1f6 100644 --- a/rgmanager/src/resources/default_event_script.sl +++ b/rgmanager/src/resources/default_event_script.sl @@ -31,7 +31,8 @@ define move_or_start(service, node_list) len = length(node_list); if (len == 0) { - debug(service, " is not runnable"); + notice(service, " is not runnable - restricted domain offline"); + ()=service_stop(service); return ERR_DOMAIN; }