public inbox for cluster-cvs@sourceware.org help / color / mirror / Atom feed
From: Lon Hohberger <lon@fedoraproject.org> To: cluster-cvs-relay@redhat.com Subject: cluster: RHEL48 - rgmanager: Detect restricted failover domain crash Date: Fri, 03 Apr 2009 14:13:00 -0000 [thread overview] Message-ID: <20090403141230.BC09D1201D6@lists.fedorahosted.org> (raw) Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=4a62d37ed15229356fde8945fa3b5798fb85b663 Commit: 4a62d37ed15229356fde8945fa3b5798fb85b663 Parent: da9f72456bdda7833f8360de92807b0f66cb334a Author: Lon Hohberger <lhh@redhat.com> AuthorDate: Mon Aug 25 15:54:44 2008 -0400 Committer: Lon Hohberger <lhh@redhat.com> CommitterDate: Fri Apr 3 10:12:12 2009 -0400 rgmanager: Detect restricted failover domain crash Mark service as 'stopped' when it is 'running' but the node is down. rhbz #428108 --- rgmanager/include/reslist.h | 5 +- rgmanager/src/daemons/fo_domain.c | 17 ++++- rgmanager/src/daemons/groups.c | 80 +++++++++++++++++------ rgmanager/src/daemons/members.c | 30 +++++++++ rgmanager/src/daemons/rg_state.c | 27 ++++++-- rgmanager/src/daemons/service_op.c | 15 ++++- rgmanager/src/daemons/slang_event.c | 23 ++----- rgmanager/src/resources/default_event_script.sl | 3 +- 8 files changed, 151 insertions(+), 49 deletions(-) diff --git a/rgmanager/include/reslist.h b/rgmanager/include/reslist.h index f78288f..4d3feea 100644 --- a/rgmanager/include/reslist.h +++ b/rgmanager/include/reslist.h @@ -23,6 +23,7 @@ #include <libxml/parser.h> #include <libxml/xmlmemory.h> #include <libxml/xpath.h> +#include <sets.h> #define RA_PRIMARY (1<<0) /** Primary key */ @@ -205,8 +206,8 @@ void deconstruct_domains(fod_t **domains); void print_domains(fod_t **domains); int node_should_start(uint64_t nodeid, cluster_member_list_t *membership, char *rg_name, fod_t **domains); -int node_domain_set(fod_t *domain, uint64_t **ret, int *retlen); -int node_domain_set_safe(char *domainname, uint64_t **ret, int *retlen, int *flags); +int node_domain_set(fod_t **domains, char *name, set_type_t **ret, int *retlen, int *flags); +int node_domain_set_safe(char *domainname, set_type_t **ret, int *retlen, int *flags); /* diff --git a/rgmanager/src/daemons/fo_domain.c b/rgmanager/src/daemons/fo_domain.c index 9019a10..be1918d 100644 --- a/rgmanager/src/daemons/fo_domain.c +++ b/rgmanager/src/daemons/fo_domain.c @@ -349,13 +349,24 @@ node_in_domain(char *nodename, fod_t *domain, int -node_domain_set(fod_t *domain, uint64_t **ret, int *retlen) +node_domain_set(fod_t **domains, char *name, set_type_t **ret, int *retlen, int *flags) { int x, i, j; set_type_t *tmpset; int ts_count; - fod_node_t *fodn; + fod_t *domain; + int found = 0; + + list_for(domains, domain, x) { + if (!strcasecmp(domain->fd_name, name)) { + found = 1; + break; + } + } // while (!list_done(&_domains, fod)); + + if (!found) + return -1; /* Count domain length */ list_for(&domain->fd_nodes, fodn, x) { } @@ -368,6 +379,8 @@ node_domain_set(fod_t *domain, uint64_t **ret, int *retlen) if (!(*tmpset)) return -1; + *flags = domain->fd_flags; + if (domain->fd_flags & FOD_ORDERED) { for (i = 1; i <= 100; i++) { diff --git a/rgmanager/src/daemons/groups.c b/rgmanager/src/daemons/groups.c index d1fe3db..a0816d5 100644 --- a/rgmanager/src/daemons/groups.c +++ b/rgmanager/src/daemons/groups.c @@ -18,9 +18,10 @@ MA 02139, USA. */ //#define DEBUG +#include <sets.h> #include <platform.h> -#include <resgroup.h> #include <restart_counter.h> +#include <resgroup.h> #include <reslist.h> #include <vf.h> #include <magma.h> @@ -60,6 +61,8 @@ pthread_rwlock_t resource_lock = PTHREAD_RWLOCK_INITIALIZER; void res_build_name(char *, size_t, resource_t *); int group_migratory(char *groupname, int lock); +int group_property(char *groupname, char *property, char *ret, size_t len); +int member_online_set(set_type_t **nodes, int *nodecount); struct status_arg { @@ -88,25 +91,11 @@ node_should_start_safe(uint64_t nodeid, cluster_member_list_t *membership, int -node_domain_set_safe(char *domainname, uint64_t **ret, int *retlen, int *flags) +node_domain_set_safe(char *domainname, set_type_t **ret, int *retlen, int *flags) { - fod_t *fod; - int rv = -1, found = 0, x = 0; - + int rv = 0; pthread_rwlock_rdlock(&resource_lock); - - list_for(&_domains, fod, x) { - if (!strcasecmp(fod->fd_name, domainname)) { - found = 1; - break; - } - } // while (!list_done(&_domains, fod)); - - if (found) { - rv = node_domain_set(fod, ret, retlen); - *flags = fod->fd_flags; - } - + rv = node_domain_set(&_domains, domainname, ret, retlen, flags); pthread_rwlock_unlock(&resource_lock); return rv; @@ -440,6 +429,52 @@ check_depend_safe(char *rg_name) } +int +check_rdomain_crash(char *svcName) +{ + set_type_t *nodes = NULL; + set_type_t *fd_nodes = NULL; + set_type_t *isect = NULL; + int nodecount; + int fd_nodecount, fl; + int icount; + char fd_name[256]; + + if (group_property(svcName, "domain", fd_name, sizeof(fd_name)) != 0) + goto out_free; + + member_online_set(&nodes, &nodecount); + + if (node_domain_set(&_domains, fd_name, &fd_nodes, + &fd_nodecount, &fl) != 0) + goto out_free; + + if (!(fl & FOD_RESTRICTED)) + goto out_free; + + if (s_intersection(fd_nodes, fd_nodecount, nodes, nodecount, + &isect, &icount) < 0) + goto out_free; + + if (icount == 0) { + clulog(LOG_DEBUG, "Marking %s as stopped: " + "Restricted domain unavailable\n", svcName); + rt_enqueue_request(svcName, RG_STOP, -1, 0, 0, + 0, 0); + } + +out_free: + if (fd_nodes) + free(fd_nodes); + if (nodes) + free(nodes); + if (isect) + free(isect); + + return 0; +} + + /** Start or failback a resource group: if it's not running, start it. If it is running and we're a better member to run it, then ask for @@ -453,6 +488,7 @@ consider_start(resource_node_t *node, char *svcName, rg_state_t *svcStatus, cluster_member_t *mp; int autostart, exclusive; void *lockp = NULL; + int fod_ret; mp = memb_id_to_p(membership, my_id()); assert(mp); @@ -545,10 +581,13 @@ consider_start(resource_node_t *node, char *svcName, rg_state_t *svcStatus, * Start any stopped services, or started services * that are owned by a down node. */ - if (node_should_start(mp->cm_id, membership, svcName, &_domains) == - FOD_BEST) + fod_ret = node_should_start(mp->cm_id, membership, + svcName, &_domains); + if (fod_ret == FOD_BEST) rt_enqueue_request(svcName, RG_START, -1, 0, mp->cm_id, 0, 0); + else if (fod_ret == FOD_ILLEGAL) + check_rdomain_crash(svcName); } @@ -979,7 +1018,6 @@ group_property_unlocked(char *groupname, char *property, char *ret, } - /** Send the state of a resource group to a given file descriptor. diff --git a/rgmanager/src/daemons/members.c b/rgmanager/src/daemons/members.c index 910d174..6fc1327 100644 --- a/rgmanager/src/daemons/members.c +++ b/rgmanager/src/daemons/members.c @@ -16,6 +16,7 @@ Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include <sets.h> #include <pthread.h> #include <magma.h> #include <magmamsg.h> @@ -94,6 +95,35 @@ member_list(void) } +int +member_online_set(set_type_t **nodes, int *nodecount) +{ + int ret = 1, i; + + pthread_rwlock_rdlock(&memblock); + if (!membership) + goto out_unlock; + + *nodes = malloc(sizeof(set_type_t) * membership->cml_count); + if (!*nodes) + goto out_unlock; + + *nodecount = 0; + for (i = 0; i < membership->cml_count; i++) { + if (membership->cml_members[i].cm_state && + membership->cml_members[i].cm_id > 0) { + (*nodes)[*nodecount] = membership->cml_members[i].cm_id; + ++(*nodecount); + } + } + + ret = 0; +out_unlock: + pthread_rwlock_unlock(&memblock); + return ret; +} + + char * member_name(uint64_t id, char *buf, int buflen) { diff --git a/rgmanager/src/daemons/rg_state.c b/rgmanager/src/daemons/rg_state.c index c717ecc..386d569 100644 --- a/rgmanager/src/daemons/rg_state.c +++ b/rgmanager/src/daemons/rg_state.c @@ -17,6 +17,7 @@ MA 02139, USA. */ //#define DEBUG +#include <sets.h> #include <assert.h> #include <platform.h> #include <magma.h> @@ -30,6 +31,7 @@ #include <ccs.h> #include <rg_queue.h> #include <msgsimple.h> +#include <event.h> #define cm_svccount cm_pad[0] /* Theses are uint8_t size */ #define cm_svcexcl cm_pad[1] @@ -46,6 +48,7 @@ int get_rg_state(char *servicename, rg_state_t *svcblk); void get_recovery_policy(char *rg_name, char *buf, size_t buflen); int have_exclusive_resources(void); int check_exclusive_resources(cluster_member_list_t *membership, char *svcName); +int count_resource_groups_local(cluster_member_t *mp); pthread_mutex_t exclusive_mutex = PTHREAD_MUTEX_INITIALIZER; @@ -433,9 +436,12 @@ get_rg_state_local(char *rgname, rg_state_t *svcblk) * @param req Specify request to perform * @return 0 = DO RG_NOT stop service, return RG_EFAIL * 1 = STOP service - return whatever it returns. - * 2 = DO RG_NOT stop service, return 0 (success) - * 3 = DO RG_NOT stop service, return RG_EFORWARD - * 4 = DO RG_NOT stop service, return RG_EAGAIN + * 2 = DO NOT stop service, return 0 (success) + * 3 = DO NOT stop service, return RG_EFORWARD + * 4 = DO NOT stop service, return RG_EAGAIN + * 5 = DO NOT stop service, return RG_EFROZEN + * 6 = DO NOT stop service, mark stopped and return + * RG_SUCCESS (0) */ int svc_advise_stop(rg_state_t *svcStatus, char *svcName, int req) @@ -494,9 +500,10 @@ svc_advise_stop(rg_state_t *svcStatus, char *svcName, int req) /* Service is marked as running but node is down. - Doesn't make much sense to stop it. + Doesn't make much sense to stop it - but we need + to mark it stopped */ - ret = 2; + ret = 6; break; case RG_STATE_ERROR: @@ -929,6 +936,16 @@ _svc_stop(char *svcName, int req, int recover, uint32_t newstate) clulog(LOG_DEBUG, "Unable to stop %s in %s state\n", svcName, rg_state_str(svcStatus.rs_state)); return RG_EFAIL; + case 6: + /* Mark stopped, but do not do anything */ + svcStatus.rs_last_owner = svcStatus.rs_owner; + svcStatus.rs_owner = 0; + svcStatus.rs_state = RG_STATE_STOPPED; + if (set_rg_state(svcName, &svcStatus) != 0) { + rg_unlock(svcName, lockp); + return RG_EFAIL; + } + /* FALLTHROUGH */ case 2: rg_unlock(svcName, lockp); return RG_ESUCCESS; diff --git a/rgmanager/src/daemons/service_op.c b/rgmanager/src/daemons/service_op.c index 3c02688..48fbd2d 100644 --- a/rgmanager/src/daemons/service_op.c +++ b/rgmanager/src/daemons/service_op.c @@ -21,6 +21,7 @@ #include <magmamsg.h> #include <stdio.h> #include <string.h> +#include <sets.h> #include <resgroup.h> #include <clulog.h> #include <rg_locks.h> @@ -153,8 +154,18 @@ service_op_stop(char *svcName, int do_disable, int event_type) if (get_service_state_internal(svcName, &svcStatus) < 0) return RG_EFAIL; - if (svcStatus.rs_owner != NODE_ID_NONE) - msgtarget = svcStatus.rs_owner; + + if (svcStatus.rs_owner != NODE_ID_NONE) { + if (member_online(svcStatus.rs_owner)) { + msgtarget = svcStatus.rs_owner; + } else { + /* If the owner is not online, + mark the service as 'stopped' but + otherwise, do nothing. + */ + return svc_stop(svcName, RG_STOP); + } + } if ((fd = msg_open(msgtarget, RG_PORT, RG_PURPOSE, 2)) < 0) { clulog(LOG_ERR, diff --git a/rgmanager/src/daemons/slang_event.c b/rgmanager/src/daemons/slang_event.c index d3a522b..6e17db0 100644 --- a/rgmanager/src/daemons/slang_event.c +++ b/rgmanager/src/daemons/slang_event.c @@ -19,10 +19,12 @@ @file S/Lang event handling & intrinsic functions + vars */ #include <platform.h> +#include <sets.h> #include <resgroup.h> #include <list.h> #include <restart_counter.h> #include <reslist.h> +#include <resgroup.h> #include <clulog.h> #include <magma.h> #include <magmamsg.h> @@ -35,7 +37,6 @@ #include <sys/syslog.h> #include <malloc.h> #include <clulog.h> -#include <sets.h> #include <signal.h> static int __sl_initialized = 0; @@ -46,6 +47,8 @@ static int _service_list_len = 0; char **get_service_names(int *len); /* from groups.c */ int get_service_property(char *rg_name, char *prop, char *buf, size_t buflen); void push_int_array(set_type_t *stuff, int len); +int member_online_set(set_type_t **nodes, int *nodecount); + /* ================================================================ @@ -604,25 +607,13 @@ push_int_array(set_type_t *stuff, int len) void sl_nodes_online(void) { - int i, nodecount = 0; set_type_t *nodes; + int nodecount = 0, x = 0; - cluster_member_list_t *membership = member_list(); - if (!membership) - return; - nodes = malloc(sizeof(set_type_t) * membership->cml_count); - if (!nodes) + x = member_online_set(&nodes, &nodecount); + if (x < 0 || !nodes || !nodecount) return; - nodecount = 0; - for (i = 0; i < membership->cml_count; i++) { - if (membership->cml_members[i].cm_state && - membership->cml_members[i].cm_id != 0) { - nodes[nodecount] = membership->cml_members[i].cm_id; - ++nodecount; - } - } - cml_free(membership); push_int_array(nodes, nodecount); free(nodes); } diff --git a/rgmanager/src/resources/default_event_script.sl b/rgmanager/src/resources/default_event_script.sl index e961266..cecc1f6 100644 --- a/rgmanager/src/resources/default_event_script.sl +++ b/rgmanager/src/resources/default_event_script.sl @@ -31,7 +31,8 @@ define move_or_start(service, node_list) len = length(node_list); if (len == 0) { - debug(service, " is not runnable"); + notice(service, " is not runnable - restricted domain offline"); + ()=service_stop(service); return ERR_DOMAIN; }
reply other threads:[~2009-04-03 14:13 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20090403141230.BC09D1201D6@lists.fedorahosted.org \ --to=lon@fedoraproject.org \ --cc=cluster-cvs-relay@redhat.com \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).