public inbox for cluster-cvs@sourceware.org
help / color / mirror / Atom feed
* cluster: RHEL48 - rgmanager: Detect restricted failover domain crash
@ 2009-04-03 14:13 Lon Hohberger
0 siblings, 0 replies; only message in thread
From: Lon Hohberger @ 2009-04-03 14:13 UTC (permalink / raw)
To: cluster-cvs-relay
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=4a62d37ed15229356fde8945fa3b5798fb85b663
Commit: 4a62d37ed15229356fde8945fa3b5798fb85b663
Parent: da9f72456bdda7833f8360de92807b0f66cb334a
Author: Lon Hohberger <lhh@redhat.com>
AuthorDate: Mon Aug 25 15:54:44 2008 -0400
Committer: Lon Hohberger <lhh@redhat.com>
CommitterDate: Fri Apr 3 10:12:12 2009 -0400
rgmanager: Detect restricted failover domain crash
Mark service as 'stopped' when it is 'running' but the
node is down. rhbz #428108
---
rgmanager/include/reslist.h | 5 +-
rgmanager/src/daemons/fo_domain.c | 17 ++++-
rgmanager/src/daemons/groups.c | 80 +++++++++++++++++------
rgmanager/src/daemons/members.c | 30 +++++++++
rgmanager/src/daemons/rg_state.c | 27 ++++++--
rgmanager/src/daemons/service_op.c | 15 ++++-
rgmanager/src/daemons/slang_event.c | 23 ++-----
rgmanager/src/resources/default_event_script.sl | 3 +-
8 files changed, 151 insertions(+), 49 deletions(-)
diff --git a/rgmanager/include/reslist.h b/rgmanager/include/reslist.h
index f78288f..4d3feea 100644
--- a/rgmanager/include/reslist.h
+++ b/rgmanager/include/reslist.h
@@ -23,6 +23,7 @@
#include <libxml/parser.h>
#include <libxml/xmlmemory.h>
#include <libxml/xpath.h>
+#include <sets.h>
#define RA_PRIMARY (1<<0) /** Primary key */
@@ -205,8 +206,8 @@ void deconstruct_domains(fod_t **domains);
void print_domains(fod_t **domains);
int node_should_start(uint64_t nodeid, cluster_member_list_t *membership,
char *rg_name, fod_t **domains);
-int node_domain_set(fod_t *domain, uint64_t **ret, int *retlen);
-int node_domain_set_safe(char *domainname, uint64_t **ret, int *retlen, int *flags);
+int node_domain_set(fod_t **domains, char *name, set_type_t **ret, int *retlen, int *flags);
+int node_domain_set_safe(char *domainname, set_type_t **ret, int *retlen, int *flags);
/*
diff --git a/rgmanager/src/daemons/fo_domain.c b/rgmanager/src/daemons/fo_domain.c
index 9019a10..be1918d 100644
--- a/rgmanager/src/daemons/fo_domain.c
+++ b/rgmanager/src/daemons/fo_domain.c
@@ -349,13 +349,24 @@ node_in_domain(char *nodename, fod_t *domain,
int
-node_domain_set(fod_t *domain, uint64_t **ret, int *retlen)
+node_domain_set(fod_t **domains, char *name, set_type_t **ret, int *retlen, int *flags)
{
int x, i, j;
set_type_t *tmpset;
int ts_count;
-
fod_node_t *fodn;
+ fod_t *domain;
+ int found = 0;
+
+ list_for(domains, domain, x) {
+ if (!strcasecmp(domain->fd_name, name)) {
+ found = 1;
+ break;
+ }
+ } // while (!list_done(&_domains, fod));
+
+ if (!found)
+ return -1;
/* Count domain length */
list_for(&domain->fd_nodes, fodn, x) { }
@@ -368,6 +379,8 @@ node_domain_set(fod_t *domain, uint64_t **ret, int *retlen)
if (!(*tmpset))
return -1;
+ *flags = domain->fd_flags;
+
if (domain->fd_flags & FOD_ORDERED) {
for (i = 1; i <= 100; i++) {
diff --git a/rgmanager/src/daemons/groups.c b/rgmanager/src/daemons/groups.c
index d1fe3db..a0816d5 100644
--- a/rgmanager/src/daemons/groups.c
+++ b/rgmanager/src/daemons/groups.c
@@ -18,9 +18,10 @@
MA 02139, USA.
*/
//#define DEBUG
+#include <sets.h>
#include <platform.h>
-#include <resgroup.h>
#include <restart_counter.h>
+#include <resgroup.h>
#include <reslist.h>
#include <vf.h>
#include <magma.h>
@@ -60,6 +61,8 @@ pthread_rwlock_t resource_lock = PTHREAD_RWLOCK_INITIALIZER;
void res_build_name(char *, size_t, resource_t *);
int group_migratory(char *groupname, int lock);
+int group_property(char *groupname, char *property, char *ret, size_t len);
+int member_online_set(set_type_t **nodes, int *nodecount);
struct status_arg {
@@ -88,25 +91,11 @@ node_should_start_safe(uint64_t nodeid, cluster_member_list_t *membership,
int
-node_domain_set_safe(char *domainname, uint64_t **ret, int *retlen, int *flags)
+node_domain_set_safe(char *domainname, set_type_t **ret, int *retlen, int *flags)
{
- fod_t *fod;
- int rv = -1, found = 0, x = 0;
-
+ int rv = 0;
pthread_rwlock_rdlock(&resource_lock);
-
- list_for(&_domains, fod, x) {
- if (!strcasecmp(fod->fd_name, domainname)) {
- found = 1;
- break;
- }
- } // while (!list_done(&_domains, fod));
-
- if (found) {
- rv = node_domain_set(fod, ret, retlen);
- *flags = fod->fd_flags;
- }
-
+ rv = node_domain_set(&_domains, domainname, ret, retlen, flags);
pthread_rwlock_unlock(&resource_lock);
return rv;
@@ -440,6 +429,52 @@ check_depend_safe(char *rg_name)
}
+int
+check_rdomain_crash(char *svcName)
+{
+ set_type_t *nodes = NULL;
+ set_type_t *fd_nodes = NULL;
+ set_type_t *isect = NULL;
+ int nodecount;
+ int fd_nodecount, fl;
+ int icount;
+ char fd_name[256];
+
+ if (group_property(svcName, "domain", fd_name, sizeof(fd_name)) != 0)
+ goto out_free;
+
+ member_online_set(&nodes, &nodecount);
+
+ if (node_domain_set(&_domains, fd_name, &fd_nodes,
+ &fd_nodecount, &fl) != 0)
+ goto out_free;
+
+ if (!(fl & FOD_RESTRICTED))
+ goto out_free;
+
+ if (s_intersection(fd_nodes, fd_nodecount, nodes, nodecount,
+ &isect, &icount) < 0)
+ goto out_free;
+
+ if (icount == 0) {
+ clulog(LOG_DEBUG, "Marking %s as stopped: "
+ "Restricted domain unavailable\n", svcName);
+ rt_enqueue_request(svcName, RG_STOP, -1, 0, 0,
+ 0, 0);
+ }
+
+out_free:
+ if (fd_nodes)
+ free(fd_nodes);
+ if (nodes)
+ free(nodes);
+ if (isect)
+ free(isect);
+
+ return 0;
+}
+
+
/**
Start or failback a resource group: if it's not running, start it.
If it is running and we're a better member to run it, then ask for
@@ -453,6 +488,7 @@ consider_start(resource_node_t *node, char *svcName, rg_state_t *svcStatus,
cluster_member_t *mp;
int autostart, exclusive;
void *lockp = NULL;
+ int fod_ret;
mp = memb_id_to_p(membership, my_id());
assert(mp);
@@ -545,10 +581,13 @@ consider_start(resource_node_t *node, char *svcName, rg_state_t *svcStatus,
* Start any stopped services, or started services
* that are owned by a down node.
*/
- if (node_should_start(mp->cm_id, membership, svcName, &_domains) ==
- FOD_BEST)
+ fod_ret = node_should_start(mp->cm_id, membership,
+ svcName, &_domains);
+ if (fod_ret == FOD_BEST)
rt_enqueue_request(svcName, RG_START, -1, 0, mp->cm_id,
0, 0);
+ else if (fod_ret == FOD_ILLEGAL)
+ check_rdomain_crash(svcName);
}
@@ -979,7 +1018,6 @@ group_property_unlocked(char *groupname, char *property, char *ret,
}
-
/**
Send the state of a resource group to a given file descriptor.
diff --git a/rgmanager/src/daemons/members.c b/rgmanager/src/daemons/members.c
index 910d174..6fc1327 100644
--- a/rgmanager/src/daemons/members.c
+++ b/rgmanager/src/daemons/members.c
@@ -16,6 +16,7 @@
Free Software Foundation, Inc., 675 Mass Ave, Cambridge,
MA 02139, USA.
*/
+#include <sets.h>
#include <pthread.h>
#include <magma.h>
#include <magmamsg.h>
@@ -94,6 +95,35 @@ member_list(void)
}
+int
+member_online_set(set_type_t **nodes, int *nodecount)
+{
+ int ret = 1, i;
+
+ pthread_rwlock_rdlock(&memblock);
+ if (!membership)
+ goto out_unlock;
+
+ *nodes = malloc(sizeof(set_type_t) * membership->cml_count);
+ if (!*nodes)
+ goto out_unlock;
+
+ *nodecount = 0;
+ for (i = 0; i < membership->cml_count; i++) {
+ if (membership->cml_members[i].cm_state &&
+ membership->cml_members[i].cm_id > 0) {
+ (*nodes)[*nodecount] = membership->cml_members[i].cm_id;
+ ++(*nodecount);
+ }
+ }
+
+ ret = 0;
+out_unlock:
+ pthread_rwlock_unlock(&memblock);
+ return ret;
+}
+
+
char *
member_name(uint64_t id, char *buf, int buflen)
{
diff --git a/rgmanager/src/daemons/rg_state.c b/rgmanager/src/daemons/rg_state.c
index c717ecc..386d569 100644
--- a/rgmanager/src/daemons/rg_state.c
+++ b/rgmanager/src/daemons/rg_state.c
@@ -17,6 +17,7 @@
MA 02139, USA.
*/
//#define DEBUG
+#include <sets.h>
#include <assert.h>
#include <platform.h>
#include <magma.h>
@@ -30,6 +31,7 @@
#include <ccs.h>
#include <rg_queue.h>
#include <msgsimple.h>
+#include <event.h>
#define cm_svccount cm_pad[0] /* Theses are uint8_t size */
#define cm_svcexcl cm_pad[1]
@@ -46,6 +48,7 @@ int get_rg_state(char *servicename, rg_state_t *svcblk);
void get_recovery_policy(char *rg_name, char *buf, size_t buflen);
int have_exclusive_resources(void);
int check_exclusive_resources(cluster_member_list_t *membership, char *svcName);
+int count_resource_groups_local(cluster_member_t *mp);
pthread_mutex_t exclusive_mutex = PTHREAD_MUTEX_INITIALIZER;
@@ -433,9 +436,12 @@ get_rg_state_local(char *rgname, rg_state_t *svcblk)
* @param req Specify request to perform
* @return 0 = DO RG_NOT stop service, return RG_EFAIL
* 1 = STOP service - return whatever it returns.
- * 2 = DO RG_NOT stop service, return 0 (success)
- * 3 = DO RG_NOT stop service, return RG_EFORWARD
- * 4 = DO RG_NOT stop service, return RG_EAGAIN
+ * 2 = DO NOT stop service, return 0 (success)
+ * 3 = DO NOT stop service, return RG_EFORWARD
+ * 4 = DO NOT stop service, return RG_EAGAIN
+ * 5 = DO NOT stop service, return RG_EFROZEN
+ * 6 = DO NOT stop service, mark stopped and return
+ * RG_SUCCESS (0)
*/
int
svc_advise_stop(rg_state_t *svcStatus, char *svcName, int req)
@@ -494,9 +500,10 @@ svc_advise_stop(rg_state_t *svcStatus, char *svcName, int req)
/*
Service is marked as running but node is down.
- Doesn't make much sense to stop it.
+ Doesn't make much sense to stop it - but we need
+ to mark it stopped
*/
- ret = 2;
+ ret = 6;
break;
case RG_STATE_ERROR:
@@ -929,6 +936,16 @@ _svc_stop(char *svcName, int req, int recover, uint32_t newstate)
clulog(LOG_DEBUG, "Unable to stop %s in %s state\n",
svcName, rg_state_str(svcStatus.rs_state));
return RG_EFAIL;
+ case 6:
+ /* Mark stopped, but do not do anything */
+ svcStatus.rs_last_owner = svcStatus.rs_owner;
+ svcStatus.rs_owner = 0;
+ svcStatus.rs_state = RG_STATE_STOPPED;
+ if (set_rg_state(svcName, &svcStatus) != 0) {
+ rg_unlock(svcName, lockp);
+ return RG_EFAIL;
+ }
+ /* FALLTHROUGH */
case 2:
rg_unlock(svcName, lockp);
return RG_ESUCCESS;
diff --git a/rgmanager/src/daemons/service_op.c b/rgmanager/src/daemons/service_op.c
index 3c02688..48fbd2d 100644
--- a/rgmanager/src/daemons/service_op.c
+++ b/rgmanager/src/daemons/service_op.c
@@ -21,6 +21,7 @@
#include <magmamsg.h>
#include <stdio.h>
#include <string.h>
+#include <sets.h>
#include <resgroup.h>
#include <clulog.h>
#include <rg_locks.h>
@@ -153,8 +154,18 @@ service_op_stop(char *svcName, int do_disable, int event_type)
if (get_service_state_internal(svcName, &svcStatus) < 0)
return RG_EFAIL;
- if (svcStatus.rs_owner != NODE_ID_NONE)
- msgtarget = svcStatus.rs_owner;
+
+ if (svcStatus.rs_owner != NODE_ID_NONE) {
+ if (member_online(svcStatus.rs_owner)) {
+ msgtarget = svcStatus.rs_owner;
+ } else {
+ /* If the owner is not online,
+ mark the service as 'stopped' but
+ otherwise, do nothing.
+ */
+ return svc_stop(svcName, RG_STOP);
+ }
+ }
if ((fd = msg_open(msgtarget, RG_PORT, RG_PURPOSE, 2)) < 0) {
clulog(LOG_ERR,
diff --git a/rgmanager/src/daemons/slang_event.c b/rgmanager/src/daemons/slang_event.c
index d3a522b..6e17db0 100644
--- a/rgmanager/src/daemons/slang_event.c
+++ b/rgmanager/src/daemons/slang_event.c
@@ -19,10 +19,12 @@
@file S/Lang event handling & intrinsic functions + vars
*/
#include <platform.h>
+#include <sets.h>
#include <resgroup.h>
#include <list.h>
#include <restart_counter.h>
#include <reslist.h>
+#include <resgroup.h>
#include <clulog.h>
#include <magma.h>
#include <magmamsg.h>
@@ -35,7 +37,6 @@
#include <sys/syslog.h>
#include <malloc.h>
#include <clulog.h>
-#include <sets.h>
#include <signal.h>
static int __sl_initialized = 0;
@@ -46,6 +47,8 @@ static int _service_list_len = 0;
char **get_service_names(int *len); /* from groups.c */
int get_service_property(char *rg_name, char *prop, char *buf, size_t buflen);
void push_int_array(set_type_t *stuff, int len);
+int member_online_set(set_type_t **nodes, int *nodecount);
+
/* ================================================================
@@ -604,25 +607,13 @@ push_int_array(set_type_t *stuff, int len)
void
sl_nodes_online(void)
{
- int i, nodecount = 0;
set_type_t *nodes;
+ int nodecount = 0, x = 0;
- cluster_member_list_t *membership = member_list();
- if (!membership)
- return;
- nodes = malloc(sizeof(set_type_t) * membership->cml_count);
- if (!nodes)
+ x = member_online_set(&nodes, &nodecount);
+ if (x < 0 || !nodes || !nodecount)
return;
- nodecount = 0;
- for (i = 0; i < membership->cml_count; i++) {
- if (membership->cml_members[i].cm_state &&
- membership->cml_members[i].cm_id != 0) {
- nodes[nodecount] = membership->cml_members[i].cm_id;
- ++nodecount;
- }
- }
- cml_free(membership);
push_int_array(nodes, nodecount);
free(nodes);
}
diff --git a/rgmanager/src/resources/default_event_script.sl b/rgmanager/src/resources/default_event_script.sl
index e961266..cecc1f6 100644
--- a/rgmanager/src/resources/default_event_script.sl
+++ b/rgmanager/src/resources/default_event_script.sl
@@ -31,7 +31,8 @@ define move_or_start(service, node_list)
len = length(node_list);
if (len == 0) {
- debug(service, " is not runnable");
+ notice(service, " is not runnable - restricted domain offline");
+ ()=service_stop(service);
return ERR_DOMAIN;
}
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2009-04-03 14:13 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-04-03 14:13 cluster: RHEL48 - rgmanager: Detect restricted failover domain crash Lon Hohberger
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).