From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 16588 invoked by alias); 29 Sep 2008 13:30:43 -0000 Received: (qmail 16574 invoked by alias); 29 Sep 2008 13:30:42 -0000 X-Spam-Status: No, hits=-0.7 required=5.0 tests=AWL,BAYES_20,KAM_MX,SPF_HELO_PASS X-Spam-Check-By: sourceware.org X-Spam-Checker-Version: SpamAssassin 3.2.4 (2008-01-01) on bastion.fedora.phx.redhat.com X-Spam-Level: Subject: STABLE2 - groupd: detect dead daemons and remove node from cluster To: cluster-cvs-relay@redhat.com X-Project: Cluster Project X-Git-Module: cluster.git X-Git-Refname: refs/heads/STABLE2 X-Git-Reftype: branch X-Git-Oldrev: 2468574b5056b45b26787db1e89664abefab043c X-Git-Newrev: 07aaff32a7abf3533bf560e5d2641b893c4bbd80 From: "Ryan O'Hara" Message-Id: <20080929132936.52918120468@lists.fedorahosted.org> Date: Mon, 29 Sep 2008 13:31:00 -0000 X-Scanned-By: MIMEDefang 2.58 on 172.16.52.254 Mailing-List: contact cluster-cvs-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Post: List-Help: , Sender: cluster-cvs-owner@sourceware.org X-SW-Source: 2008-q3/txt/msg00547.txt.bz2 Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=07aaff32a7abf3533bf560e5d2641b893c4bbd80 Commit: 07aaff32a7abf3533bf560e5d2641b893c4bbd80 Parent: 2468574b5056b45b26787db1e89664abefab043c Author: Ryan O'Hara AuthorDate: Tue Sep 9 09:57:17 2008 -0500 Committer: Ryan O'Hara CommitterDate: Mon Sep 29 08:29:25 2008 -0500 groupd: detect dead daemons and remove node from cluster If any of the daemons that run within groupd fail unexpectedly, we detect this failure and remove the node from the cluster. These daemons include fenced, dlm_controld, and gfs_controld. If any of these daemons die unexpectedly (or are killed), the cluster in in an invalid state, so the proper thing to do is remove the node from the cluster (cman_leave_cluster). This behavior is enabled by default, but can be turned off with the -s option for groupd. For example, 'groupd -s0' will disable this "shutdown mode". (BZ #318571) --- group/daemon/cman.c | 4 ++++ group/daemon/gd_internal.h | 2 ++ group/daemon/main.c | 44 +++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 49 insertions(+), 1 deletions(-) diff --git a/group/daemon/cman.c b/group/daemon/cman.c index 1b44269..998197b 100644 --- a/group/daemon/cman.c +++ b/group/daemon/cman.c @@ -13,6 +13,10 @@ static cman_node_t cman_nodes[MAX_NODES]; static int cman_node_count; static char name_buf[CMAN_MAX_NODENAME_LEN+1]; +int shutdown_cman(void) +{ + cman_leave_cluster(ch_admin, CMAN_LEAVEFLAG_FORCE); +} int kill_cman(int nodeid) { diff --git a/group/daemon/gd_internal.h b/group/daemon/gd_internal.h index b56dd92..bb6e284 100644 --- a/group/daemon/gd_internal.h +++ b/group/daemon/gd_internal.h @@ -39,6 +39,7 @@ extern char *prog_name; extern int groupd_debug_opt; extern int groupd_debug_verbose; +extern int groupd_shutdown_opt; extern char groupd_debug_buf[256]; extern char dump_buf[DUMP_SIZE]; extern int dump_point; @@ -266,6 +267,7 @@ void client_dead(int ci); /* cman.c */ int setup_cman(void); int kill_cman(int nodeid); +int shutdown_cman(void); int set_cman_dirty(void); /* cpg.c */ diff --git a/group/daemon/main.c b/group/daemon/main.c index 8e278c5..18b1e50 100644 --- a/group/daemon/main.c +++ b/group/daemon/main.c @@ -3,7 +3,7 @@ #include "gd_internal.h" -#define OPTION_STRING "DhVv" +#define OPTION_STRING "Dhs:Vv" #define LOCKFILE_NAME "/var/run/groupd.pid" #define LOG_FILE "/var/log/groupd.log" @@ -147,6 +147,37 @@ static int kernel_instance_count(char *sysfs_dir) return rv; } +int check_dead_daemons(int ci) +{ + group_t *g; + + if (strncmp(client[ci].type, "fence", 5) == 0) { + list_for_each_entry(g, &gd_groups, list) { + if (client[ci].level == g->level) { + return 1; + } + } + } + + if (strncmp(client[ci].type, "dlm", 3) == 0) { + list_for_each_entry(g, &gd_groups, list) { + if (client[ci].level == g->level) { + return 1; + } + } + } + + if (strncmp(client[ci].type, "gfs", 3) == 0) { + list_for_each_entry(g, &gd_groups, list) { + if (client[ci].level == g->level) { + return 1; + } + } + } + + return 0; +} + int check_uncontrolled_groups(void) { pid_t pid; @@ -421,6 +452,11 @@ static void client_alloc(void) void client_dead(int ci) { + if (groupd_shutdown_opt && check_dead_daemons(ci)) { + log_print("%s daemon appears to be dead", client[ci].type); + shutdown_cman(); + } + close(client[ci].fd); client[ci].workfn = NULL; client[ci].fd = -1; @@ -870,6 +906,7 @@ static void print_usage(void) printf("\n"); printf(" -D Enable debugging code and don't fork\n"); printf(" -h Print this help, then exit\n"); + printf(" -s [0|1] Enable (or disable) shutdown mode\n"); printf(" -V Print program version information, then exit\n"); } @@ -892,6 +929,10 @@ static void decode_arguments(int argc, char **argv) exit(EXIT_SUCCESS); break; + case 's': + groupd_shutdown_opt = atoi(optarg); + break; + case 'v': groupd_debug_verbose++; break; @@ -1021,6 +1062,7 @@ void groupd_dump_save(void) char *prog_name; int groupd_debug_opt; int groupd_debug_verbose; +int groupd_shutdown_opt = 1; char groupd_debug_buf[256]; char dump_buf[DUMP_SIZE]; int dump_point;