From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 13195 invoked by alias); 27 Aug 2008 19:20:32 -0000 Received: (qmail 13189 invoked by alias); 27 Aug 2008 19:20:32 -0000 X-Spam-Status: No, hits=-0.8 required=5.0 tests=AWL,BAYES_00,J_CHICKENPOX_33,J_CHICKENPOX_64,J_CHICKENPOX_74,KAM_MX,SPF_HELO_PASS X-Spam-Check-By: sourceware.org X-Spam-Checker-Version: SpamAssassin 3.2.4 (2008-01-01) on bastion.fedora.phx.redhat.com X-Spam-Level: Subject: master - fence_tool: new option to delay before join To: cluster-cvs-relay@redhat.com X-Project: Cluster Project X-Git-Module: cluster.git X-Git-Refname: refs/heads/master X-Git-Reftype: branch X-Git-Oldrev: 32849ba0f7e022ca5de30d043de5fe8c8c7ab982 X-Git-Newrev: 809e1e9fa79b4bf003fc137b2a8291e709d03b89 From: David Teigland Message-Id: <20080827191924.82CDA12036B@lists.fedorahosted.org> Date: Wed, 27 Aug 2008 19:27:00 -0000 X-Scanned-By: MIMEDefang 2.58 on 172.16.52.254 Mailing-List: contact cluster-cvs-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Post: List-Help: , Sender: cluster-cvs-owner@sourceware.org X-SW-Source: 2008-q3/txt/msg00338.txt.bz2 Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=809e1e9fa79b4bf003fc137b2a8291e709d03b89 Commit: 809e1e9fa79b4bf003fc137b2a8291e709d03b89 Parent: 32849ba0f7e022ca5de30d043de5fe8c8c7ab982 Author: David Teigland AuthorDate: Wed Aug 27 14:08:07 2008 -0500 Committer: David Teigland CommitterDate: Wed Aug 27 14:08:07 2008 -0500 fence_tool: new option to delay before join bz 460190 Certain network/switch settings cause nodes to form partitioned clusters when they start up. Add code to better cope with these initial partitions. The network partitions are a particular problem for two_node clusters where a node has quorum when it starts up on its own. This adds a new fence_tool option -m, e.g. fence_tool join -m . It causes fence_tool to delay the join by up to to allow all nodes in cluster.conf to become cluster members. This allows openais on the nodes to all see each other before starting the fence domain. So we join the domain *after* the nodes merge into a single cluster. If we joined the domain *before* the cluster partition merged, then nodes end up being fenced unnecessarily. (This is a similar idea to post_join_delay; a delay that gives us time to determine that a node in an unknown state is actually ok and doesn't require fencing.) Signed-off-by: David Teigland --- fence/fence_tool/fence_tool.c | 169 +++++++++++++++++++++++++++++++++------- fence/man/fence_tool.8 | 7 +- 2 files changed, 144 insertions(+), 32 deletions(-) diff --git a/fence/fence_tool/fence_tool.c b/fence/fence_tool/fence_tool.c index 95f4ba1..8e4040b 100644 --- a/fence/fence_tool/fence_tool.c +++ b/fence/fence_tool/fence_tool.c @@ -27,20 +27,28 @@ #define DEFAULT_WAIT_TIMEOUT 300 /* five minutes */ -#define die(fmt, args...) \ -do { \ - fprintf(stderr, "%s: ", prog_name); \ - fprintf(stderr, fmt "\n", ##args); \ - exit(EXIT_FAILURE); \ -} while (0) +#define MAX_NODES 128 +int all_nodeids[MAX_NODES]; +int all_nodeids_count; +cman_node_t cman_nodes[MAX_NODES]; +int cman_nodes_count; +struct fenced_node nodes[MAX_NODES]; char *prog_name; int operation; int verbose = 0; int inquorate_fail = 0; int wait_join = 0; /* default: don't wait for join */ int wait_leave = 0; /* default: don't wait for leave */ -int wait_timeout = DEFAULT_WAIT_TIMEOUT; /* applies to all waits */ +int wait_members = 0; /* default: don't wait for members */ +int wait_timeout = DEFAULT_WAIT_TIMEOUT; + +#define die(fmt, args...) \ +do { \ + fprintf(stderr, "%s: ", prog_name); \ + fprintf(stderr, fmt "\n", ##args); \ + exit(EXIT_FAILURE); \ +} while (0) static int do_write(int fd, void *buf, size_t count) { @@ -116,7 +124,7 @@ static int we_are_in_fence_domain(void) return 0; } -static void do_wait(int joining) +static void wait_domain(int joining) { int in, tries = 0; @@ -144,10 +152,65 @@ static void do_wait(int joining) printf("Error %s the fence group.\n", joining ? "joining" : "leaving"); } -static void wait_quorum(void) +static void read_ccs_nodeids(int cd) +{ + char path[PATH_MAX]; + char *nodeid_str; + int i, error; + + memset(all_nodeids, 0, sizeof(all_nodeids)); + all_nodeids_count = 0; + + for (i = 1; ; i++) { + nodeid_str = NULL; + memset(path, 0, sizeof(path)); + sprintf(path, "/cluster/clusternodes/clusternode[%d]/@nodeid", i); + + error = ccs_get(cd, path, &nodeid_str); + if (error || !nodeid_str) + break; + + all_nodeids[all_nodeids_count++] = atoi(nodeid_str); + free(nodeid_str); + } +} + +static int all_nodeids_are_members(cman_handle_t ch) +{ + int i, j, rv, found; + + memset(&cman_nodes, 0, sizeof(cman_nodes)); + cman_nodes_count = 0; + + rv = cman_get_nodes(ch, MAX_NODES, &cman_nodes_count, cman_nodes); + if (rv < 0) { + printf("cman_get_nodes error %d %d\n", rv, errno); + return 0; + } + + for (i = 0; i < all_nodeids_count; i++) { + found = 0; + + for (j = 0; j < cman_nodes_count; j++) { + if (cman_nodes[j].cn_nodeid == all_nodeids[i] && + cman_nodes[j].cn_member) { + found = 1; + break; + } + } + + if (!found) + return 0; + } + return 1; +} + +static void wait_cman(void) { cman_handle_t ch; - int rv, try_init = 0, try_active = 0, try_quorate = 0; + int try_init = 0, try_active = 0, try_quorate = 0; + int try_ccs = 0, try_members = 0; + int rv, cd; while (1) { ch = cman_init(NULL); @@ -157,8 +220,11 @@ static void wait_quorum(void) if (inquorate_fail) goto fail; - if (try_init++ >= wait_timeout) - goto fail_err; + if (try_init++ >= wait_timeout) { + printf("%s: timed out waiting for cman init\n", + prog_name); + goto fail; + } if (!(try_init % 10)) printf("%s: waiting for cman to start\n", prog_name); @@ -174,12 +240,14 @@ static void wait_quorum(void) if (inquorate_fail) goto fail; - if (try_active++ >= wait_timeout) - goto fail_err; + if (try_active++ >= wait_timeout) { + printf("%s: timed out waiting for cman active\n", + prog_name); + goto fail; + } if (!(try_active % 10)) - printf("%s: waiting for cman to be active\n",prog_name); - + printf("%s: waiting for cman active\n", prog_name); sleep(1); } @@ -191,22 +259,61 @@ static void wait_quorum(void) if (inquorate_fail) goto fail; - if (try_quorate++ >= wait_timeout) - goto fail_err; + if (try_quorate++ >= wait_timeout) { + printf("%s: timed out waiting for cman quorum\n", + prog_name); + goto fail; + } if (!(try_quorate % 10)) - printf("%s: waiting for cluster quorum\n", prog_name); + printf("%s: waiting for cman quorum\n", prog_name); + + sleep(1); + } + + while (1) { + cd = ccs_connect(); + if (cd > 0) + break; + + if (try_ccs++ >= wait_timeout) { + printf("%s: timed out waiting for ccs connect\n", + prog_name); + goto fail; + } + + if (!(try_ccs % 10)) + printf("%s: waiting for ccs connect\n", prog_name); sleep(1); } + if (!wait_members) + goto out; + read_ccs_nodeids(cd); + + while (1) { + rv = all_nodeids_are_members(ch); + if (rv) + break; + + if (try_members++ >= wait_members) + break; + + if (!(try_members % 10)) + printf("%s: waiting for all %d nodes to be members\n", + prog_name, all_nodeids_count); + sleep(1); + } + + out: + ccs_disconnect(cd); cman_finish(ch); return; - fail_err: - printf("%s: Timed out waiting for cluster quorum to form.\n", - prog_name); fail: + if (ch) + cman_finish(ch); exit(EXIT_FAILURE); } @@ -214,14 +321,14 @@ static void do_join(int argc, char *argv[]) { int rv; - wait_quorum(); + wait_cman(); rv = fenced_join(); if (rv < 0) die("can't communicate with fenced"); if (wait_join) - do_wait(1); + wait_domain(1); exit(EXIT_SUCCESS); } @@ -237,7 +344,7 @@ static void do_leave(void) die("can't communicate with fenced"); if (wait_leave) - do_wait(0); + wait_domain(0); exit(EXIT_SUCCESS); } @@ -264,10 +371,6 @@ static int node_compare(const void *va, const void *vb) return a->nodeid - b->nodeid; } -#define MAX_NODES 128 - -struct fenced_node nodes[MAX_NODES]; - static int do_list(void) { struct fenced_domain d; @@ -346,6 +449,8 @@ static void print_usage(void) printf(" dump Dump debug buffer from fenced\n"); printf("\n"); printf("Options:\n"); + printf(" -m Delay join up to for all nodes in cluster.conf\n"); + printf(" to be cluster members\n"); printf(" -w Wait for join or leave to complete\n"); printf(" -t Maximum time in seconds to wait (default %d)\n", DEFAULT_WAIT_TIMEOUT); printf(" -Q Fail if cluster is not quorate, don't wait\n"); @@ -354,7 +459,7 @@ static void print_usage(void) printf("\n"); } -#define OPTION_STRING "vVht:wQ" +#define OPTION_STRING "vVht:wQm:" static void decode_arguments(int argc, char *argv[]) { @@ -391,6 +496,10 @@ static void decode_arguments(int argc, char *argv[]) wait_leave = 1; break; + case 'm': + wait_members = atoi(optarg); + break; + case 't': wait_timeout = get_int_arg(optchar, optarg); break; diff --git a/fence/man/fence_tool.8 b/fence/man/fence_tool.8 index a83da94..625fbe0 100644 --- a/fence/man/fence_tool.8 +++ b/fence/man/fence_tool.8 @@ -20,6 +20,9 @@ it to stdout. .SH OPTIONS .TP +\fB-m\fP +Delay join up to n seconds for all nodes in cluster.conf to be cluster members. +.TP \fB-w\fP Wait until the join or leave is completed. .TP @@ -29,8 +32,8 @@ Help. Print out the usage syntax. \fB-V\fP Print version information. .TP -\fB-t\fP -Maximum time in seconds to wait (default: 300 seconds) +\fB-t\fP +Maximum time in seconds to wait for quorum or -w (default: 300 seconds) .TP \fB-Q\fP Fail command immediately if the cluster is not quorate, don't wait.