From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 13005 invoked by alias); 31 Jul 2008 22:23:04 -0000 Received: (qmail 12999 invoked by alias); 31 Jul 2008 22:23:04 -0000 X-Spam-Status: No, hits=2.6 required=5.0 tests=BAYES_50,J_CHICKENPOX_33,J_CHICKENPOX_42,J_CHICKENPOX_43,J_CHICKENPOX_66,KAM_MX,SPF_HELO_PASS X-Spam-Check-By: sourceware.org X-Spam-Checker-Version: SpamAssassin 3.2.4 (2008-01-01) on bastion.fedora.phx.redhat.com X-Spam-Level: Subject: The tag: gfs_0_1_18 has been created To: cluster-cvs-relay@redhat.com X-Project: Cluster Project X-Git-Module: cluster.git X-Git-Refname: refs/tags/gfs_0_1_18 X-Git-Reftype: tag X-Git-Oldrev: 86753221e88e910d57429e06c4aa3e6295d8d4f2 X-Git-Newrev: c91f6ca13c5b67b9b111c50d4455fe0110b52274 From: Chris Feist Message-Id: <20080731212156.54633A8259@lists.fedorahosted.org> Date: Thu, 31 Jul 2008 22:23:00 -0000 X-Scanned-By: MIMEDefang 2.58 on 172.16.52.254 Mailing-List: contact cluster-cvs-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Post: List-Help: , Sender: cluster-cvs-owner@sourceware.org X-SW-Source: 2008-q3/txt/msg00192.txt.bz2 Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=c91f6ca13c5b67b9b111c50d4455fe0110b52274 Commit: c91f6ca13c5b67b9b111c50d4455fe0110b52274 Parent: 86753221e88e910d57429e06c4aa3e6295d8d4f2 Author: David Teigland AuthorDate: Thu Jul 31 15:55:54 2008 -0500 Committer: David Teigland CommitterDate: Thu Jul 31 15:55:54 2008 -0500 fence_tool: add domain member checks using libfenced library. Also clean up the code and logic for waiting and timeouts. Signed-off-by: David Teigland --- fence/fence_tool/fence_tool.c | 260 ++++++++++++++++++++--------------------- fence/fenced/main.c | 9 +- fence/libfenced/libfenced.h | 3 + 3 files changed, 138 insertions(+), 134 deletions(-) diff --git a/fence/fence_tool/fence_tool.c b/fence/fence_tool/fence_tool.c index ded7eda..95f4ba1 100644 --- a/fence/fence_tool/fence_tool.c +++ b/fence/fence_tool/fence_tool.c @@ -25,23 +25,22 @@ #define OP_LIST 3 #define OP_DUMP 4 +#define DEFAULT_WAIT_TIMEOUT 300 /* five minutes */ + #define die(fmt, args...) \ -do \ -{ \ - fprintf(stderr, "%s: ", prog_name); \ - fprintf(stderr, fmt "\n", ##args); \ - exit(EXIT_FAILURE); \ -} \ -while (0) +do { \ + fprintf(stderr, "%s: ", prog_name); \ + fprintf(stderr, fmt "\n", ##args); \ + exit(EXIT_FAILURE); \ +} while (0) char *prog_name; int operation; int verbose = 0; -int child_wait = 0; -int quorum_wait = 1; -int fenced_start_timeout = 300; /* five minutes */ -int signalled = 0; -cman_handle_t ch; +int inquorate_fail = 0; +int wait_join = 0; /* default: don't wait for join */ +int wait_leave = 0; /* default: don't wait for leave */ +int wait_timeout = DEFAULT_WAIT_TIMEOUT; /* applies to all waits */ static int do_write(int fd, void *buf, size_t count) { @@ -62,25 +61,6 @@ static int do_write(int fd, void *buf, size_t count) return 0; } -#if 0 -static int do_read(int fd, void *buf, size_t count) -{ - int rv, off = 0; - - while (off < count) { - rv = read(fd, buf + off, count - off); - if (rv == 0) - return -1; - if (rv == -1 && errno == EINTR) - continue; - if (rv == -1) - return -1; - off += rv; - } - return 0; -} -#endif - static int get_int_arg(char argopt, char *arg) { char *tmp; @@ -120,119 +100,133 @@ static int check_mounted(void) return 0; } -static void sigalarm_handler(int sig) -{ - signalled = 1; -} - static int we_are_in_fence_domain(void) { -#if 0 - group_data_t gdata; + struct fenced_node nodeinfo; int rv; - memset(&gdata, 0, sizeof(gdata)); - rv = group_get_group(0, "default", &gdata); + memset(&nodeinfo, 0, sizeof(nodeinfo)); - if (rv || strcmp(gdata.client_name, "fence")) + rv = fenced_node_info(FENCED_NODEID_US, &nodeinfo); + if (rv < 0) return 0; - return gdata.member; -#endif - printf("FIXME: use libfenced:fenced_domain_members()\n"); - return 1; + if (nodeinfo.member) + return 1; + return 0; } -/* - * We wait for the cluster to be quorate in this program because it's easy to - * kill this program if we want to quit waiting. If we just started fenced - * without waiting for quorum, fenced's join would then wait for quorum in SM - * but we can't kill/cancel it at that point -- we have to wait for it to - * complete. - * - * A second reason to wait for quorum is that the unfencing step involves - * cluster.conf lookups through ccs, but ccsd may wait for the cluster to be - * quorate before responding to the lookups. There wouldn't be a problem - * blocking there per se, but it's cleaner I think to just wait here first. - * - * In the case where we're leaving, we want to wait for quorum because if we go - * ahead and shut down fenced, the fence domain leave will block in SM where it - * will wait for quorum before the leave can be processed. We can't - * kill/cancel the leave at that point, but we can if we're waiting here. - * - * Waiting here doesn't guarantee we won't end up blocking in SM on the join or - * leave, but it avoids it in some common cases which can be helpful. (Quorum - * could easily be lost between the time we wait for it here and then begin the - * join/leave process.) - */ - -static int check_quorum(void) +static void do_wait(int joining) { - int rv = 0, i = 0; + int in, tries = 0; - while (!signalled) { - rv = cman_is_quorate(ch); - if (rv) - return 1; - else if (!quorum_wait) - return 0; + while (1) { + in = we_are_in_fence_domain(); - sleep(1); + if (joining && in) + break; - if (!signalled && ++i > 9 && !(i % 10)) - printf("%s: waiting for cluster quorum\n", prog_name); + if (!joining && !in) + break; + + if (tries++ >= wait_timeout) + goto fail; + + if (!(tries % 5)) + printf("Waiting for fenced to %s the fence group.\n", + joining ? "join" : "leave"); + + sleep(1); } - errno = ETIMEDOUT; - return 0; + return; + fail: + printf("Error %s the fence group.\n", joining ? "joining" : "leaving"); } -static int do_wait(int joining) +static void wait_quorum(void) { - int i; + cman_handle_t ch; + int rv, try_init = 0, try_active = 0, try_quorate = 0; + + while (1) { + ch = cman_init(NULL); + if (ch) + break; + + if (inquorate_fail) + goto fail; + + if (try_init++ >= wait_timeout) + goto fail_err; + + if (!(try_init % 10)) + printf("%s: waiting for cman to start\n", prog_name); - for (i=0; !fenced_start_timeout || i < fenced_start_timeout; i++) { - if (we_are_in_fence_domain() == joining) - return 0; - if (i && !(i % 5)) - printf("Waiting for fenced to %s the fence group.\n", - (joining?"join":"leave")); sleep(1); } - printf("Error joining the fence group.\n"); - return -1; -} -static int do_join(int argc, char *argv[]) -{ - int rv; + while (1) { + rv = cman_is_active(ch); + if (rv) + break; + + if (inquorate_fail) + goto fail; + + if (try_active++ >= wait_timeout) + goto fail_err; - ch = cman_init(NULL); + if (!(try_active % 10)) + printf("%s: waiting for cman to be active\n",prog_name); - if (fenced_start_timeout) { - signal(SIGALRM, sigalarm_handler); - alarm(fenced_start_timeout); + sleep(1); } - if (!check_quorum()) { - if (errno == ETIMEDOUT) - printf("%s: Timed out waiting for cluster " - "quorum to form.\n", prog_name); - cman_finish(ch); - return EXIT_FAILURE; + while (1) { + rv = cman_is_quorate(ch); + if (rv) + break; + + if (inquorate_fail) + goto fail; + + if (try_quorate++ >= wait_timeout) + goto fail_err; + + if (!(try_quorate % 10)) + printf("%s: waiting for cluster quorum\n", prog_name); + + sleep(1); } + cman_finish(ch); + return; + + fail_err: + printf("%s: Timed out waiting for cluster quorum to form.\n", + prog_name); + fail: + exit(EXIT_FAILURE); +} + +static void do_join(int argc, char *argv[]) +{ + int rv; + + wait_quorum(); rv = fenced_join(); if (rv < 0) die("can't communicate with fenced"); - if (child_wait) + if (wait_join) do_wait(1); - return EXIT_SUCCESS; + + exit(EXIT_SUCCESS); } -static int do_leave(void) +static void do_leave(void) { int rv; @@ -242,12 +236,13 @@ static int do_leave(void) if (rv < 0) die("can't communicate with fenced"); - if (child_wait) + if (wait_leave) do_wait(0); - return EXIT_SUCCESS; + + exit(EXIT_SUCCESS); } -static int do_dump(void) +static void do_dump(void) { char buf[FENCED_DUMP_SIZE]; int rv; @@ -258,7 +253,7 @@ static int do_dump(void) do_write(STDOUT_FILENO, buf, sizeof(buf)); - return 0; + exit(EXIT_SUCCESS); } static int node_compare(const void *va, const void *vb) @@ -282,7 +277,7 @@ static int do_list(void) rv = fenced_domain_info(&d); if (rv < 0) - goto out; + goto fail; printf("fence domain \"default\"\n"); printf("member_count %d master_nodeid %d victim_count %d current_victim %d state %d\n", @@ -294,7 +289,7 @@ static int do_list(void) rv = fenced_domain_nodes(FENCED_NODES_MEMBERS, MAX_NODES, &node_count, nodes); if (rv < 0) - goto out; + goto fail; qsort(&nodes, node_count, sizeof(struct fenced_node), node_compare); @@ -308,7 +303,7 @@ static int do_list(void) printf("\n"); if (!verbose) - return 0; + exit(EXIT_SUCCESS); node_count = 0; memset(&nodes, 0, sizeof(nodes)); @@ -316,7 +311,7 @@ static int do_list(void) rv = fenced_domain_nodes(FENCED_NODES_ALL, MAX_NODES, &node_count, nodes); if (rv < 0) - goto out; + goto fail; qsort(&nodes, node_count, sizeof(struct fenced_node), node_compare); @@ -332,10 +327,10 @@ static int do_list(void) np->last_fenced_how); np++; } - return 0; - out: + exit(EXIT_SUCCESS); + fail: fprintf(stderr, "fenced query error %d\n", rv); - return rv; + exit(EXIT_FAILURE); } static void print_usage(void) @@ -351,11 +346,11 @@ static void print_usage(void) printf(" dump Dump debug buffer from fenced\n"); printf("\n"); printf("Options:\n"); - printf(" -w Wait for join to complete\n"); + printf(" -w Wait for join or leave to complete\n"); + printf(" -t Maximum time in seconds to wait (default %d)\n", DEFAULT_WAIT_TIMEOUT); + printf(" -Q Fail if cluster is not quorate, don't wait\n"); printf(" -V Print program version information, then exit\n"); printf(" -h Print this help, then exit\n"); - printf(" -t Maximum time in seconds to wait\n"); - printf(" -Q Fail if cluster is not quorate, don't wait\n"); printf("\n"); } @@ -388,11 +383,16 @@ static void decode_arguments(int argc, char *argv[]) break; case 'Q': - quorum_wait = 0; + inquorate_fail = 1; break; case 'w': - child_wait = 1; + wait_join = 1; + wait_leave = 1; + break; + + case 't': + wait_timeout = get_int_arg(optchar, optarg); break; case ':': @@ -405,10 +405,6 @@ static void decode_arguments(int argc, char *argv[]) cont = 0; break; - case 't': - fenced_start_timeout = get_int_arg(optchar, optarg); - break; - default: die("unknown option: %c\n", optchar); break; @@ -441,13 +437,13 @@ int main(int argc, char *argv[]) switch (operation) { case OP_JOIN: - return do_join(argc, argv); + do_join(argc, argv); case OP_LEAVE: - return do_leave(); + do_leave(); case OP_DUMP: - return do_dump(); + do_dump(); case OP_LIST: - return do_list(); + do_list(); } return EXIT_FAILURE; diff --git a/fence/fenced/main.c b/fence/fenced/main.c index 3c88197..6bfbb0e 100644 --- a/fence/fenced/main.c +++ b/fence/fenced/main.c @@ -316,11 +316,11 @@ static void query_dump_debug(int f) do_write(f, dump_buf, len); } -static void query_node_info(int f, int nodeid) +static void query_node_info(int f, int data_nodeid) { struct fd *fd; struct fenced_node node; - int rv; + int nodeid, rv; fd = find_fd("default"); if (!fd) { @@ -328,6 +328,11 @@ static void query_node_info(int f, int nodeid) goto out; } + if (data_nodeid == FENCED_NODEID_US) + nodeid = our_nodeid; + else + nodeid = data_nodeid; + if (group_mode == GROUP_LIBGROUP) rv = set_node_info_group(fd, nodeid, &node); else diff --git a/fence/libfenced/libfenced.h b/fence/libfenced/libfenced.h index 056e4f2..30ea210 100644 --- a/fence/libfenced/libfenced.h +++ b/fence/libfenced/libfenced.h @@ -3,6 +3,9 @@ #define FENCED_DUMP_SIZE (1024 * 1024) +/* for querying local node info */ +#define FENCED_NODEID_US 0 + struct fenced_node { int nodeid; int member;