public inbox for cluster-cvs@sourceware.org
help / color / mirror / Atom feed
* The tag: rgmanager_2_0_39 has been created
@ 2008-07-31 22:23 Chris Feist
0 siblings, 0 replies; only message in thread
From: Chris Feist @ 2008-07-31 22:23 UTC (permalink / raw)
To: cluster-cvs-relay
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=c91f6ca13c5b67b9b111c50d4455fe0110b52274
Commit: c91f6ca13c5b67b9b111c50d4455fe0110b52274
Parent: 86753221e88e910d57429e06c4aa3e6295d8d4f2
Author: David Teigland <teigland@redhat.com>
AuthorDate: Thu Jul 31 15:55:54 2008 -0500
Committer: David Teigland <teigland@redhat.com>
CommitterDate: Thu Jul 31 15:55:54 2008 -0500
fence_tool: add domain member checks
using libfenced library. Also clean up the code and logic for waiting
and timeouts.
Signed-off-by: David Teigland <teigland@redhat.com>
---
fence/fence_tool/fence_tool.c | 260 ++++++++++++++++++++---------------------
fence/fenced/main.c | 9 +-
fence/libfenced/libfenced.h | 3 +
3 files changed, 138 insertions(+), 134 deletions(-)
diff --git a/fence/fence_tool/fence_tool.c b/fence/fence_tool/fence_tool.c
index ded7eda..95f4ba1 100644
--- a/fence/fence_tool/fence_tool.c
+++ b/fence/fence_tool/fence_tool.c
@@ -25,23 +25,22 @@
#define OP_LIST 3
#define OP_DUMP 4
+#define DEFAULT_WAIT_TIMEOUT 300 /* five minutes */
+
#define die(fmt, args...) \
-do \
-{ \
- fprintf(stderr, "%s: ", prog_name); \
- fprintf(stderr, fmt "\n", ##args); \
- exit(EXIT_FAILURE); \
-} \
-while (0)
+do { \
+ fprintf(stderr, "%s: ", prog_name); \
+ fprintf(stderr, fmt "\n", ##args); \
+ exit(EXIT_FAILURE); \
+} while (0)
char *prog_name;
int operation;
int verbose = 0;
-int child_wait = 0;
-int quorum_wait = 1;
-int fenced_start_timeout = 300; /* five minutes */
-int signalled = 0;
-cman_handle_t ch;
+int inquorate_fail = 0;
+int wait_join = 0; /* default: don't wait for join */
+int wait_leave = 0; /* default: don't wait for leave */
+int wait_timeout = DEFAULT_WAIT_TIMEOUT; /* applies to all waits */
static int do_write(int fd, void *buf, size_t count)
{
@@ -62,25 +61,6 @@ static int do_write(int fd, void *buf, size_t count)
return 0;
}
-#if 0
-static int do_read(int fd, void *buf, size_t count)
-{
- int rv, off = 0;
-
- while (off < count) {
- rv = read(fd, buf + off, count - off);
- if (rv == 0)
- return -1;
- if (rv == -1 && errno == EINTR)
- continue;
- if (rv == -1)
- return -1;
- off += rv;
- }
- return 0;
-}
-#endif
-
static int get_int_arg(char argopt, char *arg)
{
char *tmp;
@@ -120,119 +100,133 @@ static int check_mounted(void)
return 0;
}
-static void sigalarm_handler(int sig)
-{
- signalled = 1;
-}
-
static int we_are_in_fence_domain(void)
{
-#if 0
- group_data_t gdata;
+ struct fenced_node nodeinfo;
int rv;
- memset(&gdata, 0, sizeof(gdata));
- rv = group_get_group(0, "default", &gdata);
+ memset(&nodeinfo, 0, sizeof(nodeinfo));
- if (rv || strcmp(gdata.client_name, "fence"))
+ rv = fenced_node_info(FENCED_NODEID_US, &nodeinfo);
+ if (rv < 0)
return 0;
- return gdata.member;
-#endif
- printf("FIXME: use libfenced:fenced_domain_members()\n");
- return 1;
+ if (nodeinfo.member)
+ return 1;
+ return 0;
}
-/*
- * We wait for the cluster to be quorate in this program because it's easy to
- * kill this program if we want to quit waiting. If we just started fenced
- * without waiting for quorum, fenced's join would then wait for quorum in SM
- * but we can't kill/cancel it at that point -- we have to wait for it to
- * complete.
- *
- * A second reason to wait for quorum is that the unfencing step involves
- * cluster.conf lookups through ccs, but ccsd may wait for the cluster to be
- * quorate before responding to the lookups. There wouldn't be a problem
- * blocking there per se, but it's cleaner I think to just wait here first.
- *
- * In the case where we're leaving, we want to wait for quorum because if we go
- * ahead and shut down fenced, the fence domain leave will block in SM where it
- * will wait for quorum before the leave can be processed. We can't
- * kill/cancel the leave at that point, but we can if we're waiting here.
- *
- * Waiting here doesn't guarantee we won't end up blocking in SM on the join or
- * leave, but it avoids it in some common cases which can be helpful. (Quorum
- * could easily be lost between the time we wait for it here and then begin the
- * join/leave process.)
- */
-
-static int check_quorum(void)
+static void do_wait(int joining)
{
- int rv = 0, i = 0;
+ int in, tries = 0;
- while (!signalled) {
- rv = cman_is_quorate(ch);
- if (rv)
- return 1;
- else if (!quorum_wait)
- return 0;
+ while (1) {
+ in = we_are_in_fence_domain();
- sleep(1);
+ if (joining && in)
+ break;
- if (!signalled && ++i > 9 && !(i % 10))
- printf("%s: waiting for cluster quorum\n", prog_name);
+ if (!joining && !in)
+ break;
+
+ if (tries++ >= wait_timeout)
+ goto fail;
+
+ if (!(tries % 5))
+ printf("Waiting for fenced to %s the fence group.\n",
+ joining ? "join" : "leave");
+
+ sleep(1);
}
- errno = ETIMEDOUT;
- return 0;
+ return;
+ fail:
+ printf("Error %s the fence group.\n", joining ? "joining" : "leaving");
}
-static int do_wait(int joining)
+static void wait_quorum(void)
{
- int i;
+ cman_handle_t ch;
+ int rv, try_init = 0, try_active = 0, try_quorate = 0;
+
+ while (1) {
+ ch = cman_init(NULL);
+ if (ch)
+ break;
+
+ if (inquorate_fail)
+ goto fail;
+
+ if (try_init++ >= wait_timeout)
+ goto fail_err;
+
+ if (!(try_init % 10))
+ printf("%s: waiting for cman to start\n", prog_name);
- for (i=0; !fenced_start_timeout || i < fenced_start_timeout; i++) {
- if (we_are_in_fence_domain() == joining)
- return 0;
- if (i && !(i % 5))
- printf("Waiting for fenced to %s the fence group.\n",
- (joining?"join":"leave"));
sleep(1);
}
- printf("Error joining the fence group.\n");
- return -1;
-}
-static int do_join(int argc, char *argv[])
-{
- int rv;
+ while (1) {
+ rv = cman_is_active(ch);
+ if (rv)
+ break;
+
+ if (inquorate_fail)
+ goto fail;
+
+ if (try_active++ >= wait_timeout)
+ goto fail_err;
- ch = cman_init(NULL);
+ if (!(try_active % 10))
+ printf("%s: waiting for cman to be active\n",prog_name);
- if (fenced_start_timeout) {
- signal(SIGALRM, sigalarm_handler);
- alarm(fenced_start_timeout);
+ sleep(1);
}
- if (!check_quorum()) {
- if (errno == ETIMEDOUT)
- printf("%s: Timed out waiting for cluster "
- "quorum to form.\n", prog_name);
- cman_finish(ch);
- return EXIT_FAILURE;
+ while (1) {
+ rv = cman_is_quorate(ch);
+ if (rv)
+ break;
+
+ if (inquorate_fail)
+ goto fail;
+
+ if (try_quorate++ >= wait_timeout)
+ goto fail_err;
+
+ if (!(try_quorate % 10))
+ printf("%s: waiting for cluster quorum\n", prog_name);
+
+ sleep(1);
}
+
cman_finish(ch);
+ return;
+
+ fail_err:
+ printf("%s: Timed out waiting for cluster quorum to form.\n",
+ prog_name);
+ fail:
+ exit(EXIT_FAILURE);
+}
+
+static void do_join(int argc, char *argv[])
+{
+ int rv;
+
+ wait_quorum();
rv = fenced_join();
if (rv < 0)
die("can't communicate with fenced");
- if (child_wait)
+ if (wait_join)
do_wait(1);
- return EXIT_SUCCESS;
+
+ exit(EXIT_SUCCESS);
}
-static int do_leave(void)
+static void do_leave(void)
{
int rv;
@@ -242,12 +236,13 @@ static int do_leave(void)
if (rv < 0)
die("can't communicate with fenced");
- if (child_wait)
+ if (wait_leave)
do_wait(0);
- return EXIT_SUCCESS;
+
+ exit(EXIT_SUCCESS);
}
-static int do_dump(void)
+static void do_dump(void)
{
char buf[FENCED_DUMP_SIZE];
int rv;
@@ -258,7 +253,7 @@ static int do_dump(void)
do_write(STDOUT_FILENO, buf, sizeof(buf));
- return 0;
+ exit(EXIT_SUCCESS);
}
static int node_compare(const void *va, const void *vb)
@@ -282,7 +277,7 @@ static int do_list(void)
rv = fenced_domain_info(&d);
if (rv < 0)
- goto out;
+ goto fail;
printf("fence domain \"default\"\n");
printf("member_count %d master_nodeid %d victim_count %d current_victim %d state %d\n",
@@ -294,7 +289,7 @@ static int do_list(void)
rv = fenced_domain_nodes(FENCED_NODES_MEMBERS, MAX_NODES,
&node_count, nodes);
if (rv < 0)
- goto out;
+ goto fail;
qsort(&nodes, node_count, sizeof(struct fenced_node), node_compare);
@@ -308,7 +303,7 @@ static int do_list(void)
printf("\n");
if (!verbose)
- return 0;
+ exit(EXIT_SUCCESS);
node_count = 0;
memset(&nodes, 0, sizeof(nodes));
@@ -316,7 +311,7 @@ static int do_list(void)
rv = fenced_domain_nodes(FENCED_NODES_ALL, MAX_NODES,
&node_count, nodes);
if (rv < 0)
- goto out;
+ goto fail;
qsort(&nodes, node_count, sizeof(struct fenced_node), node_compare);
@@ -332,10 +327,10 @@ static int do_list(void)
np->last_fenced_how);
np++;
}
- return 0;
- out:
+ exit(EXIT_SUCCESS);
+ fail:
fprintf(stderr, "fenced query error %d\n", rv);
- return rv;
+ exit(EXIT_FAILURE);
}
static void print_usage(void)
@@ -351,11 +346,11 @@ static void print_usage(void)
printf(" dump Dump debug buffer from fenced\n");
printf("\n");
printf("Options:\n");
- printf(" -w Wait for join to complete\n");
+ printf(" -w Wait for join or leave to complete\n");
+ printf(" -t <seconds> Maximum time in seconds to wait (default %d)\n", DEFAULT_WAIT_TIMEOUT);
+ printf(" -Q Fail if cluster is not quorate, don't wait\n");
printf(" -V Print program version information, then exit\n");
printf(" -h Print this help, then exit\n");
- printf(" -t Maximum time in seconds to wait\n");
- printf(" -Q Fail if cluster is not quorate, don't wait\n");
printf("\n");
}
@@ -388,11 +383,16 @@ static void decode_arguments(int argc, char *argv[])
break;
case 'Q':
- quorum_wait = 0;
+ inquorate_fail = 1;
break;
case 'w':
- child_wait = 1;
+ wait_join = 1;
+ wait_leave = 1;
+ break;
+
+ case 't':
+ wait_timeout = get_int_arg(optchar, optarg);
break;
case ':':
@@ -405,10 +405,6 @@ static void decode_arguments(int argc, char *argv[])
cont = 0;
break;
- case 't':
- fenced_start_timeout = get_int_arg(optchar, optarg);
- break;
-
default:
die("unknown option: %c\n", optchar);
break;
@@ -441,13 +437,13 @@ int main(int argc, char *argv[])
switch (operation) {
case OP_JOIN:
- return do_join(argc, argv);
+ do_join(argc, argv);
case OP_LEAVE:
- return do_leave();
+ do_leave();
case OP_DUMP:
- return do_dump();
+ do_dump();
case OP_LIST:
- return do_list();
+ do_list();
}
return EXIT_FAILURE;
diff --git a/fence/fenced/main.c b/fence/fenced/main.c
index 3c88197..6bfbb0e 100644
--- a/fence/fenced/main.c
+++ b/fence/fenced/main.c
@@ -316,11 +316,11 @@ static void query_dump_debug(int f)
do_write(f, dump_buf, len);
}
-static void query_node_info(int f, int nodeid)
+static void query_node_info(int f, int data_nodeid)
{
struct fd *fd;
struct fenced_node node;
- int rv;
+ int nodeid, rv;
fd = find_fd("default");
if (!fd) {
@@ -328,6 +328,11 @@ static void query_node_info(int f, int nodeid)
goto out;
}
+ if (data_nodeid == FENCED_NODEID_US)
+ nodeid = our_nodeid;
+ else
+ nodeid = data_nodeid;
+
if (group_mode == GROUP_LIBGROUP)
rv = set_node_info_group(fd, nodeid, &node);
else
diff --git a/fence/libfenced/libfenced.h b/fence/libfenced/libfenced.h
index 056e4f2..30ea210 100644
--- a/fence/libfenced/libfenced.h
+++ b/fence/libfenced/libfenced.h
@@ -3,6 +3,9 @@
#define FENCED_DUMP_SIZE (1024 * 1024)
+/* for querying local node info */
+#define FENCED_NODEID_US 0
+
struct fenced_node {
int nodeid;
int member;
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2008-07-31 22:23 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-07-31 22:23 The tag: rgmanager_2_0_39 has been created Chris Feist
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).