public inbox for cluster-cvs@sourceware.org
help / color / mirror / Atom feed
* fence: master - fence_tool, init.d/cman: fix wait/retry options
@ 2009-03-17 20:02 David Teigland
0 siblings, 0 replies; only message in thread
From: David Teigland @ 2009-03-17 20:02 UTC (permalink / raw)
To: cluster-cvs-relay
Gitweb: http://git.fedorahosted.org/git/fence.git?p=fence.git;a=commitdiff;h=fe1dacafe59f2512ca5eac9069951c285013dbfa
Commit: fe1dacafe59f2512ca5eac9069951c285013dbfa
Parent: d3b33ac0f2afa8c44dc3c7fd5dee6a4f769eb409
Author: David Teigland <teigland@redhat.com>
AuthorDate: Tue Mar 17 14:57:58 2009 -0500
Committer: David Teigland <teigland@redhat.com>
CommitterDate: Tue Mar 17 14:57:58 2009 -0500
fence_tool, init.d/cman: fix wait/retry options
Bring some sanity to the fence_tool ad hoc wait/retry options, which
are used by init.d/cman. At a high level we want:
. fence_tool join to fail right away, with an error, if cman or
fenced fail or aren't running
. fence_tool join to exit with 0 if the join succeeds and
with 1 if it fails
. do these things properly even when fenced is slow starting up,
or in processing the join
Signed-off-by: David Teigland <teigland@redhat.com>
---
fence/fence_tool/fence_tool.c | 317 +++++++++++++++++++++++++----------------
1 files changed, 195 insertions(+), 122 deletions(-)
diff --git a/fence/fence_tool/fence_tool.c b/fence/fence_tool/fence_tool.c
index 8ee5544..7a900c1 100644
--- a/fence/fence_tool/fence_tool.c
+++ b/fence/fence_tool/fence_tool.c
@@ -27,23 +27,27 @@
#define OP_LIST 3
#define OP_DUMP 4
-#define DEFAULT_WAIT_TIMEOUT 300 /* five minutes */
-
#define MAX_NODES 128
int all_nodeids[MAX_NODES];
int all_nodeids_count;
+static quorum_handle_t qh;
static uint32_t quorum_nodes[MAX_NODES];
static int quorum_node_count;
struct fenced_node nodes[MAX_NODES];
char *prog_name;
int operation;
-int ls_all_nodes = 0;
-int inquorate_fail = 0;
-int wait_join = 0; /* default: don't wait for join */
-int wait_leave = 0; /* default: don't wait for leave */
-int wait_members = 0; /* default: don't wait for members */
-int wait_timeout = DEFAULT_WAIT_TIMEOUT;
+
+#define DEFAULT_RETRY_QUORUM 0 /* fail immediately if we can't connect to quorum */
+#define DEFAULT_DELAY_QUORUM 0
+#define DEFAULT_DELAY_MEMBERS 0
+#define DEFAULT_WAIT_JOINLEAVE 0
+
+int opt_all_nodes = 0;
+int opt_retry_quorum = DEFAULT_RETRY_QUORUM;
+int opt_delay_quorum = DEFAULT_DELAY_QUORUM;
+int opt_delay_members = DEFAULT_DELAY_MEMBERS;
+int opt_wait_joinleave = DEFAULT_WAIT_JOINLEAVE;
#define die(fmt, args...) \
do { \
@@ -71,19 +75,30 @@ static int do_write(int fd, void *buf, size_t count)
return 0;
}
-static int get_int_arg(char argopt, char *arg)
+#define LOCKFILE_NAME "/var/run/fenced.pid"
+
+static void check_fenced_running(void)
{
- char *tmp;
- int val;
-
- val = strtol(arg, &tmp, 10);
- if (tmp == arg || tmp != arg + strlen(arg))
- die("argument to %c (%s) is not an integer", argopt, arg);
-
- if (val < 0)
- die("argument to %c cannot be negative", argopt);
-
- return val;
+ struct flock lock;
+ int fd, rv;
+
+ fd = open(LOCKFILE_NAME, O_RDONLY);
+ if (fd < 0)
+ die("fenced not running, no lockfile");
+
+ lock.l_type = F_RDLCK;
+ lock.l_start = 0;
+ lock.l_whence= SEEK_SET;
+ lock.l_len = 0;
+
+ rv = fcntl(fd, F_GETLK, &lock);
+ if (rv < 0)
+ die("fenced not running, get lockfile");
+
+ if (lock.l_type == F_UNLCK)
+ die("fenced not running, unlocked lockfile");
+
+ close(fd);
}
static int check_gfs(void)
@@ -103,7 +118,7 @@ static int check_gfs(void)
if (sscanf(line, "%s %s %s", device, path, type) != 3)
continue;
if (!strcmp(type, "gfs") || !strcmp(type, "gfs2")) {
- printf("found %s file system mounted from %s on %s\n",
+ fprintf(stderr, "found %s file system mounted from %s on %s\n",
type, device, path);
count++;
}
@@ -132,7 +147,7 @@ static int check_controlled_dir(char *path)
continue;
#endif
- printf("found dlm lockspace %s/%s\n", path, de->d_name);
+ fprintf(stderr, "found dlm lockspace %s/%s\n", path, de->d_name);
count++;
}
@@ -178,6 +193,9 @@ static void wait_domain(int joining)
int in, tries = 0;
while (1) {
+ if (joining)
+ check_fenced_running();
+
in = we_are_in_fence_domain();
if (joining && in)
@@ -186,19 +204,25 @@ static void wait_domain(int joining)
if (!joining && !in)
break;
- if (tries++ >= wait_timeout)
- goto fail;
+ tries++;
- if (!(tries % 5))
- printf("Waiting for fenced to %s the fence group.\n",
- joining ? "join" : "leave");
+ if (opt_wait_joinleave < 0)
+ goto retry_domain;
+
+ if (!opt_wait_joinleave || tries >= opt_wait_joinleave) {
+ fprintf(stderr, "%s: %s not complete\n",
+ prog_name, joining ? "join" : "leave");
+ break;
+ }
+ retry_domain:
+ if (!(tries % 10))
+ fprintf(stderr, "%s: waiting for fenced to %s the fence group.\n",
+ prog_name, joining ? "join" : "leave");
sleep(1);
}
return;
- fail:
- printf("Error %s the fence group.\n", joining ? "joining" : "leaving");
}
static void read_ccs_nodeids(int cd)
@@ -237,7 +261,7 @@ static void quorum_callback(quorum_handle_t h, uint32_t quorate,
quorum_nodes[quorum_node_count++] = node_list[i];
}
-static int all_nodeids_are_members(quorum_handle_t qh)
+static int all_nodeids_are_members(void)
{
cs_error_t err;
int i, j, found;
@@ -281,116 +305,149 @@ static quorum_callbacks_t quorum_callbacks =
.quorum_notify_fn = quorum_callback,
};
-static void wait_quorum(void)
+static int connect_quorum(void)
{
- quorum_handle_t qh;
cs_error_t err;
- int try_init = 0, try_quorate = 0;
- int try_ccs = 0, try_members = 0;
- int rv, cd, quorate;
+ int tries = 0;
while (1) {
err = quorum_initialize(&qh, &quorum_callbacks);
if (err == CS_OK)
break;
- if (inquorate_fail)
- goto fail;
-
- if (try_init++ >= wait_timeout) {
- printf("%s: timed out waiting for quorum init\n",
- prog_name);
- goto fail;
- }
+ tries++;
- if (!(try_init % 10))
- printf("%s: waiting for quorum init\n", prog_name);
+ if (opt_retry_quorum < 0)
+ goto retry_init;
+ if (!opt_retry_quorum || tries >= opt_retry_quorum)
+ return -1;
+ retry_init:
+ if (!(tries % 10))
+ fprintf(stderr, "%s: retrying quorum connection\n", prog_name);
sleep(1);
}
+ return 0;
+}
+
+static void delay_quorum(void)
+{
+ cs_error_t err;
+ int tries = 0;
+ int quorate = 0;
+
while (1) {
err = quorum_getquorate(qh, &quorate);
- if (err != CS_OK)
- goto fail;
+ if (err != CS_OK) {
+ quorum_finalize(qh);
+ die("lost quorum connection");
+ }
if (quorate)
break;
- if (inquorate_fail)
- goto fail;
+ tries++;
- if (try_quorate++ >= wait_timeout) {
- printf("%s: timed out waiting for quorum\n",
- prog_name);
- goto fail;
- }
+ if (opt_delay_quorum < 0)
+ goto retry_quorum;
- if (!(try_quorate % 10))
- printf("%s: waiting for quorum\n", prog_name);
+ if (!opt_delay_quorum || tries >= opt_delay_quorum) {
+ fprintf(stderr, "%s: continuing without quorum\n", prog_name);
+ break;
+ }
+ retry_quorum:
+ if (!(tries % 10))
+ fprintf(stderr, "%s: delaying for quorum\n", prog_name);
sleep(1);
}
- while (1) {
- cd = ccs_connect();
- if (cd > 0)
- break;
-
- if (try_ccs++ >= wait_timeout) {
- printf("%s: timed out waiting for ccs connect\n",
- prog_name);
- goto fail;
- }
+ return;
+}
- if (!(try_ccs % 10))
- printf("%s: waiting for ccs connect\n", prog_name);
+static void delay_members(void)
+{
+ int rv, tries = 0;
+ int cd;
- sleep(1);
+ cd = ccs_connect();
+ if (cd < 0) {
+ quorum_finalize(qh);
+ die("failed ccs connection");
}
- if (!wait_members)
- goto out;
read_ccs_nodeids(cd);
while (1) {
- rv = all_nodeids_are_members(qh);
- if (rv < 0)
- goto fail;
- if (rv > 0)
+ rv = all_nodeids_are_members();
+ if (rv < 0) {
+ ccs_disconnect(cd);
+ quorum_finalize(qh);
+ die("lost quorum connection");
+ }
+ if (rv)
break;
- if (try_members++ >= wait_members)
+ tries++;
+
+ if (opt_delay_members < 0)
+ goto retry_members;
+
+ if (!opt_delay_members || tries > opt_delay_members) {
+ fprintf(stderr, "%s: continuing without all members\n", prog_name);
break;
+ }
+ retry_members:
+ if (!(tries % 10))
+ fprintf(stderr, "%s: delaying for members\n", prog_name);
- if (!(try_members % 10))
- printf("%s: waiting for all %d nodes to be members\n",
- prog_name, all_nodeids_count);
sleep(1);
}
- out:
ccs_disconnect(cd);
- quorum_finalize(qh);
return;
-
- fail:
- if (qh)
- quorum_finalize(qh);
- exit(EXIT_FAILURE);
}
static void do_join(int argc, char *argv[])
{
- int rv;
-
- wait_quorum();
+ int rv, tries = 0;
- rv = fenced_join();
+ rv = connect_quorum();
if (rv < 0)
- die("can't communicate with fenced");
+ die("can't connect to quorum");
+
+ /* if delay_quorum() or delay_members() fail on any quorum/ccs
+ connection or operation, they call quorum_finalize() and exit
+ with failure */
+
+ if (opt_delay_quorum)
+ delay_quorum();
+
+ if (opt_delay_members)
+ delay_members();
+
+ quorum_finalize(qh);
+
+ /* This loop deals with the case where fenced is slow enough starting
+ up that fenced_join fails. Do we also want to add a delay here to
+ deal with the case where fenced is so slow starting up that it hasn't
+ locked its lockfile yet, causing check_fenced_running to fail? */
+
+ while (1) {
+ rv = fenced_join();
+ if (!rv)
+ break;
+
+ check_fenced_running();
+
+ tries++;
+ if (!(tries % 10))
+ fprintf(stderr, "%s: retrying join\n", prog_name);
+ sleep(1);
+ }
- if (wait_join)
+ if (opt_wait_joinleave)
wait_domain(1);
exit(EXIT_SUCCESS);
@@ -404,9 +461,9 @@ static void do_leave(void)
rv = fenced_leave();
if (rv < 0)
- die("can't communicate with fenced");
+ die("leave: can't communicate with fenced");
- if (wait_leave)
+ if (opt_wait_joinleave)
wait_domain(0);
exit(EXIT_SUCCESS);
@@ -419,7 +476,7 @@ static void do_dump(void)
rv = fenced_dump_debug(buf);
if (rv < 0)
- die("can't communicate with fenced");
+ die("dump: can't communicate with fenced");
do_write(STDOUT_FILENO, buf, strlen(buf));
@@ -515,7 +572,7 @@ static int do_list(void)
}
printf("\n");
- if (!ls_all_nodes) {
+ if (!opt_all_nodes) {
printf("\n");
exit(EXIT_SUCCESS);
}
@@ -554,27 +611,40 @@ static void print_usage(void)
{
printf("Usage:\n");
printf("\n");
- printf("%s <join|leave|dump> [options]\n", prog_name);
+ printf("%s <ls|join|leave|dump> [options]\n", prog_name);
printf("\n");
printf("Actions:\n");
+ printf(" ls List nodes status\n");
printf(" join Join the default fence domain\n");
printf(" leave Leave default fence domain\n");
- printf(" ls List nodes status\n");
- printf(" dump Dump debug buffer from fenced\n");
+ printf(" dump Dump debug buffer from fenced\n");
printf("\n");
printf("Options:\n");
printf(" -n Show all node information in ls\n");
+
+ printf(" -t <seconds> Retry quorum connection for <seconds>.\n");
+ printf(" Default %d. 0 no retry, -1 indefinite retry.\n",
+ DEFAULT_RETRY_QUORUM);
+
+ printf(" -q <seconds> Delay join up to <seconds> for the cluster to have quorum.\n");
+ printf(" Default %d. 0 no delay, -1 indefinite delay.\n",
+ DEFAULT_DELAY_QUORUM);
+
printf(" -m <seconds> Delay join up to <seconds> for all nodes in cluster.conf\n");
- printf(" to be cluster members\n");
- printf(" -w Wait for join or leave to complete\n");
- printf(" -t <seconds> Maximum time in seconds to wait (default %d)\n", DEFAULT_WAIT_TIMEOUT);
- printf(" -Q Fail if cluster is not quorate, don't wait\n");
+ printf(" to be cluster members.\n");
+ printf(" Default %d. 0 no delay, -1 indefinite delay\n",
+ DEFAULT_DELAY_MEMBERS);
+
+ printf(" -w <seconds> Wait up to <seconds> for join or leave result.\n");
+ printf(" Default %d. 0 no wait, -1 indefinite wait.\n",
+ DEFAULT_WAIT_JOINLEAVE);
+
printf(" -V Print program version information, then exit\n");
printf(" -h Print this help, then exit\n");
printf("\n");
}
-#define OPTION_STRING "Vht:wQm:n"
+#define OPTION_STRING "nt:q:m:w:Vh"
static void decode_arguments(int argc, char *argv[])
{
@@ -586,37 +656,36 @@ static void decode_arguments(int argc, char *argv[])
switch (optchar) {
- case 'V':
- printf("fence_tool %s (built %s %s)\n",
- RELEASE_VERSION, __DATE__, __TIME__);
- printf("%s\n", REDHAT_COPYRIGHT);
- exit(EXIT_SUCCESS);
+ case 'n':
+ opt_all_nodes = 1;
break;
- case 'n':
- ls_all_nodes = 1;
+ case 't':
+ opt_retry_quorum = atoi(optarg);
break;
- case 'h':
- print_usage();
- exit(EXIT_SUCCESS);
+ case 'q':
+ opt_delay_quorum = atoi(optarg);
break;
- case 'Q':
- inquorate_fail = 1;
+ case 'm':
+ opt_delay_members = atoi(optarg);
break;
case 'w':
- wait_join = 1;
- wait_leave = 1;
+ opt_wait_joinleave = atoi(optarg);
break;
- case 'm':
- wait_members = atoi(optarg);
+ case 'V':
+ printf("fence_tool %s (built %s %s)\n",
+ RELEASE_VERSION, __DATE__, __TIME__);
+ printf("%s\n", REDHAT_COPYRIGHT);
+ exit(EXIT_SUCCESS);
break;
- case 't':
- wait_timeout = get_int_arg(optchar, optarg);
+ case 'h':
+ print_usage();
+ exit(EXIT_SUCCESS);
break;
case ':':
@@ -662,12 +731,16 @@ int main(int argc, char *argv[])
switch (operation) {
case OP_JOIN:
do_join(argc, argv);
+ break;
case OP_LEAVE:
do_leave();
+ break;
case OP_DUMP:
do_dump();
+ break;
case OP_LIST:
do_list();
+ break;
}
return EXIT_FAILURE;
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2009-03-17 20:02 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-03-17 20:02 fence: master - fence_tool, init.d/cman: fix wait/retry options David Teigland
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).