public inbox for cluster-cvs@sourceware.org
help / color / mirror / Atom feed
* master - fenced: joining daemon cpg to bypass fencing
@ 2008-09-04 21:27 David Teigland
  0 siblings, 0 replies; only message in thread
From: David Teigland @ 2008-09-04 21:27 UTC (permalink / raw)
  To: cluster-cvs-relay

Gitweb:        http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=5043d28a1a37bc51122246f9631dc8c4441011a8
Commit:        5043d28a1a37bc51122246f9631dc8c4441011a8
Parent:        f00e7639798652daf6cce0dc7f9d8d3be0dae194
Author:        David Teigland <teigland@redhat.com>
AuthorDate:    Wed Sep 3 12:56:24 2008 -0500
Committer:     David Teigland <teigland@redhat.com>
CommitterDate: Wed Sep 3 14:07:53 2008 -0500

fenced: joining daemon cpg to bypass fencing

When the fenced daemon starts, it checks for uncontrolled instances
of gfs/dlm, and if none are found, it joins a special "daemon" cpg (not
the fence domain cpg).  This join simply tells fenced on other nodes
that the new node is in a cleanly reset state and they can skip fencing
it if it's currently a victim.

Currently, fencing is skipped if the victim just joins the cluster, but
this is not sufficient since a node can join the cluster with uncontrolled
gfs/dlm instances (it still needs to be fenced).

In cluster2, the groupd cpg filled the role of this new fenced cpg in
advertising the clean/reset state of a node.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fence/fenced/cpg.c     |  120 ++++++++++++++++++++++++++++++++++++++++++-
 fence/fenced/fd.h      |    5 ++
 fence/fenced/main.c    |  134 ++++++++++++++++++++++++++++++++++++++++++++----
 fence/fenced/recover.c |   22 ++++----
 4 files changed, 259 insertions(+), 22 deletions(-)

diff --git a/fence/fenced/cpg.c b/fence/fenced/cpg.c
index ccebbd9..5b6c826 100644
--- a/fence/fenced/cpg.c
+++ b/fence/fenced/cpg.c
@@ -2,6 +2,9 @@
 #include "config.h"
 
 static unsigned int protocol_active[3] = {1, 0, 0};
+static cpg_handle_t cpg_handle_daemon;
+static struct cpg_address daemon_member_list[MAX_NODES];
+static int daemon_member_list_entries;
 
 struct member {
 	struct list_head list;
@@ -1308,7 +1311,6 @@ int fd_join(struct fd *fd)
 	}
 	if (error != CPG_OK) {
 		log_error("cpg_join error %d", error);
-		cpg_finalize(h);
 		goto fail;
 	}
 
@@ -1349,6 +1351,122 @@ int fd_leave(struct fd *fd)
 	return 0;
 }
 
+/* process_cpg(), setup_cpg(), close_cpg() are for the "daemon" cpg which
+   tracks the presence of other daemons; it's not the fenced domain cpg.
+   Joining this cpg tells others that we don't have uncontrolled dlm/gfs
+   kernel state and they can skip fencing us if we're a victim.  (We have
+   to check for that uncontrolled state before calling setup_cpg, obviously.) */
+
+static void deliver_cb_daemon(cpg_handle_t handle, struct cpg_name *group_name,
+		uint32_t nodeid, uint32_t pid, void *data, int len)
+{
+}
+
+static void confchg_cb_daemon(cpg_handle_t handle, struct cpg_name *group_name,
+		struct cpg_address *member_list, int member_list_entries,
+		struct cpg_address *left_list, int left_list_entries,
+		struct cpg_address *joined_list, int joined_list_entries)
+{
+	memset(&daemon_member_list, 0, sizeof(daemon_member_list));
+	memcpy(&daemon_member_list, member_list,
+	       member_list_entries * sizeof(struct cpg_address));
+	daemon_member_list_entries = member_list_entries;
+}
+
+static cpg_callbacks_t cpg_callbacks_daemon = {
+	.cpg_deliver_fn = deliver_cb_daemon,
+	.cpg_confchg_fn = confchg_cb_daemon,
+};
+
+void process_cpg(int ci)
+{
+	cpg_error_t error;
+
+	error = cpg_dispatch(cpg_handle_daemon, CPG_DISPATCH_ALL);
+	if (error != CPG_OK)
+		log_error("daemon cpg_dispatch error %d", error);
+}
+
+int in_daemon_member_list(int nodeid)
+{
+	int i;
+
+	cpg_dispatch(cpg_handle_daemon, CPG_DISPATCH_ALL);
+
+	for (i = 0; i < daemon_member_list_entries; i++) {
+		if (daemon_member_list[i].nodeid == nodeid)
+			return 1;
+	}
+	return 0;
+}
+
+int setup_cpg(void)
+{
+	cpg_error_t error;
+	cpg_handle_t h;
+	struct cpg_name name;
+	int i = 0, f;
+
+	error = cpg_initialize(&h, &cpg_callbacks_daemon);
+	if (error != CPG_OK) {
+		log_error("daemon cpg_initialize error %d", error);
+		goto fail;
+	}
+
+	cpg_fd_get(h, &f);
+
+	cpg_handle_daemon = h;
+
+	memset(&name, 0, sizeof(name));
+	sprintf(name.value, "fenced:daemon");
+	name.length = strlen(name.value) + 1;
+
+ retry:
+	error = cpg_join(h, &name);
+	if (error == CPG_ERR_TRY_AGAIN) {
+		sleep(1);
+		if (!(++i % 10))
+			log_error("daemon cpg_join error retrying");
+		goto retry;
+	}
+	if (error != CPG_OK) {
+		log_error("daemon cpg_join error %d", error);
+		goto fail;
+	}
+
+	log_debug("setup_cpg %d", f);
+	return f;
+
+ fail:
+	cpg_finalize(h);
+	return -1;
+}
+
+void close_cpg(void)
+{
+	cpg_error_t error;
+	struct cpg_name name;
+	int i = 0;
+
+	if (!cpg_handle_daemon)
+		return;
+
+	memset(&name, 0, sizeof(name));
+	sprintf(name.value, "fenced:daemon");
+	name.length = strlen(name.value) + 1;
+
+ retry:
+	error = cpg_leave(cpg_handle_daemon, &name);
+	if (error == CPG_ERR_TRY_AGAIN) {
+		sleep(1);
+		if (!(++i % 10))
+			log_error("daemon cpg_leave error retrying");
+		goto retry;
+	}
+	if (error != CPG_OK)
+		log_error("daemon cpg_leave error %d", error);
+}
+
 int set_node_info(struct fd *fd, int nodeid, struct fenced_node *nodeinfo)
 {
 	struct node_history *node;
diff --git a/fence/fenced/fd.h b/fence/fenced/fd.h
index 21cac39..a9dacbd 100644
--- a/fence/fenced/fd.h
+++ b/fence/fenced/fd.h
@@ -13,6 +13,7 @@
 #include <time.h>
 #include <sched.h>
 #include <limits.h>
+#include <dirent.h>
 #include <sys/ioctl.h>
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -203,6 +204,9 @@ int read_ccs(struct fd *fd);
 
 /* cpg.c */
 
+void process_cpg(int ci);
+int setup_cpg(void);
+void close_cpg(void);
 void free_cg(struct change *cg);
 void node_history_fence(struct fd *fd, int victim, int master, int how,
 			uint64_t mastertime);
@@ -216,6 +220,7 @@ int set_node_info(struct fd *fd, int nodeid, struct fenced_node *node);
 int set_domain_info(struct fd *fd, struct fenced_domain *domain);
 int set_domain_nodes(struct fd *fd, int option, int *node_count,
 		     struct fenced_node **nodes);
+int in_daemon_member_list(int nodeid);
 
 /* group.c */
 
diff --git a/fence/fenced/main.c b/fence/fenced/main.c
index ae5f662..01937f1 100644
--- a/fence/fenced/main.c
+++ b/fence/fenced/main.c
@@ -12,6 +12,7 @@ static struct client *client = NULL;
 static struct pollfd *pollfd = NULL;
 static pthread_t query_thread;
 static pthread_mutex_t query_mutex;
+static struct list_head controlled_entries;
 
 struct client {
 	int fd;
@@ -602,6 +603,108 @@ static int setup_queries(void)
 	return 0;
 }
 
+struct controlled_entry {
+	struct list_head list;
+	char path[PATH_MAX+1];
+};
+
+static void register_controlled_dir(char *path)
+{
+	struct controlled_entry *ce;
+
+	ce = malloc(sizeof(struct controlled_entry));
+	if (!ce)
+		return;
+	memset(ce, 0, sizeof(struct controlled_entry));
+	strncpy(ce->path, path, PATH_MAX);
+	list_add(&ce->list, &controlled_entries);
+}
+
+static int ignore_nolock(char *sysfs_dir, char *table)
+{
+	char path[PATH_MAX];
+	int fd;
+
+	memset(path, 0, PATH_MAX);
+
+	snprintf(path, PATH_MAX, "%s/%s/lock_module/proto_name",
+		 sysfs_dir, table);
+
+	/* lock_nolock doesn't create the "lock_module" dir at all,
+	   so we'll fail to open this */
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+		return 1;
+
+	close(fd);
+	return 0;
+}
+
+static int check_controlled_dir(char *path)
+{
+	DIR *d;
+	struct dirent *de;
+	int count = 0;
+
+	d = opendir(path);
+	if (!d)
+		return 0;
+
+	while ((de = readdir(d))) {
+		if (de->d_name[0] == '.')
+			continue;
+
+		if (strstr(path, "fs/gfs") && ignore_nolock(path, de->d_name))
+			continue;
+
+		log_error("found uncontrolled entry %s/%s", path, de->d_name);
+		count++;
+	}
+	closedir(d);
+
+	return count;
+}
+
+/* Joining the "fenced:daemon" cpg (in setup_cpg()) tells fenced on other
+   nodes that we are in a "clean state", and don't need fencing.  So, if
+   we're a pending fence victim on another node, they'll skip fencing us
+   once we start fenced and join the "daemon" cpg (it's not the fence domain
+   cpg which we join when fence_tool join is run).  This "daemon" cpg is just
+   to notify others that we have no uncontrolled gfs/dlm objects.
+   (Conceptually, we could use the fence domain cpg for this purpose instead,
+   but that would require processing domain membership changes during
+   fence_victims(), which would be a major change in the way the daemon works.)
+
+   So, if we (the local node) are *not* in a clean state, we don't join the
+   daemon cpg and we exit; we still need to be fenced.  If we are starting
+   up and find that instances of gfs/dlm in the kernel have been previously
+   abandoned, that's an unclean, unreset state, and we still need fencing. */
+
+static int check_uncontrolled_entries(void)
+{
+	struct controlled_entry *ce;
+	int count = 0;
+
+	list_for_each_entry(ce, &controlled_entries, list) {
+		if (strncmp(ce->path, "-", 1))
+			goto skip_default;
+	}
+
+	/* the default dirs to check */
+	register_controlled_dir("/sys/kernel/dlm");
+	register_controlled_dir("/sys/fs/gfs2");
+	register_controlled_dir("/sys/fs/gfs");
+
+ skip_default:
+	list_for_each_entry(ce, &controlled_entries, list)
+		count += check_controlled_dir(ce->path);
+
+	if (count)
+		return -1;
+	return 0;
+}
+
 void cluster_dead(int ci)
 {
 	log_error("cluster is down, exiting");
@@ -634,6 +737,15 @@ static void loop(void)
 
 	setup_logging();
 
+	rv = check_uncontrolled_entries();
+	if (rv < 0)
+		goto out;
+
+	rv = setup_cpg();
+	if (rv < 0)
+		goto out;
+	client_add(rv, process_cpg, cluster_dead);
+
 	group_mode = GROUP_LIBCPG;
 
 	if (cfgd_groupd_compat) {
@@ -648,15 +760,6 @@ static void loop(void)
 	}
 	log_debug("group_mode %d compat %d", group_mode, cfgd_groupd_compat);
 
-	if (group_mode == GROUP_LIBCPG) {
-		/*
-		rv = setup_cpg();
-		if (rv < 0)
-			goto out;
-		client_add(rv, process_cpg, cluster_dead);
-		*/
-	}
-
 	for (;;) {
 		rv = poll(pollfd, client_maxi + 1, -1);
 		if (rv == -1 && errno == EINTR) {
@@ -692,6 +795,7 @@ static void loop(void)
  out:
 	if (cfgd_groupd_compat)
 		close_groupd();
+	close_cpg();
 	close_logging();
 	close_ccs();
 	close_cman();
@@ -757,7 +861,10 @@ static void print_usage(void)
 	printf("               1: use libgroup for compat with cluster2/rhel5\n");
 	printf("               2: use groupd to detect old, or mode 1, nodes that\n"
 	       "               require compat, use libcpg if none found\n");
-	printf("  -c           All nodes are in a clean state to start\n");
+	printf("  -r <path>    Register a directory that needs to be empty for\n");
+	printf("               the daemon to start.  \"-\" to skip default directories\n");
+	printf("               /sys/fs/gfs, /sys/fs/gfs2, /sys/kernel/dlm\n");
+	printf("  -c           All nodes are in a clean state to start; do no startup fencing\n");
 	printf("  -s           Skip startup fencing of nodes with no defined fence methods\n");
 	printf("  -j <secs>    Post-join fencing delay (default %d)\n", DEFAULT_POST_JOIN_DELAY);
 	printf("  -f <secs>    Post-fail fencing delay (default %d)\n", DEFAULT_POST_FAIL_DELAY);
@@ -772,7 +879,7 @@ static void print_usage(void)
 	printf("\n");
 }
 
-#define OPTION_STRING	"L:g:cj:f:Dn:O:hVSs"
+#define OPTION_STRING	"L:g:cj:f:Dn:O:hVSse:r:"
 
 static void read_arguments(int argc, char **argv)
 {
@@ -830,6 +937,10 @@ static void read_arguments(int argc, char **argv)
 			cfgd_override_path = strdup(optarg);
 			break;
 
+		case 'r':
+			register_controlled_dir(optarg);
+			break;
+
 		case 'h':
 			print_usage();
 			exit(EXIT_SUCCESS);
@@ -879,6 +990,7 @@ static void set_oom_adj(int val)
 int main(int argc, char **argv)
 {
 	INIT_LIST_HEAD(&domains);
+	INIT_LIST_HEAD(&controlled_entries);
 
 	init_logging();
 
diff --git a/fence/fenced/recover.c b/fence/fenced/recover.c
index d70a876..ecb13d5 100644
--- a/fence/fenced/recover.c
+++ b/fence/fenced/recover.c
@@ -62,7 +62,8 @@ static int reduce_victims(struct fd *fd)
 	num_victims = list_count(&fd->victims);
 
 	list_for_each_entry_safe(node, safe, &fd->victims, list) {
-		if (is_cman_member(node->nodeid)) {
+		if (is_cman_member(node->nodeid) &&
+		    in_daemon_member_list(node->nodeid)) {
 			log_debug("reduce victim %s", node->name);
 			victim_done(fd, node->nodeid, VIC_DONE_MEMBER);
 			list_del(&node->list);
@@ -235,23 +236,24 @@ void fence_victims(struct fd *fd)
 	struct node *node;
 	int error;
 	int override = -1;
-	int member, fenced;
+	int cman_member, cpg_member, ext;
 
 	while (!list_empty(&fd->victims)) {
 		node = list_entry(fd->victims.next, struct node, list);
 
-		member = is_cman_member(node->nodeid);
+		cman_member = is_cman_member(node->nodeid);
+		cpg_member = in_daemon_member_list(node->nodeid);
 		if (group_mode == GROUP_LIBCPG)
-			fenced = is_fenced_external(fd, node->nodeid);
+			ext = is_fenced_external(fd, node->nodeid);
 		else
-			fenced = 0;
+			ext = 0;
 
-		if (member || fenced) {
+		if ((cman_member && cpg_member) || ext) {
 			log_debug("averting fence of node %s "
-				  "member %d external %d",
-				  node->name, member, fenced);
-			victim_done(fd, node->nodeid, member ? VIC_DONE_MEMBER :
-							       VIC_DONE_EXTERNAL);
+				  "cman member %d cpg member %d external %d",
+				  node->name, cman_member, cpg_member, ext);
+			victim_done(fd, node->nodeid,
+				    ext ? VIC_DONE_EXTERNAL : VIC_DONE_MEMBER);
 			list_del(&node->list);
 			free(node);
 			continue;


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2008-09-03 21:29 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-09-04 21:27 master - fenced: joining daemon cpg to bypass fencing David Teigland

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).