From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 21087 invoked by alias); 11 Apr 2008 19:50:28 -0000 Received: (qmail 21046 invoked by uid 9453); 11 Apr 2008 19:50:28 -0000 Date: Fri, 11 Apr 2008 19:50:00 -0000 Message-ID: <20080411195027.21030.qmail@sourceware.org> From: teigland@sourceware.org To: cluster-cvs@sources.redhat.com, cluster-devel@redhat.com Subject: Cluster Project branch, master, updated. gfs-kernel_0_1_22-159-gb4c3351 X-Git-Refname: refs/heads/master X-Git-Reftype: branch X-Git-Oldrev: 77bce77b5034adf8f00090b13dde7c7d481b0dd9 X-Git-Newrev: b4c3351a0850da056f879705a28aead767d78072 Mailing-List: contact cluster-cvs-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Post: List-Help: , Sender: cluster-cvs-owner@sourceware.org X-SW-Source: 2008-q2/txt/msg00070.txt.bz2 This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "Cluster Project". http://sources.redhat.com/git/gitweb.cgi?p=cluster.git;a=commitdiff;h=b4c3351a0850da056f879705a28aead767d78072 The branch, master has been updated via b4c3351a0850da056f879705a28aead767d78072 (commit) from 77bce77b5034adf8f00090b13dde7c7d481b0dd9 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit b4c3351a0850da056f879705a28aead767d78072 Author: David Teigland Date: Fri Apr 11 14:45:41 2008 -0500 dlm_controld: quorum checking Fill out the quorum dependency checking, and refine structure of the fencing and fs dependency checking which don't actually work yet. Signed-off-by: David Teigland ----------------------------------------------------------------------- Summary of changes: group/dlm_controld/cpg.c | 107 +++++++++++++++++++++++++++---------- group/dlm_controld/dlm_daemon.h | 2 + group/dlm_controld/main.c | 1 + group/dlm_controld/member_cman.c | 2 + group/dlm_controld/plock.c | 2 +- 5 files changed, 84 insertions(+), 30 deletions(-) diff --git a/group/dlm_controld/cpg.c b/group/dlm_controld/cpg.c index 21c1a43..afe12bd 100644 --- a/group/dlm_controld/cpg.c +++ b/group/dlm_controld/cpg.c @@ -32,7 +32,10 @@ struct member { struct node { struct list_head list; int nodeid; - int needs_fencing; + int check_fencing; + int check_quorum; + int check_fs; + int fs_notify; struct timeval add_time; }; @@ -266,8 +269,8 @@ static void free_ls(struct lockspace *ls) when we see a node not in this list, add entry for it with zero add_time record the time we get a good start message from the node, add_time clear add_time if the node leaves - if node fails with non-zero add_time, set needs_fencing - when a node is fenced, clear add_time and clear needs_fencing + if node fails with non-zero add_time, set check_fencing + when a node is fenced, clear add_time and clear check_fencing if a node remerges after this, no good start message, no new add_time set if a node fails with zero add_time, it doesn't need fencing if a node remerges before it's been fenced, no good start message, no new @@ -340,63 +343,109 @@ static void node_history_fail(struct lockspace *ls, int nodeid) } if (!timerisset(&node->add_time)) - node->needs_fencing = 1; + node->check_fencing = 1; + + node->check_quorum = 1; + node->check_fs = 1; } -static int failed_nodes_fenced(struct lockspace *ls) +static int check_fencing_done(struct lockspace *ls) { -#if 0 struct node *node; struct timeval last_fenced; int wait_count = 0; list_for_each_entry(node, &ls->node_history, list) { - if (!node->needs_fencing) + if (!node->check_fencing) continue; /* check with fenced to see if the node has been fenced since node->add_time */ - fencedomain_last_success(node->nodeid, &last_fenced); + /* fenced_last_success(node->nodeid, &last_fenced); */ + gettimeofday(&last_fenced, NULL); - if (last_fenced <= node->add_time) { + if (timercmp(&last_fenced, &node->add_time, >)) { + node->check_fencing = 0; + timerclear(&node->add_time); + } else { + log_group(ls, "check_fencing %d needs fencing", + node->nodeid); wait_count++; - continue; } - - /* node has been fenced */ - node->needs_fencing = 0; - timerclear(&node->add_time); } - if (wait_count) { + if (wait_count) return 0; - } /* now check if there are any outstanding fencing ops (for nodes we may not have seen in any lockspace), and return 0 if there are any */ - fencedomain_pending_count(&pending); + /* + fenced_pending_count(&pending); if (pending) return 0; -#endif + */ return 1; } -static int cluster_has_quorum(struct lockspace *ls) +static int check_quorum_done(struct lockspace *ls) { - /* verify cman_last_failure_time() for this node is more recent - than when we last saw the node added; then we know that the - quorum result from cman is accounting for the given failure. */ + struct node *node; + int wait_count = 0; + + if (!cman_quorate) { + log_group(ls, "check_quorum %d", cman_quorate); + return 0; + } + + list_for_each_entry(node, &ls->node_history, list) { + if (!node->check_quorum) + continue; + + if (!is_cman_member(node->nodeid)) { + node->check_quorum = 0; + } else { + log_group(ls, "check_quorum %d is_cman_member", + node->nodeid); + wait_count++; + } + } + + if (wait_count) + return 0; + + log_group(ls, "check_quorum done"); return 1; } -static int cluster_filesystem_stopped(struct lockspace *ls) +static int check_fs_done(struct lockspace *ls) { - /* communicate with fs daemon through the fscontrol:hostname - cpg to check if the fs has been notified of any node failures - in this change */ + struct node *node; + int wait_count = 0; + + /* no corresponding fs for this lockspace */ + if (!ls->fs_registered) + return 1; + + list_for_each_entry(node, &ls->node_history, list) { + if (!node->check_fs) + continue; + + if (node->fs_notify) { + node->check_fs = 0; + } else { + log_group(ls, "check_fs %d needs fs notify", + node->nodeid); + wait_count++; + } + } + + if (wait_count) + return 0; + + log_group(ls, "check_fs done"); return 1; } @@ -490,7 +539,7 @@ static int wait_conditions_done(struct lockspace *ls) that have occured since the last change applied to dlm-kernel, not just the latest change */ - if (!failed_nodes_fenced(ls)) { + if (!check_fencing_done(ls)) { poll_fencing = 1; return 0; } @@ -500,13 +549,13 @@ static int wait_conditions_done(struct lockspace *ls) sufficient because we don't want to start new lockspaces in an inquorate cluster */ - if (!cluster_has_quorum(ls)) { + if (!check_quorum_done(ls)) { poll_quorum = 1; return 0; } poll_quorum = 0; - if (!cluster_filesystem_stopped(ls)) { + if (!check_fs_done(ls)) { poll_fs = 1; return 0; } diff --git a/group/dlm_controld/dlm_daemon.h b/group/dlm_controld/dlm_daemon.h index d5657dd..b969bdc 100644 --- a/group/dlm_controld/dlm_daemon.h +++ b/group/dlm_controld/dlm_daemon.h @@ -68,6 +68,7 @@ extern int poll_ignore_plock; extern int plock_fd; extern int plock_ci; extern struct list_head lockspaces; +extern int cman_quorate; extern int our_nodeid; extern char daemon_debug_buf[256]; extern char dump_buf[DUMP_SIZE]; @@ -149,6 +150,7 @@ struct lockspace { int joining; int leaving; int kernel_stopped; + int fs_registered; uint32_t change_seq; struct change *started_change; struct list_head changes; diff --git a/group/dlm_controld/main.c b/group/dlm_controld/main.c index 0e4bc15..b954f53 100644 --- a/group/dlm_controld/main.c +++ b/group/dlm_controld/main.c @@ -881,6 +881,7 @@ int poll_ignore_plock; int plock_fd; int plock_ci; struct list_head lockspaces; +int cman_quorate; int our_nodeid; char daemon_debug_buf[256]; char dump_buf[DUMP_SIZE]; diff --git a/group/dlm_controld/member_cman.c b/group/dlm_controld/member_cman.c index 847351a..c871097 100644 --- a/group/dlm_controld/member_cman.c +++ b/group/dlm_controld/member_cman.c @@ -71,6 +71,8 @@ static void statechange(void) int num_addrs; struct cman_node_address *addrptr = addrs; + cman_quorate = cman_is_quorate(ch); + old_node_count = cman_node_count; memcpy(&old_nodes, &cman_nodes, sizeof(old_nodes)); diff --git a/group/dlm_controld/plock.c b/group/dlm_controld/plock.c index a862356..4dc38ac 100644 --- a/group/dlm_controld/plock.c +++ b/group/dlm_controld/plock.c @@ -1816,7 +1816,7 @@ static int _unlink_checkpoint(struct lockspace *ls, SaNameT *name) if (rv == SA_AIS_OK) goto out_close; - log_error("unlink ckpt error %d %s", rv, ls->name); + log_group(ls, "unlink ckpt error %d %s", rv, ls->name); ret = -1; status_retry: hooks/post-receive -- Cluster Project