From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 13306 invoked by alias); 9 Jan 2009 20:05:00 -0000 Received: (qmail 13299 invoked by alias); 9 Jan 2009 20:05:00 -0000 X-SWARE-Spam-Status: Yes, hits=8.1 required=5.0 tests=AWL,BAYES_50,FB_REPLIC_CAP,J_CHICKENPOX_101,J_CHICKENPOX_38,J_CHICKENPOX_39,J_CHICKENPOX_41,J_CHICKENPOX_42,J_CHICKENPOX_43,J_CHICKENPOX_44,J_CHICKENPOX_45,J_CHICKENPOX_46,J_CHICKENPOX_53,J_CHICKENPOX_54,J_CHICKENPOX_61,J_CHICKENPOX_62,J_CHICKENPOX_63,J_CHICKENPOX_64,J_CHICKENPOX_65,J_CHICKENPOX_66,J_CHICKENPOX_73,J_CHICKENPOX_83,J_CHICKENPOX_84,KAM_MX,SPF_HELO_PASS,ZMIde_GENERICSPAM1 X-Spam-Status: Yes, hits=8.1 required=5.0 tests=AWL,BAYES_50,FB_REPLIC_CAP,J_CHICKENPOX_101,J_CHICKENPOX_38,J_CHICKENPOX_39,J_CHICKENPOX_41,J_CHICKENPOX_42,J_CHICKENPOX_43,J_CHICKENPOX_44,J_CHICKENPOX_45,J_CHICKENPOX_46,J_CHICKENPOX_53,J_CHICKENPOX_54,J_CHICKENPOX_61,J_CHICKENPOX_62,J_CHICKENPOX_63,J_CHICKENPOX_64,J_CHICKENPOX_65,J_CHICKENPOX_66,J_CHICKENPOX_73,J_CHICKENPOX_83,J_CHICKENPOX_84,KAM_MX,SPF_HELO_PASS,ZMIde_GENERICSPAM1 X-Spam-Flag: YES X-Spam-Check-By: sourceware.org X-Spam-Checker-Version: SpamAssassin 3.2.4 (2008-01-01) on bastion.fedora.phx.redhat.com Subject: gfs2-utils: master - gfs_controld: remove groupd compat To: cluster-cvs-relay@redhat.com X-Project: Cluster Project X-Git-Module: gfs2-utils.git X-Git-Refname: refs/heads/master X-Git-Reftype: branch X-Git-Oldrev: 7c2220b33a5bdb50727717d2d4c4475a49a8de6e X-Git-Newrev: 8c4b2d6c09b5547fb3f556dc19ccf0e9ffe760f0 From: David Teigland Message-Id: <20090109200424.EC0B31204EA@lists.fedorahosted.org> Date: Fri, 09 Jan 2009 20:05:00 -0000 X-Scanned-By: MIMEDefang 2.58 on 172.16.52.254 Mailing-List: contact cluster-cvs-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Post: List-Help: , Sender: cluster-cvs-owner@sourceware.org X-SW-Source: 2009-q1/txt/msg00072.txt.bz2 Gitweb: http://git.fedorahosted.org/git/gfs2-utils.git?p=gfs2-utils.git;a=commitdiff;h=8c4b2d6c09b5547fb3f556dc19ccf0e9ffe760f0 Commit: 8c4b2d6c09b5547fb3f556dc19ccf0e9ffe760f0 Parent: 7c2220b33a5bdb50727717d2d4c4475a49a8de6e Author: David Teigland AuthorDate: Fri Jan 9 14:24:08 2009 -0600 Committer: David Teigland CommitterDate: Fri Jan 9 14:24:08 2009 -0600 gfs_controld: remove groupd compat Signed-off-by: David Teigland --- group/gfs_controld/Makefile | 5 - group/gfs_controld/config.c | 72 +-- group/gfs_controld/config.h | 24 - group/gfs_controld/cpg-old.c | 2442 --------------------------------------- group/gfs_controld/cpg-old.h | 73 -- group/gfs_controld/gfs_daemon.h | 101 -- group/gfs_controld/group.c | 360 ------ group/gfs_controld/main.c | 257 +---- group/gfs_controld/plock.c | 2361 ------------------------------------- group/gfs_controld/util.c | 9 +- 10 files changed, 25 insertions(+), 5679 deletions(-) diff --git a/group/gfs_controld/Makefile b/group/gfs_controld/Makefile index e194610..0bfe9d5 100644 --- a/group/gfs_controld/Makefile +++ b/group/gfs_controld/Makefile @@ -15,10 +15,7 @@ OBJS= main.o \ config.o \ crc.o \ cpg-new.o \ - cpg-old.o \ - group.o \ util.o \ - plock.o \ logging.o CFLAGS += -I${ccsincdir} -I${cmanincdir} -I${logtincdir} -I${dlmcontrolincdir} @@ -34,10 +31,8 @@ LDFLAGS += -L${logtlibdir} -llogthread LDFLAGS += -L${corosynclibdir} -lcpg -lpthread LDFLAGS += -L${openaislibdir} -lSaCkpt LDFLAGS += -L${fencedlibdir} -lfenced -LDFLAGS += -L${grouplibdir} -lgroup LDFLAGS += -L${libdir} -LDDEPS += ${grouplibdir}/libgroup.a LDDEPS += ${fencedlibdir}/libfenced.a ${TARGET}: ${OBJS} ${LDDEPS} diff --git a/group/gfs_controld/config.c b/group/gfs_controld/config.c index b6e2ebe..896e063 100644 --- a/group/gfs_controld/config.c +++ b/group/gfs_controld/config.c @@ -29,29 +29,13 @@ int ccs_handle; /* was a config value set on command line?, 0 or 1. */ -int optd_groupd_compat; int optd_debug_logfile; int optd_enable_withdraw; -int optd_enable_plock; -int optd_plock_debug; -int optd_plock_rate_limit; -int optd_plock_ownership; -int optd_drop_resources_time; -int optd_drop_resources_count; -int optd_drop_resources_age; /* actual config value from command line, cluster.conf, or default. */ -int cfgd_groupd_compat = DEFAULT_GROUPD_COMPAT; int cfgd_debug_logfile = DEFAULT_DEBUG_LOGFILE; int cfgd_enable_withdraw = DEFAULT_ENABLE_WITHDRAW; -int cfgd_enable_plock = DEFAULT_ENABLE_PLOCK; -int cfgd_plock_debug = DEFAULT_PLOCK_DEBUG; -int cfgd_plock_rate_limit = DEFAULT_PLOCK_RATE_LIMIT; -int cfgd_plock_ownership = DEFAULT_PLOCK_OWNERSHIP; -int cfgd_drop_resources_time = DEFAULT_DROP_RESOURCES_TIME; -int cfgd_drop_resources_count = DEFAULT_DROP_RESOURCES_COUNT; -int cfgd_drop_resources_age = DEFAULT_DROP_RESOURCES_AGE; void read_ccs_name(char *path, char *name) { @@ -140,28 +124,14 @@ void read_ccs_nodir(struct mountgroup *mg, char *buf) free(str); } -#define GROUPD_COMPAT_PATH "/cluster/group/@groupd_compat" #define ENABLE_WITHDRAW_PATH "/cluster/gfs_controld/@enable_withdraw" -#define ENABLE_PLOCK_PATH "/cluster/gfs_controld/@enable_plock" -#define PLOCK_DEBUG_PATH "/cluster/gfs_controld/@plock_debug" -#define PLOCK_RATE_LIMIT_PATH "/cluster/gfs_controld/@plock_rate_limit" -#define PLOCK_OWNERSHIP_PATH "/cluster/gfs_controld/@plock_ownership" -#define DROP_RESOURCES_TIME_PATH "/cluster/gfs_controld/@drop_resources_time" -#define DROP_RESOURCES_COUNT_PATH "/cluster/gfs_controld/@drop_resources_count" -#define DROP_RESOURCES_AGE_PATH "/cluster/gfs_controld/@drop_resources_age" - -#define DLM_PLOCK_RATE_LIMIT_PATH "/cluster/dlm/@plock_rate_limit" -#define DLM_PLOCK_OWNERSHIP_PATH "/cluster/dlm/@plock_ownership" -#define DLM_DROP_RESOURCES_TIME_PATH "/cluster/dlm/@drop_resources_time" -#define DLM_DROP_RESOURCES_COUNT_PATH "/cluster/dlm/@drop_resources_count" -#define DLM_DROP_RESOURCES_AGE_PATH "/cluster/dlm/@drop_resources_age" int setup_ccs(void) { - int cd, rv; + int cd; if (ccs_handle) - goto update; + return 0; cd = ccs_connect(); if (cd < 0) { @@ -170,46 +140,8 @@ int setup_ccs(void) } ccs_handle = cd; - /* These config values are set from cluster.conf only if they haven't - already been set on the command line. */ - - if (!optd_groupd_compat) - read_ccs_int(GROUPD_COMPAT_PATH, &cfgd_groupd_compat); if (!optd_enable_withdraw) read_ccs_int(ENABLE_WITHDRAW_PATH, &cfgd_enable_withdraw); - if (!optd_enable_plock) - read_ccs_int(ENABLE_PLOCK_PATH, &cfgd_enable_plock); - if (!optd_plock_ownership) { - rv = read_ccs_int(PLOCK_OWNERSHIP_PATH, &cfgd_plock_ownership); - if (rv < 0) - read_ccs_int(DLM_PLOCK_OWNERSHIP_PATH, &cfgd_plock_ownership); - } - - /* The following can be changed while running */ - update: - if (!optd_plock_debug) { - read_ccs_int(PLOCK_DEBUG_PATH, &cfgd_plock_debug); - } - if (!optd_plock_rate_limit) { - rv = read_ccs_int(PLOCK_RATE_LIMIT_PATH, &cfgd_plock_rate_limit); - if (rv < 0) - read_ccs_int(DLM_PLOCK_RATE_LIMIT_PATH, &cfgd_plock_rate_limit); - } - if (!optd_drop_resources_time) { - rv = read_ccs_int(DROP_RESOURCES_TIME_PATH, &cfgd_drop_resources_time); - if (rv < 0) - read_ccs_int(DLM_DROP_RESOURCES_TIME_PATH, &cfgd_drop_resources_time); - } - if (!optd_drop_resources_count) { - rv = read_ccs_int(DROP_RESOURCES_COUNT_PATH, &cfgd_drop_resources_count); - if (rv < 0) - read_ccs_int(DLM_DROP_RESOURCES_COUNT_PATH, &cfgd_drop_resources_count); - } - if (!optd_drop_resources_age) { - rv = read_ccs_int(DROP_RESOURCES_AGE_PATH, &cfgd_drop_resources_age); - if (rv < 0) - read_ccs_int(DLM_DROP_RESOURCES_AGE_PATH, &cfgd_drop_resources_age); - } return 0; } diff --git a/group/gfs_controld/config.h b/group/gfs_controld/config.h index ee0b3b6..bc4788d 100644 --- a/group/gfs_controld/config.h +++ b/group/gfs_controld/config.h @@ -1,38 +1,14 @@ #ifndef __CONFIG_DOT_H__ #define __CONFIG_DOT_H__ -#define DEFAULT_GROUPD_COMPAT 2 #define DEFAULT_DEBUG_LOGFILE 0 #define DEFAULT_ENABLE_WITHDRAW 1 -#define DEFAULT_ENABLE_PLOCK 1 -#define DEFAULT_PLOCK_DEBUG 0 -#define DEFAULT_PLOCK_RATE_LIMIT 100 -#define DEFAULT_PLOCK_OWNERSHIP 1 -#define DEFAULT_DROP_RESOURCES_TIME 10000 /* 10 sec */ -#define DEFAULT_DROP_RESOURCES_COUNT 10 -#define DEFAULT_DROP_RESOURCES_AGE 10000 /* 10 sec */ -extern int optd_groupd_compat; extern int optd_debug_logfile; extern int optd_enable_withdraw; -extern int optd_enable_plock; -extern int optd_plock_debug; -extern int optd_plock_rate_limit; -extern int optd_plock_ownership; -extern int optd_drop_resources_time; -extern int optd_drop_resources_count; -extern int optd_drop_resources_age; -extern int cfgd_groupd_compat; extern int cfgd_debug_logfile; extern int cfgd_enable_withdraw; -extern int cfgd_enable_plock; -extern int cfgd_plock_debug; -extern int cfgd_plock_rate_limit; -extern int cfgd_plock_ownership; -extern int cfgd_drop_resources_time; -extern int cfgd_drop_resources_count; -extern int cfgd_drop_resources_age; #endif diff --git a/group/gfs_controld/cpg-old.c b/group/gfs_controld/cpg-old.c deleted file mode 100644 index b353867..0000000 --- a/group/gfs_controld/cpg-old.c +++ /dev/null @@ -1,2442 +0,0 @@ -#include "gfs_daemon.h" -#include "config.h" -#include "cpg-old.h" -#include "libgroup.h" - -#define ASSERT(x) \ -do { \ - if (!(x)) { \ - log_error("Assertion failed on line %d of file %s\n" \ - "Assertion: \"%s\"\n", __LINE__, __FILE__, #x); \ - } \ -} while (0) - -#define JID_INIT -9 - -/* mg_member opts bit field */ - -enum { - MEMB_OPT_RW = 1, - MEMB_OPT_RO = 2, - MEMB_OPT_SPECT = 4, - MEMB_OPT_RECOVER = 8, -}; - -/* mg_member state: local_recovery_status, recovery_status */ - -enum { - RS_NEED_RECOVERY = 1, - RS_SUCCESS, - RS_GAVEUP, - RS_NOFS, - RS_READONLY, -}; - -extern group_handle_t gh; - -/* cpg message protocol - 1.0.0 is initial version - 2.0.0 is incompatible with 1.0.0 and allows plock ownership */ -static unsigned int protocol_v100[3] = {1, 0, 0}; -static unsigned int protocol_v200[3] = {2, 0, 0}; -static unsigned int protocol_active[3]; - - -static void send_journals(struct mountgroup *mg, int nodeid); - - -static char *msg_name(int type) -{ - switch (type) { - case MSG_JOURNAL: - return "MSG_JOURNAL"; - case MSG_OPTIONS: - return "MSG_OPTIONS"; - case MSG_REMOUNT: - return "MSG_REMOUNT"; - case MSG_PLOCK: - return "MSG_PLOCK"; - case MSG_MOUNT_STATUS: - return "MSG_MOUNT_STATUS"; - case MSG_RECOVERY_STATUS: - return "MSG_RECOVERY_STATUS"; - case MSG_RECOVERY_DONE: - return "MSG_RECOVERY_DONE"; - case MSG_WITHDRAW: - return "MSG_WITHDRAW"; - } - return "unknown"; -} - -static int _send_message(cpg_handle_t h, void *buf, int len, int type) -{ - struct iovec iov; - cpg_error_t error; - int retries = 0; - - iov.iov_base = buf; - iov.iov_len = len; - - retry: - error = cpg_mcast_joined(h, CPG_TYPE_AGREED, &iov, 1); - if (error == CPG_ERR_TRY_AGAIN) { - retries++; - usleep(1000); - if (!(retries % 100)) - log_error("cpg_mcast_joined retry %d %s", - retries, msg_name(type)); - goto retry; - } - if (error != CPG_OK) { - log_error("cpg_mcast_joined error %d handle %llx %s", - error, (unsigned long long)h, msg_name(type)); - return -1; - } - - if (retries) - log_debug("cpg_mcast_joined retried %d %s", - retries, msg_name(type)); - - return 0; -} - -int send_group_message_old(struct mountgroup *mg, int len, char *buf) -{ - struct gdlm_header *hd = (struct gdlm_header *) buf; - int type = hd->type; - - hd->version[0] = cpu_to_le16(protocol_active[0]); - hd->version[1] = cpu_to_le16(protocol_active[1]); - hd->version[2] = cpu_to_le16(protocol_active[2]); - hd->type = cpu_to_le16(hd->type); - hd->nodeid = cpu_to_le32(hd->nodeid); - hd->to_nodeid = cpu_to_le32(hd->to_nodeid); - memcpy(hd->name, mg->name, strlen(mg->name)); - - return _send_message(cpg_handle_daemon, buf, len, type); -} - -static struct mg_member *find_memb_nodeid(struct mountgroup *mg, int nodeid) -{ - struct mg_member *memb; - - list_for_each_entry(memb, &mg->members, list) { - if (memb->nodeid == nodeid) - return memb; - } - return NULL; -} - -static struct mg_member *find_memb_jid(struct mountgroup *mg, int jid) -{ - struct mg_member *memb; - - list_for_each_entry(memb, &mg->members, list) { - if (memb->jid == jid) - return memb; - } - return NULL; -} - -static void notify_mount_client(struct mountgroup *mg) -{ - struct mg_member *memb; - - if (!mg->mount_client_result && mg->mount_client_delay) { - log_group(mg, "notify_mount_client delayed"); - return; - } - - client_reply_join_full(mg, mg->mount_client_result); - - if (mg->mount_client_result) { - log_group(mg, "leaving due to mount error: %d", - mg->mount_client_result); - - memb = find_memb_nodeid(mg, our_nodeid); - if (memb->finished) - group_leave(gh, mg->name); - else { - log_group(mg, "delay leave until after join"); - mg->group_leave_on_finish = 1; - } - } else { - mg->mount_client_notified = 1; - } -} - -/* we can receive recovery_status messages from other nodes doing start before - we actually process the corresponding start callback ourselves */ - -void save_message_old(struct mountgroup *mg, char *buf, int len, int from, - int type) -{ - struct save_msg *sm; - - sm = malloc(sizeof(struct save_msg) + len); - if (!sm) - return; - memset(sm, 0, sizeof(struct save_msg) + len); - - memcpy(&sm->buf, buf, len); - sm->type = type; - sm->len = len; - sm->nodeid = from; - - log_group(mg, "save %s from %d len %d", msg_name(type), from, len); - - list_add_tail(&sm->list, &mg->saved_messages); -} - -static int first_mounter_recovery(struct mountgroup *mg) -{ - struct mg_member *memb; - - list_for_each_entry(memb, &mg->members, list) { - if (memb->opts & MEMB_OPT_RECOVER) - return memb->nodeid; - } - return 0; -} - -static int local_first_mounter_recovery(struct mountgroup *mg) -{ - int nodeid; - - nodeid = first_mounter_recovery(mg); - if (nodeid == our_nodeid) - return 1; - return 0; -} - -int remote_first_mounter_recovery(struct mountgroup *mg) -{ - int nodeid; - - nodeid = first_mounter_recovery(mg); - if (nodeid && (nodeid != our_nodeid)) - return 1; - return 0; -} - -static void start_done(struct mountgroup *mg) -{ - log_group(mg, "start_done %d", mg->start_event_nr); - group_start_done(gh, mg->name, mg->start_event_nr); -} - -void send_withdraw_old(struct mountgroup *mg) -{ - struct gdlm_header *hd; - int len; - char *buf; - - len = sizeof(struct gdlm_header); - - buf = malloc(len); - if (!buf) - return; - memset(buf, 0, len); - - hd = (struct gdlm_header *)buf; - hd->type = MSG_WITHDRAW; - hd->nodeid = our_nodeid; - hd->to_nodeid = 0; - - log_group(mg, "send_withdraw"); - - send_group_message_old(mg, len, buf); - - free(buf); -} - -static void receive_withdraw(struct mountgroup *mg, char *buf, int len, int from) -{ - struct mg_member *memb; - - memb = find_memb_nodeid(mg, from); - if (!memb) { - log_group(mg, "receive_withdraw no member %d", from); - return; - } - log_group(mg, "receive_withdraw from %d", from); - memb->withdrawing = 1; - - if (from == our_nodeid) - group_leave(gh, mg->name); -} - -#define SEND_RS_INTS 3 - -static void send_recovery_status(struct mountgroup *mg) -{ - struct gdlm_header *hd; - struct mg_member *memb; - int len, *p, i, n = 0; - char *buf; - - list_for_each_entry(memb, &mg->members_gone, list) { - if (memb->local_recovery_status == RS_SUCCESS) - n++; - } - - len = sizeof(struct gdlm_header) + (n * SEND_RS_INTS * sizeof(int)); - - buf = malloc(len); - if (!buf) - return; - memset(buf, 0, len); - - hd = (struct gdlm_header *)buf; - hd->type = MSG_RECOVERY_STATUS; - hd->nodeid = our_nodeid; - hd->to_nodeid = 0; - p = (int *) (buf + sizeof(struct gdlm_header)); - - i = 0; - list_for_each_entry(memb, &mg->members_gone, list) { - if (memb->local_recovery_status != RS_SUCCESS) - continue; - p[i] = cpu_to_le32(memb->nodeid); - i++; - p[i] = cpu_to_le32(memb->jid); - i++; - p[i] = cpu_to_le32(memb->local_recovery_status); - i++; - } - - log_group(mg, "send_recovery_status for %d nodes len %d", n, len); - - send_group_message_old(mg, len, buf); - - free(buf); -} - -/* Note: we can get more than one node reporting success in recovering - the journal for a failed node. The first has really recovered it, - the rest have found the fs clean and report success. */ - -static void _receive_recovery_status(struct mountgroup *mg, char *buf, int len, - int from) -{ - struct mg_member *memb; - int *p, n, i, nodeid, jid, status, found = 0; - - n = (len - sizeof(struct gdlm_header)) / (SEND_RS_INTS * sizeof(int)); - - p = (int *) (buf + sizeof(struct gdlm_header)); - - for (i = 0; i < n; i++) { - nodeid = le32_to_cpu(p[i * SEND_RS_INTS]); - jid = le32_to_cpu(p[i * SEND_RS_INTS + 1]); - status = le32_to_cpu(p[i * SEND_RS_INTS + 2]); - - ASSERT(status == RS_SUCCESS); - - found = 0; - list_for_each_entry(memb, &mg->members_gone, list) { - if (memb->nodeid != nodeid) - continue; - ASSERT(memb->jid == jid); - ASSERT(memb->recovery_status == RS_NEED_RECOVERY || - memb->recovery_status == RS_SUCCESS); - memb->recovery_status = status; - found = 1; - break; - } - - log_group(mg, "receive_recovery_status from %d len %d " - "nodeid %d jid %d status %d found %d", - from, len, nodeid, jid, status, found); - } - - if (from == our_nodeid) - start_done(mg); -} - -static void process_saved_recovery_status(struct mountgroup *mg) -{ - struct save_msg *sm, *sm2; - - if (list_empty(&mg->saved_messages)) - return; - - log_group(mg, "process_saved_recovery_status"); - - list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) { - if (sm->type != MSG_RECOVERY_STATUS) - continue; - _receive_recovery_status(mg, sm->buf, sm->len, sm->nodeid); - list_del(&sm->list); - free(sm); - } -} - -static void assign_next_first_mounter(struct mountgroup *mg) -{ - struct mg_member *memb, *next = NULL; - int low = -1; - - list_for_each_entry(memb, &mg->members, list) { - if (memb->jid == -2) - continue; - if (memb->jid == -9) - continue; - if (memb->spectator || memb->readonly || memb->withdrawing || - memb->ms_kernel_mount_done) - continue; - if (low == -1 || memb->nodeid < low) { - next = memb; - low = memb->nodeid; - } - } - - if (next) { - log_group(mg, "next first mounter is %d jid %d opts %x", - next->nodeid, next->jid, next->opts); - next->opts |= MEMB_OPT_RECOVER; - ASSERT(next->jid >= 0); - } else - log_group(mg, "no next mounter available yet"); -} - -#define SEND_MS_INTS 4 - -void send_mount_status_old(struct mountgroup *mg) -{ - struct gdlm_header *hd; - int len, *p; - char *buf; - - len = sizeof(struct gdlm_header) + (SEND_MS_INTS * sizeof(int)); - - buf = malloc(len); - if (!buf) - return; - memset(buf, 0, len); - - hd = (struct gdlm_header *)buf; - hd->type = MSG_MOUNT_STATUS; - hd->nodeid = our_nodeid; - hd->to_nodeid = 0; - - p = (int *) (buf + sizeof(struct gdlm_header)); - - p[0] = cpu_to_le32(mg->first_mounter); - p[1] = cpu_to_le32(mg->kernel_mount_error); - p[2] = 0; /* unused */ - p[3] = 0; /* unused */ - - log_group(mg, "send_mount_status kernel_mount_error %d " - "first_mounter %d", - mg->kernel_mount_error, - mg->first_mounter); - - send_group_message_old(mg, len, buf); - - free(buf); -} - -static void _receive_mount_status(struct mountgroup *mg, char *buf, int len, - int from) -{ - struct mg_member *memb, *us; - int *p; - - p = (int *) (buf + sizeof(struct gdlm_header)); - - memb = find_memb_nodeid(mg, from); - if (!memb) { - log_group(mg, "_receive_mount_status no node %d", from); - return; - } - - memb->ms_kernel_mount_done = 1; - memb->ms_first_mounter = le32_to_cpu(p[0]); - memb->ms_kernel_mount_error = le32_to_cpu(p[1]); - - log_group(mg, "_receive_mount_status from %d kernel_mount_error %d " - "first_mounter %d opts %x", from, - memb->ms_kernel_mount_error, memb->ms_first_mounter, - memb->opts); - - if (memb->opts & MEMB_OPT_RECOVER) { - ASSERT(memb->ms_first_mounter); - } - if (memb->ms_first_mounter) { - ASSERT(memb->opts & MEMB_OPT_RECOVER); - } - - if (memb->ms_first_mounter) { - memb->opts &= ~MEMB_OPT_RECOVER; - - if (!memb->ms_kernel_mount_error) { - /* the first mounter has successfully mounted, we can - go ahead and mount now */ - - if (mg->mount_client_delay) { - mg->mount_client_delay = 0; - notify_mount_client(mg); - } - } else { - /* first mounter mount failed, next low node should be - made first mounter */ - - memb->jid = -2; - if (from == our_nodeid) - mg->our_jid = -2; - - assign_next_first_mounter(mg); - - /* if we became the next first mounter, then notify - mount client */ - - us = find_memb_nodeid(mg, our_nodeid); - if (us->opts & MEMB_OPT_RECOVER) { - log_group(mg, "we are next first mounter"); - mg->first_mounter = 1; - mg->first_mounter_done = 0; - mg->mount_client_delay = 0; - notify_mount_client(mg); - } - } - } -} - -static void receive_mount_status(struct mountgroup *mg, char *buf, int len, - int from) -{ - log_group(mg, "receive_mount_status from %d len %d last_cb %d", - from, len, mg->last_callback); - - if (!mg->got_our_options) { - log_group(mg, "ignore mount_status from %d", from); - return; - } - - if (!mg->got_our_journals) - save_message_old(mg, buf, len, from, MSG_MOUNT_STATUS); - else - _receive_mount_status(mg, buf, len, from); -} - -/* We delay processing mount_status msesages until we receive the journals - message for our own mount. Our journals message is a snapshot of the memb - list at the time our options message is received on the remote node. We - ignore any messages that would change the memb list prior to seeing our own - options message and we save any messages that would change the memb list - after seeing our own options message and before we receive the memb list - from the journals message. */ - -static void process_saved_mount_status(struct mountgroup *mg) -{ - struct save_msg *sm, *sm2; - - if (list_empty(&mg->saved_messages)) - return; - - log_group(mg, "process_saved_mount_status"); - - list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) { - if (sm->type != MSG_MOUNT_STATUS) - continue; - _receive_mount_status(mg, sm->buf, sm->len, sm->nodeid); - list_del(&sm->list); - free(sm); - } -} - -static void receive_recovery_status(struct mountgroup *mg, char *buf, int len, - int from) -{ - switch (mg->last_callback) { - case DO_STOP: - save_message_old(mg, buf, len, from, MSG_RECOVERY_STATUS); - break; - case DO_START: - _receive_recovery_status(mg, buf, len, from); - break; - default: - log_group(mg, "receive_recovery_status %d last_callback %d", - from, mg->last_callback); - } -} - -/* tell others that all journals are recovered; they should clear - memb's from members_gone, clear needs_recovery and unblock locks */ - -static void send_recovery_done(struct mountgroup *mg) -{ - struct gdlm_header *hd; - int len; - char *buf; - - len = sizeof(struct gdlm_header); - - buf = malloc(len); - if (!buf) - return; - memset(buf, 0, len); - - hd = (struct gdlm_header *)buf; - hd->type = MSG_RECOVERY_DONE; - hd->nodeid = our_nodeid; - hd->to_nodeid = 0; - - send_group_message_old(mg, len, buf); - - free(buf); -} - -static void receive_recovery_done(struct mountgroup *mg, char *buf, int len, - int from) -{ - struct mg_member *memb, *safe; - - log_group(mg, "receive_recovery_done from %d needs_recovery %d", - from, mg->needs_recovery); - - list_for_each_entry_safe(memb, safe, &mg->members_gone, list) { - log_group(mg, "receive_recovery_done clear jid %d nodeid %d", - memb->jid, memb->nodeid); - list_del(&memb->list); - free(memb); - } - - mg->needs_recovery = 0; - mg->kernel_stopped = 0; /* for queries */ - set_sysfs(mg, "block", 0); -} - -void send_remount_old(struct mountgroup *mg, struct gfsc_mount_args *ma) -{ - struct gdlm_header *hd; - char *buf; - int len; - int ro = strstr(ma->options, "ro") ? 1 : 0; - - len = sizeof(struct gdlm_header) + MAX_OPTIONS_LEN; - - buf = malloc(len); - if (!buf) - return; - memset(buf, 0, len); - - hd = (struct gdlm_header *)buf; - hd->type = MSG_REMOUNT; - hd->nodeid = our_nodeid; - hd->to_nodeid = 0; - - strcpy(buf+sizeof(struct gdlm_header), ro ? "ro" : "rw"); - - log_group(mg, "send_remount_old len %d \"%s\"", len, - buf+sizeof(struct gdlm_header)); - - send_group_message_old(mg, len, buf); - - free(buf); -} - -static void receive_remount(struct mountgroup *mg, char *buf, int len, int from) -{ - struct mg_member *memb; - char *options; - int rw = 0, ro = 0; - int result = 0; - - options = (char *) (buf + sizeof(struct gdlm_header)); - - memb = find_memb_nodeid(mg, from); - if (!memb) { - log_error("receive_remount: unknown nodeid %d", from); - return; - } - - if (strstr(options, "rw")) - rw = 1; - else if (strstr(options, "ro")) - ro = 1; - else { - result = -EINVAL; - goto out; - } - - /* FIXME: check if we've even fully completed our normal mount yet - (received our own mount-status?) if not, then disallow remount */ - - /* FIXME: going ro->rw may mean we can now do journal or first-mounter - recovery that we couldn't do before. */ - - memb->readonly = ro; - memb->rw = !ro; - - if (ro) { - memb->opts &= ~MEMB_OPT_RW; - memb->opts |= MEMB_OPT_RO; - } else { - memb->opts &= ~MEMB_OPT_RO; - memb->opts |= MEMB_OPT_RW; - } - out: - if (from == our_nodeid) { - if (!result) { - mg->rw = memb->rw; - mg->ro = memb->readonly; - } - client_reply_remount(mg, mg->remount_client, result); - } - - log_group(mg, "receive_remount from %d rw=%d ro=%d opts=%x", - from, memb->rw, memb->readonly, memb->opts); -} - -static void set_our_memb_options(struct mountgroup *mg) -{ - struct mg_member *memb; - memb = find_memb_nodeid(mg, our_nodeid); - ASSERT(memb); - - if (mg->ro) { - memb->readonly = 1; - memb->opts |= MEMB_OPT_RO; - } else if (mg->spectator) { - memb->spectator = 1; - memb->opts |= MEMB_OPT_SPECT; - } else if (mg->rw) { - memb->rw = 1; - memb->opts |= MEMB_OPT_RW; - } -} - -static void send_options(struct mountgroup *mg) -{ - struct gdlm_header *hd; - int len; - char *buf; - - len = sizeof(struct gdlm_header) + MAX_OPTIONS_LEN; - - buf = malloc(len); - if (!buf) - return; - memset(buf, 0, len); - - hd = (struct gdlm_header *)buf; - hd->type = MSG_OPTIONS; - hd->nodeid = our_nodeid; - hd->to_nodeid = 0; - - strncpy(buf+sizeof(struct gdlm_header), mg->mount_args.options, - MAX_OPTIONS_LEN-1); - - log_group(mg, "send_options len %d \"%s\"", len, - buf+sizeof(struct gdlm_header)); - - send_group_message_old(mg, len, buf); - - free(buf); -} - -/* We set the new member's jid to the lowest unused jid. If we're the lowest - existing member (by nodeid), then send jid info to the new node. */ - -/* Look at rw/ro/spectator status of all existing mounters and whether - we need to do recovery. Based on that, decide if the current mount - mode (ro/spectator) is permitted; if not, set jid = -2. If spectator - mount and it's ok, set jid = -1. If ro or rw mount and it's ok, set - real jid. */ - -static int assign_journal(struct mountgroup *mg, struct mg_member *new) -{ - struct mg_member *memb, *memb_recover = NULL, *memb_mounted = NULL; - int i, total, rw_count, ro_count, spect_count, invalid_count; - - total = rw_count = ro_count = spect_count = invalid_count = 0; - - list_for_each_entry(memb, &mg->members, list) { - if (memb->nodeid == new->nodeid) - continue; - total++; - if (memb->jid == -2) - invalid_count++; - else if (memb->spectator) - spect_count++; - else if (memb->rw) - rw_count++; - else if (memb->readonly) - ro_count++; - - if (memb->opts & MEMB_OPT_RECOVER) { - memb_recover = memb; - log_group(mg, "assign_journal: memb %d has OPT_RECOVER", - memb->nodeid); - } - - if (memb->ms_kernel_mount_done && !memb->ms_kernel_mount_error) - memb_mounted = memb; - } - - log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d " - "needs_recovery %d", total, invalid_count, rw_count, - ro_count, spect_count, mg->needs_recovery); - - if (new->spectator) { - log_group(mg, "assign_journal: new spectator allowed"); - new->jid = -1; - goto out; - } - - for (i = 0; i < 1024; i++) { - memb = find_memb_jid(mg, i); - if (!memb) { - new->jid = i; - break; - } - } - - /* Repeat first-mounter recovery: the fs has been mounted and in-use, - but nodes have failed and none of the current mounters has been able - to do recovery (all remaining nodes may be ro/spect for example). - This puts us into the special "needs_recovery" state where new - mounters are asked to do first-mounter recovery of the fs while - the current mounters sit in a blocked state. */ - - if (mg->needs_recovery) { - if (!memb_recover) { - log_group(mg, "assign_journal: needs_recovery: " - "new memb %d gets OPT_RECOVER", - new->nodeid); - new->opts |= MEMB_OPT_RECOVER; - } else { - log_group(mg, "assign_journal: needs_recovery: " - "new memb %d memb %d has OPT_RECOVER", - new->nodeid, memb_recover->nodeid); - } - goto out; - } - - /* Initial first-mounter recovery: the fs is coming online, the first - mg member assumes first-mounter role and other nodes join the mg - while the first-mounter is working. These non-first mounters wait - for the first-mounter to finish before notifying mount.gfs. If the - first-mounter fails, one of them will become the first-mounter. */ - - /* it shouldn't be possible to have someone doing first mounter - recovery and also have someone with the fs fully mounted */ - - if (memb_mounted && memb_recover) { - log_group(mg, "memb_mounted %d memb_recover %d", - memb_mounted->nodeid, memb_recover->nodeid); - ASSERT(0); - } - - /* someone has successfully mounted the fs which means the fs doesn't - need first mounter recovery */ - - if (memb_mounted) { - log_group(mg, "assign_journal: no first recovery needed %d", - memb_mounted->nodeid); - goto out; - } - - /* someone is currently doing first mounter recovery, they'll send - mount_status when they're done letting everyone know the result */ - - if (memb_recover) { - log_group(mg, "assign_journal: %d doing first recovery", - memb_recover->nodeid); - goto out; - } - - /* when we received our journals, no one was flagged with OPT_RECOVER - which means no first mounter recovery is needed or is current */ - - if (mg->global_first_recover_done) { - log_group(mg, "assign_journal: global_first_recover_done"); - goto out; - } - - /* no one has done kernel mount successfully and no one is doing first - mounter recovery, the new node gets to try first mounter recovery */ - - log_group(mg, "kernel_mount_done %d kernel_mount_error %d " - "first_mounter %d first_mounter_done %d", - mg->kernel_mount_done, mg->kernel_mount_error, - mg->first_mounter, mg->first_mounter_done); - - log_group(mg, "assign_journal: new memb %d gets OPT_RECOVER for: " - "fs not mounted", new->nodeid); - new->opts |= MEMB_OPT_RECOVER; - - out: - log_group(mg, "assign_journal: new member %d got jid %d opts %x", - new->nodeid, new->jid, new->opts); - - if (mg->master_nodeid == our_nodeid) { - store_plocks(mg, new->nodeid); - send_journals(mg, new->nodeid); - } - return 0; -} - -static void _receive_options(struct mountgroup *mg, char *buf, int len, - int from) -{ - struct mg_member *memb; - struct gdlm_header *hd; - char *options; - - hd = (struct gdlm_header *)buf; - options = (char *) (buf + sizeof(struct gdlm_header)); - - memb = find_memb_nodeid(mg, from); - if (!memb) { - log_error("unknown nodeid %d for options message", from); - return; - } - - if (strstr(options, "spectator")) { - memb->spectator = 1; - memb->opts |= MEMB_OPT_SPECT; - } else if (strstr(options, "rw")) { - memb->rw = 1; - memb->opts |= MEMB_OPT_RW; - } else if (strstr(options, "ro")) { - memb->readonly = 1; - memb->opts |= MEMB_OPT_RO; - } - - log_group(mg, "_receive_options from %d rw=%d ro=%d spect=%d opts=%x", - from, memb->rw, memb->readonly, memb->spectator, memb->opts); - - assign_journal(mg, memb); -} - -static void receive_options(struct mountgroup *mg, char *buf, int len, int from) -{ - struct gdlm_header *hd = (struct gdlm_header *)buf; - struct mg_member *memb; - - log_group(mg, "receive_options from %d len %d last_cb %d", - from, len, mg->last_callback); - - if (hd->nodeid == our_nodeid) { - mg->got_our_options = 1; - mg->save_plocks = 1; - return; - } - - if (!mg->got_our_options) { - log_group(mg, "ignore options from %d", from); - return; - } - - /* we can receive an options message before getting the start - that adds the mounting node that sent the options, or - we can receive options messages before we get the journals - message for out own mount */ - - memb = find_memb_nodeid(mg, from); - - if (!memb || !mg->got_our_journals) - save_message_old(mg, buf, len, from, MSG_OPTIONS); - else - _receive_options(mg, buf, len, from); -} - -static void process_saved_options(struct mountgroup *mg) -{ - struct save_msg *sm, *sm2; - - if (list_empty(&mg->saved_messages)) - return; - - log_group(mg, "process_saved_options"); - - list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) { - if (sm->type != MSG_OPTIONS) - continue; - _receive_options(mg, sm->buf, sm->len, sm->nodeid); - list_del(&sm->list); - free(sm); - } -} - -#define NUM 3 - -/* send nodeid/jid/opts of every member to nodeid */ - -static void send_journals(struct mountgroup *mg, int nodeid) -{ - struct mg_member *memb; - struct gdlm_header *hd; - int i, len; - char *buf; - int *ids; - - len = sizeof(struct gdlm_header) + (mg->memb_count * NUM * sizeof(int)); - - buf = malloc(len); - if (!buf) - return; - memset(buf, 0, len); - - hd = (struct gdlm_header *)buf; - hd->type = MSG_JOURNAL; - hd->nodeid = our_nodeid; - hd->to_nodeid = nodeid; - ids = (int *) (buf + sizeof(struct gdlm_header)); - - i = 0; - list_for_each_entry(memb, &mg->members, list) { - ids[i] = cpu_to_le32(memb->nodeid); - i++; - ids[i] = cpu_to_le32(memb->jid); - i++; - ids[i] = cpu_to_le32(memb->opts); - i++; - } - - log_group(mg, "send_journals to %d len %d count %d", nodeid, len, i); - - send_group_message_old(mg, len, buf); - - free(buf); -} - -static void received_our_jid(struct mountgroup *mg) -{ - log_group(mg, "received_our_jid %d", mg->our_jid); - - /* we've been given jid of -2 which means we're not permitted - to mount the fs; probably because we're trying to mount readonly - but the next mounter is required to be rw */ - - if (mg->our_jid == -2) { - mg->mount_client_result = -EUCLEAN; - goto out; - } - - /* fs needs recovery and existing mounters can't recover it, - i.e. they're spectator/readonly or the first mounter's - mount(2) failed, so we're told to do first-mounter recovery - on the fs. */ - - if (local_first_mounter_recovery(mg)) { - log_group(mg, "we're told to do first mounter recovery"); - mg->first_mounter = 1; - mg->first_mounter_done = 0; - mg->mount_client_delay = 0; - mg->save_plocks = 0; - goto out; - } else if (remote_first_mounter_recovery(mg)) { - /* delay notifying mount client until we get a successful - mount status from the first mounter */ - log_group(mg, "other node doing first mounter recovery, " - "set mount_client_delay"); - mg->mount_client_delay = 1; - mg->save_plocks = 0; - return; - } - - retrieve_plocks(mg); - mg->save_plocks = 0; - process_saved_plocks(mg); - out: - notify_mount_client(mg); -} - -static void _receive_journals(struct mountgroup *mg, char *buf, int len, - int from) -{ - struct mg_member *memb, *memb2; - struct gdlm_header *hd; - int *ids, count, i, nodeid, jid, opts; - int current_first_recover = 0; - - hd = (struct gdlm_header *)buf; - - count = (len - sizeof(struct gdlm_header)) / (NUM * sizeof(int)); - ids = (int *) (buf + sizeof(struct gdlm_header)); - - for (i = 0; i < count; i++) { - nodeid = le32_to_cpu(ids[i * NUM]); - jid = le32_to_cpu(ids[i * NUM + 1]); - opts = le32_to_cpu(ids[i * NUM + 2]); - - log_debug("receive nodeid %d jid %d opts %x", - nodeid, jid, opts); - - memb = find_memb_nodeid(mg, nodeid); - memb2 = find_memb_jid(mg, jid); - - if (!memb || memb2) { - log_error("invalid journals message " - "nodeid %d jid %d opts %x", - nodeid, jid, opts); - } - if (!memb) - continue; - - memb->jid = jid; - - if (nodeid == our_nodeid) { - mg->our_jid = jid; - /* set_our_memb_options() sets rest */ - if (opts & MEMB_OPT_RECOVER) - memb->opts |= MEMB_OPT_RECOVER; - } else { - memb->opts = opts; - if (opts & MEMB_OPT_RO) - memb->readonly = 1; - else if (opts & MEMB_OPT_RW) - memb->rw = 1; - else if (opts & MEMB_OPT_SPECT) - memb->spectator = 1; - } - - if (opts & MEMB_OPT_RECOVER) - current_first_recover = 1; - } - - /* FIXME: use global_first_recover_done more widely instead of - as a single special case */ - if (!current_first_recover) - mg->global_first_recover_done = 1; - - process_saved_mount_status(mg); - - /* we delay processing any options messages from new mounters - until after we receive the journals message for our own mount */ - - process_saved_options(mg); - - received_our_jid(mg); -} - -static void receive_journals(struct mountgroup *mg, char *buf, int len, - int from) -{ - struct gdlm_header *hd = (struct gdlm_header *)buf; - struct mg_member *memb; - int count; - - count = (len - sizeof(struct gdlm_header)) / (NUM * sizeof(int)); - - log_group(mg, "receive_journals from %d to %d len %d count %d cb %d", - from, hd->to_nodeid, len, count, mg->last_callback); - - /* just like we can receive an options msg from a newly added node - before we get the start adding it, we can receive the journals - message sent to it before we get the start adding it */ - - memb = find_memb_nodeid(mg, hd->to_nodeid); - if (!memb) { - log_group(mg, "receive_journals from %d to unknown %d", - from, hd->to_nodeid); - return; - } - memb->needs_journals = 0; - - if (hd->to_nodeid && hd->to_nodeid != our_nodeid) - return; - - if (mg->got_our_journals) { - log_group(mg, "receive_journals from %d duplicate", from); - return; - } - mg->got_our_journals = 1; - - _receive_journals(mg, buf, len, from); -} - -static void add_ordered_member(struct mountgroup *mg, struct mg_member *new) -{ - struct mg_member *memb = NULL; - struct list_head *tmp; - struct list_head *newlist = &new->list; - struct list_head *head = &mg->members; - - list_for_each(tmp, head) { - memb = list_entry(tmp, struct mg_member, list); - if (new->nodeid < memb->nodeid) - break; - } - - if (!memb) - list_add_tail(newlist, head); - else { - /* FIXME: can use list macro here */ - newlist->prev = tmp->prev; - newlist->next = tmp; - tmp->prev->next = newlist; - tmp->prev = newlist; - } -} - -static int add_member(struct mountgroup *mg, int nodeid) -{ - struct mg_member *memb; - - memb = malloc(sizeof(struct mg_member)); - if (!memb) - return -ENOMEM; - - memset(memb, 0, sizeof(struct mg_member)); - - memb->nodeid = nodeid; - memb->jid = JID_INIT; - add_ordered_member(mg, memb); - mg->memb_count++; - - if (!mg->init) - memb->needs_journals = 1; - - return 0; -} - -static int is_member(struct mountgroup *mg, int nodeid) -{ - struct mg_member *memb; - - list_for_each_entry(memb, &mg->members, list) { - if (memb->nodeid == nodeid) - return 1; - } - return 0; -} - -static int is_removed(struct mountgroup *mg, int nodeid) -{ - struct mg_member *memb; - - list_for_each_entry(memb, &mg->members_gone, list) { - if (memb->nodeid == nodeid) - return 1; - } - return 0; -} - -/* New mounters may be waiting for a journals message that a failed node (as - master) would have sent. If the master failed and we're the new master, - then send a journals message to any nodes for whom we've not seen a journals - message. We also need to checkpoint the plock state for the new nodes to - read after they get their journals message. */ - -static void resend_journals(struct mountgroup *mg) -{ - struct mg_member *memb; - int stored_plocks = 0; - - list_for_each_entry(memb, &mg->members, list) { - if (!memb->needs_journals) - continue; - - if (!stored_plocks) { - store_plocks(mg, memb->nodeid); - stored_plocks = 1; - } - - log_group(mg, "resend_journals to %d", memb->nodeid); - send_journals(mg, memb->nodeid); - } -} - -/* The master node is the member of the group with the lowest nodeid who - was also a member of the last "finished" group, i.e. a member of the - group the last time it got a finish callback. The job of the master - is to send state info to new nodes joining the group, and doing that - requires that the master has all the state to send -- a new joining - node that has the lowest nodeid doesn't have any state, which is why - we add the "finished" requirement. */ - -static void update_master_nodeid(struct mountgroup *mg) -{ - struct mg_member *memb; - int new = -1, low = -1; - - list_for_each_entry(memb, &mg->members, list) { - if (low == -1 || memb->nodeid < low) - low = memb->nodeid; - if (!memb->finished) - continue; - if (new == -1 || memb->nodeid < new) - new = memb->nodeid; - } - mg->master_nodeid = new; - mg->low_nodeid = low; -} - -/* This can happen before we receive a journals message for our mount. */ - -static void recover_members(struct mountgroup *mg, int num_nodes, - int *nodeids, int *pos_out, int *neg_out) -{ - struct mg_member *memb, *safe, *memb_gone_recover = NULL; - int i, found, id, pos = 0, neg = 0, prev_master_nodeid; - int master_failed = 0; - - /* move departed nodes from members list to members_gone */ - - list_for_each_entry_safe(memb, safe, &mg->members, list) { - found = 0; - for (i = 0; i < num_nodes; i++) { - if (memb->nodeid == nodeids[i]) { - found = 1; - break; - } - } - - if (!found) { - neg++; - - list_move(&memb->list, &mg->members_gone); - memb->gone_event = mg->start_event_nr; - memb->gone_type = mg->start_type; - mg->memb_count--; - - memb->tell_gfs_to_recover = 0; - memb->recovery_status = 0; - memb->local_recovery_status = 0; - - /* - journal cb for failed or withdrawing nodes - - failed node was assigned a journal - - no journal cb if failed node was spectator - - no journal cb if we've already done a journl cb */ - - if ((memb->gone_type == GROUP_NODE_FAILED || - memb->withdrawing) && - memb->jid != JID_INIT && - memb->jid != -2 && - !memb->spectator && - !memb->wait_gfs_recover_done) { - memb->tell_gfs_to_recover = 1; - memb->recovery_status = RS_NEED_RECOVERY; - memb->local_recovery_status = RS_NEED_RECOVERY; - } - - log_group(mg, "remove member %d tell_gfs_to_recover %d " - "(%d,%d,%d,%d,%d,%d)", - memb->nodeid, memb->tell_gfs_to_recover, - mg->spectator, - mg->start_type, - memb->withdrawing, - memb->jid, - memb->spectator, - memb->wait_gfs_recover_done); - - if (mg->master_nodeid == memb->nodeid && - memb->gone_type == GROUP_NODE_FAILED) - master_failed = 1; - - if (memb->opts & MEMB_OPT_RECOVER) - memb_gone_recover = memb; - } - } - - /* add new nodes to members list */ - - for (i = 0; i < num_nodes; i++) { - id = nodeids[i]; - if (is_member(mg, id)) - continue; - add_member(mg, id); - pos++; - log_group(mg, "add member %d", id); - } - - prev_master_nodeid = mg->master_nodeid; - update_master_nodeid(mg); - - *pos_out = pos; - *neg_out = neg; - - log_group(mg, "total members %d master_nodeid %d prev %d", - mg->memb_count, mg->master_nodeid, prev_master_nodeid); - - - /* The master failed and we're the new master, we need to: - - - unlink the ckpt that the failed master had open so new ckpts - can be created down the road - - resend journals msg to any nodes that needed one from the - failed master - - store plocks in ckpt for the new mounters to read when they - get the journals msg from us */ - - if (neg && master_failed && - (prev_master_nodeid != -1) && - (prev_master_nodeid != mg->master_nodeid) && - (our_nodeid == mg->master_nodeid)) { - log_group(mg, "unlink ckpt for failed master %d", - prev_master_nodeid); - unlink_checkpoint(mg); - resend_journals(mg); - } - - /* Do we need a new first mounter? - - If we've not gotten a journals message yet (implies we're mounting) - and there's only one node left in the group (us, after removing the - failed node), then it's possible that the failed node was doing - first mounter recovery, so we need to become first mounter. - - If we've received a journals message, we can check if the failed - node was doing first mounter recovery (MEMB_OPT_RECOVER set) and - if so select the next first mounter. */ - - if (!neg) - return; - - if (!mg->got_our_journals && mg->memb_count == 1) { - log_group(mg, "we are left alone, act as first mounter"); - unlink_checkpoint(mg); - memb = find_memb_nodeid(mg, our_nodeid); - memb->jid = 0; - memb->opts |= MEMB_OPT_RECOVER; - mg->our_jid = 0; - mg->first_mounter = 1; - mg->first_mounter_done = 0; - mg->got_our_options = 1; - mg->got_our_journals = 1; - mg->mount_client_delay = 0; - notify_mount_client(mg); - return; - } - - if (memb_gone_recover) { - log_group(mg, "failed node %d had MEMB_OPT_RECOVER", - memb_gone_recover->nodeid); - memb_gone_recover->tell_gfs_to_recover = 0; - } - - if (memb_gone_recover && mg->got_our_journals) { - assign_next_first_mounter(mg); - memb = find_memb_nodeid(mg, our_nodeid); - if (memb->opts & MEMB_OPT_RECOVER) { - log_group(mg, "first mounter failed, we get " - "MEMB_OPT_RECOVER"); - unlink_checkpoint(mg); - memb->opts |= MEMB_OPT_RECOVER; - mg->first_mounter = 1; - mg->first_mounter_done = 0; - mg->mount_client_delay = 0; - notify_mount_client(mg); - } - } -} - -int gfs_join_mountgroup_old(struct mountgroup *mg, struct gfsc_mount_args *ma) -{ - int rv; - - if (strlen(ma->options) > MAX_OPTIONS_LEN-1) { - log_error("join: options too long %zu", strlen(ma->options)); - return -EMLINK; - } - - rv = group_join(gh, mg->name); - if (rv) - return -ENOTCONN; - return 0; -} - -/* recover_members() discovers which nodes need journal recovery - and moves the memb structs for those nodes into members_gone - and sets memb->tell_gfs_to_recover on them */ - -/* we don't want to tell gfs-kernel to do journal recovery for a failed - node in a number of cases: - - we're a spectator or readonly mount - - gfs-kernel is currently withdrawing - - we're mounting and haven't received a journals message yet - - we're mounting and got a kernel mount error back from mount.gfs - - we're mounting and haven't notified mount.gfs yet (to do mount(2)) - - we're mounting and got_kernel_mount is 0, i.e. we've not seen a uevent - related to the kernel mount yet - (some of the mounting checks should be obviated by others) - - the problem we're trying to avoid here is telling gfs-kernel to do - recovery when it can't for some reason and then waiting forever for - a recovery_done signal that will never arrive. */ - -static void recover_journals(struct mountgroup *mg) -{ - struct mg_member *memb; - int rv; - - if (mg->spectator || - mg->ro || - mg->withdraw_suspend || - mg->our_jid == JID_INIT || - mg->kernel_mount_error || - !mg->mount_client_notified || - !mg->got_kernel_mount || - !mg->kernel_mount_done) { - log_group(mg, "recover_journals: unable %d,%d,%d,%d,%d,%d,%d,%d", - mg->spectator, - mg->ro, - mg->withdraw_suspend, - mg->our_jid, - mg->kernel_mount_error, - mg->mount_client_notified, - mg->got_kernel_mount, - mg->kernel_mount_done); - - list_for_each_entry(memb, &mg->members_gone, list) { - log_group(mg, "member gone %d jid %d " - "tell_gfs_to_recover %d", - memb->nodeid, memb->jid, - memb->tell_gfs_to_recover); - - if (memb->tell_gfs_to_recover) { - memb->tell_gfs_to_recover = 0; - memb->local_recovery_status = RS_READONLY; - } - } - start_done(mg); - return; - } - - /* we feed one jid into the kernel for recovery instead of all - at once because we need to get the result of each independently - through the single recovery_done sysfs file */ - - list_for_each_entry(memb, &mg->members_gone, list) { - if (memb->wait_gfs_recover_done) { - log_group(mg, "delay new gfs recovery, " - "wait_gfs_recover_done for nodeid %d jid %d", - memb->nodeid, memb->jid); - return; - } - } - - list_for_each_entry(memb, &mg->members_gone, list) { - if (!memb->tell_gfs_to_recover) - continue; - - log_group(mg, "recover journal %d nodeid %d", - memb->jid, memb->nodeid); - - rv = set_sysfs(mg, "recover", memb->jid); - if (rv < 0) { - memb->local_recovery_status = RS_NOFS; - continue; - } - memb->tell_gfs_to_recover = 0; - memb->wait_gfs_recover_done = 1; - return; - } - - /* no more journals to attempt to recover, if we've been successful - recovering any then send out status, if not then start_done... - receiving no status message from us before start_done means we - didn't successfully recover any journals. If we send out status, - then delay start_done until we get our own message (so all nodes - will get the status before finish) */ - - list_for_each_entry(memb, &mg->members_gone, list) { - if (memb->local_recovery_status == RS_SUCCESS) { - send_recovery_status(mg); - log_group(mg, "delay start_done until status recvd"); - return; - } - } - - start_done(mg); -} - -/* In some cases, we may be joining a mountgroup with needs_recovery - set (there are journals that need recovery and current members can't - recover them because they're ro). In this case, we're told to act - like the first mounter to cause gfs to try to recovery all journals - when it mounts. When gfs does this, we'll get recovery_done's for - the individual journals it recovers (ignored) and finally, if all - journals are ok, an others_may_mount/first_done. */ - -/* When gfs does first-mount recovery, the mount(2) fails if it can't - recover one of the journals. If we get o_m_m, then we know it was - able to successfully recover all the journals. */ - -/* When we're the first mounter, gfs does recovery on all the journals - and does "recovery_done" callbacks when it finishes each. We ignore - these and wait for gfs to be finished with all at which point it calls - others_may_mount() and first_done is set. */ - -static int kernel_recovery_done_first(struct mountgroup *mg, int first_done) -{ - int rv; - - if (first_done < 0) { - /* for back compat, sysfs file deprecated */ - rv = read_sysfs_int(mg, "first_done", &first_done); - if (rv < 0) - return rv; - } - - log_group(mg, "kernel_recovery_done_first first_done %d", first_done); - - if (mg->kernel_mount_done) - log_group(mg, "FIXME: assuming kernel_mount_done comes after " - "first_done"); - - if (first_done) { - mg->first_mounter_done = 1; - send_recovery_done(mg); - } - - return 0; -} - -static int need_kernel_recovery_done(struct mountgroup *mg) -{ - struct mg_member *memb; - - list_for_each_entry(memb, &mg->members_gone, list) { - if (memb->wait_gfs_recover_done) - return 1; - } - return 0; -} - -/* Note: when a readonly node fails we do consider its journal (and the - fs) to need recovery... not sure this is really necessary, but - the readonly node did "own" a journal so it seems proper to recover - it even if the node wasn't writing to it. So, if there are 3 ro - nodes mounting the fs and one fails, gfs on the remaining 2 will - remain blocked until an rw node mounts, and the next mounter must - be rw. */ - -int process_recovery_uevent_old(char *name, int jid_done, int status, int first) -{ - struct mountgroup *mg; - struct mg_member *memb; - char *ss; - int rv, found = 0; - - mg = find_mg(name); - if (!mg) { - log_error("recovery_done: unknown mount group %s", name); - return -1; - } - - if (mg->first_mounter && !mg->first_mounter_done) - return kernel_recovery_done_first(mg, first); - - if (jid_done < 0) { - /* for back compat, sysfs file deprecated */ - rv = read_sysfs_int(mg, "recover_done", &jid_done); - if (rv < 0) - return rv; - } - - list_for_each_entry(memb, &mg->members_gone, list) { - if (memb->jid == jid_done) { - if (memb->wait_gfs_recover_done) { - memb->wait_gfs_recover_done = 0; - found = 1; - } - break; - } - } - - /* We need to ignore recovery_done callbacks in the case where there - are a bunch of recovery_done callbacks for the first mounter, but - we detect "first_done" before we've processed all the - recovery_done's. */ - - if (!found) { - log_group(mg, "recovery_done jid %d ignored, first %d,%d", - jid_done, mg->first_mounter, mg->first_mounter_done); - return 0; - } - - if (status < 0) { - /* for back compat, sysfs file deprecated */ - rv = read_sysfs_int(mg, "recover_status", &status); - if (rv < 0) { - log_group(mg, "recovery_done jid %d nodeid %d sysfs error %d", - memb->jid, memb->nodeid, rv); - memb->local_recovery_status = RS_NOFS; - goto out; - } - } - - switch (status) { - case LM_RD_GAVEUP: - /* - * This is unfortunate; it's needed for bz 442451 where - * gfs-kernel fails to acquire the journal lock on all nodes - * because a withdrawing node has not yet called - * dlm_release_lockspace() to free it's journal lock. With - * this, all nodes should repeatedly try to to recover the - * journal of the withdrawn node until the withdrawing node - * clears its dlm locks, and gfs on each of the remaining nodes - * succeeds in doing the recovery. - */ - - if (memb->withdrawing) { - log_group(mg, "recovery_done jid %d nodeid %d retry " - "for withdraw", memb->jid, memb->nodeid); - memb->tell_gfs_to_recover = 1; - memb->wait_gfs_recover_done = 0; - usleep(500000); - } - - memb->local_recovery_status = RS_GAVEUP; - ss = "gaveup"; - break; - case LM_RD_SUCCESS: - memb->local_recovery_status = RS_SUCCESS; - ss = "success"; - break; - default: - log_error("recovery_done: jid %d nodeid %d unknown status %d", - memb->jid, memb->nodeid, status); - ss = "unknown"; - } - - log_group(mg, "recovery_done jid %d nodeid %d %s", - memb->jid, memb->nodeid, ss); - - /* sanity check */ - if (need_kernel_recovery_done(mg)) - log_error("recovery_done: should be no pending gfs recoveries"); - - out: - recover_journals(mg); - return 0; -} - -static void leave_mountgroup(struct mountgroup *mg, int mnterr) -{ - /* sanity check: we should already have gotten the error from - the mount.gfs mount_done; so this shouldn't happen */ - - if (mnterr && !mg->kernel_mount_error) { - log_error("leave: mount_error is new %d %d", - mg->kernel_mount_error, mnterr); - } - - mg->leaving = 1; - - /* Check to see if we're waiting for a kernel recovery_done to do a - start_done(). If so, call the start_done() here because we won't be - getting anything else from gfs-kernel which is now gone. */ - - if (need_kernel_recovery_done(mg)) { - log_group(mg, "leave: fill in start_done"); - start_done(mg); - } - - group_leave(gh, mg->name); -} - -void do_leave_old(char *name, int mnterr) -{ - struct mountgroup *mg; - - log_debug("do_leave_old %s mnterr %d", name, mnterr); - - list_for_each_entry(mg, &withdrawn_mounts, list) { - if (strcmp(mg->name, name)) - continue; - log_group(mg, "leave for withdrawn fs"); - list_del(&mg->list); - free_mg(mg); - return; - } - - mg = find_mg(name); - if (!mg) { - log_error("do_leave_old: %s not found", name); - return; - } - - leave_mountgroup(mg, mnterr); -} - -/* When mounting a fs, we first join the mountgroup, then tell mount.gfs - to procede with the kernel mount. Once we're in the mountgroup, we - can get a stop callback at any time, which requires us to block the - fs by setting a sysfs file. If the kernel mount is slow, we can get - a stop callback and try to set the sysfs file before the kernel mount - has actually created the sysfs files for the fs. This function delays - any further processing until the sysfs files exist. */ - -/* This function returns 0 when the kernel mount is successfully detected - and we know that do_stop() will be able to block the fs. - This function returns a negative error if it detects the kernel mount - has failed which means there's nothing to stop and do_stop() can assume - an implicit stop. */ - -/* wait for - - kernel mount to get to the point of creating sysfs files we - can read (and that do_stop can then use), or - - kernel mount to fail causing mount.gfs to send us a MOUNT_DONE - which we read in process_connection() */ - -static int wait_for_kernel_mount(struct mountgroup *mg) -{ - int rv, val; - - while (1) { - /* This is the standard way we leave this loop, where the - kernel mount gets to the point of creating the sysfs files - which we see by successfully reading "id". With the - sysfs files in place, do_stop() will be able to block - the kernel. */ - - rv = read_sysfs_int(mg, "block", &val); - if (!rv) - break; - usleep(100000); - - /* kernel_mount_done is set by mount_done_old() which is called - by process_connection() if mount.gfs sends MOUNT_DONE. */ - - if (mg->kernel_mount_done && !mg->kernel_mount_error) { - /* mount(2) was successful and we should be able - to read "id" very shortly... */ - continue; - } - - if (mg->kernel_mount_done && mg->kernel_mount_error) { - /* mount(2) failed, stop becomes implicit */ - break; - } - - /* this should either do nothing and return immediatley, or - read a MOUNT_DONE from mount.gfs and call mount_done_old() - which will set kernel_mount_done and set kernel_mount_error */ - - process_connection(mg->mount_client); - } - - return rv; -} - -/* The processing of new mounters (send/recv options, send/recv journals, - notify mount.gfs) is not very integrated with the stop/start/finish - callbacks from libgroup. A start callback just notifies us of a new - mounter and the options/journals messages drive things from there. - Recovery for failed nodes _is_ controlled more directly by the - stop/start/finish callbacks. So, processing new mounters happens - independently of recovery and of the libgroup callbacks. One place - where they need to intersect, though, is in stopping/suspending - gfs-kernel: - - When we get a stop callback, we need to be certain that gfs-kernel - is blocked. - - When a mounter notifies mount.gfs to go ahead, gfs-kernel will - shortly begin running in an unblocked fashion as it goes through - the kernel mounting process. - Given this, we need to be sure that if gfs-kernel is supposed to be - blocked, we don't notify mount.gfs to go ahead and do the kernel mount - since that starts gfs-kernel in an unblocked state. */ - -/* - if we're unmounting, the kernel is gone, so no problem. - - if we've just mounted and notified mount.gfs, then wait for kernel - mount and then block. - - if we're mounting and have not yet notified mount.gfs, then set - a flag that delays the notification until block is set to 0. */ - -int do_stop(struct mountgroup *mg) -{ - int rv; - - if (mg->first_mounter && !mg->kernel_mount_done) { - log_group(mg, "do_stop skip during first mount recovery"); - goto out; - } - - for (;;) { - rv = set_sysfs(mg, "block", 1); - if (!rv) { - mg->kernel_stopped = 1; /* for queries */ - break; - } - - /* We get an error trying to block gfs, this could be due - to a number of things: - 1. if the kernel instance of gfs existed before but now - we can't see it, that must mean it's been unmounted, - so it's implicitly stopped - 2. we're in the process of mounting and gfs hasn't created - the sysfs files for this fs yet - 3. we're mounting and mount(2) returned an error - 4. we're mounting but haven't told mount.gfs to go ahead - with mount(2) yet - We also need to handle the situation where we get here in - case 2 but it turns into case 3 while we're in - wait_for_kernel_mount() */ - - if (mg->got_kernel_mount) { - log_group(mg, "do_stop skipped fs unmounted"); - break; - } - - if (mg->mount_client_notified) { - if (!mg->kernel_mount_error) { - log_group(mg, "do_stop wait for kernel mount"); - rv = wait_for_kernel_mount(mg); - if (rv < 0) - break; - } else { - log_group(mg, "do_stop ignore, failed mount"); - break; - } - } else { - log_group(mg, "do_stop causes mount_client_delay"); - mg->mount_client_delay = 1; - break; - } - } - out: - group_stop_done(gh, mg->name); - return 0; -} - -/* After a start that initiated a recovery, everyone will go and see if they - can do recovery and try if they can. If a node can't, it does start_done, - if it tries and fails, it does start_done, if it tries and succeeds it - sends a message and then does start_done once it receives's it back. So, - when we get a finish we know that we have all the results from the recovery - cycle and can judge if everything is recovered properly or not. If so, we - can unblock locks (in the finish), if not, we leave them blocked (in the - finish). - - If we leave locks blocked in the finish, then they can only be unblocked - after someone is able to do the recovery that's needed. So, leaving locks - blocked in a finish because recovery hasn't worked puts us into a special - state: the fs needs recovery, none of the current mounters has been able to - recover it, all current mounters have locks blocked in gfs, new mounters - are allowed, nodes can unmount, new mounters are asked to do first-mounter - recovery, if one of them succeeds then we can all clear this special state - and unblock locks (the unblock would happen upon recving the success - message from the new pseudo-first mounter, not as part of a finish), future - finishes would then go back to being able to unblock locks. - - While in this special state, a new node has been added and asked to do - first-mounter recovery, other nodes can also be added while the new - first-mounter is active. These other nodes don't notify mount.gfs. - They'll receive the result of the first mounter and if it succeeded they'll - notify mount.gfs, otherwise one of them will become the next first-mounter - and notify mount.gfs. */ - -int do_finish(struct mountgroup *mg) -{ - struct mg_member *memb, *safe; - - log_group(mg, "finish %d needs_recovery %d", mg->last_finish, - mg->needs_recovery); - - /* members_gone list are the members that were removed from the - members list when processing a start. members are removed - from members_gone if their journals have been recovered */ - - list_for_each_entry_safe(memb, safe, &mg->members_gone, list) { - if (!memb->recovery_status) { - list_del(&memb->list); - free(memb); - } else if (memb->recovery_status == RS_SUCCESS) { - ASSERT(memb->gone_event <= mg->last_finish); - log_group(mg, "finish: recovered jid %d nodeid %d", - memb->jid, memb->nodeid); - list_del(&memb->list); - free(memb); - } else { - log_error("%s finish: needs recovery jid %d nodeid %d " - "status %d", mg->name, memb->jid, - memb->nodeid, memb->recovery_status); - mg->needs_recovery = 1; - } - } - - list_for_each_entry(memb, &mg->members, list) - memb->finished = 1; - - if (mg->group_leave_on_finish) { - log_group(mg, "leaving group after delay for join to finish"); - group_leave(gh, mg->name); - mg->group_leave_on_finish = 0; - return 0; - } - - if (!mg->needs_recovery) { - mg->kernel_stopped = 0; /* for queries */ - set_sysfs(mg, "block", 0); - - /* we may have been holding back our local mount due to - being stopped/blocked */ - if (mg->mount_client_delay && !first_mounter_recovery(mg)) { - mg->mount_client_delay = 0; - notify_mount_client(mg); - } - } else - log_group(mg, "finish: leave locks blocked for needs_recovery"); - - return 0; -} - -/* - * - require the first mounter to be rw, not ro or spectator. - * - * - if rw mounter fails, leaving only spectator mounters, - * require the next mounter to be rw, more ro/spectator mounts should - * fail until the fs is mounted rw. - * - * - if last rw mounter fails and ro mounters are left (possibly with - * some spectators), disallow any ro->rw remounts, leave gfs blocked, - * require next mounter to be rw, have next mounter do first mount - * gfs/journal recovery. - */ - -/* called for the initial start on the node that's first to mount the fs. - (it should be ok to let the first mounter be a spectator, gfs should do - first recovery and bail out if there are any dirty journals) */ - -/* FIXME: if journal recovery fails on any of the journals, we should - fail the mount */ - -static void start_first_mounter(struct mountgroup *mg) -{ - struct mg_member *memb; - - log_group(mg, "start_first_mounter"); - set_our_memb_options(mg); - memb = find_memb_nodeid(mg, our_nodeid); - ASSERT(memb); - - if (mg->ro || mg->spectator) { - memb->jid = -2; - mg->our_jid = -2; - log_group(mg, "start_first_mounter not rw ro=%d spect=%d", - mg->ro , mg->spectator); - mg->mount_client_result = -EUCLEAN; - } else { - memb->opts |= MEMB_OPT_RECOVER; - memb->jid = 0; - mg->our_jid = 0; - mg->first_mounter = 1; - mg->first_mounter_done = 0; - mg->got_our_options = 1; - mg->got_our_journals = 1; - } - start_done(mg); - notify_mount_client(mg); -} - -/* called for the initial start on a rw/ro mounter; - the existing mounters are running start_participant() */ - -static void start_participant_init(struct mountgroup *mg) -{ - log_group(mg, "start_participant_init"); - set_our_memb_options(mg); - send_options(mg); - start_done(mg); -} - -/* called for a non-initial start on a normal mounter. - NB we can get here without having received a journals message for - our (recent) mount yet in which case we don't know the jid or ro/rw - status of any members, and don't know our own jid. */ - -static void start_participant(struct mountgroup *mg, int pos, int neg) -{ - log_group(mg, "start_participant pos=%d neg=%d", pos, neg); - - if (pos) { - start_done(mg); - /* we save options messages from nodes for whom we've not - received a start yet */ - process_saved_options(mg); - } else if (neg) { - recover_journals(mg); - process_saved_recovery_status(mg); - } -} - -/* called for the initial start on a spectator mounter, - after _receive_journals() */ - -static void start_spectator_init_2(struct mountgroup *mg) -{ - log_group(mg, "start_spectator_init_2 our_jid=%d", mg->our_jid); - - /* we've been given jid of -2 which means we're not permitted - to mount the fs; probably because the next mounter must be rw */ - - if (mg->our_jid == -2) { - mg->mount_client_result = -EUCLEAN; - } else - ASSERT(mg->our_jid == -1); - - notify_mount_client(mg); -} - -/* called for the initial start on a spectator mounter */ - -static void start_spectator_init(struct mountgroup *mg) -{ - log_group(mg, "start_spectator_init"); - set_our_memb_options(mg); - send_options(mg); - start_done(mg); - mg->start2_fn = start_spectator_init_2; -} - -/* called for a non-initial start on a spectator mounter */ - -static void start_spectator(struct mountgroup *mg, int pos, int neg) -{ - log_group(mg, "start_spectator pos=%d neg=%d", pos, neg); - - if (pos) { - start_done(mg); - process_saved_options(mg); - } else if (neg) { - recover_journals(mg); - process_saved_recovery_status(mg); - } -} - -/* If nodeA fails, nodeB is recovering journalA and nodeB fails before - finishing, then nodeC needs to tell gfs to recover both journalA and - journalB. We do this by setting tell_gfs_to_recover back to 1 for - any nodes that are still on the members_gone list. */ - -static void reset_unfinished_recoveries(struct mountgroup *mg) -{ - struct mg_member *memb; - - list_for_each_entry(memb, &mg->members_gone, list) { - if (memb->recovery_status && - memb->recovery_status != RS_NEED_RECOVERY) { - log_group(mg, "retry unfinished recovery " - "jid %d nodeid %d", - memb->jid, memb->nodeid); - memb->tell_gfs_to_recover = 1; - memb->recovery_status = RS_NEED_RECOVERY; - memb->local_recovery_status = RS_NEED_RECOVERY; - } - } -} - -/* - old method: - A is rw mount, B mounts rw - - do_start do_start - start_participant start_participant_init - send_options - receive_options - start_participant_2 - discover_journals - assign B a jid - send_journals - group_start_done - receive_journals - start_participant_init_2 - group_start_done - do_finish do_finish - - new method: decouples stop/start/finish from mount processing - A is rw mount, B mounts rw - - do_start do_start - start_participant start_participant_init - start_done send_options - start_done - do_finish do_finish - - receive_options - assign_journal - send_journals - receive_journals - start_participant_init_2 - notify_mount_client -*/ - -void do_start(struct mountgroup *mg, int type, int member_count, int *nodeids) -{ - int pos = 0, neg = 0; - - mg->start_event_nr = mg->last_start; - mg->start_type = type; - - log_group(mg, "start %d init %d type %d member_count %d", - mg->last_start, mg->init, type, member_count); - - recover_members(mg, member_count, nodeids, &pos, &neg); - reset_unfinished_recoveries(mg); - - if (mg->init) { - if (member_count == 1) - start_first_mounter(mg); - else if (mg->spectator) - start_spectator_init(mg); - else - start_participant_init(mg); - mg->init = 0; - } else { - if (mg->spectator) - start_spectator(mg, pos, neg); - else - start_participant(mg, pos, neg); - } -} - -/* - What repurcussions are there from umount shutting down gfs in the - kernel before we leave the mountgroup? We can no longer participate - in recovery even though we're in the group -- what are the end cases - that we need to deal with where this causes a problem? i.e. there - is a period of time where the mountgroup=A,B,C but the kernel fs - is only active on A,B, not C. The mountgroup on A,B can't depend - on the mg on C to necessarily be able to do some things (recovery). - - At least in part, it means that after we do an umount and have - removed the instance of this fs in the kernel, we'll still get - stop/start/finish callbacks from groupd for which we'll attempt - and fail to: block/unblock gfs kernel activity, initiate gfs - journal recovery, get recovery-done signals fromt eh kernel. - - We don't want to hang groupd event processing by failing to send - an ack (stop_done/start_done) back to groupd when it needs one - to procede. In the case where we get a start for a failed node - that needs journal recovery, we have a problem because we wait to - call group_start_done() until gfs in the kernel to signal that - the journal recovery is done. If we've unmounted gfs isn't there - any more to give us this signal and we'll never call start_done. - - update: we should be dealing with all these issues correctly now. */ - -int do_terminate(struct mountgroup *mg) -{ - purge_plocks(mg, 0, 1); - - if (mg->withdraw_suspend) { - log_group(mg, "termination of our withdraw leave"); - set_sysfs(mg, "withdraw", 1); - list_move(&mg->list, &withdrawn_mounts); - } else { - log_group(mg, "termination of our unmount leave"); - list_del(&mg->list); - free(mg); - } - - return 0; -} - -static void do_deliver(int nodeid, char *data, int len) -{ - struct mountgroup *mg; - struct gdlm_header *hd; - - hd = (struct gdlm_header *) data; - - mg = find_mg(hd->name); - if (!mg) { - /* - log_error("cpg message from %d len %d no group %s", - nodeid, len, hd->name); - */ - return; - } - - hd->version[0] = le16_to_cpu(hd->version[0]); - hd->version[1] = le16_to_cpu(hd->version[1]); - hd->version[2] = le16_to_cpu(hd->version[2]); - hd->type = le16_to_cpu(hd->type); - hd->nodeid = le32_to_cpu(hd->nodeid); - hd->to_nodeid = le32_to_cpu(hd->to_nodeid); - - /* FIXME: we need to look at how to gracefully fail when we end up - with mixed incompat versions */ - - if (hd->version[0] != protocol_active[0]) { - log_error("reject message from %d version %u.%u.%u vs %u.%u.%u", - nodeid, hd->version[0], hd->version[1], - hd->version[2], protocol_active[0], - protocol_active[1], protocol_active[2]); - return; - } - - /* If there are some group messages between a new node being added to - the cpg group and being added to the app group, the new node should - discard them since they're only relevant to the app group. */ - - if (!mg->last_callback) { - log_group(mg, "discard %s len %d from %d", - msg_name(hd->type), len, nodeid); - return; - } - - switch (hd->type) { - case MSG_JOURNAL: - receive_journals(mg, data, len, nodeid); - break; - - case MSG_OPTIONS: - receive_options(mg, data, len, nodeid); - break; - - case MSG_REMOUNT: - receive_remount(mg, data, len, nodeid); - break; - - case MSG_PLOCK: - receive_plock(mg, data, len, nodeid); - break; - - case MSG_MOUNT_STATUS: - receive_mount_status(mg, data, len, nodeid); - break; - - case MSG_RECOVERY_STATUS: - receive_recovery_status(mg, data, len, nodeid); - break; - - case MSG_RECOVERY_DONE: - receive_recovery_done(mg, data, len, nodeid); - break; - - case MSG_WITHDRAW: - receive_withdraw(mg, data, len, nodeid); - break; - - case MSG_PLOCK_OWN: - receive_own(mg, data, len, nodeid); - break; - - case MSG_PLOCK_DROP: - receive_drop(mg, data, len, nodeid); - break; - - case MSG_PLOCK_SYNC_LOCK: - case MSG_PLOCK_SYNC_WAITER: - receive_sync(mg, data, len, nodeid); - break; - - default: - log_error("unknown message type %d from %d", - hd->type, hd->nodeid); - } -} - -static void deliver_cb(cpg_handle_t handle, struct cpg_name *group_name, - uint32_t nodeid, uint32_t pid, void *data, int data_len) -{ - do_deliver(nodeid, data, data_len); -} - -/* Not sure if purging plocks (driven by confchg) needs to be synchronized with - the other recovery steps (driven by libgroup) for a node, don't think so. - Is it possible for a node to have been cleared from the members_gone list - before this confchg is processed? */ - -static void confchg_cb(cpg_handle_t handle, struct cpg_name *group_name, - struct cpg_address *member_list, int member_list_entries, - struct cpg_address *left_list, int left_list_entries, - struct cpg_address *joined_list, int joined_list_entries) -{ - struct mountgroup *mg; - int i, nodeid; - - for (i = 0; i < left_list_entries; i++) { - nodeid = left_list[i].nodeid; - list_for_each_entry(mg, &mountgroups, list) { - if (is_member(mg, nodeid) || is_removed(mg, nodeid)) - purge_plocks(mg, left_list[i].nodeid, 0); - } - } -} - -static cpg_callbacks_t callbacks = { - .cpg_deliver_fn = deliver_cb, - .cpg_confchg_fn = confchg_cb, -}; - -void process_cpg_old(int ci) -{ - cpg_error_t error; - - error = cpg_dispatch(cpg_handle_daemon, CPG_DISPATCH_ALL); - if (error != CPG_OK) { - log_error("cpg_dispatch error %d", error); - return; - } - - update_flow_control_status(); -} - -int setup_cpg_old(void) -{ - static struct cpg_name name; - cpg_error_t error; - int fd = 0; - - if (cfgd_plock_ownership) - memcpy(protocol_active, protocol_v200, sizeof(protocol_v200)); - else - memcpy(protocol_active, protocol_v100, sizeof(protocol_v100)); - - error = cpg_initialize(&cpg_handle_daemon, &callbacks); - if (error != CPG_OK) { - log_error("daemon cpg_initialize error %d", error); - return -1; - } - - cpg_fd_get(cpg_handle_daemon, &fd); - if (fd < 0) { - log_error("daemon cpg_fd_get error %d", error); - return -1; - } - - memset(&name, 0, sizeof(name)); - strcpy(name.value, "gfs_controld"); - name.length = 12; - - retry: - error = cpg_join(cpg_handle_daemon, &name); - if (error == CPG_ERR_TRY_AGAIN) { - log_debug("daemon cpg_join retry"); - sleep(1); - goto retry; - } - if (error != CPG_OK) { - log_error("daemon cpg_join error %d", error); - cpg_finalize(cpg_handle_daemon); - return -1; - } - - log_debug("setup_cpg_old %d", fd); - return fd; -} - -void close_cpg_old(void) -{ - static struct cpg_name name; - cpg_error_t error; - int i = 0; - - if (!cpg_handle_daemon || cluster_down) - return; - - memset(&name, 0, sizeof(name)); - strcpy(name.value, "gfs_controld"); - name.length = 12; - - retry: - error = cpg_leave(cpg_handle_daemon, &name); - if (error == CPG_ERR_TRY_AGAIN) { - sleep(1); - if (!(++i % 10)) - log_error("daemon cpg_leave error retrying"); - goto retry; - } - if (error != CPG_OK) - log_error("daemon cpg_leave error %d", error); -} - diff --git a/group/gfs_controld/cpg-old.h b/group/gfs_controld/cpg-old.h deleted file mode 100644 index 0458338..0000000 --- a/group/gfs_controld/cpg-old.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef __CPG_OLD_DOT_H__ -#define __CPG_OLD_DOT_H__ - -#define DO_STOP 1 -#define DO_START 2 -#define DO_FINISH 3 -#define DO_TERMINATE 4 -#define DO_SETID 5 - -enum { - - MSG_JOURNAL = 1, - MSG_OPTIONS, - MSG_REMOUNT, - MSG_PLOCK, - MSG_WITHDRAW, - MSG_MOUNT_STATUS, - MSG_RECOVERY_STATUS, - MSG_RECOVERY_DONE, - MSG_PLOCK_OWN, - MSG_PLOCK_DROP, - MSG_PLOCK_SYNC_LOCK, - MSG_PLOCK_SYNC_WAITER, -}; - -/* These lengths are part of the "old" wire protocol. */ - -#define MAX_OPTIONS_LEN 1024 -#define MSG_NAMELEN 255 - -struct gdlm_header { - uint16_t version[3]; - uint16_t type; /* MSG_ */ - uint32_t nodeid; /* sender */ - uint32_t to_nodeid; /* 0 if to all */ - char name[MSG_NAMELEN]; -}; - -struct save_msg { - struct list_head list; - int nodeid; - int len; - int type; - char buf[0]; -}; - -struct mg_member { - struct list_head list; - int nodeid; - int jid; - - int spectator; - int readonly; - int rw; - uint32_t opts; - - int tell_gfs_to_recover; - int wait_gfs_recover_done; - int gone_event; - int gone_type; - int finished; - int local_recovery_status; - int recovery_status; - int withdrawing; - int needs_journals; - - int ms_kernel_mount_done; - int ms_first_mounter; - int ms_kernel_mount_error; -}; - -#endif - diff --git a/group/gfs_controld/gfs_daemon.h b/group/gfs_controld/gfs_daemon.h index 157865a..ec9c0c6 100644 --- a/group/gfs_controld/gfs_daemon.h +++ b/group/gfs_controld/gfs_daemon.h @@ -57,18 +57,10 @@ #define MAXLINE 256 -/* group_mode */ - -#define GROUP_LIBGROUP 2 -#define GROUP_LIBCPG 3 - extern int daemon_debug_opt; extern int daemon_quit; extern int cluster_down; extern int poll_dlm; -extern int poll_ignore_plock; -extern int plock_fd; -extern int plock_ci; extern struct list_head mountgroups; extern int cman_quorate; extern int our_nodeid; @@ -77,14 +69,9 @@ extern char daemon_debug_buf[256]; extern char dump_buf[GFSC_DUMP_SIZE]; extern int dump_point; extern int dump_wrap; -extern char plock_dump_buf[GFSC_DUMP_SIZE]; -extern int plock_dump_len; extern int dmsetup_wait; extern cpg_handle_t cpg_handle_daemon; extern int libcpg_flow_control_on; -extern int group_mode; -extern uint32_t plock_minor; -extern uint32_t old_plock_minor; extern struct list_head withdrawn_mounts; void daemon_dump_save(void); @@ -124,7 +111,6 @@ struct mountgroup { uint32_t id; struct gfsc_mount_args mount_args; char name[GFS_MOUNTGROUP_LEN+1]; - int old_group_mode; int mount_client; int mount_client_result; @@ -165,44 +151,6 @@ struct mountgroup { int first_recovery_msg; int local_recovery_jid; int local_recovery_busy; - - /* cpg-old stuff for rhel5/stable2 compat */ - - struct list_head members; - struct list_head members_gone; - int memb_count; - int last_stop; - int last_start; - int last_finish; - int last_callback; - int start_event_nr; - int start_type; - int group_leave_on_finish; - int init; - int got_our_options; - int got_our_journals; - int delay_send_journals; - int first_mount_pending_stop; - int first_mounter_done; - int global_first_recover_done; - int emulate_first_mounter; - int wait_first_done; - int needs_recovery; - int low_nodeid; - int master_nodeid; - int got_kernel_mount; - struct list_head saved_messages; - void *start2_fn; - - /* cpg-old plock stuff */ - - int save_plocks; - struct list_head plock_resources; - uint32_t associated_ls_id; - uint64_t cp_handle; - time_t last_checkpoint_time; - time_t last_plock_time; - struct timeval drop_resources_last; }; /* these need to match the kernel defines of the same name in lm_interface.h */ @@ -239,39 +187,6 @@ int set_mountgroup_nodes(struct mountgroup *mg, int option, int *node_count, struct gfsc_node **nodes_out); void free_mg(struct mountgroup *mg); -/* cpg-old.c */ -int setup_cpg_old(void); -void close_cpg_old(void); -void process_cpg_old(int ci); - -int gfs_join_mountgroup_old(struct mountgroup *mg, struct gfsc_mount_args *ma); -void do_leave_old(char *name, int mnterr); -int send_group_message_old(struct mountgroup *mg, int len, char *buf); -void save_message_old(struct mountgroup *mg, char *buf, int len, int from, - int type); -void send_withdraw_old(struct mountgroup *mg); -int process_recovery_uevent_old(char *name, int jid, int status, int first); -void send_remount_old(struct mountgroup *mg, struct gfsc_mount_args *ma); -void send_mount_status_old(struct mountgroup *mg); -int do_stop(struct mountgroup *mg); -int do_finish(struct mountgroup *mg); -void do_start(struct mountgroup *mg, int type, int member_count, int *nodeids); -int do_terminate(struct mountgroup *mg); -int do_withdraw_old(char *table); - -/* group.c */ -int setup_groupd(void); -void close_groupd(void); -void process_groupd(int ci); -int set_mountgroup_info_group(struct mountgroup *mg, - struct gfsc_mountgroup *out); -int set_node_info_group(struct mountgroup *mg, int nodeid, - struct gfsc_node *node); -int set_mountgroups_group(int *count, struct gfsc_mountgroup **mgs_out); -int set_mountgroup_nodes_group(struct mountgroup *mg, int option, - int *node_count, struct gfsc_node **nodes_out); -int set_group_mode(void); - /* main.c */ int do_read(int fd, void *buf, size_t count); int do_write(int fd, void *buf, size_t count); @@ -297,22 +212,6 @@ void close_cman(void); void process_cman(int ci); void kick_node_from_cluster(int nodeid); -/* plock.c */ -int setup_plocks(void); -void process_plocks(int ci); -int limit_plocks(void); -void receive_plock(struct mountgroup *mg, char *buf, int len, int from); -void receive_own(struct mountgroup *mg, char *buf, int len, int from); -void receive_sync(struct mountgroup *mg, char *buf, int len, int from); -void receive_drop(struct mountgroup *mg, char *buf, int len, int from); -void process_saved_plocks(struct mountgroup *mg); -int unlink_checkpoint(struct mountgroup *mg); -void store_plocks(struct mountgroup *mg, int nodeid); -void retrieve_plocks(struct mountgroup *mg); -void purge_plocks(struct mountgroup *mg, int nodeid, int unmount); -int fill_plock_dump_buf(struct mountgroup *mg); -int setup_misc_devices(void); - /* util.c */ int we_are_in_fence_domain(void); int set_sysfs(struct mountgroup *mg, char *field, int val); diff --git a/group/gfs_controld/group.c b/group/gfs_controld/group.c deleted file mode 100644 index e5046b2..0000000 --- a/group/gfs_controld/group.c +++ /dev/null @@ -1,360 +0,0 @@ -#include "gfs_daemon.h" -#include "config.h" -#include "cpg-old.h" -#include "libgroup.h" - -#define LOCK_DLM_GROUP_LEVEL 2 -#define LOCK_DLM_GROUP_NAME "gfs" - -/* save all the params from callback functions here because we can't - do the processing within the callback function itself */ - -group_handle_t gh; -static int cb_action; -static char cb_name[GFS_MOUNTGROUP_LEN+1]; -static int cb_event_nr; -static unsigned int cb_id; -static int cb_type; -static int cb_member_count; -static int cb_members[MAX_NODES]; - - -static void stop_cbfn(group_handle_t h, void *private, char *name) -{ - cb_action = DO_STOP; - strcpy(cb_name, name); -} - -static void start_cbfn(group_handle_t h, void *private, char *name, - int event_nr, int type, int member_count, int *members) -{ - int i; - - cb_action = DO_START; - strncpy(cb_name, name, GFS_MOUNTGROUP_LEN); - cb_event_nr = event_nr; - cb_type = type; - cb_member_count = member_count; - - for (i = 0; i < member_count; i++) - cb_members[i] = members[i]; -} - -static void finish_cbfn(group_handle_t h, void *private, char *name, - int event_nr) -{ - cb_action = DO_FINISH; - strncpy(cb_name, name, GFS_MOUNTGROUP_LEN); - cb_event_nr = event_nr; -} - -static void terminate_cbfn(group_handle_t h, void *private, char *name) -{ - cb_action = DO_TERMINATE; - strncpy(cb_name, name, GFS_MOUNTGROUP_LEN); -} - -static void setid_cbfn(group_handle_t h, void *private, char *name, - unsigned int id) -{ - cb_action = DO_SETID; - strncpy(cb_name, name, GFS_MOUNTGROUP_LEN); - cb_id = id; -} - -static group_callbacks_t callbacks = { - stop_cbfn, - start_cbfn, - finish_cbfn, - terminate_cbfn, - setid_cbfn, -}; - -static char *str_members(void) -{ - static char str_members_buf[MAXLINE]; - int i, ret, pos = 0, len = MAXLINE; - - memset(str_members_buf, 0, MAXLINE); - - for (i = 0; i < cb_member_count; i++) { - if (i != 0) { - ret = snprintf(str_members_buf + pos, len - pos, " "); - if (ret >= len - pos) - break; - pos += ret; - } - ret = snprintf(str_members_buf + pos, len - pos, "%d", - cb_members[i]); - if (ret >= len - pos) - break; - pos += ret; - } - return str_members_buf; -} - -void process_groupd(int ci) -{ - struct mountgroup *mg; - int error = 0; - - error = group_dispatch(gh); - if (error) { - log_error("groupd_dispatch error %d errno %d", error, errno); - goto out; - } - - if (!cb_action) - goto out; - - mg = find_mg(cb_name); - if (!mg) { - log_error("callback %d group %s not found", cb_action, cb_name); - error = -1; - goto out; - } - - switch (cb_action) { - case DO_STOP: - log_debug("groupd cb: stop %s", cb_name); - mg->last_callback = DO_STOP; - mg->last_stop = mg->last_start; - do_stop(mg); - break; - - case DO_START: - log_debug("groupd cb: start %s type %d count %d members %s", - cb_name, cb_type, cb_member_count, str_members()); - mg->last_callback = DO_START; - mg->last_start = cb_event_nr; - do_start(mg, cb_type, cb_member_count, cb_members); - break; - - case DO_FINISH: - log_debug("groupd cb: finish %s", cb_name); - mg->last_callback = DO_FINISH; - mg->last_finish = cb_event_nr; - do_finish(mg); - break; - - case DO_TERMINATE: - log_debug("groupd cb: terminate %s", cb_name); - mg->last_callback = DO_TERMINATE; - do_terminate(mg); - break; - - case DO_SETID: - log_debug("groupd cb: set_id %s %x", cb_name, cb_id); - mg->id = cb_id; - break; - - default: - error = -EINVAL; - } - - out: - cb_action = 0; -} - -int setup_groupd(void) -{ - int rv; - - gh = group_init(NULL, LOCK_DLM_GROUP_NAME, LOCK_DLM_GROUP_LEVEL, - &callbacks, 10); - if (!gh) { - log_error("group_init error %p %d", gh, errno); - return -ENOTCONN; - } - - rv = group_get_fd(gh); - if (rv < 0) - log_error("group_get_fd error %d %d", rv, errno); - - log_debug("groupd %d", rv); - - return rv; -} - -void close_groupd(void) -{ - group_exit(gh); -} - -/* most of the query info doesn't apply in the LIBGROUP mode, but we can - emulate some basic parts of it */ - -int set_mountgroup_info_group(struct mountgroup *mg, - struct gfsc_mountgroup *out) -{ - strncpy(out->name, mg->name, GFS_MOUNTGROUP_LEN); - out->global_id = mg->id; - - if (mg->joining) - out->flags |= GFSC_MF_JOINING; - if (mg->leaving) - out->flags |= GFSC_MF_LEAVING; - if (mg->kernel_stopped) - out->flags |= GFSC_MF_KERNEL_STOPPED; - - out->cg_prev.member_count = mg->memb_count; - - return 0; -} - -static int _set_node_info(struct mountgroup *mg, int nodeid, - struct gfsc_node *node) -{ - struct mg_member *memb; - int is_member = 0, is_gone = 0; - - list_for_each_entry(memb, &mg->members, list) { - if (memb->nodeid != nodeid) - continue; - is_member = 1; - goto found; - } - list_for_each_entry(memb, &mg->members_gone, list) { - if (memb->nodeid != nodeid) - continue; - is_gone = 1; - break; - } - if (!is_member && !is_gone) - goto out; - found: - node->nodeid = nodeid; - - if (is_member) - node->flags |= GFSC_NF_MEMBER; - if (memb->spectator) - node->flags |= GFSC_NF_SPECTATOR; - if (memb->readonly) - node->flags |= GFSC_NF_READONLY; - if (memb->ms_kernel_mount_done) - node->flags |= GFSC_NF_KERNEL_MOUNT_DONE; - if (memb->ms_kernel_mount_error) - node->flags |= GFSC_NF_KERNEL_MOUNT_ERROR; - - node->jid = memb->jid; - - if (is_gone && memb->gone_type == GROUP_NODE_FAILED) - node->failed_reason = 1; - out: - return 0; -} - -int set_node_info_group(struct mountgroup *mg, int nodeid, - struct gfsc_node *node) -{ - return _set_node_info(mg, nodeid, node); -} - -int set_mountgroups_group(int *count, struct gfsc_mountgroup **mgs_out) -{ - struct mountgroup *mg; - struct gfsc_mountgroup *mgs, *mgp; - int mg_count = 0; - - list_for_each_entry(mg, &mountgroups, list) - mg_count++; - - mgs = malloc(mg_count * sizeof(struct gfsc_mountgroup)); - if (!mgs) - return -ENOMEM; - memset(mgs, 0, mg_count * sizeof(struct gfsc_mountgroup)); - - mgp = mgs; - list_for_each_entry(mg, &mountgroups, list) { - set_mountgroup_info(mg, mgp++); - } - - *count = mg_count; - *mgs_out = mgs; - return 0; -} - -int list_count(struct list_head *head) -{ - struct list_head *tmp; - int count = 0; - - list_for_each(tmp, head) - count++; - return count; -} - -int set_mountgroup_nodes_group(struct mountgroup *mg, int option, - int *node_count, struct gfsc_node **nodes_out) -{ - struct gfsc_node *nodes = NULL, *nodep; - struct mg_member *memb; - int count = 0; - - if (option == GFSC_NODES_ALL) { - count = mg->memb_count + list_count(&mg->members_gone); - } else if (option == GFSC_NODES_MEMBERS) { - count = mg->memb_count; - } else - goto out; - - nodes = malloc(count * sizeof(struct gfsc_node)); - if (!nodes) - return -ENOMEM; - memset(nodes, 0, count * sizeof(struct gfsc_node)); - nodep = nodes; - - list_for_each_entry(memb, &mg->members, list) - _set_node_info(mg, memb->nodeid, nodep++); - - if (option == GFSC_NODES_ALL) { - list_for_each_entry(memb, &mg->members_gone, list) - _set_node_info(mg, memb->nodeid, nodep++); - } - out: - *node_count = count; - *nodes_out = nodes; - return 0; -} - -int set_group_mode(void) -{ - int i = 0, rv, version, limit; - - while (1) { - rv = group_get_version(&version); - - if (rv || version < 0) { - /* we expect to get version of -EAGAIN while groupd - is detecting the mode of everyone; don't retry - as long if we're not getting anything back from - groupd */ - - log_debug("set_group_mode get_version %d ver %d", - rv, version); - - limit = (version == -EAGAIN) ? 30 : 5; - - if (i++ > limit) { - log_error("cannot get groupd compatibility " - "mode rv %d ver %d", rv, version); - return -1; - } - sleep(1); - continue; - } - - - if (version == GROUP_LIBGROUP) { - group_mode = GROUP_LIBGROUP; - return 0; - } else if (version == GROUP_LIBCPG) { - group_mode = GROUP_LIBCPG; - return 0; - } else { - log_error("set_group_mode invalid ver %d", version); - return -1; - } - } -} - diff --git a/group/gfs_controld/main.c b/group/gfs_controld/main.c index 457a5d6..bf481af 100644 --- a/group/gfs_controld/main.c +++ b/group/gfs_controld/main.c @@ -157,19 +157,9 @@ struct mountgroup *create_mg(char *name) return NULL; memset(mg, 0, sizeof(struct mountgroup)); - if (group_mode == GROUP_LIBGROUP) - mg->old_group_mode = 1; - - INIT_LIST_HEAD(&mg->members); - INIT_LIST_HEAD(&mg->members_gone); - INIT_LIST_HEAD(&mg->plock_resources); - INIT_LIST_HEAD(&mg->saved_messages); INIT_LIST_HEAD(&mg->changes); INIT_LIST_HEAD(&mg->journals); INIT_LIST_HEAD(&mg->node_history); - mg->init = 1; - mg->master_nodeid = -1; - mg->low_nodeid = -1; strncpy(mg->name, name, GFS_MOUNTGROUP_LEN); @@ -340,10 +330,7 @@ static void process_uevent(int ci) if (strcmp(uevent_vals[Env_SUBSYSTEM], "lock_dlm") == 0) return; - if (group_mode == GROUP_LIBGROUP) - do_leave_old(fsname, 0); - else - do_leave(fsname, 0); + do_leave(fsname, 0); } else if (!strcmp(uevent_vals[Env_ACTION], "change")) { int jid, status = -1, first = -1; @@ -363,10 +350,7 @@ static void process_uevent(int ci) (strcmp(uevent_vals[Env_FIRSTMOUNT], "Done") == 0)) first = 1; - if (group_mode == GROUP_LIBGROUP) - process_recovery_uevent_old(fsname, jid, status, first); - else - process_recovery_uevent(fsname, jid, status, first); + process_recovery_uevent(fsname, jid, status, first); } else if (!strcmp(uevent_vals[Env_ACTION], "offline")) { do_withdraw(fsname); @@ -445,29 +429,6 @@ static void query_dump_debug(int fd) do_write(fd, dump_buf, len); } -static void query_dump_plocks(int fd, char *name) -{ - struct mountgroup *mg; - struct gfsc_header h; - int rv; - - mg = find_mg(name); - if (!mg) { - plock_dump_len = 0; - rv = -ENOENT; - } else { - /* writes to plock_dump_buf and sets plock_dump_len */ - rv = fill_plock_dump_buf(mg); - } - - init_header(&h, GFSC_CMD_DUMP_PLOCKS, name, rv, plock_dump_len); - - do_write(fd, &h, sizeof(h)); - - if (plock_dump_len) - do_write(fd, plock_dump_buf, plock_dump_len); -} - /* combines a header and the data and sends it back to the client in a single do_write() call */ @@ -506,12 +467,8 @@ static void query_mountgroup_info(int fd, char *name) } memset(&mountgroup, 0, sizeof(mountgroup)); - mountgroup.group_mode = group_mode; - if (group_mode == GROUP_LIBGROUP) - rv = set_mountgroup_info_group(mg, &mountgroup); - else - rv = set_mountgroup_info(mg, &mountgroup); + rv = set_mountgroup_info(mg, &mountgroup); out: do_reply(fd, GFSC_CMD_MOUNTGROUP_INFO, name, rv, (char *)&mountgroup, sizeof(mountgroup)); @@ -529,10 +486,7 @@ static void query_node_info(int fd, char *name, int nodeid) goto out; } - if (group_mode == GROUP_LIBGROUP) - rv = set_node_info_group(mg, nodeid, &node); - else - rv = set_node_info(mg, nodeid, &node); + rv = set_node_info(mg, nodeid, &node); out: do_reply(fd, GFSC_CMD_NODE_INFO, name, rv, (char *)&node, sizeof(node)); @@ -544,11 +498,7 @@ static void query_mountgroups(int fd, int max) struct gfsc_mountgroup *mgs = NULL; int rv, result; - if (group_mode == GROUP_LIBGROUP) - rv = set_mountgroups_group(&mg_count, &mgs); - else - rv = set_mountgroups(&mg_count, &mgs); - + rv = set_mountgroups(&mg_count, &mgs); if (rv < 0) { result = rv; mg_count = 0; @@ -583,11 +533,7 @@ static void query_mountgroup_nodes(int fd, char *name, int option, int max) goto out; } - if (group_mode == GROUP_LIBGROUP) - rv = set_mountgroup_nodes_group(mg, option, &node_count, &nodes); - else - rv = set_mountgroup_nodes(mg, option, &node_count, &nodes); - + rv = set_mountgroup_nodes(mg, option, &node_count, &nodes); if (rv < 0) { result = rv; node_count = 0; @@ -757,11 +703,7 @@ static void do_join(int ci, struct gfsc_mount_args *ma) list_add(&mg->list, &mountgroups); - if (group_mode == GROUP_LIBGROUP) - rv = gfs_join_mountgroup_old(mg, ma); - else - rv = gfs_join_mountgroup(mg); - + rv = gfs_join_mountgroup(mg); if (rv) { log_error("join: group join error %d", rv); list_del(&mg->list); @@ -829,10 +771,7 @@ static void do_mount_done(char *table, int result) mg->kernel_mount_done = 1; mg->kernel_mount_error = result; - if (group_mode == GROUP_LIBGROUP) - send_mount_status_old(mg); - else - gfs_mount_done(mg); + gfs_mount_done(mg); } void client_reply_remount(struct mountgroup *mg, int ci, int result) @@ -870,13 +809,6 @@ static void do_remount(int ci, struct gfsc_mount_args *ma) if ((mg->ro && ro) || (!mg->ro && !ro)) goto out; - if (group_mode == GROUP_LIBGROUP) { - /* the receive calls client_reply_remount */ - mg->remount_client = ci; - send_remount_old(mg, ma); - return; - } - send_remount(mg, ma); out: client_reply_remount(mg, ci, result); @@ -943,10 +875,7 @@ void process_connection(int ci) break; case GFSC_CMD_FS_LEAVE: - if (group_mode == GROUP_LIBGROUP) - do_leave_old(ma->table, h.data); - else - do_leave(ma->table, h.data); + do_leave(ma->table, h.data); break; case GFSC_CMD_FS_MOUNT_DONE: @@ -1068,9 +997,6 @@ static void *process_queries(void *arg) case GFSC_CMD_DUMP_DEBUG: query_dump_debug(f); break; - case GFSC_CMD_DUMP_PLOCKS: - query_dump_plocks(f, h.name); - break; case GFSC_CMD_MOUNTGROUP_INFO: query_mountgroup_info(f, h.name); break; @@ -1158,75 +1084,19 @@ static void loop(void) goto out; client_add(rv, process_uevent, NULL); - group_mode = GROUP_LIBCPG; - - if (cfgd_groupd_compat) { - rv = setup_groupd(); - if (rv < 0) - goto out; - client_add(rv, process_groupd, cluster_dead); - - switch (cfgd_groupd_compat) { - case 1: - group_mode = GROUP_LIBGROUP; - rv = 0; - break; - case 2: - rv = set_group_mode(); - break; - default: - log_error("inval groupd_compat %d", cfgd_groupd_compat); - rv = -1; - break; - } - if (rv < 0) - goto out; - } - log_debug("group_mode %d compat %d", group_mode, cfgd_groupd_compat); - - if (group_mode == GROUP_LIBCPG) { - - /* - * The new, good, way of doing things using libcpg directly. - * code in: cpg-new.c - */ - - rv = setup_cpg(); - if (rv < 0) - goto out; - client_add(rv, process_cpg, cluster_dead); - - rv = set_protocol(); - if (rv < 0) - goto out; - - rv = setup_dlmcontrol(); - if (rv < 0) - goto out; - client_add(rv, process_dlmcontrol, dlmcontrol_dead); - - } else if (group_mode == GROUP_LIBGROUP) { - - /* - * The old, bad, way of doing things using libgroup. - * code in: cpg-old.c group.c plock.c - */ - - rv = setup_cpg_old(); - if (rv < 0) - goto out; - client_add(rv, process_cpg_old, cluster_dead); + rv = setup_cpg(); + if (rv < 0) + goto out; + client_add(rv, process_cpg, cluster_dead); - rv = setup_misc_devices(); - if (rv < 0) - goto out; + rv = set_protocol(); + if (rv < 0) + goto out; - rv = setup_plocks(); - if (rv < 0) - goto out; - plock_fd = rv; - plock_ci = client_add(rv, process_plocks, NULL); - } + rv = setup_dlmcontrol(); + if (rv < 0) + goto out; + client_add(rv, process_dlmcontrol, dlmcontrol_dead); for (;;) { rv = poll(pollfd, client_maxi + 1, poll_timeout); @@ -1265,20 +1135,10 @@ static void loop(void) poll_timeout = -1; if (poll_dlm) { - /* only happens for GROUP_LIBCPG */ process_mountgroups(); poll_timeout = 500; } - if (poll_ignore_plock) { - /* only happens for GROUP_LIBGROUP */ - if (!limit_plocks()) { - poll_ignore_plock = 0; - client_back(plock_ci, plock_fd); - } - poll_timeout = 1000; - } - if (dmsetup_wait) { update_dmsetup_wait(); if (dmsetup_wait) { @@ -1292,12 +1152,7 @@ static void loop(void) query_unlock(); } out: - if (group_mode == GROUP_LIBCPG) - close_cpg(); - else if (group_mode == GROUP_LIBGROUP) - close_cpg_old(); - if (cfgd_groupd_compat) - close_groupd(); + close_cpg(); close_logging(); close_ccs(); close_cman(); @@ -1358,33 +1213,13 @@ static void print_usage(void) printf("\n"); printf(" -D Enable debugging to stderr and don't fork\n"); printf(" -L Enable debugging to log file\n"); - printf(" -g groupd compatibility mode, 0 off, 1 on, 2 detect\n"); - printf(" 0: use libcpg, no backward compat, best performance\n"); - printf(" 1: use libgroup for compat with cluster2/rhel5\n"); - printf(" 2: use groupd to detect old, or mode 1, nodes that\n" - " require compat, use libcpg if none found\n"); - printf(" Default is %d\n", DEFAULT_GROUPD_COMPAT); printf(" -w Enable (1) or disable (0) withdraw\n"); printf(" Default is %d\n", DEFAULT_ENABLE_WITHDRAW); - printf(" -p Enable (1) or disable (0) plock code\n"); - printf(" Default is %d\n", DEFAULT_ENABLE_PLOCK); - printf(" -P Enable plock debugging\n"); - - printf(" -l Limit the rate of plock operations\n"); - printf(" Default is %d, set to 0 for no limit\n", DEFAULT_PLOCK_RATE_LIMIT); - printf(" -o Enable (1) or disable (0) plock ownership\n"); - printf(" Default is %d\n", DEFAULT_PLOCK_OWNERSHIP); - printf(" -t plock ownership drop resources time (milliseconds)\n"); - printf(" Default is %u\n", DEFAULT_DROP_RESOURCES_TIME); - printf(" -c plock ownership drop resources count\n"); - printf(" Default is %u\n", DEFAULT_DROP_RESOURCES_COUNT); - printf(" -a plock ownership drop resources age (milliseconds)\n"); - printf(" Default is %u\n", DEFAULT_DROP_RESOURCES_AGE); printf(" -h Print this help, then exit\n"); printf(" -V Print program version information, then exit\n"); } -#define OPTION_STRING "LDKg:w:f:q:d:p:Pl:o:t:c:a:hV" +#define OPTION_STRING "LDw:hV" static void read_arguments(int argc, char **argv) { @@ -1405,51 +1240,11 @@ static void read_arguments(int argc, char **argv) cfgd_debug_logfile = 1; break; - case 'g': - optd_groupd_compat = 1; - cfgd_groupd_compat = atoi(optarg); - break; - case 'w': optd_enable_withdraw = 1; cfgd_enable_withdraw = atoi(optarg); break; - case 'p': - optd_enable_plock = 1; - cfgd_enable_plock = atoi(optarg); - break; - - case 'P': - optd_plock_debug = 1; - cfgd_plock_debug = 1; - break; - - case 'l': - optd_plock_rate_limit = 1; - cfgd_plock_rate_limit = atoi(optarg); - break; - - case 'o': - optd_plock_ownership = 1; - cfgd_plock_ownership = atoi(optarg); - break; - - case 't': - optd_drop_resources_time = 1; - cfgd_drop_resources_time = atoi(optarg); - break; - - case 'c': - optd_drop_resources_count = 1; - cfgd_drop_resources_count = atoi(optarg); - break; - - case 'a': - optd_drop_resources_age = 1; - cfgd_drop_resources_age = atoi(optarg); - break; - case 'h': print_usage(); exit(EXIT_SUCCESS); @@ -1559,10 +1354,7 @@ void daemon_dump_save(void) int daemon_debug_opt; int daemon_quit; int cluster_down; -int poll_ignore_plock; int poll_dlm; -int plock_fd; -int plock_ci; struct list_head mountgroups; int cman_quorate; int our_nodeid; @@ -1571,13 +1363,8 @@ char daemon_debug_buf[256]; char dump_buf[GFSC_DUMP_SIZE]; int dump_point; int dump_wrap; -char plock_dump_buf[GFSC_DUMP_SIZE]; -int plock_dump_len; int dmsetup_wait; cpg_handle_t cpg_handle_daemon; int libcpg_flow_control_on; -int group_mode; -uint32_t plock_minor; -uint32_t old_plock_minor; struct list_head withdrawn_mounts; diff --git a/group/gfs_controld/plock.c b/group/gfs_controld/plock.c deleted file mode 100644 index ce17afe..0000000 --- a/group/gfs_controld/plock.c +++ /dev/null @@ -1,2361 +0,0 @@ -/* gfs_controld only handles plocks in rhel5/stable2 compat mode */ - -#include "gfs_daemon.h" -#include "cpg-old.h" -#include "config.h" - -#include - -static uint32_t plock_read_count; -static uint32_t plock_recv_count; -static uint32_t plock_rate_delays; -static struct timeval plock_read_time; -static struct timeval plock_recv_time; -static struct timeval plock_rate_last; - -static int plock_device_fd = -1; -static SaCkptHandleT ckpt_handle; -static SaCkptCallbacksT callbacks = { 0, 0 }; -static SaVersionT version = { 'B', 1, 1 }; -static char section_buf[1024 * 1024]; -static uint32_t section_len; -static int need_fsid_translation = 0; - -struct pack_plock { - uint64_t start; - uint64_t end; - uint64_t owner; - uint32_t pid; - uint32_t nodeid; - uint8_t ex; - uint8_t waiter; - uint16_t pad1; - uint32_t pad; -}; - -#define R_GOT_UNOWN 0x00000001 /* have received owner=0 message */ - -struct resource { - struct list_head list; /* list of resources */ - uint64_t number; - int owner; /* nodeid or 0 for unowned */ - uint32_t flags; - struct timeval last_access; - struct list_head locks; /* one lock for each range */ - struct list_head waiters; - struct list_head pending; /* discovering r owner */ -}; - -#define P_SYNCING 0x00000001 /* plock has been sent as part of sync but not - yet received */ - -struct posix_lock { - struct list_head list; /* resource locks or waiters list */ - uint32_t pid; - uint64_t owner; - uint64_t start; - uint64_t end; - int ex; - int nodeid; - uint32_t flags; -}; - -struct lock_waiter { - struct list_head list; - uint32_t flags; - struct dlm_plock_info info; -}; - - -static void send_own(struct mountgroup *mg, struct resource *r, int owner); -static void save_pending_plock(struct mountgroup *mg, struct resource *r, - struct dlm_plock_info *in); - - -static int got_unown(struct resource *r) -{ - return !!(r->flags & R_GOT_UNOWN); -} - -static void info_bswap_out(struct dlm_plock_info *i) -{ - i->version[0] = cpu_to_le32(i->version[0]); - i->version[1] = cpu_to_le32(i->version[1]); - i->version[2] = cpu_to_le32(i->version[2]); - i->pid = cpu_to_le32(i->pid); - i->nodeid = cpu_to_le32(i->nodeid); - i->rv = cpu_to_le32(i->rv); - i->fsid = cpu_to_le32(i->fsid); - i->number = cpu_to_le64(i->number); - i->start = cpu_to_le64(i->start); - i->end = cpu_to_le64(i->end); - i->owner = cpu_to_le64(i->owner); -} - -static void info_bswap_in(struct dlm_plock_info *i) -{ - i->version[0] = le32_to_cpu(i->version[0]); - i->version[1] = le32_to_cpu(i->version[1]); - i->version[2] = le32_to_cpu(i->version[2]); - i->pid = le32_to_cpu(i->pid); - i->nodeid = le32_to_cpu(i->nodeid); - i->rv = le32_to_cpu(i->rv); - i->fsid = le32_to_cpu(i->fsid); - i->number = le64_to_cpu(i->number); - i->start = le64_to_cpu(i->start); - i->end = le64_to_cpu(i->end); - i->owner = le64_to_cpu(i->owner); -} - -static char *op_str(int optype) -{ - switch (optype) { - case DLM_PLOCK_OP_LOCK: - return "LK"; - case DLM_PLOCK_OP_UNLOCK: - return "UN"; - case DLM_PLOCK_OP_GET: - return "GET"; - default: - return "??"; - } -} - -static char *ex_str(int optype, int ex) -{ - if (optype == DLM_PLOCK_OP_UNLOCK || optype == DLM_PLOCK_OP_GET) - return "-"; - if (ex) - return "WR"; - else - return "RD"; -} - -/* - * In kernels before 2.6.26, plocks came from gfs2's lock_dlm module. - * Reading plocks from there as well should allow us to use cluster3 - * on old (RHEL5) kernels. In this case, the fsid we read in plock_info - * structs is the mountgroup id, which we need to translate to the ls id. - */ - -int setup_plocks(void) -{ - SaAisErrorT err; - - plock_read_count = 0; - plock_recv_count = 0; - plock_rate_delays = 0; - gettimeofday(&plock_read_time, NULL); - gettimeofday(&plock_recv_time, NULL); - gettimeofday(&plock_rate_last, NULL); - - err = saCkptInitialize(&ckpt_handle, &callbacks, &version); - if (err != SA_AIS_OK) { - log_error("ckpt init error %d", err); - cfgd_enable_plock = 0; - - /* still try to open and read the control device so that we can - send ENOSYS back to the kernel if it tries to do a plock */ - } - - if (plock_minor) { - need_fsid_translation = 1; - plock_device_fd = open("/dev/misc/dlm_plock", O_RDWR); - } else if (old_plock_minor) { - log_debug("setup_plocks using old lock_dlm interface"); - need_fsid_translation = 0; - plock_device_fd = open("/dev/misc/lock_dlm_plock", O_RDWR); - } - - if (plock_device_fd < 0) { - log_error("Failure to open plock device: %s", strerror(errno)); - return -1; - } - - log_debug("plocks %d", plock_device_fd); - log_debug("plock cpg message size: %u bytes", - (unsigned int) (sizeof(struct gdlm_header) + - sizeof(struct dlm_plock_info))); - - return plock_device_fd; -} - -/* FIXME: unify these two */ - -static unsigned long time_diff_ms(struct timeval *begin, struct timeval *end) -{ - struct timeval result; - timersub(end, begin, &result); - return (result.tv_sec * 1000) + (result.tv_usec / 1000); -} - -static uint64_t dt_usec(struct timeval *start, struct timeval *stop) -{ - uint64_t dt; - - dt = stop->tv_sec - start->tv_sec; - dt *= 1000000; - dt += stop->tv_usec - start->tv_usec; - return dt; -} - -static struct resource *search_resource(struct mountgroup *mg, uint64_t number) -{ - struct resource *r; - - list_for_each_entry(r, &mg->plock_resources, list) { - if (r->number == number) - return r; - } - return NULL; -} - -static int find_resource(struct mountgroup *mg, uint64_t number, int create, - struct resource **r_out) -{ - struct resource *r = NULL; - int rv = 0; - - r = search_resource(mg, number); - if (r) - goto out; - - if (create == 0) { - rv = -ENOENT; - goto out; - } - - r = malloc(sizeof(struct resource)); - if (!r) { - log_error("find_resource no memory %d", errno); - rv = -ENOMEM; - goto out; - } - - memset(r, 0, sizeof(struct resource)); - r->number = number; - INIT_LIST_HEAD(&r->locks); - INIT_LIST_HEAD(&r->waiters); - INIT_LIST_HEAD(&r->pending); - - if (cfgd_plock_ownership) - r->owner = -1; - else - r->owner = 0; - - list_add_tail(&r->list, &mg->plock_resources); - out: - if (r) - gettimeofday(&r->last_access, NULL); - *r_out = r; - return rv; -} - -static void put_resource(struct resource *r) -{ - /* with ownership, resources are only freed via drop messages */ - if (cfgd_plock_ownership) - return; - - if (list_empty(&r->locks) && list_empty(&r->waiters)) { - list_del(&r->list); - free(r); - } -} - -static inline int ranges_overlap(uint64_t start1, uint64_t end1, - uint64_t start2, uint64_t end2) -{ - if (end1 < start2 || start1 > end2) - return 0; - return 1; -} - -/** - * overlap_type - returns a value based on the type of overlap - * @s1 - start of new lock range - * @e1 - end of new lock range - * @s2 - start of existing lock range - * @e2 - end of existing lock range - * - */ - -static int overlap_type(uint64_t s1, uint64_t e1, uint64_t s2, uint64_t e2) -{ - int ret; - - /* - * ---r1--- - * ---r2--- - */ - - if (s1 == s2 && e1 == e2) - ret = 0; - - /* - * --r1-- - * ---r2--- - */ - - else if (s1 == s2 && e1 < e2) - ret = 1; - - /* - * --r1-- - * ---r2--- - */ - - else if (s1 > s2 && e1 == e2) - ret = 1; - - /* - * --r1-- - * ---r2--- - */ - - else if (s1 > s2 && e1 < e2) - ret = 2; - - /* - * ---r1--- or ---r1--- or ---r1--- - * --r2-- --r2-- --r2-- - */ - - else if (s1 <= s2 && e1 >= e2) - ret = 3; - - /* - * ---r1--- - * ---r2--- - */ - - else if (s1 > s2 && e1 > e2) - ret = 4; - - /* - * ---r1--- - * ---r2--- - */ - - else if (s1 < s2 && e1 < e2) - ret = 4; - - else - ret = -1; - - return ret; -} - -/* shrink the range start2:end2 by the partially overlapping start:end */ - -static int shrink_range2(uint64_t *start2, uint64_t *end2, - uint64_t start, uint64_t end) -{ - int error = 0; - - if (*start2 < start) - *end2 = start - 1; - else if (*end2 > end) - *start2 = end + 1; - else - error = -1; - return error; -} - -static int shrink_range(struct posix_lock *po, uint64_t start, uint64_t end) -{ - return shrink_range2(&po->start, &po->end, start, end); -} - -static int is_conflict(struct resource *r, struct dlm_plock_info *in, int get) -{ - struct posix_lock *po; - - list_for_each_entry(po, &r->locks, list) { - if (po->nodeid == in->nodeid && po->owner == in->owner) - continue; - if (!ranges_overlap(po->start, po->end, in->start, in->end)) - continue; - - if (in->ex || po->ex) { - if (get) { - in->ex = po->ex; - in->pid = po->pid; - in->start = po->start; - in->end = po->end; - } - return 1; - } - } - return 0; -} - -static int add_lock(struct resource *r, uint32_t nodeid, uint64_t owner, - uint32_t pid, int ex, uint64_t start, uint64_t end) -{ - struct posix_lock *po; - - po = malloc(sizeof(struct posix_lock)); - if (!po) - return -ENOMEM; - memset(po, 0, sizeof(struct posix_lock)); - - po->start = start; - po->end = end; - po->nodeid = nodeid; - po->owner = owner; - po->pid = pid; - po->ex = ex; - list_add_tail(&po->list, &r->locks); - - return 0; -} - -/* RN within RE (and starts or ends on RE boundary) - 1. add new lock for non-overlap area of RE, orig mode - 2. convert RE to RN range and mode */ - -static int lock_case1(struct posix_lock *po, struct resource *r, - struct dlm_plock_info *in) -{ - uint64_t start2, end2; - int rv; - - /* non-overlapping area start2:end2 */ - start2 = po->start; - end2 = po->end; - rv = shrink_range2(&start2, &end2, in->start, in->end); - if (rv) - goto out; - - po->start = in->start; - po->end = in->end; - po->ex = in->ex; - - rv = add_lock(r, in->nodeid, in->owner, in->pid, !in->ex, start2, end2); - out: - return rv; -} - -/* RN within RE (RE overlaps RN on both sides) - 1. add new lock for front fragment, orig mode - 2. add new lock for back fragment, orig mode - 3. convert RE to RN range and mode */ - -static int lock_case2(struct posix_lock *po, struct resource *r, - struct dlm_plock_info *in) - -{ - int rv; - - rv = add_lock(r, in->nodeid, in->owner, in->pid, - !in->ex, po->start, in->start - 1); - if (rv) - goto out; - - rv = add_lock(r, in->nodeid, in->owner, in->pid, - !in->ex, in->end + 1, po->end); - if (rv) - goto out; - - po->start = in->start; - po->end = in->end; - po->ex = in->ex; - out: - return rv; -} - -static int lock_internal(struct mountgroup *mg, struct resource *r, - struct dlm_plock_info *in) -{ - struct posix_lock *po, *safe; - int rv = 0; - - list_for_each_entry_safe(po, safe, &r->locks, list) { - if (po->nodeid != in->nodeid || po->owner != in->owner) - continue; - if (!ranges_overlap(po->start, po->end, in->start, in->end)) - continue; - - /* existing range (RE) overlaps new range (RN) */ - - switch(overlap_type(in->start, in->end, po->start, po->end)) { - - case 0: - if (po->ex == in->ex) - goto out; - - /* ranges the same - just update the existing lock */ - po->ex = in->ex; - goto out; - - case 1: - if (po->ex == in->ex) - goto out; - - rv = lock_case1(po, r, in); - goto out; - - case 2: - if (po->ex == in->ex) - goto out; - - rv = lock_case2(po, r, in); - goto out; - - case 3: - list_del(&po->list); - free(po); - break; - - case 4: - if (po->start < in->start) - po->end = in->start - 1; - else - po->start = in->end + 1; - break; - - default: - rv = -1; - goto out; - } - } - - rv = add_lock(r, in->nodeid, in->owner, in->pid, - in->ex, in->start, in->end); - out: - return rv; - -} - -static int unlock_internal(struct mountgroup *mg, struct resource *r, - struct dlm_plock_info *in) -{ - struct posix_lock *po, *safe; - int rv = 0; - - list_for_each_entry_safe(po, safe, &r->locks, list) { - if (po->nodeid != in->nodeid || po->owner != in->owner) - continue; - if (!ranges_overlap(po->start, po->end, in->start, in->end)) - continue; - - /* existing range (RE) overlaps new range (RN) */ - - switch (overlap_type(in->start, in->end, po->start, po->end)) { - - case 0: - /* ranges the same - just remove the existing lock */ - - list_del(&po->list); - free(po); - goto out; - - case 1: - /* RN within RE and starts or ends on RE boundary - - * shrink and update RE */ - - rv = shrink_range(po, in->start, in->end); - goto out; - - case 2: - /* RN within RE - shrink and update RE to be front - * fragment, and add a new lock for back fragment */ - - rv = add_lock(r, in->nodeid, in->owner, in->pid, - po->ex, in->end + 1, po->end); - po->end = in->start - 1; - goto out; - - case 3: - /* RE within RN - remove RE, then continue checking - * because RN could cover other locks */ - - list_del(&po->list); - free(po); - continue; - - case 4: - /* front of RE in RN, or end of RE in RN - shrink and - * update RE, then continue because RN could cover - * other locks */ - - rv = shrink_range(po, in->start, in->end); - continue; - - default: - rv = -1; - goto out; - } - } - out: - return rv; -} - -static int add_waiter(struct mountgroup *mg, struct resource *r, - struct dlm_plock_info *in) - -{ - struct lock_waiter *w; - - w = malloc(sizeof(struct lock_waiter)); - if (!w) - return -ENOMEM; - memcpy(&w->info, in, sizeof(struct dlm_plock_info)); - list_add_tail(&w->list, &r->waiters); - return 0; -} - -static void write_result(struct mountgroup *mg, struct dlm_plock_info *in, - int rv) -{ - if (need_fsid_translation) - in->fsid = mg->associated_ls_id; - - in->rv = rv; - write(plock_device_fd, in, sizeof(struct dlm_plock_info)); -} - -static void do_waiters(struct mountgroup *mg, struct resource *r) -{ - struct lock_waiter *w, *safe; - struct dlm_plock_info *in; - int rv; - - list_for_each_entry_safe(w, safe, &r->waiters, list) { - in = &w->info; - - if (is_conflict(r, in, 0)) - continue; - - list_del(&w->list); - - /* - log_group(mg, "take waiter %llx %llx-%llx %d/%u/%llx", - in->number, in->start, in->end, - in->nodeid, in->pid, in->owner); - */ - - rv = lock_internal(mg, r, in); - - if (in->nodeid == our_nodeid) - write_result(mg, in, rv); - - free(w); - } -} - -static void do_lock(struct mountgroup *mg, struct dlm_plock_info *in, - struct resource *r) -{ - int rv; - - if (is_conflict(r, in, 0)) { - if (!in->wait) - rv = -EAGAIN; - else { - rv = add_waiter(mg, r, in); - if (rv) - goto out; - rv = -EINPROGRESS; - } - } else - rv = lock_internal(mg, r, in); - - out: - if (in->nodeid == our_nodeid && rv != -EINPROGRESS) - write_result(mg, in, rv); - - do_waiters(mg, r); - put_resource(r); -} - -static void do_unlock(struct mountgroup *mg, struct dlm_plock_info *in, - struct resource *r) -{ - int rv; - - rv = unlock_internal(mg, r, in); - - if (in->nodeid == our_nodeid) - write_result(mg, in, rv); - - do_waiters(mg, r); - put_resource(r); -} - -/* we don't even get to this function if the getlk isn't from us */ - -static void do_get(struct mountgroup *mg, struct dlm_plock_info *in, - struct resource *r) -{ - int rv; - - if (is_conflict(r, in, 1)) - rv = 1; - else - rv = 0; - - write_result(mg, in, rv); -} - -static void __receive_plock(struct mountgroup *mg, struct dlm_plock_info *in, - int from, struct resource *r) -{ - switch (in->optype) { - case DLM_PLOCK_OP_LOCK: - mg->last_plock_time = time(NULL); - do_lock(mg, in, r); - break; - case DLM_PLOCK_OP_UNLOCK: - mg->last_plock_time = time(NULL); - do_unlock(mg, in, r); - break; - case DLM_PLOCK_OP_GET: - do_get(mg, in, r); - break; - default: - log_error("receive_plock from %d optype %d", from, in->optype); - if (from == our_nodeid) - write_result(mg, in, -EINVAL); - } -} - -/* When mg members receive our options message (for our mount), one of them - saves all plock state received to that point in a checkpoint and then sends - us our journals message. We know to retrieve the plock state from the - checkpoint when we receive our journals message. Any plocks messages that - arrive between seeing our options message and our journals message needs to - be saved and processed after we synchronize our plock state from the - checkpoint. Any plock message received while we're mounting but before we - set save_plocks (when we see our options message) can be ignored because it - should be reflected in the checkpointed state. */ - -static void _receive_plock(struct mountgroup *mg, char *buf, int len, int from) -{ - struct dlm_plock_info info; - struct gdlm_header *hd = (struct gdlm_header *) buf; - struct resource *r = NULL; - struct timeval now; - uint64_t usec; - int rv, create; - - memcpy(&info, buf + sizeof(struct gdlm_header), sizeof(info)); - info_bswap_in(&info); - - log_plock(mg, "receive plock %llx %s %s %llx-%llx %d/%u/%llx w %d", - (unsigned long long)info.number, - op_str(info.optype), - ex_str(info.optype, info.ex), - (unsigned long long)info.start, (unsigned long long)info.end, - info.nodeid, info.pid, (unsigned long long)info.owner, - info.wait); - - plock_recv_count++; - if (!(plock_recv_count % 1000)) { - gettimeofday(&now, NULL); - usec = dt_usec(&plock_recv_time, &now); - log_group(mg, "plock_recv_count %u time %.3f s", - plock_recv_count, usec * 1.e-6); - plock_recv_time = now; - } - - if (info.optype == DLM_PLOCK_OP_GET && from != our_nodeid) - return; - - if (from != hd->nodeid || from != info.nodeid) { - log_error("receive_plock from %d header %d info %d", - from, hd->nodeid, info.nodeid); - return; - } - - create = !cfgd_plock_ownership; - - rv = find_resource(mg, info.number, create, &r); - - if (rv && cfgd_plock_ownership) { - /* There must have been a race with a drop, so we need to - ignore this plock op which will be resent. If we're the one - who sent the plock, we need to send_own() and put it on the - pending list to resend once the owner is established. */ - - log_debug("receive_plock from %d no r %llx", from, - (unsigned long long)info.number); - - if (from != our_nodeid) - return; - - rv = find_resource(mg, info.number, 1, &r); - if (rv) - return; - send_own(mg, r, our_nodeid); - save_pending_plock(mg, r, &info); - return; - } - if (rv) { - /* r not found, rv is -ENOENT, this shouldn't happen because - process_plocks() creates a resource for every op */ - - log_error("receive_plock from %d no r %llx %d", from, - (unsigned long long)info.number, rv); - return; - } - - /* The owner should almost always be 0 here, but other owners may - be possible given odd combinations of races with drop. Odd races to - worry about (some seem pretty improbable): - - - A sends drop, B sends plock, receive drop, receive plock. - This is addressed above. - - - A sends drop, B sends plock, receive drop, B reads plock - and sends own, receive plock, on B we find owner of -1. - - - A sends drop, B sends two plocks, receive drop, receive plocks. - Receiving the first plock is the previous case, receiving the - second plock will find r with owner of -1. - - - A sends drop, B sends two plocks, receive drop, C sends own, - receive plock, B sends own, receive own (C), receive plock, - receive own (B). - - Haven't tried to cook up a scenario that would lead to the - last case below; receiving a plock from ourself and finding - we're the owner of r. */ - - if (!r->owner) { - __receive_plock(mg, &info, from, r); - - } else if (r->owner == -1) { - log_debug("receive_plock from %d r %llx owner %d", from, - (unsigned long long)info.number, r->owner); - - if (from == our_nodeid) - save_pending_plock(mg, r, &info); - - } else if (r->owner != our_nodeid) { - /* might happen, if frequent change to log_debug */ - log_error("receive_plock from %d r %llx owner %d", from, - (unsigned long long)info.number, r->owner); - - if (from == our_nodeid) - save_pending_plock(mg, r, &info); - - } else if (r->owner == our_nodeid) { - /* might happen, if frequent change to log_debug */ - log_error("receive_plock from %d r %llx owner %d", from, - (unsigned long long)info.number, r->owner); - - if (from == our_nodeid) - __receive_plock(mg, &info, from, r); - } -} - -void receive_plock(struct mountgroup *mg, char *buf, int len, int from) -{ - if (mg->save_plocks) { - save_message_old(mg, buf, len, from, MSG_PLOCK); - return; - } - - if (!mg->got_our_journals) { - log_group(mg, "not saving plock messages yet"); - return; - } - - _receive_plock(mg, buf, len, from); -} - -static int send_struct_info(struct mountgroup *mg, struct dlm_plock_info *in, - int msg_type) -{ - char *buf; - int rv, len; - struct gdlm_header *hd; - - len = sizeof(struct gdlm_header) + sizeof(struct dlm_plock_info); - buf = malloc(len); - if (!buf) { - rv = -ENOMEM; - goto out; - } - memset(buf, 0, len); - - info_bswap_out(in); - - hd = (struct gdlm_header *)buf; - hd->type = msg_type; - hd->nodeid = our_nodeid; - hd->to_nodeid = 0; - - memcpy(buf + sizeof(struct gdlm_header), in, sizeof(*in)); - - rv = send_group_message_old(mg, len, buf); - - free(buf); - out: - if (rv) - log_error("send plock message error %d", rv); - return rv; -} - -static void send_plock(struct mountgroup *mg, struct resource *r, - struct dlm_plock_info *in) -{ - send_struct_info(mg, in, MSG_PLOCK); -} - -static void send_own(struct mountgroup *mg, struct resource *r, int owner) -{ - struct dlm_plock_info info; - - /* if we've already sent an own message for this resource, - (pending list is not empty), then we shouldn't send another */ - - if (!list_empty(&r->pending)) { - log_debug("send_own %llx already pending", - (unsigned long long)r->number); - return; - } - - memset(&info, 0, sizeof(info)); - info.number = r->number; - info.nodeid = owner; - - send_struct_info(mg, &info, MSG_PLOCK_OWN); -} - -static void send_syncs(struct mountgroup *mg, struct resource *r) -{ - struct dlm_plock_info info; - struct posix_lock *po; - struct lock_waiter *w; - int rv; - - list_for_each_entry(po, &r->locks, list) { - memset(&info, 0, sizeof(info)); - info.number = r->number; - info.start = po->start; - info.end = po->end; - info.nodeid = po->nodeid; - info.owner = po->owner; - info.pid = po->pid; - info.ex = po->ex; - - rv = send_struct_info(mg, &info, MSG_PLOCK_SYNC_LOCK); - if (rv) - goto out; - - po->flags |= P_SYNCING; - } - - list_for_each_entry(w, &r->waiters, list) { - memcpy(&info, &w->info, sizeof(info)); - - rv = send_struct_info(mg, &info, MSG_PLOCK_SYNC_WAITER); - if (rv) - goto out; - - w->flags |= P_SYNCING; - } - out: - return; -} - -static void send_drop(struct mountgroup *mg, struct resource *r) -{ - struct dlm_plock_info info; - - memset(&info, 0, sizeof(info)); - info.number = r->number; - - send_struct_info(mg, &info, MSG_PLOCK_DROP); -} - -/* plock op can't be handled until we know the owner value of the resource, - so the op is saved on the pending list until the r owner is established */ - -static void save_pending_plock(struct mountgroup *mg, struct resource *r, - struct dlm_plock_info *in) -{ - struct lock_waiter *w; - - w = malloc(sizeof(struct lock_waiter)); - if (!w) { - log_error("save_pending_plock no mem"); - return; - } - memcpy(&w->info, in, sizeof(struct dlm_plock_info)); - list_add_tail(&w->list, &r->pending); -} - -/* plock ops are on pending list waiting for ownership to be established. - owner has now become us, so add these plocks to r */ - -static void add_pending_plocks(struct mountgroup *mg, struct resource *r) -{ - struct lock_waiter *w, *safe; - - list_for_each_entry_safe(w, safe, &r->pending, list) { - __receive_plock(mg, &w->info, our_nodeid, r); - list_del(&w->list); - free(w); - } -} - -/* plock ops are on pending list waiting for ownership to be established. - owner has now become 0, so send these plocks to everyone */ - -static void send_pending_plocks(struct mountgroup *mg, struct resource *r) -{ - struct lock_waiter *w, *safe; - - list_for_each_entry_safe(w, safe, &r->pending, list) { - send_plock(mg, r, &w->info); - list_del(&w->list); - free(w); - } -} - -static void _receive_own(struct mountgroup *mg, char *buf, int len, int from) -{ - struct gdlm_header *hd = (struct gdlm_header *) buf; - struct dlm_plock_info info; - struct resource *r; - int should_not_happen = 0; - int rv; - - memcpy(&info, buf + sizeof(struct gdlm_header), sizeof(info)); - info_bswap_in(&info); - - log_plock(mg, "receive own %llx from %u owner %u", - (unsigned long long)info.number, hd->nodeid, info.nodeid); - - rv = find_resource(mg, info.number, 1, &r); - if (rv) - return; - - if (from == our_nodeid) { - /* - * received our own own message - */ - - if (info.nodeid == 0) { - /* we are setting owner to 0 */ - - if (r->owner == our_nodeid) { - /* we set owner to 0 when we relinquish - ownership */ - should_not_happen = 1; - } else if (r->owner == 0) { - /* this happens when we relinquish ownership */ - r->flags |= R_GOT_UNOWN; - } else { - should_not_happen = 1; - } - - } else if (info.nodeid == our_nodeid) { - /* we are setting owner to ourself */ - - if (r->owner == -1) { - /* we have gained ownership */ - r->owner = our_nodeid; - add_pending_plocks(mg, r); - } else if (r->owner == our_nodeid) { - should_not_happen = 1; - } else if (r->owner == 0) { - send_pending_plocks(mg, r); - } else { - /* resource is owned by other node; - they should set owner to 0 shortly */ - } - - } else { - /* we should only ever set owner to 0 or ourself */ - should_not_happen = 1; - } - } else { - /* - * received own message from another node - */ - - if (info.nodeid == 0) { - /* other node is setting owner to 0 */ - - if (r->owner == -1) { - /* we should have a record of the owner before - it relinquishes */ - should_not_happen = 1; - } else if (r->owner == our_nodeid) { - /* only the owner should relinquish */ - should_not_happen = 1; - } else if (r->owner == 0) { - should_not_happen = 1; - } else { - r->owner = 0; - r->flags |= R_GOT_UNOWN; - send_pending_plocks(mg, r); - } - - } else if (info.nodeid == from) { - /* other node is setting owner to itself */ - - if (r->owner == -1) { - /* normal path for a node becoming owner */ - r->owner = from; - } else if (r->owner == our_nodeid) { - /* we relinquish our ownership: sync our local - plocks to everyone, then set owner to 0 */ - send_syncs(mg, r); - send_own(mg, r, 0); - /* we need to set owner to 0 here because - local ops may arrive before we receive - our send_own message and can't be added - locally */ - r->owner = 0; - } else if (r->owner == 0) { - /* can happen because we set owner to 0 before - we receive our send_own sent just above */ - } else { - /* do nothing, current owner should be - relinquishing its ownership */ - } - - } else if (info.nodeid == our_nodeid) { - /* no one else should try to set the owner to us */ - should_not_happen = 1; - } else { - /* a node should only ever set owner to 0 or itself */ - should_not_happen = 1; - } - } - - if (should_not_happen) { - log_error("receive_own from %u %llx info nodeid %d r owner %d", - from, (unsigned long long)r->number, info.nodeid, - r->owner); - } -} - -void receive_own(struct mountgroup *mg, char *buf, int len, int from) -{ - if (mg->save_plocks) { - save_message_old(mg, buf, len, from, MSG_PLOCK_OWN); - return; - } - - _receive_own(mg, buf, len, from); -} - -static void clear_syncing_flag(struct resource *r, struct dlm_plock_info *in) -{ - struct posix_lock *po; - struct lock_waiter *w; - - list_for_each_entry(po, &r->locks, list) { - if ((po->flags & P_SYNCING) && - in->start == po->start && - in->end == po->end && - in->nodeid == po->nodeid && - in->owner == po->owner && - in->pid == po->pid && - in->ex == po->ex) { - po->flags &= ~P_SYNCING; - return; - } - } - - list_for_each_entry(w, &r->waiters, list) { - if ((w->flags & P_SYNCING) && - in->start == w->info.start && - in->end == w->info.end && - in->nodeid == w->info.nodeid && - in->owner == w->info.owner && - in->pid == w->info.pid && - in->ex == w->info.ex) { - w->flags &= ~P_SYNCING; - return; - } - } - - log_error("clear_syncing %llx no match %s %llx-%llx %d/%u/%llx", - (unsigned long long)r->number, in->ex ? "WR" : "RD", - (unsigned long long)in->start, (unsigned long long)in->end, - in->nodeid, in->pid, (unsigned long long)in->owner); -} - -static void _receive_sync(struct mountgroup *mg, char *buf, int len, int from) -{ - struct dlm_plock_info info; - struct gdlm_header *hd = (struct gdlm_header *) buf; - struct resource *r; - int rv; - - memcpy(&info, buf + sizeof(struct gdlm_header), sizeof(info)); - info_bswap_in(&info); - - log_plock(mg, "receive sync %llx from %u %s %llx-%llx %d/%u/%llx", - (unsigned long long)info.number, from, info.ex ? "WR" : "RD", - (unsigned long long)info.start, (unsigned long long)info.end, - info.nodeid, info.pid, (unsigned long long)info.owner); - - rv = find_resource(mg, info.number, 0, &r); - if (rv) { - log_error("receive_sync no r %llx from %d", info.number, from); - return; - } - - if (from == our_nodeid) { - /* this plock now in sync on all nodes */ - clear_syncing_flag(r, &info); - return; - } - - if (hd->type == MSG_PLOCK_SYNC_LOCK) - add_lock(r, info.nodeid, info.owner, info.pid, info.ex, - info.start, info.end); - else if (hd->type == MSG_PLOCK_SYNC_WAITER) - add_waiter(mg, r, &info); -} - -void receive_sync(struct mountgroup *mg, char *buf, int len, int from) -{ - struct gdlm_header *hd = (struct gdlm_header *) buf; - - if (mg->save_plocks) { - save_message_old(mg, buf, len, from, hd->type); - return; - } - - _receive_sync(mg, buf, len, from); -} - -static void _receive_drop(struct mountgroup *mg, char *buf, int len, int from) -{ - struct dlm_plock_info info; - struct resource *r; - int rv; - - memcpy(&info, buf + sizeof(struct gdlm_header), sizeof(info)); - info_bswap_in(&info); - - log_plock(mg, "receive drop %llx from %u", - (unsigned long long)info.number, from); - - rv = find_resource(mg, info.number, 0, &r); - if (rv) { - /* we'll find no r if two nodes sent drop at once */ - log_debug("receive_drop from %d no r %llx", from, - (unsigned long long)info.number); - return; - } - - if (r->owner != 0) { - /* - A sent drop, B sent drop, receive drop A, C sent own, - receive drop B (this warning on C, owner -1) - - A sent drop, B sent drop, receive drop A, A sent own, - receive own A, receive drop B (this warning on all, - owner A) */ - log_debug("receive_drop from %d r %llx owner %d", from, - (unsigned long long)r->number, r->owner); - return; - } - - if (!list_empty(&r->pending)) { - /* shouldn't happen */ - log_error("receive_drop from %d r %llx pending op", from, - (unsigned long long)r->number); - return; - } - - /* the decision to drop or not must be based on things that are - guaranteed to be the same on all nodes */ - - if (list_empty(&r->locks) && list_empty(&r->waiters)) { - list_del(&r->list); - free(r); - } else { - /* A sent drop, B sent a plock, receive plock, receive drop */ - log_debug("receive_drop from %d r %llx in use", from, - (unsigned long long)r->number); - } -} - -void receive_drop(struct mountgroup *mg, char *buf, int len, int from) -{ - if (mg->save_plocks) { - save_message_old(mg, buf, len, from, MSG_PLOCK_DROP); - return; - } - - _receive_drop(mg, buf, len, from); -} - -/* We only drop resources from the unowned state to simplify things. - If we want to drop a resource we own, we unown/relinquish it first. */ - -/* FIXME: in the transition from owner = us, to owner = 0, to drop; - we want the second period to be shorter than the first */ - -static int drop_resources(struct mountgroup *mg) -{ - struct resource *r; - struct timeval now; - int count = 0; - - gettimeofday(&now, NULL); - - /* try to drop the oldest, unused resources */ - - list_for_each_entry_reverse(r, &mg->plock_resources, list) { - if (count >= cfgd_drop_resources_count) - break; - if (r->owner && r->owner != our_nodeid) - continue; - if (time_diff_ms(&r->last_access, &now) < - cfgd_drop_resources_age) - continue; - - if (list_empty(&r->locks) && list_empty(&r->waiters)) { - if (r->owner == our_nodeid) { - send_own(mg, r, 0); - r->owner = 0; - } else if (r->owner == 0 && got_unown(r)) { - send_drop(mg, r); - } - - count++; - } - } - - return 0; -} - -/* iterate through directory names looking for matching id: - /sys/kernel/dlm//id */ - -#define DLM_SYSFS_DIR "/sys/kernel/dlm" - -static char ls_name[256]; - -static int get_lockspace_name(uint32_t ls_id) -{ - char path[PATH_MAX]; - DIR *d; - FILE *file; - struct dirent *de; - uint32_t id; - int rv, error; - - d = opendir(DLM_SYSFS_DIR); - if (!d) { - log_debug("%s: opendir failed: %d", path, errno); - return -1; - } - - rv = -1; - - while ((de = readdir(d))) { - if (de->d_name[0] == '.') - continue; - - id = 0; - memset(path, 0, PATH_MAX); - snprintf(path, PATH_MAX, "%s/%s/id", DLM_SYSFS_DIR, de->d_name); - - file = fopen(path, "r"); - if (!file) { - log_error("can't open %s %d", path, errno); - continue; - } - - error = fscanf(file, "%u", &id); - fclose(file); - - if (error != 1) { - log_error("bad read %s %d", path, errno); - continue; - } - if (id != ls_id) { - log_debug("get_lockspace_name skip %x %s", - id, de->d_name); - continue; - } - - log_debug("get_lockspace_name found %x %s", id, de->d_name); - strncpy(ls_name, de->d_name, 256); - rv = 0; - break; - } - - closedir(d); - return rv; -} - -/* find the locskapce with "ls_id" in sysfs, get it's name, then look for - the mg with with the same name in mounts list, return it's id */ - -static void set_associated_id(uint32_t ls_id) -{ - struct mountgroup *mg; - int rv; - - log_debug("set_associated_id ls_id %x %d", ls_id, ls_id); - - memset(&ls_name, 0, sizeof(ls_name)); - - rv = get_lockspace_name(ls_id); - if (rv) { - log_error("no lockspace found with id %x", ls_id); - return; - } - - mg = find_mg(ls_name); - if (!mg) { - log_error("no mountgroup found with name %s for ls_id %x", - ls_name, ls_id); - return; - } - - log_debug("set_associated_id ls %x is mg %x", ls_id, mg->id); - - mg->associated_ls_id = ls_id; -} - -static uint32_t ls_to_mg_id(uint32_t fsid) -{ - struct mountgroup *mg; - int do_set = 1; - - retry: - list_for_each_entry(mg, &mountgroups, list) { - if (mg->associated_ls_id == fsid) - return mg->id; - } - - if (do_set) { - do_set = 0; - set_associated_id(fsid); - goto retry; - } - - return fsid; -} - -int limit_plocks(void) -{ - struct timeval now; - - /* Don't send more messages while the cpg message queue is backed up */ - - if (libcpg_flow_control_on) { - update_flow_control_status(); - if (libcpg_flow_control_on) - return 1; - } - - if (!cfgd_plock_rate_limit || !plock_read_count) - return 0; - - gettimeofday(&now, NULL); - - /* Every time a plock op is read from the kernel, we increment - plock_read_count. After every cfgd_plock_rate_limit (N) reads, - we check the time it's taken to do those N; if the time is less than - a second, then we delay reading any more until a second is up. - This way we read a max of N ops from the kernel every second. */ - - if (!(plock_read_count % cfgd_plock_rate_limit)) { - if (time_diff_ms(&plock_rate_last, &now) < 1000) { - plock_rate_delays++; - return 2; - } - plock_rate_last = now; - plock_read_count++; - } - return 0; -} - -void process_plocks(int ci) -{ - struct mountgroup *mg; - struct resource *r; - struct dlm_plock_info info; - struct timeval now; - uint64_t usec; - int rv; - - if (limit_plocks()) { - poll_ignore_plock = 1; - client_ignore(plock_ci, plock_fd); - return; - } - - gettimeofday(&now, NULL); - - memset(&info, 0, sizeof(info)); - - rv = do_read(plock_device_fd, &info, sizeof(info)); - if (rv < 0) { - log_debug("process_plocks: read error %d fd %d\n", - errno, plock_device_fd); - return; - } - - /* kernel doesn't set the nodeid field */ - info.nodeid = our_nodeid; - - if (!cfgd_enable_plock) { - rv = -ENOSYS; - goto fail; - } - - if (need_fsid_translation) - info.fsid = ls_to_mg_id(info.fsid); - - mg = find_mg_id(info.fsid); - if (!mg) { - log_debug("process_plocks: no mg id %x", info.fsid); - rv = -EEXIST; - goto fail; - } - - log_plock(mg, "read plock %llx %s %s %llx-%llx %d/%u/%llx w %d", - (unsigned long long)info.number, - op_str(info.optype), - ex_str(info.optype, info.ex), - (unsigned long long)info.start, (unsigned long long)info.end, - info.nodeid, info.pid, (unsigned long long)info.owner, - info.wait); - - /* report plock rate and any delays since the last report */ - plock_read_count++; - if (!(plock_read_count % 1000)) { - usec = dt_usec(&plock_read_time, &now) ; - log_group(mg, "plock_read_count %u time %.3f s delays %u", - plock_read_count, usec * 1.e-6, plock_rate_delays); - plock_read_time = now; - plock_rate_delays = 0; - } - - rv = find_resource(mg, info.number, 1, &r); - if (rv) - goto fail; - - if (r->owner == 0) { - /* plock state replicated on all nodes */ - send_plock(mg, r, &info); - - } else if (r->owner == our_nodeid) { - /* we are the owner of r, so our plocks are local */ - __receive_plock(mg, &info, our_nodeid, r); - - } else { - /* r owner is -1: r is new, try to become the owner; - r owner > 0: tell other owner to give up ownership; - both done with a message trying to set owner to ourself */ - send_own(mg, r, our_nodeid); - save_pending_plock(mg, r, &info); - } - - if (cfgd_plock_ownership && - time_diff_ms(&mg->drop_resources_last, &now) >= - cfgd_drop_resources_time) { - mg->drop_resources_last = now; - drop_resources(mg); - } - - return; - - fail: - info.rv = rv; - rv = write(plock_device_fd, &info, sizeof(info)); - - return; -} - -void process_saved_plocks(struct mountgroup *mg) -{ - struct save_msg *sm, *sm2; - - if (list_empty(&mg->saved_messages)) - return; - - log_group(mg, "process_saved_plocks"); - - list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) { - switch (sm->type) { - case MSG_PLOCK: - _receive_plock(mg, sm->buf, sm->len, sm->nodeid); - break; - case MSG_PLOCK_OWN: - _receive_own(mg, sm->buf, sm->len, sm->nodeid); - break; - case MSG_PLOCK_DROP: - _receive_drop(mg, sm->buf, sm->len, sm->nodeid); - break; - case MSG_PLOCK_SYNC_LOCK: - case MSG_PLOCK_SYNC_WAITER: - _receive_sync(mg, sm->buf, sm->len, sm->nodeid); - break; - default: - continue; - } - - list_del(&sm->list); - free(sm); - } -} - -void plock_exit(void) -{ - if (cfgd_enable_plock) - saCkptFinalize(ckpt_handle); -} - -/* locks still marked SYNCING should not go into the ckpt; the new node - will get those locks by receiving PLOCK_SYNC messages */ - -static void pack_section_buf(struct mountgroup *mg, struct resource *r) -{ - struct pack_plock *pp; - struct posix_lock *po; - struct lock_waiter *w; - int count = 0; - - /* plocks on owned resources are not replicated on other nodes */ - if (r->owner == our_nodeid) - return; - - pp = (struct pack_plock *) §ion_buf; - - list_for_each_entry(po, &r->locks, list) { - if (po->flags & P_SYNCING) - continue; - pp->start = cpu_to_le64(po->start); - pp->end = cpu_to_le64(po->end); - pp->owner = cpu_to_le64(po->owner); - pp->pid = cpu_to_le32(po->pid); - pp->nodeid = cpu_to_le32(po->nodeid); - pp->ex = po->ex; - pp->waiter = 0; - pp++; - count++; - } - - list_for_each_entry(w, &r->waiters, list) { - if (w->flags & P_SYNCING) - continue; - pp->start = cpu_to_le64(w->info.start); - pp->end = cpu_to_le64(w->info.end); - pp->owner = cpu_to_le64(w->info.owner); - pp->pid = cpu_to_le32(w->info.pid); - pp->nodeid = cpu_to_le32(w->info.nodeid); - pp->ex = w->info.ex; - pp->waiter = 1; - pp++; - count++; - } - - section_len = count * sizeof(struct pack_plock); -} - -static int unpack_section_buf(struct mountgroup *mg, char *numbuf, int buflen) -{ - struct pack_plock *pp; - struct posix_lock *po; - struct lock_waiter *w; - struct resource *r; - int count = section_len / sizeof(struct pack_plock); - int i, owner = 0; - unsigned long long num; - struct timeval now; - - gettimeofday(&now, NULL); - - r = malloc(sizeof(struct resource)); - if (!r) - return -ENOMEM; - memset(r, 0, sizeof(struct resource)); - INIT_LIST_HEAD(&r->locks); - INIT_LIST_HEAD(&r->waiters); - INIT_LIST_HEAD(&r->pending); - - if (cfgd_plock_ownership) - sscanf(numbuf, "r%llu.%d", &num, &owner); - else - sscanf(numbuf, "r%llu", &num); - - r->number = num; - r->owner = owner; - r->last_access = now; - - pp = (struct pack_plock *) §ion_buf; - - for (i = 0; i < count; i++) { - if (!pp->waiter) { - po = malloc(sizeof(struct posix_lock)); - // FIXME: handle failed malloc - po->start = le64_to_cpu(pp->start); - po->end = le64_to_cpu(pp->end); - po->owner = le64_to_cpu(pp->owner); - po->pid = le32_to_cpu(pp->pid); - po->nodeid = le32_to_cpu(pp->nodeid); - po->ex = pp->ex; - list_add_tail(&po->list, &r->locks); - } else { - w = malloc(sizeof(struct lock_waiter)); - // FIXME: handle failed malloc - w->info.start = le64_to_cpu(pp->start); - w->info.end = le64_to_cpu(pp->end); - w->info.owner = le64_to_cpu(pp->owner); - w->info.pid = le32_to_cpu(pp->pid); - w->info.nodeid = le32_to_cpu(pp->nodeid); - w->info.ex = pp->ex; - list_add_tail(&w->list, &r->waiters); - } - pp++; - } - - list_add_tail(&r->list, &mg->plock_resources); - return 0; -} - -static int _unlink_checkpoint(struct mountgroup *mg, SaNameT *name) -{ - SaCkptCheckpointHandleT h; - SaCkptCheckpointDescriptorT s; - SaAisErrorT rv; - int ret = 0; - - h = (SaCkptCheckpointHandleT) mg->cp_handle; - log_group(mg, "unlink ckpt %llx", (unsigned long long)h); - - unlink_retry: - rv = saCkptCheckpointUnlink(ckpt_handle, name); - if (rv == SA_AIS_ERR_TRY_AGAIN) { - log_group(mg, "unlink ckpt retry"); - sleep(1); - goto unlink_retry; - } - if (rv == SA_AIS_OK) - goto out_close; - - log_error("unlink ckpt error %d %s", rv, mg->name); - ret = -1; - - status_retry: - rv = saCkptCheckpointStatusGet(h, &s); - if (rv == SA_AIS_ERR_TRY_AGAIN) { - log_group(mg, "unlink ckpt status retry"); - sleep(1); - goto status_retry; - } - if (rv != SA_AIS_OK) { - log_error("unlink ckpt status error %d %s", rv, mg->name); - goto out_close; - } - - log_group(mg, "unlink ckpt status: size %llu, max sections %u, " - "max section size %llu, section count %u, mem %u", - (unsigned long long)s.checkpointCreationAttributes.checkpointSize, - s.checkpointCreationAttributes.maxSections, - (unsigned long long)s.checkpointCreationAttributes.maxSectionSize, - s.numberOfSections, s.memoryUsed); - - out_close: - if (!h) - goto out; - - rv = saCkptCheckpointClose(h); - if (rv == SA_AIS_ERR_TRY_AGAIN) { - log_group(mg, "unlink ckpt close retry"); - sleep(1); - goto out_close; - } - if (rv != SA_AIS_OK) { - log_error("unlink ckpt %llx close err %d %s", - (unsigned long long)h, rv, mg->name); - /* should we return an error here and possibly cause - store_plocks() to fail on this? */ - /* ret = -1; */ - } - out: - mg->cp_handle = 0; - return ret; -} - -int unlink_checkpoint(struct mountgroup *mg) -{ - SaNameT name; - int len; - - len = snprintf((char *)name.value, SA_MAX_NAME_LENGTH, "gfsplock.%s", - mg->name); - name.length = len; - return _unlink_checkpoint(mg, &name); -} - -/* - * section id is r., the maximum string length is: - * "r" prefix = 1 strlen("r") - * max uint64 = 20 strlen("18446744073709551615") - * "." before owner = 1 strlen(".") - * max int = 11 strlen("-2147483647") - * \0 at end = 1 - * --------------------- - * 34 SECTION_NAME_LEN - */ - -#define SECTION_NAME_LEN 34 - -/* Copy all plock state into a checkpoint so new node can retrieve it. The - node creating the ckpt for the mounter needs to be the same node that's - sending the mounter its journals message (i.e. the low nodeid). The new - mounter knows the ckpt is ready to read only after it gets its journals - message. - - If the mounter is becoming the new low nodeid in the group, the node doing - the store closes the ckpt and the new node unlinks the ckpt after reading - it. The ckpt should then disappear and the new node can create a new ckpt - for the next mounter. */ - -void store_plocks(struct mountgroup *mg, int nodeid) -{ - SaCkptCheckpointCreationAttributesT attr; - SaCkptCheckpointHandleT h; - SaCkptSectionIdT section_id; - SaCkptSectionCreationAttributesT section_attr; - SaCkptCheckpointOpenFlagsT flags; - SaNameT name; - SaAisErrorT rv; - char buf[SECTION_NAME_LEN]; - struct resource *r; - struct posix_lock *po; - struct lock_waiter *w; - int r_count, lock_count, total_size, section_size, max_section_size; - int len, owner; - - if (!cfgd_enable_plock) - return; - - /* no change to plock state since we created the last checkpoint */ - if (mg->last_checkpoint_time > mg->last_plock_time) { - log_group(mg, "store_plocks: saved ckpt uptodate"); - goto out; - } - mg->last_checkpoint_time = time(NULL); - - len = snprintf((char *)name.value, SA_MAX_NAME_LENGTH, "gfsplock.%s", - mg->name); - name.length = len; - - /* unlink an old checkpoint before we create a new one */ - if (mg->cp_handle) { - if (_unlink_checkpoint(mg, &name)) - return; - } - - /* loop through all plocks to figure out sizes to set in - the attr fields */ - - r_count = 0; - lock_count = 0; - total_size = 0; - max_section_size = 0; - - list_for_each_entry(r, &mg->plock_resources, list) { - if (r->owner == -1) - continue; - - r_count++; - section_size = 0; - list_for_each_entry(po, &r->locks, list) { - section_size += sizeof(struct pack_plock); - lock_count++; - } - list_for_each_entry(w, &r->waiters, list) { - section_size += sizeof(struct pack_plock); - lock_count++; - } - total_size += section_size; - if (section_size > max_section_size) - max_section_size = section_size; - } - - log_group(mg, "store_plocks: r_count %d, lock_count %d, pp %u bytes", - r_count, lock_count, (unsigned int)sizeof(struct pack_plock)); - - log_group(mg, "store_plocks: total %d bytes, max_section %d bytes", - total_size, max_section_size); - - attr.creationFlags = SA_CKPT_WR_ALL_REPLICAS; - attr.checkpointSize = total_size; - attr.retentionDuration = SA_TIME_MAX; - attr.maxSections = r_count + 1; /* don't know why we need +1 */ - attr.maxSectionSize = max_section_size; - attr.maxSectionIdSize = SECTION_NAME_LEN; - - flags = SA_CKPT_CHECKPOINT_READ | - SA_CKPT_CHECKPOINT_WRITE | - SA_CKPT_CHECKPOINT_CREATE; - - open_retry: - rv = saCkptCheckpointOpen(ckpt_handle, &name, &attr, flags, 0, &h); - if (rv == SA_AIS_ERR_TRY_AGAIN) { - log_group(mg, "store_plocks: ckpt open retry"); - sleep(1); - goto open_retry; - } - if (rv == SA_AIS_ERR_EXIST) { - log_group(mg, "store_plocks: ckpt already exists"); - return; - } - if (rv != SA_AIS_OK) { - log_error("store_plocks: ckpt open error %d %s", rv, mg->name); - return; - } - - log_group(mg, "store_plocks: open ckpt handle %llx", - (unsigned long long)h); - mg->cp_handle = (uint64_t) h; - - /* - If r owner is -1, ckpt nothing. - - If r owner is us, ckpt owner of us and no plocks. - - If r owner is other, ckpt that owner and any plocks we have on r - (they've just been synced but owner=0 msg not recved yet). - - If r owner is 0 and !got_unown, then we've just unowned r; - ckpt owner of us and any plocks that don't have SYNCING set - (plocks with SYNCING will be handled by our sync messages). - - If r owner is 0 and got_unown, then ckpt owner 0 and all plocks; - (there should be no SYNCING plocks) */ - - list_for_each_entry(r, &mg->plock_resources, list) { - if (r->owner == -1) - continue; - else if (r->owner == our_nodeid) - owner = our_nodeid; - else if (r->owner) - owner = r->owner; - else if (!r->owner && !got_unown(r)) - owner = our_nodeid; - else if (!r->owner) - owner = 0; - else { - log_error("store_plocks owner %d r %llx", r->owner, - (unsigned long long)r->number); - continue; - } - - memset(&buf, 0, sizeof(buf)); - if (cfgd_plock_ownership) - len = snprintf(buf, SECTION_NAME_LEN, "r%llu.%d", - (unsigned long long)r->number, owner); - else - len = snprintf(buf, SECTION_NAME_LEN, "r%llu", - (unsigned long long)r->number); - - section_id.id = (void *)buf; - section_id.idLen = len + 1; - section_attr.sectionId = §ion_id; - section_attr.expirationTime = SA_TIME_END; - - memset(§ion_buf, 0, sizeof(section_buf)); - section_len = 0; - - pack_section_buf(mg, r); - - log_group(mg, "store_plocks: section size %u id %u \"%s\"", - section_len, section_id.idLen, buf); - - create_retry: - rv = saCkptSectionCreate(h, §ion_attr, §ion_buf, - section_len); - if (rv == SA_AIS_ERR_TRY_AGAIN) { - log_group(mg, "store_plocks: ckpt create retry"); - sleep(1); - goto create_retry; - } - if (rv == SA_AIS_ERR_EXIST) { - /* this shouldn't happen in general */ - log_group(mg, "store_plocks: clearing old ckpt"); - saCkptCheckpointClose(h); - _unlink_checkpoint(mg, &name); - goto open_retry; - } - if (rv != SA_AIS_OK) { - log_error("store_plocks: ckpt section create err %d %s", - rv, mg->name); - break; - } - } - - out: - /* If the new nodeid is becoming the low nodeid it will now be in - charge of creating ckpt's for mounters instead of us. */ - - if (nodeid < our_nodeid) { - log_group(mg, "store_plocks: closing ckpt for new low node %d", - nodeid); - saCkptCheckpointClose(h); - mg->cp_handle = 0; - } -} - -/* called by a node that's just been added to the group to get existing plock - state */ - -void retrieve_plocks(struct mountgroup *mg) -{ - SaCkptCheckpointHandleT h; - SaCkptSectionIterationHandleT itr; - SaCkptSectionDescriptorT desc; - SaCkptIOVectorElementT iov; - SaNameT name; - SaAisErrorT rv; - char buf[SECTION_NAME_LEN]; - int len; - - if (!cfgd_enable_plock) - return; - - log_group(mg, "retrieve_plocks"); - - len = snprintf((char *)name.value, SA_MAX_NAME_LENGTH, "gfsplock.%s", - mg->name); - name.length = len; - - open_retry: - rv = saCkptCheckpointOpen(ckpt_handle, &name, NULL, - SA_CKPT_CHECKPOINT_READ, 0, &h); - if (rv == SA_AIS_ERR_TRY_AGAIN) { - log_group(mg, "retrieve_plocks: ckpt open retry"); - sleep(1); - goto open_retry; - } - if (rv != SA_AIS_OK) { - log_error("retrieve_plocks: ckpt open error %d %s", - rv, mg->name); - return; - } - - init_retry: - rv = saCkptSectionIterationInitialize(h, SA_CKPT_SECTIONS_ANY, 0, &itr); - if (rv == SA_AIS_ERR_TRY_AGAIN) { - log_group(mg, "retrieve_plocks: ckpt iterinit retry"); - sleep(1); - goto init_retry; - } - if (rv != SA_AIS_OK) { - log_error("retrieve_plocks: ckpt iterinit error %d %s", - rv, mg->name); - goto out; - } - - while (1) { - next_retry: - rv = saCkptSectionIterationNext(itr, &desc); - if (rv == SA_AIS_ERR_NO_SECTIONS) - break; - if (rv == SA_AIS_ERR_TRY_AGAIN) { - log_group(mg, "retrieve_plocks: ckpt iternext retry"); - sleep(1); - goto next_retry; - } - if (rv != SA_AIS_OK) { - log_error("retrieve_plocks: ckpt iternext error %d %s", - rv, mg->name); - goto out_it; - } - - if (!desc.sectionId.idLen) - continue; - - iov.sectionId = desc.sectionId; - iov.dataBuffer = §ion_buf; - iov.dataSize = desc.sectionSize; - iov.dataOffset = 0; - - /* for debug print */ - memset(&buf, 0, sizeof(buf)); - snprintf(buf, SECTION_NAME_LEN, "%s", desc.sectionId.id); - - log_group(mg, "retrieve_plocks: section size %llu id %u \"%s\"", - (unsigned long long)iov.dataSize, iov.sectionId.idLen, - buf); - - read_retry: - rv = saCkptCheckpointRead(h, &iov, 1, NULL); - if (rv == SA_AIS_ERR_TRY_AGAIN) { - log_group(mg, "retrieve_plocks: ckpt read retry"); - sleep(1); - goto read_retry; - } - if (rv != SA_AIS_OK) { - log_error("retrieve_plocks: ckpt read error %d %s", - rv, mg->name); - goto out_it; - } - - /* we'll get empty (zero length) sections for resources with - no locks, which exist in ownership mode; the resource - name and owner come from the section id */ - - log_group(mg, "retrieve_plocks: ckpt read %llu bytes", - (unsigned long long)iov.readSize); - section_len = iov.readSize; - - if (section_len % sizeof(struct pack_plock)) { - log_error("retrieve_plocks: bad section len %d %s", - section_len, mg->name); - continue; - } - - unpack_section_buf(mg, (char *)desc.sectionId.id, - desc.sectionId.idLen); - } - - out_it: - saCkptSectionIterationFinalize(itr); - out: - if (mg->low_nodeid == our_nodeid) { - /* we're the new low nodeid, will be master */ - log_group(mg, "retrieve_plocks: unlink ckpt from old master"); - mg->cp_handle = (uint64_t) h; - _unlink_checkpoint(mg, &name); - } else - saCkptCheckpointClose(h); -} - -/* Called when a node has failed, or we're unmounting. For a node failure, we - need to call this when the cpg confchg arrives so that we're guaranteed all - nodes do this in the same sequence wrt other messages. */ - -void purge_plocks(struct mountgroup *mg, int nodeid, int unmount) -{ - struct posix_lock *po, *po2; - struct lock_waiter *w, *w2; - struct resource *r, *r2; - int purged = 0; - - if (!cfgd_enable_plock) - return; - - list_for_each_entry_safe(r, r2, &mg->plock_resources, list) { - list_for_each_entry_safe(po, po2, &r->locks, list) { - if (po->nodeid == nodeid || unmount) { - list_del(&po->list); - free(po); - purged++; - } - } - - list_for_each_entry_safe(w, w2, &r->waiters, list) { - if (w->info.nodeid == nodeid || unmount) { - list_del(&w->list); - free(w); - purged++; - } - } - - /* TODO: haven't thought carefully about how this transition - to owner 0 might interact with other owner messages in - progress. */ - - if (r->owner == nodeid) { - r->owner = 0; - send_pending_plocks(mg, r); - } - - if (!list_empty(&r->waiters)) - do_waiters(mg, r); - - if (!cfgd_plock_ownership && - list_empty(&r->locks) && list_empty(&r->waiters)) { - list_del(&r->list); - free(r); - } - } - - if (purged) - mg->last_plock_time = time(NULL); - - log_group(mg, "purged %d plocks for %d", purged, nodeid); - - /* we may have a saved ckpt that we created for the last mounter, - we need to unlink it so another node can create a new ckpt for - the next mounter after we leave */ - - if (unmount && mg->cp_handle) - unlink_checkpoint(mg); -} - -int fill_plock_dump_buf(struct mountgroup *mg) -{ - struct posix_lock *po; - struct lock_waiter *w; - struct resource *r; - struct timeval now; - int rv = 0; - int len = GFSC_DUMP_SIZE, pos = 0, ret; - - memset(plock_dump_buf, 0, sizeof(plock_dump_buf)); - plock_dump_len = 0; - - gettimeofday(&now, NULL); - - list_for_each_entry(r, &mg->plock_resources, list) { - - if (list_empty(&r->locks) && - list_empty(&r->waiters) && - list_empty(&r->pending)) { - ret = snprintf(plock_dump_buf + pos, len - pos, - "%llu rown %d unused_ms %llu\n", - (unsigned long long)r->number, r->owner, - (unsigned long long)time_diff_ms(&r->last_access, - &now)); - if (ret >= len - pos) { - rv = -ENOSPC; - goto out; - } - pos += ret; - continue; - } - - list_for_each_entry(po, &r->locks, list) { - ret = snprintf(plock_dump_buf + pos, len - pos, - "%llu %s %llu-%llu nodeid %d pid %u owner %llx rown %d\n", - (unsigned long long)r->number, - po->ex ? "WR" : "RD", - (unsigned long long)po->start, - (unsigned long long)po->end, - po->nodeid, po->pid, - (unsigned long long)po->owner, r->owner); - - if (ret >= len - pos) { - rv = -ENOSPC; - goto out; - } - pos += ret; - } - - list_for_each_entry(w, &r->waiters, list) { - ret = snprintf(plock_dump_buf + pos, len - pos, - "%llu %s %llu-%llu nodeid %d pid %u owner %llx rown %d WAITING\n", - (unsigned long long)r->number, - w->info.ex ? "WR" : "RD", - (unsigned long long)w->info.start, - (unsigned long long)w->info.end, - w->info.nodeid, w->info.pid, - (unsigned long long)w->info.owner, r->owner); - - if (ret >= len - pos) { - rv = -ENOSPC; - goto out; - } - pos += ret; - } - - list_for_each_entry(w, &r->pending, list) { - ret = snprintf(plock_dump_buf + pos, len - pos, - "%llu %s %llu-%llu nodeid %d pid %u owner %llx rown %d PENDING\n", - (unsigned long long)r->number, - w->info.ex ? "WR" : "RD", - (unsigned long long)w->info.start, - (unsigned long long)w->info.end, - w->info.nodeid, w->info.pid, - (unsigned long long)w->info.owner, r->owner); - - if (ret >= len - pos) { - rv = -ENOSPC; - goto out; - } - pos += ret; - } - } - out: - plock_dump_len = pos; - return rv; -} - -static void find_minors(void) -{ - FILE *fl; - char name[256]; - uint32_t number; - int found = 0; - int c; - - plock_minor = 0; - old_plock_minor = 0; - - if (!(fl = fopen("/proc/misc", "r"))) { - log_error("/proc/misc fopen failed: %s", strerror(errno)); - return; - } - - while (!feof(fl)) { - if (fscanf(fl, "%d %255s\n", &number, &name[0]) == 2) { - - if (!strcmp(name, "dlm_plock")) { - plock_minor = number; - found++; - } else if (!strcmp(name, "lock_dlm_plock")) { - old_plock_minor = number; - found++; - } - - } else do { - c = fgetc(fl); - } while (c != EOF && c != '\n'); - - if (found == 3) - break; - } - fclose(fl); - - if (!found) - log_error("Is lock_dlm or dlm missing from kernel? No misc devices found."); -} - -static int find_udev_device(char *path, uint32_t minor) -{ - struct stat st; - int i; - - for (i = 0; i < 10; i++) { - if (stat(path, &st) == 0 && minor(st.st_rdev) == minor) - return 0; - sleep(1); - } - - log_error("cannot find device %s with minor %d", path, minor); - return -1; -} - -int setup_misc_devices(void) -{ - int rv; - - find_minors(); - - if (plock_minor) { - rv = find_udev_device("/dev/misc/dlm_plock", plock_minor); - if (rv < 0) - return rv; - log_debug("found /dev/misc/dlm_plock minor %u", - plock_minor); - } - - if (!plock_minor && old_plock_minor) { - rv = find_udev_device("/dev/misc/lock_dlm_plock", - old_plock_minor); - if (rv < 0) - return rv; - log_debug("found /dev/misc/lock_dlm_plock minor %u", - old_plock_minor); - } - - return 0; -} - diff --git a/group/gfs_controld/util.c b/group/gfs_controld/util.c index a0650fe..51d274f 100644 --- a/group/gfs_controld/util.c +++ b/group/gfs_controld/util.c @@ -64,8 +64,6 @@ int set_sysfs(struct mountgroup *mg, char *field, int val) return -1; } - mg->got_kernel_mount = 1; - memset(out, 0, sizeof(out)); sprintf(out, "%d", val); @@ -92,8 +90,6 @@ static int get_sysfs(struct mountgroup *mg, char *field, char *buf, int len) return -1; } - mg->got_kernel_mount = 1; - rv = read(fd, buf, len); if (rv < 0) log_error("read %s error %d %d", fname, rv, errno); @@ -166,10 +162,7 @@ static void dmsetup_suspend_done(struct mountgroup *mg, int rv) if (!rv) { mg->withdraw_suspend = 1; - if (mg->old_group_mode) - send_withdraw_old(mg); - else - send_withdraw(mg); + send_withdraw(mg); } }