public inbox for cluster-cvs@sourceware.org
help / color / mirror / Atom feed
* gfs2-utils: master - gfs_controld: remove groupd compat
@ 2009-01-09 20:05 David Teigland
0 siblings, 0 replies; only message in thread
From: David Teigland @ 2009-01-09 20:05 UTC (permalink / raw)
To: cluster-cvs-relay
Gitweb: http://git.fedorahosted.org/git/gfs2-utils.git?p=gfs2-utils.git;a=commitdiff;h=8c4b2d6c09b5547fb3f556dc19ccf0e9ffe760f0
Commit: 8c4b2d6c09b5547fb3f556dc19ccf0e9ffe760f0
Parent: 7c2220b33a5bdb50727717d2d4c4475a49a8de6e
Author: David Teigland <teigland@redhat.com>
AuthorDate: Fri Jan 9 14:24:08 2009 -0600
Committer: David Teigland <teigland@redhat.com>
CommitterDate: Fri Jan 9 14:24:08 2009 -0600
gfs_controld: remove groupd compat
Signed-off-by: David Teigland <teigland@redhat.com>
---
group/gfs_controld/Makefile | 5 -
group/gfs_controld/config.c | 72 +--
group/gfs_controld/config.h | 24 -
group/gfs_controld/cpg-old.c | 2442 ---------------------------------------
group/gfs_controld/cpg-old.h | 73 --
group/gfs_controld/gfs_daemon.h | 101 --
group/gfs_controld/group.c | 360 ------
group/gfs_controld/main.c | 257 +----
group/gfs_controld/plock.c | 2361 -------------------------------------
group/gfs_controld/util.c | 9 +-
10 files changed, 25 insertions(+), 5679 deletions(-)
diff --git a/group/gfs_controld/Makefile b/group/gfs_controld/Makefile
index e194610..0bfe9d5 100644
--- a/group/gfs_controld/Makefile
+++ b/group/gfs_controld/Makefile
@@ -15,10 +15,7 @@ OBJS= main.o \
config.o \
crc.o \
cpg-new.o \
- cpg-old.o \
- group.o \
util.o \
- plock.o \
logging.o
CFLAGS += -I${ccsincdir} -I${cmanincdir} -I${logtincdir} -I${dlmcontrolincdir}
@@ -34,10 +31,8 @@ LDFLAGS += -L${logtlibdir} -llogthread
LDFLAGS += -L${corosynclibdir} -lcpg -lpthread
LDFLAGS += -L${openaislibdir} -lSaCkpt
LDFLAGS += -L${fencedlibdir} -lfenced
-LDFLAGS += -L${grouplibdir} -lgroup
LDFLAGS += -L${libdir}
-LDDEPS += ${grouplibdir}/libgroup.a
LDDEPS += ${fencedlibdir}/libfenced.a
${TARGET}: ${OBJS} ${LDDEPS}
diff --git a/group/gfs_controld/config.c b/group/gfs_controld/config.c
index b6e2ebe..896e063 100644
--- a/group/gfs_controld/config.c
+++ b/group/gfs_controld/config.c
@@ -29,29 +29,13 @@ int ccs_handle;
/* was a config value set on command line?, 0 or 1. */
-int optd_groupd_compat;
int optd_debug_logfile;
int optd_enable_withdraw;
-int optd_enable_plock;
-int optd_plock_debug;
-int optd_plock_rate_limit;
-int optd_plock_ownership;
-int optd_drop_resources_time;
-int optd_drop_resources_count;
-int optd_drop_resources_age;
/* actual config value from command line, cluster.conf, or default. */
-int cfgd_groupd_compat = DEFAULT_GROUPD_COMPAT;
int cfgd_debug_logfile = DEFAULT_DEBUG_LOGFILE;
int cfgd_enable_withdraw = DEFAULT_ENABLE_WITHDRAW;
-int cfgd_enable_plock = DEFAULT_ENABLE_PLOCK;
-int cfgd_plock_debug = DEFAULT_PLOCK_DEBUG;
-int cfgd_plock_rate_limit = DEFAULT_PLOCK_RATE_LIMIT;
-int cfgd_plock_ownership = DEFAULT_PLOCK_OWNERSHIP;
-int cfgd_drop_resources_time = DEFAULT_DROP_RESOURCES_TIME;
-int cfgd_drop_resources_count = DEFAULT_DROP_RESOURCES_COUNT;
-int cfgd_drop_resources_age = DEFAULT_DROP_RESOURCES_AGE;
void read_ccs_name(char *path, char *name)
{
@@ -140,28 +124,14 @@ void read_ccs_nodir(struct mountgroup *mg, char *buf)
free(str);
}
-#define GROUPD_COMPAT_PATH "/cluster/group/@groupd_compat"
#define ENABLE_WITHDRAW_PATH "/cluster/gfs_controld/@enable_withdraw"
-#define ENABLE_PLOCK_PATH "/cluster/gfs_controld/@enable_plock"
-#define PLOCK_DEBUG_PATH "/cluster/gfs_controld/@plock_debug"
-#define PLOCK_RATE_LIMIT_PATH "/cluster/gfs_controld/@plock_rate_limit"
-#define PLOCK_OWNERSHIP_PATH "/cluster/gfs_controld/@plock_ownership"
-#define DROP_RESOURCES_TIME_PATH "/cluster/gfs_controld/@drop_resources_time"
-#define DROP_RESOURCES_COUNT_PATH "/cluster/gfs_controld/@drop_resources_count"
-#define DROP_RESOURCES_AGE_PATH "/cluster/gfs_controld/@drop_resources_age"
-
-#define DLM_PLOCK_RATE_LIMIT_PATH "/cluster/dlm/@plock_rate_limit"
-#define DLM_PLOCK_OWNERSHIP_PATH "/cluster/dlm/@plock_ownership"
-#define DLM_DROP_RESOURCES_TIME_PATH "/cluster/dlm/@drop_resources_time"
-#define DLM_DROP_RESOURCES_COUNT_PATH "/cluster/dlm/@drop_resources_count"
-#define DLM_DROP_RESOURCES_AGE_PATH "/cluster/dlm/@drop_resources_age"
int setup_ccs(void)
{
- int cd, rv;
+ int cd;
if (ccs_handle)
- goto update;
+ return 0;
cd = ccs_connect();
if (cd < 0) {
@@ -170,46 +140,8 @@ int setup_ccs(void)
}
ccs_handle = cd;
- /* These config values are set from cluster.conf only if they haven't
- already been set on the command line. */
-
- if (!optd_groupd_compat)
- read_ccs_int(GROUPD_COMPAT_PATH, &cfgd_groupd_compat);
if (!optd_enable_withdraw)
read_ccs_int(ENABLE_WITHDRAW_PATH, &cfgd_enable_withdraw);
- if (!optd_enable_plock)
- read_ccs_int(ENABLE_PLOCK_PATH, &cfgd_enable_plock);
- if (!optd_plock_ownership) {
- rv = read_ccs_int(PLOCK_OWNERSHIP_PATH, &cfgd_plock_ownership);
- if (rv < 0)
- read_ccs_int(DLM_PLOCK_OWNERSHIP_PATH, &cfgd_plock_ownership);
- }
-
- /* The following can be changed while running */
- update:
- if (!optd_plock_debug) {
- read_ccs_int(PLOCK_DEBUG_PATH, &cfgd_plock_debug);
- }
- if (!optd_plock_rate_limit) {
- rv = read_ccs_int(PLOCK_RATE_LIMIT_PATH, &cfgd_plock_rate_limit);
- if (rv < 0)
- read_ccs_int(DLM_PLOCK_RATE_LIMIT_PATH, &cfgd_plock_rate_limit);
- }
- if (!optd_drop_resources_time) {
- rv = read_ccs_int(DROP_RESOURCES_TIME_PATH, &cfgd_drop_resources_time);
- if (rv < 0)
- read_ccs_int(DLM_DROP_RESOURCES_TIME_PATH, &cfgd_drop_resources_time);
- }
- if (!optd_drop_resources_count) {
- rv = read_ccs_int(DROP_RESOURCES_COUNT_PATH, &cfgd_drop_resources_count);
- if (rv < 0)
- read_ccs_int(DLM_DROP_RESOURCES_COUNT_PATH, &cfgd_drop_resources_count);
- }
- if (!optd_drop_resources_age) {
- rv = read_ccs_int(DROP_RESOURCES_AGE_PATH, &cfgd_drop_resources_age);
- if (rv < 0)
- read_ccs_int(DLM_DROP_RESOURCES_AGE_PATH, &cfgd_drop_resources_age);
- }
return 0;
}
diff --git a/group/gfs_controld/config.h b/group/gfs_controld/config.h
index ee0b3b6..bc4788d 100644
--- a/group/gfs_controld/config.h
+++ b/group/gfs_controld/config.h
@@ -1,38 +1,14 @@
#ifndef __CONFIG_DOT_H__
#define __CONFIG_DOT_H__
-#define DEFAULT_GROUPD_COMPAT 2
#define DEFAULT_DEBUG_LOGFILE 0
#define DEFAULT_ENABLE_WITHDRAW 1
-#define DEFAULT_ENABLE_PLOCK 1
-#define DEFAULT_PLOCK_DEBUG 0
-#define DEFAULT_PLOCK_RATE_LIMIT 100
-#define DEFAULT_PLOCK_OWNERSHIP 1
-#define DEFAULT_DROP_RESOURCES_TIME 10000 /* 10 sec */
-#define DEFAULT_DROP_RESOURCES_COUNT 10
-#define DEFAULT_DROP_RESOURCES_AGE 10000 /* 10 sec */
-extern int optd_groupd_compat;
extern int optd_debug_logfile;
extern int optd_enable_withdraw;
-extern int optd_enable_plock;
-extern int optd_plock_debug;
-extern int optd_plock_rate_limit;
-extern int optd_plock_ownership;
-extern int optd_drop_resources_time;
-extern int optd_drop_resources_count;
-extern int optd_drop_resources_age;
-extern int cfgd_groupd_compat;
extern int cfgd_debug_logfile;
extern int cfgd_enable_withdraw;
-extern int cfgd_enable_plock;
-extern int cfgd_plock_debug;
-extern int cfgd_plock_rate_limit;
-extern int cfgd_plock_ownership;
-extern int cfgd_drop_resources_time;
-extern int cfgd_drop_resources_count;
-extern int cfgd_drop_resources_age;
#endif
diff --git a/group/gfs_controld/cpg-old.c b/group/gfs_controld/cpg-old.c
deleted file mode 100644
index b353867..0000000
--- a/group/gfs_controld/cpg-old.c
+++ /dev/null
@@ -1,2442 +0,0 @@
-#include "gfs_daemon.h"
-#include "config.h"
-#include "cpg-old.h"
-#include "libgroup.h"
-
-#define ASSERT(x) \
-do { \
- if (!(x)) { \
- log_error("Assertion failed on line %d of file %s\n" \
- "Assertion: \"%s\"\n", __LINE__, __FILE__, #x); \
- } \
-} while (0)
-
-#define JID_INIT -9
-
-/* mg_member opts bit field */
-
-enum {
- MEMB_OPT_RW = 1,
- MEMB_OPT_RO = 2,
- MEMB_OPT_SPECT = 4,
- MEMB_OPT_RECOVER = 8,
-};
-
-/* mg_member state: local_recovery_status, recovery_status */
-
-enum {
- RS_NEED_RECOVERY = 1,
- RS_SUCCESS,
- RS_GAVEUP,
- RS_NOFS,
- RS_READONLY,
-};
-
-extern group_handle_t gh;
-
-/* cpg message protocol
- 1.0.0 is initial version
- 2.0.0 is incompatible with 1.0.0 and allows plock ownership */
-static unsigned int protocol_v100[3] = {1, 0, 0};
-static unsigned int protocol_v200[3] = {2, 0, 0};
-static unsigned int protocol_active[3];
-
-
-static void send_journals(struct mountgroup *mg, int nodeid);
-
-
-static char *msg_name(int type)
-{
- switch (type) {
- case MSG_JOURNAL:
- return "MSG_JOURNAL";
- case MSG_OPTIONS:
- return "MSG_OPTIONS";
- case MSG_REMOUNT:
- return "MSG_REMOUNT";
- case MSG_PLOCK:
- return "MSG_PLOCK";
- case MSG_MOUNT_STATUS:
- return "MSG_MOUNT_STATUS";
- case MSG_RECOVERY_STATUS:
- return "MSG_RECOVERY_STATUS";
- case MSG_RECOVERY_DONE:
- return "MSG_RECOVERY_DONE";
- case MSG_WITHDRAW:
- return "MSG_WITHDRAW";
- }
- return "unknown";
-}
-
-static int _send_message(cpg_handle_t h, void *buf, int len, int type)
-{
- struct iovec iov;
- cpg_error_t error;
- int retries = 0;
-
- iov.iov_base = buf;
- iov.iov_len = len;
-
- retry:
- error = cpg_mcast_joined(h, CPG_TYPE_AGREED, &iov, 1);
- if (error == CPG_ERR_TRY_AGAIN) {
- retries++;
- usleep(1000);
- if (!(retries % 100))
- log_error("cpg_mcast_joined retry %d %s",
- retries, msg_name(type));
- goto retry;
- }
- if (error != CPG_OK) {
- log_error("cpg_mcast_joined error %d handle %llx %s",
- error, (unsigned long long)h, msg_name(type));
- return -1;
- }
-
- if (retries)
- log_debug("cpg_mcast_joined retried %d %s",
- retries, msg_name(type));
-
- return 0;
-}
-
-int send_group_message_old(struct mountgroup *mg, int len, char *buf)
-{
- struct gdlm_header *hd = (struct gdlm_header *) buf;
- int type = hd->type;
-
- hd->version[0] = cpu_to_le16(protocol_active[0]);
- hd->version[1] = cpu_to_le16(protocol_active[1]);
- hd->version[2] = cpu_to_le16(protocol_active[2]);
- hd->type = cpu_to_le16(hd->type);
- hd->nodeid = cpu_to_le32(hd->nodeid);
- hd->to_nodeid = cpu_to_le32(hd->to_nodeid);
- memcpy(hd->name, mg->name, strlen(mg->name));
-
- return _send_message(cpg_handle_daemon, buf, len, type);
-}
-
-static struct mg_member *find_memb_nodeid(struct mountgroup *mg, int nodeid)
-{
- struct mg_member *memb;
-
- list_for_each_entry(memb, &mg->members, list) {
- if (memb->nodeid == nodeid)
- return memb;
- }
- return NULL;
-}
-
-static struct mg_member *find_memb_jid(struct mountgroup *mg, int jid)
-{
- struct mg_member *memb;
-
- list_for_each_entry(memb, &mg->members, list) {
- if (memb->jid == jid)
- return memb;
- }
- return NULL;
-}
-
-static void notify_mount_client(struct mountgroup *mg)
-{
- struct mg_member *memb;
-
- if (!mg->mount_client_result && mg->mount_client_delay) {
- log_group(mg, "notify_mount_client delayed");
- return;
- }
-
- client_reply_join_full(mg, mg->mount_client_result);
-
- if (mg->mount_client_result) {
- log_group(mg, "leaving due to mount error: %d",
- mg->mount_client_result);
-
- memb = find_memb_nodeid(mg, our_nodeid);
- if (memb->finished)
- group_leave(gh, mg->name);
- else {
- log_group(mg, "delay leave until after join");
- mg->group_leave_on_finish = 1;
- }
- } else {
- mg->mount_client_notified = 1;
- }
-}
-
-/* we can receive recovery_status messages from other nodes doing start before
- we actually process the corresponding start callback ourselves */
-
-void save_message_old(struct mountgroup *mg, char *buf, int len, int from,
- int type)
-{
- struct save_msg *sm;
-
- sm = malloc(sizeof(struct save_msg) + len);
- if (!sm)
- return;
- memset(sm, 0, sizeof(struct save_msg) + len);
-
- memcpy(&sm->buf, buf, len);
- sm->type = type;
- sm->len = len;
- sm->nodeid = from;
-
- log_group(mg, "save %s from %d len %d", msg_name(type), from, len);
-
- list_add_tail(&sm->list, &mg->saved_messages);
-}
-
-static int first_mounter_recovery(struct mountgroup *mg)
-{
- struct mg_member *memb;
-
- list_for_each_entry(memb, &mg->members, list) {
- if (memb->opts & MEMB_OPT_RECOVER)
- return memb->nodeid;
- }
- return 0;
-}
-
-static int local_first_mounter_recovery(struct mountgroup *mg)
-{
- int nodeid;
-
- nodeid = first_mounter_recovery(mg);
- if (nodeid == our_nodeid)
- return 1;
- return 0;
-}
-
-int remote_first_mounter_recovery(struct mountgroup *mg)
-{
- int nodeid;
-
- nodeid = first_mounter_recovery(mg);
- if (nodeid && (nodeid != our_nodeid))
- return 1;
- return 0;
-}
-
-static void start_done(struct mountgroup *mg)
-{
- log_group(mg, "start_done %d", mg->start_event_nr);
- group_start_done(gh, mg->name, mg->start_event_nr);
-}
-
-void send_withdraw_old(struct mountgroup *mg)
-{
- struct gdlm_header *hd;
- int len;
- char *buf;
-
- len = sizeof(struct gdlm_header);
-
- buf = malloc(len);
- if (!buf)
- return;
- memset(buf, 0, len);
-
- hd = (struct gdlm_header *)buf;
- hd->type = MSG_WITHDRAW;
- hd->nodeid = our_nodeid;
- hd->to_nodeid = 0;
-
- log_group(mg, "send_withdraw");
-
- send_group_message_old(mg, len, buf);
-
- free(buf);
-}
-
-static void receive_withdraw(struct mountgroup *mg, char *buf, int len, int from)
-{
- struct mg_member *memb;
-
- memb = find_memb_nodeid(mg, from);
- if (!memb) {
- log_group(mg, "receive_withdraw no member %d", from);
- return;
- }
- log_group(mg, "receive_withdraw from %d", from);
- memb->withdrawing = 1;
-
- if (from == our_nodeid)
- group_leave(gh, mg->name);
-}
-
-#define SEND_RS_INTS 3
-
-static void send_recovery_status(struct mountgroup *mg)
-{
- struct gdlm_header *hd;
- struct mg_member *memb;
- int len, *p, i, n = 0;
- char *buf;
-
- list_for_each_entry(memb, &mg->members_gone, list) {
- if (memb->local_recovery_status == RS_SUCCESS)
- n++;
- }
-
- len = sizeof(struct gdlm_header) + (n * SEND_RS_INTS * sizeof(int));
-
- buf = malloc(len);
- if (!buf)
- return;
- memset(buf, 0, len);
-
- hd = (struct gdlm_header *)buf;
- hd->type = MSG_RECOVERY_STATUS;
- hd->nodeid = our_nodeid;
- hd->to_nodeid = 0;
- p = (int *) (buf + sizeof(struct gdlm_header));
-
- i = 0;
- list_for_each_entry(memb, &mg->members_gone, list) {
- if (memb->local_recovery_status != RS_SUCCESS)
- continue;
- p[i] = cpu_to_le32(memb->nodeid);
- i++;
- p[i] = cpu_to_le32(memb->jid);
- i++;
- p[i] = cpu_to_le32(memb->local_recovery_status);
- i++;
- }
-
- log_group(mg, "send_recovery_status for %d nodes len %d", n, len);
-
- send_group_message_old(mg, len, buf);
-
- free(buf);
-}
-
-/* Note: we can get more than one node reporting success in recovering
- the journal for a failed node. The first has really recovered it,
- the rest have found the fs clean and report success. */
-
-static void _receive_recovery_status(struct mountgroup *mg, char *buf, int len,
- int from)
-{
- struct mg_member *memb;
- int *p, n, i, nodeid, jid, status, found = 0;
-
- n = (len - sizeof(struct gdlm_header)) / (SEND_RS_INTS * sizeof(int));
-
- p = (int *) (buf + sizeof(struct gdlm_header));
-
- for (i = 0; i < n; i++) {
- nodeid = le32_to_cpu(p[i * SEND_RS_INTS]);
- jid = le32_to_cpu(p[i * SEND_RS_INTS + 1]);
- status = le32_to_cpu(p[i * SEND_RS_INTS + 2]);
-
- ASSERT(status == RS_SUCCESS);
-
- found = 0;
- list_for_each_entry(memb, &mg->members_gone, list) {
- if (memb->nodeid != nodeid)
- continue;
- ASSERT(memb->jid == jid);
- ASSERT(memb->recovery_status == RS_NEED_RECOVERY ||
- memb->recovery_status == RS_SUCCESS);
- memb->recovery_status = status;
- found = 1;
- break;
- }
-
- log_group(mg, "receive_recovery_status from %d len %d "
- "nodeid %d jid %d status %d found %d",
- from, len, nodeid, jid, status, found);
- }
-
- if (from == our_nodeid)
- start_done(mg);
-}
-
-static void process_saved_recovery_status(struct mountgroup *mg)
-{
- struct save_msg *sm, *sm2;
-
- if (list_empty(&mg->saved_messages))
- return;
-
- log_group(mg, "process_saved_recovery_status");
-
- list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) {
- if (sm->type != MSG_RECOVERY_STATUS)
- continue;
- _receive_recovery_status(mg, sm->buf, sm->len, sm->nodeid);
- list_del(&sm->list);
- free(sm);
- }
-}
-
-static void assign_next_first_mounter(struct mountgroup *mg)
-{
- struct mg_member *memb, *next = NULL;
- int low = -1;
-
- list_for_each_entry(memb, &mg->members, list) {
- if (memb->jid == -2)
- continue;
- if (memb->jid == -9)
- continue;
- if (memb->spectator || memb->readonly || memb->withdrawing ||
- memb->ms_kernel_mount_done)
- continue;
- if (low == -1 || memb->nodeid < low) {
- next = memb;
- low = memb->nodeid;
- }
- }
-
- if (next) {
- log_group(mg, "next first mounter is %d jid %d opts %x",
- next->nodeid, next->jid, next->opts);
- next->opts |= MEMB_OPT_RECOVER;
- ASSERT(next->jid >= 0);
- } else
- log_group(mg, "no next mounter available yet");
-}
-
-#define SEND_MS_INTS 4
-
-void send_mount_status_old(struct mountgroup *mg)
-{
- struct gdlm_header *hd;
- int len, *p;
- char *buf;
-
- len = sizeof(struct gdlm_header) + (SEND_MS_INTS * sizeof(int));
-
- buf = malloc(len);
- if (!buf)
- return;
- memset(buf, 0, len);
-
- hd = (struct gdlm_header *)buf;
- hd->type = MSG_MOUNT_STATUS;
- hd->nodeid = our_nodeid;
- hd->to_nodeid = 0;
-
- p = (int *) (buf + sizeof(struct gdlm_header));
-
- p[0] = cpu_to_le32(mg->first_mounter);
- p[1] = cpu_to_le32(mg->kernel_mount_error);
- p[2] = 0; /* unused */
- p[3] = 0; /* unused */
-
- log_group(mg, "send_mount_status kernel_mount_error %d "
- "first_mounter %d",
- mg->kernel_mount_error,
- mg->first_mounter);
-
- send_group_message_old(mg, len, buf);
-
- free(buf);
-}
-
-static void _receive_mount_status(struct mountgroup *mg, char *buf, int len,
- int from)
-{
- struct mg_member *memb, *us;
- int *p;
-
- p = (int *) (buf + sizeof(struct gdlm_header));
-
- memb = find_memb_nodeid(mg, from);
- if (!memb) {
- log_group(mg, "_receive_mount_status no node %d", from);
- return;
- }
-
- memb->ms_kernel_mount_done = 1;
- memb->ms_first_mounter = le32_to_cpu(p[0]);
- memb->ms_kernel_mount_error = le32_to_cpu(p[1]);
-
- log_group(mg, "_receive_mount_status from %d kernel_mount_error %d "
- "first_mounter %d opts %x", from,
- memb->ms_kernel_mount_error, memb->ms_first_mounter,
- memb->opts);
-
- if (memb->opts & MEMB_OPT_RECOVER) {
- ASSERT(memb->ms_first_mounter);
- }
- if (memb->ms_first_mounter) {
- ASSERT(memb->opts & MEMB_OPT_RECOVER);
- }
-
- if (memb->ms_first_mounter) {
- memb->opts &= ~MEMB_OPT_RECOVER;
-
- if (!memb->ms_kernel_mount_error) {
- /* the first mounter has successfully mounted, we can
- go ahead and mount now */
-
- if (mg->mount_client_delay) {
- mg->mount_client_delay = 0;
- notify_mount_client(mg);
- }
- } else {
- /* first mounter mount failed, next low node should be
- made first mounter */
-
- memb->jid = -2;
- if (from == our_nodeid)
- mg->our_jid = -2;
-
- assign_next_first_mounter(mg);
-
- /* if we became the next first mounter, then notify
- mount client */
-
- us = find_memb_nodeid(mg, our_nodeid);
- if (us->opts & MEMB_OPT_RECOVER) {
- log_group(mg, "we are next first mounter");
- mg->first_mounter = 1;
- mg->first_mounter_done = 0;
- mg->mount_client_delay = 0;
- notify_mount_client(mg);
- }
- }
- }
-}
-
-static void receive_mount_status(struct mountgroup *mg, char *buf, int len,
- int from)
-{
- log_group(mg, "receive_mount_status from %d len %d last_cb %d",
- from, len, mg->last_callback);
-
- if (!mg->got_our_options) {
- log_group(mg, "ignore mount_status from %d", from);
- return;
- }
-
- if (!mg->got_our_journals)
- save_message_old(mg, buf, len, from, MSG_MOUNT_STATUS);
- else
- _receive_mount_status(mg, buf, len, from);
-}
-
-/* We delay processing mount_status msesages until we receive the journals
- message for our own mount. Our journals message is a snapshot of the memb
- list at the time our options message is received on the remote node. We
- ignore any messages that would change the memb list prior to seeing our own
- options message and we save any messages that would change the memb list
- after seeing our own options message and before we receive the memb list
- from the journals message. */
-
-static void process_saved_mount_status(struct mountgroup *mg)
-{
- struct save_msg *sm, *sm2;
-
- if (list_empty(&mg->saved_messages))
- return;
-
- log_group(mg, "process_saved_mount_status");
-
- list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) {
- if (sm->type != MSG_MOUNT_STATUS)
- continue;
- _receive_mount_status(mg, sm->buf, sm->len, sm->nodeid);
- list_del(&sm->list);
- free(sm);
- }
-}
-
-static void receive_recovery_status(struct mountgroup *mg, char *buf, int len,
- int from)
-{
- switch (mg->last_callback) {
- case DO_STOP:
- save_message_old(mg, buf, len, from, MSG_RECOVERY_STATUS);
- break;
- case DO_START:
- _receive_recovery_status(mg, buf, len, from);
- break;
- default:
- log_group(mg, "receive_recovery_status %d last_callback %d",
- from, mg->last_callback);
- }
-}
-
-/* tell others that all journals are recovered; they should clear
- memb's from members_gone, clear needs_recovery and unblock locks */
-
-static void send_recovery_done(struct mountgroup *mg)
-{
- struct gdlm_header *hd;
- int len;
- char *buf;
-
- len = sizeof(struct gdlm_header);
-
- buf = malloc(len);
- if (!buf)
- return;
- memset(buf, 0, len);
-
- hd = (struct gdlm_header *)buf;
- hd->type = MSG_RECOVERY_DONE;
- hd->nodeid = our_nodeid;
- hd->to_nodeid = 0;
-
- send_group_message_old(mg, len, buf);
-
- free(buf);
-}
-
-static void receive_recovery_done(struct mountgroup *mg, char *buf, int len,
- int from)
-{
- struct mg_member *memb, *safe;
-
- log_group(mg, "receive_recovery_done from %d needs_recovery %d",
- from, mg->needs_recovery);
-
- list_for_each_entry_safe(memb, safe, &mg->members_gone, list) {
- log_group(mg, "receive_recovery_done clear jid %d nodeid %d",
- memb->jid, memb->nodeid);
- list_del(&memb->list);
- free(memb);
- }
-
- mg->needs_recovery = 0;
- mg->kernel_stopped = 0; /* for queries */
- set_sysfs(mg, "block", 0);
-}
-
-void send_remount_old(struct mountgroup *mg, struct gfsc_mount_args *ma)
-{
- struct gdlm_header *hd;
- char *buf;
- int len;
- int ro = strstr(ma->options, "ro") ? 1 : 0;
-
- len = sizeof(struct gdlm_header) + MAX_OPTIONS_LEN;
-
- buf = malloc(len);
- if (!buf)
- return;
- memset(buf, 0, len);
-
- hd = (struct gdlm_header *)buf;
- hd->type = MSG_REMOUNT;
- hd->nodeid = our_nodeid;
- hd->to_nodeid = 0;
-
- strcpy(buf+sizeof(struct gdlm_header), ro ? "ro" : "rw");
-
- log_group(mg, "send_remount_old len %d \"%s\"", len,
- buf+sizeof(struct gdlm_header));
-
- send_group_message_old(mg, len, buf);
-
- free(buf);
-}
-
-static void receive_remount(struct mountgroup *mg, char *buf, int len, int from)
-{
- struct mg_member *memb;
- char *options;
- int rw = 0, ro = 0;
- int result = 0;
-
- options = (char *) (buf + sizeof(struct gdlm_header));
-
- memb = find_memb_nodeid(mg, from);
- if (!memb) {
- log_error("receive_remount: unknown nodeid %d", from);
- return;
- }
-
- if (strstr(options, "rw"))
- rw = 1;
- else if (strstr(options, "ro"))
- ro = 1;
- else {
- result = -EINVAL;
- goto out;
- }
-
- /* FIXME: check if we've even fully completed our normal mount yet
- (received our own mount-status?) if not, then disallow remount */
-
- /* FIXME: going ro->rw may mean we can now do journal or first-mounter
- recovery that we couldn't do before. */
-
- memb->readonly = ro;
- memb->rw = !ro;
-
- if (ro) {
- memb->opts &= ~MEMB_OPT_RW;
- memb->opts |= MEMB_OPT_RO;
- } else {
- memb->opts &= ~MEMB_OPT_RO;
- memb->opts |= MEMB_OPT_RW;
- }
- out:
- if (from == our_nodeid) {
- if (!result) {
- mg->rw = memb->rw;
- mg->ro = memb->readonly;
- }
- client_reply_remount(mg, mg->remount_client, result);
- }
-
- log_group(mg, "receive_remount from %d rw=%d ro=%d opts=%x",
- from, memb->rw, memb->readonly, memb->opts);
-}
-
-static void set_our_memb_options(struct mountgroup *mg)
-{
- struct mg_member *memb;
- memb = find_memb_nodeid(mg, our_nodeid);
- ASSERT(memb);
-
- if (mg->ro) {
- memb->readonly = 1;
- memb->opts |= MEMB_OPT_RO;
- } else if (mg->spectator) {
- memb->spectator = 1;
- memb->opts |= MEMB_OPT_SPECT;
- } else if (mg->rw) {
- memb->rw = 1;
- memb->opts |= MEMB_OPT_RW;
- }
-}
-
-static void send_options(struct mountgroup *mg)
-{
- struct gdlm_header *hd;
- int len;
- char *buf;
-
- len = sizeof(struct gdlm_header) + MAX_OPTIONS_LEN;
-
- buf = malloc(len);
- if (!buf)
- return;
- memset(buf, 0, len);
-
- hd = (struct gdlm_header *)buf;
- hd->type = MSG_OPTIONS;
- hd->nodeid = our_nodeid;
- hd->to_nodeid = 0;
-
- strncpy(buf+sizeof(struct gdlm_header), mg->mount_args.options,
- MAX_OPTIONS_LEN-1);
-
- log_group(mg, "send_options len %d \"%s\"", len,
- buf+sizeof(struct gdlm_header));
-
- send_group_message_old(mg, len, buf);
-
- free(buf);
-}
-
-/* We set the new member's jid to the lowest unused jid. If we're the lowest
- existing member (by nodeid), then send jid info to the new node. */
-
-/* Look at rw/ro/spectator status of all existing mounters and whether
- we need to do recovery. Based on that, decide if the current mount
- mode (ro/spectator) is permitted; if not, set jid = -2. If spectator
- mount and it's ok, set jid = -1. If ro or rw mount and it's ok, set
- real jid. */
-
-static int assign_journal(struct mountgroup *mg, struct mg_member *new)
-{
- struct mg_member *memb, *memb_recover = NULL, *memb_mounted = NULL;
- int i, total, rw_count, ro_count, spect_count, invalid_count;
-
- total = rw_count = ro_count = spect_count = invalid_count = 0;
-
- list_for_each_entry(memb, &mg->members, list) {
- if (memb->nodeid == new->nodeid)
- continue;
- total++;
- if (memb->jid == -2)
- invalid_count++;
- else if (memb->spectator)
- spect_count++;
- else if (memb->rw)
- rw_count++;
- else if (memb->readonly)
- ro_count++;
-
- if (memb->opts & MEMB_OPT_RECOVER) {
- memb_recover = memb;
- log_group(mg, "assign_journal: memb %d has OPT_RECOVER",
- memb->nodeid);
- }
-
- if (memb->ms_kernel_mount_done && !memb->ms_kernel_mount_error)
- memb_mounted = memb;
- }
-
- log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d "
- "needs_recovery %d", total, invalid_count, rw_count,
- ro_count, spect_count, mg->needs_recovery);
-
- if (new->spectator) {
- log_group(mg, "assign_journal: new spectator allowed");
- new->jid = -1;
- goto out;
- }
-
- for (i = 0; i < 1024; i++) {
- memb = find_memb_jid(mg, i);
- if (!memb) {
- new->jid = i;
- break;
- }
- }
-
- /* Repeat first-mounter recovery: the fs has been mounted and in-use,
- but nodes have failed and none of the current mounters has been able
- to do recovery (all remaining nodes may be ro/spect for example).
- This puts us into the special "needs_recovery" state where new
- mounters are asked to do first-mounter recovery of the fs while
- the current mounters sit in a blocked state. */
-
- if (mg->needs_recovery) {
- if (!memb_recover) {
- log_group(mg, "assign_journal: needs_recovery: "
- "new memb %d gets OPT_RECOVER",
- new->nodeid);
- new->opts |= MEMB_OPT_RECOVER;
- } else {
- log_group(mg, "assign_journal: needs_recovery: "
- "new memb %d memb %d has OPT_RECOVER",
- new->nodeid, memb_recover->nodeid);
- }
- goto out;
- }
-
- /* Initial first-mounter recovery: the fs is coming online, the first
- mg member assumes first-mounter role and other nodes join the mg
- while the first-mounter is working. These non-first mounters wait
- for the first-mounter to finish before notifying mount.gfs. If the
- first-mounter fails, one of them will become the first-mounter. */
-
- /* it shouldn't be possible to have someone doing first mounter
- recovery and also have someone with the fs fully mounted */
-
- if (memb_mounted && memb_recover) {
- log_group(mg, "memb_mounted %d memb_recover %d",
- memb_mounted->nodeid, memb_recover->nodeid);
- ASSERT(0);
- }
-
- /* someone has successfully mounted the fs which means the fs doesn't
- need first mounter recovery */
-
- if (memb_mounted) {
- log_group(mg, "assign_journal: no first recovery needed %d",
- memb_mounted->nodeid);
- goto out;
- }
-
- /* someone is currently doing first mounter recovery, they'll send
- mount_status when they're done letting everyone know the result */
-
- if (memb_recover) {
- log_group(mg, "assign_journal: %d doing first recovery",
- memb_recover->nodeid);
- goto out;
- }
-
- /* when we received our journals, no one was flagged with OPT_RECOVER
- which means no first mounter recovery is needed or is current */
-
- if (mg->global_first_recover_done) {
- log_group(mg, "assign_journal: global_first_recover_done");
- goto out;
- }
-
- /* no one has done kernel mount successfully and no one is doing first
- mounter recovery, the new node gets to try first mounter recovery */
-
- log_group(mg, "kernel_mount_done %d kernel_mount_error %d "
- "first_mounter %d first_mounter_done %d",
- mg->kernel_mount_done, mg->kernel_mount_error,
- mg->first_mounter, mg->first_mounter_done);
-
- log_group(mg, "assign_journal: new memb %d gets OPT_RECOVER for: "
- "fs not mounted", new->nodeid);
- new->opts |= MEMB_OPT_RECOVER;
-
- out:
- log_group(mg, "assign_journal: new member %d got jid %d opts %x",
- new->nodeid, new->jid, new->opts);
-
- if (mg->master_nodeid == our_nodeid) {
- store_plocks(mg, new->nodeid);
- send_journals(mg, new->nodeid);
- }
- return 0;
-}
-
-static void _receive_options(struct mountgroup *mg, char *buf, int len,
- int from)
-{
- struct mg_member *memb;
- struct gdlm_header *hd;
- char *options;
-
- hd = (struct gdlm_header *)buf;
- options = (char *) (buf + sizeof(struct gdlm_header));
-
- memb = find_memb_nodeid(mg, from);
- if (!memb) {
- log_error("unknown nodeid %d for options message", from);
- return;
- }
-
- if (strstr(options, "spectator")) {
- memb->spectator = 1;
- memb->opts |= MEMB_OPT_SPECT;
- } else if (strstr(options, "rw")) {
- memb->rw = 1;
- memb->opts |= MEMB_OPT_RW;
- } else if (strstr(options, "ro")) {
- memb->readonly = 1;
- memb->opts |= MEMB_OPT_RO;
- }
-
- log_group(mg, "_receive_options from %d rw=%d ro=%d spect=%d opts=%x",
- from, memb->rw, memb->readonly, memb->spectator, memb->opts);
-
- assign_journal(mg, memb);
-}
-
-static void receive_options(struct mountgroup *mg, char *buf, int len, int from)
-{
- struct gdlm_header *hd = (struct gdlm_header *)buf;
- struct mg_member *memb;
-
- log_group(mg, "receive_options from %d len %d last_cb %d",
- from, len, mg->last_callback);
-
- if (hd->nodeid == our_nodeid) {
- mg->got_our_options = 1;
- mg->save_plocks = 1;
- return;
- }
-
- if (!mg->got_our_options) {
- log_group(mg, "ignore options from %d", from);
- return;
- }
-
- /* we can receive an options message before getting the start
- that adds the mounting node that sent the options, or
- we can receive options messages before we get the journals
- message for out own mount */
-
- memb = find_memb_nodeid(mg, from);
-
- if (!memb || !mg->got_our_journals)
- save_message_old(mg, buf, len, from, MSG_OPTIONS);
- else
- _receive_options(mg, buf, len, from);
-}
-
-static void process_saved_options(struct mountgroup *mg)
-{
- struct save_msg *sm, *sm2;
-
- if (list_empty(&mg->saved_messages))
- return;
-
- log_group(mg, "process_saved_options");
-
- list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) {
- if (sm->type != MSG_OPTIONS)
- continue;
- _receive_options(mg, sm->buf, sm->len, sm->nodeid);
- list_del(&sm->list);
- free(sm);
- }
-}
-
-#define NUM 3
-
-/* send nodeid/jid/opts of every member to nodeid */
-
-static void send_journals(struct mountgroup *mg, int nodeid)
-{
- struct mg_member *memb;
- struct gdlm_header *hd;
- int i, len;
- char *buf;
- int *ids;
-
- len = sizeof(struct gdlm_header) + (mg->memb_count * NUM * sizeof(int));
-
- buf = malloc(len);
- if (!buf)
- return;
- memset(buf, 0, len);
-
- hd = (struct gdlm_header *)buf;
- hd->type = MSG_JOURNAL;
- hd->nodeid = our_nodeid;
- hd->to_nodeid = nodeid;
- ids = (int *) (buf + sizeof(struct gdlm_header));
-
- i = 0;
- list_for_each_entry(memb, &mg->members, list) {
- ids[i] = cpu_to_le32(memb->nodeid);
- i++;
- ids[i] = cpu_to_le32(memb->jid);
- i++;
- ids[i] = cpu_to_le32(memb->opts);
- i++;
- }
-
- log_group(mg, "send_journals to %d len %d count %d", nodeid, len, i);
-
- send_group_message_old(mg, len, buf);
-
- free(buf);
-}
-
-static void received_our_jid(struct mountgroup *mg)
-{
- log_group(mg, "received_our_jid %d", mg->our_jid);
-
- /* we've been given jid of -2 which means we're not permitted
- to mount the fs; probably because we're trying to mount readonly
- but the next mounter is required to be rw */
-
- if (mg->our_jid == -2) {
- mg->mount_client_result = -EUCLEAN;
- goto out;
- }
-
- /* fs needs recovery and existing mounters can't recover it,
- i.e. they're spectator/readonly or the first mounter's
- mount(2) failed, so we're told to do first-mounter recovery
- on the fs. */
-
- if (local_first_mounter_recovery(mg)) {
- log_group(mg, "we're told to do first mounter recovery");
- mg->first_mounter = 1;
- mg->first_mounter_done = 0;
- mg->mount_client_delay = 0;
- mg->save_plocks = 0;
- goto out;
- } else if (remote_first_mounter_recovery(mg)) {
- /* delay notifying mount client until we get a successful
- mount status from the first mounter */
- log_group(mg, "other node doing first mounter recovery, "
- "set mount_client_delay");
- mg->mount_client_delay = 1;
- mg->save_plocks = 0;
- return;
- }
-
- retrieve_plocks(mg);
- mg->save_plocks = 0;
- process_saved_plocks(mg);
- out:
- notify_mount_client(mg);
-}
-
-static void _receive_journals(struct mountgroup *mg, char *buf, int len,
- int from)
-{
- struct mg_member *memb, *memb2;
- struct gdlm_header *hd;
- int *ids, count, i, nodeid, jid, opts;
- int current_first_recover = 0;
-
- hd = (struct gdlm_header *)buf;
-
- count = (len - sizeof(struct gdlm_header)) / (NUM * sizeof(int));
- ids = (int *) (buf + sizeof(struct gdlm_header));
-
- for (i = 0; i < count; i++) {
- nodeid = le32_to_cpu(ids[i * NUM]);
- jid = le32_to_cpu(ids[i * NUM + 1]);
- opts = le32_to_cpu(ids[i * NUM + 2]);
-
- log_debug("receive nodeid %d jid %d opts %x",
- nodeid, jid, opts);
-
- memb = find_memb_nodeid(mg, nodeid);
- memb2 = find_memb_jid(mg, jid);
-
- if (!memb || memb2) {
- log_error("invalid journals message "
- "nodeid %d jid %d opts %x",
- nodeid, jid, opts);
- }
- if (!memb)
- continue;
-
- memb->jid = jid;
-
- if (nodeid == our_nodeid) {
- mg->our_jid = jid;
- /* set_our_memb_options() sets rest */
- if (opts & MEMB_OPT_RECOVER)
- memb->opts |= MEMB_OPT_RECOVER;
- } else {
- memb->opts = opts;
- if (opts & MEMB_OPT_RO)
- memb->readonly = 1;
- else if (opts & MEMB_OPT_RW)
- memb->rw = 1;
- else if (opts & MEMB_OPT_SPECT)
- memb->spectator = 1;
- }
-
- if (opts & MEMB_OPT_RECOVER)
- current_first_recover = 1;
- }
-
- /* FIXME: use global_first_recover_done more widely instead of
- as a single special case */
- if (!current_first_recover)
- mg->global_first_recover_done = 1;
-
- process_saved_mount_status(mg);
-
- /* we delay processing any options messages from new mounters
- until after we receive the journals message for our own mount */
-
- process_saved_options(mg);
-
- received_our_jid(mg);
-}
-
-static void receive_journals(struct mountgroup *mg, char *buf, int len,
- int from)
-{
- struct gdlm_header *hd = (struct gdlm_header *)buf;
- struct mg_member *memb;
- int count;
-
- count = (len - sizeof(struct gdlm_header)) / (NUM * sizeof(int));
-
- log_group(mg, "receive_journals from %d to %d len %d count %d cb %d",
- from, hd->to_nodeid, len, count, mg->last_callback);
-
- /* just like we can receive an options msg from a newly added node
- before we get the start adding it, we can receive the journals
- message sent to it before we get the start adding it */
-
- memb = find_memb_nodeid(mg, hd->to_nodeid);
- if (!memb) {
- log_group(mg, "receive_journals from %d to unknown %d",
- from, hd->to_nodeid);
- return;
- }
- memb->needs_journals = 0;
-
- if (hd->to_nodeid && hd->to_nodeid != our_nodeid)
- return;
-
- if (mg->got_our_journals) {
- log_group(mg, "receive_journals from %d duplicate", from);
- return;
- }
- mg->got_our_journals = 1;
-
- _receive_journals(mg, buf, len, from);
-}
-
-static void add_ordered_member(struct mountgroup *mg, struct mg_member *new)
-{
- struct mg_member *memb = NULL;
- struct list_head *tmp;
- struct list_head *newlist = &new->list;
- struct list_head *head = &mg->members;
-
- list_for_each(tmp, head) {
- memb = list_entry(tmp, struct mg_member, list);
- if (new->nodeid < memb->nodeid)
- break;
- }
-
- if (!memb)
- list_add_tail(newlist, head);
- else {
- /* FIXME: can use list macro here */
- newlist->prev = tmp->prev;
- newlist->next = tmp;
- tmp->prev->next = newlist;
- tmp->prev = newlist;
- }
-}
-
-static int add_member(struct mountgroup *mg, int nodeid)
-{
- struct mg_member *memb;
-
- memb = malloc(sizeof(struct mg_member));
- if (!memb)
- return -ENOMEM;
-
- memset(memb, 0, sizeof(struct mg_member));
-
- memb->nodeid = nodeid;
- memb->jid = JID_INIT;
- add_ordered_member(mg, memb);
- mg->memb_count++;
-
- if (!mg->init)
- memb->needs_journals = 1;
-
- return 0;
-}
-
-static int is_member(struct mountgroup *mg, int nodeid)
-{
- struct mg_member *memb;
-
- list_for_each_entry(memb, &mg->members, list) {
- if (memb->nodeid == nodeid)
- return 1;
- }
- return 0;
-}
-
-static int is_removed(struct mountgroup *mg, int nodeid)
-{
- struct mg_member *memb;
-
- list_for_each_entry(memb, &mg->members_gone, list) {
- if (memb->nodeid == nodeid)
- return 1;
- }
- return 0;
-}
-
-/* New mounters may be waiting for a journals message that a failed node (as
- master) would have sent. If the master failed and we're the new master,
- then send a journals message to any nodes for whom we've not seen a journals
- message. We also need to checkpoint the plock state for the new nodes to
- read after they get their journals message. */
-
-static void resend_journals(struct mountgroup *mg)
-{
- struct mg_member *memb;
- int stored_plocks = 0;
-
- list_for_each_entry(memb, &mg->members, list) {
- if (!memb->needs_journals)
- continue;
-
- if (!stored_plocks) {
- store_plocks(mg, memb->nodeid);
- stored_plocks = 1;
- }
-
- log_group(mg, "resend_journals to %d", memb->nodeid);
- send_journals(mg, memb->nodeid);
- }
-}
-
-/* The master node is the member of the group with the lowest nodeid who
- was also a member of the last "finished" group, i.e. a member of the
- group the last time it got a finish callback. The job of the master
- is to send state info to new nodes joining the group, and doing that
- requires that the master has all the state to send -- a new joining
- node that has the lowest nodeid doesn't have any state, which is why
- we add the "finished" requirement. */
-
-static void update_master_nodeid(struct mountgroup *mg)
-{
- struct mg_member *memb;
- int new = -1, low = -1;
-
- list_for_each_entry(memb, &mg->members, list) {
- if (low == -1 || memb->nodeid < low)
- low = memb->nodeid;
- if (!memb->finished)
- continue;
- if (new == -1 || memb->nodeid < new)
- new = memb->nodeid;
- }
- mg->master_nodeid = new;
- mg->low_nodeid = low;
-}
-
-/* This can happen before we receive a journals message for our mount. */
-
-static void recover_members(struct mountgroup *mg, int num_nodes,
- int *nodeids, int *pos_out, int *neg_out)
-{
- struct mg_member *memb, *safe, *memb_gone_recover = NULL;
- int i, found, id, pos = 0, neg = 0, prev_master_nodeid;
- int master_failed = 0;
-
- /* move departed nodes from members list to members_gone */
-
- list_for_each_entry_safe(memb, safe, &mg->members, list) {
- found = 0;
- for (i = 0; i < num_nodes; i++) {
- if (memb->nodeid == nodeids[i]) {
- found = 1;
- break;
- }
- }
-
- if (!found) {
- neg++;
-
- list_move(&memb->list, &mg->members_gone);
- memb->gone_event = mg->start_event_nr;
- memb->gone_type = mg->start_type;
- mg->memb_count--;
-
- memb->tell_gfs_to_recover = 0;
- memb->recovery_status = 0;
- memb->local_recovery_status = 0;
-
- /* - journal cb for failed or withdrawing nodes
- - failed node was assigned a journal
- - no journal cb if failed node was spectator
- - no journal cb if we've already done a journl cb */
-
- if ((memb->gone_type == GROUP_NODE_FAILED ||
- memb->withdrawing) &&
- memb->jid != JID_INIT &&
- memb->jid != -2 &&
- !memb->spectator &&
- !memb->wait_gfs_recover_done) {
- memb->tell_gfs_to_recover = 1;
- memb->recovery_status = RS_NEED_RECOVERY;
- memb->local_recovery_status = RS_NEED_RECOVERY;
- }
-
- log_group(mg, "remove member %d tell_gfs_to_recover %d "
- "(%d,%d,%d,%d,%d,%d)",
- memb->nodeid, memb->tell_gfs_to_recover,
- mg->spectator,
- mg->start_type,
- memb->withdrawing,
- memb->jid,
- memb->spectator,
- memb->wait_gfs_recover_done);
-
- if (mg->master_nodeid == memb->nodeid &&
- memb->gone_type == GROUP_NODE_FAILED)
- master_failed = 1;
-
- if (memb->opts & MEMB_OPT_RECOVER)
- memb_gone_recover = memb;
- }
- }
-
- /* add new nodes to members list */
-
- for (i = 0; i < num_nodes; i++) {
- id = nodeids[i];
- if (is_member(mg, id))
- continue;
- add_member(mg, id);
- pos++;
- log_group(mg, "add member %d", id);
- }
-
- prev_master_nodeid = mg->master_nodeid;
- update_master_nodeid(mg);
-
- *pos_out = pos;
- *neg_out = neg;
-
- log_group(mg, "total members %d master_nodeid %d prev %d",
- mg->memb_count, mg->master_nodeid, prev_master_nodeid);
-
-
- /* The master failed and we're the new master, we need to:
-
- - unlink the ckpt that the failed master had open so new ckpts
- can be created down the road
- - resend journals msg to any nodes that needed one from the
- failed master
- - store plocks in ckpt for the new mounters to read when they
- get the journals msg from us */
-
- if (neg && master_failed &&
- (prev_master_nodeid != -1) &&
- (prev_master_nodeid != mg->master_nodeid) &&
- (our_nodeid == mg->master_nodeid)) {
- log_group(mg, "unlink ckpt for failed master %d",
- prev_master_nodeid);
- unlink_checkpoint(mg);
- resend_journals(mg);
- }
-
- /* Do we need a new first mounter?
-
- If we've not gotten a journals message yet (implies we're mounting)
- and there's only one node left in the group (us, after removing the
- failed node), then it's possible that the failed node was doing
- first mounter recovery, so we need to become first mounter.
-
- If we've received a journals message, we can check if the failed
- node was doing first mounter recovery (MEMB_OPT_RECOVER set) and
- if so select the next first mounter. */
-
- if (!neg)
- return;
-
- if (!mg->got_our_journals && mg->memb_count == 1) {
- log_group(mg, "we are left alone, act as first mounter");
- unlink_checkpoint(mg);
- memb = find_memb_nodeid(mg, our_nodeid);
- memb->jid = 0;
- memb->opts |= MEMB_OPT_RECOVER;
- mg->our_jid = 0;
- mg->first_mounter = 1;
- mg->first_mounter_done = 0;
- mg->got_our_options = 1;
- mg->got_our_journals = 1;
- mg->mount_client_delay = 0;
- notify_mount_client(mg);
- return;
- }
-
- if (memb_gone_recover) {
- log_group(mg, "failed node %d had MEMB_OPT_RECOVER",
- memb_gone_recover->nodeid);
- memb_gone_recover->tell_gfs_to_recover = 0;
- }
-
- if (memb_gone_recover && mg->got_our_journals) {
- assign_next_first_mounter(mg);
- memb = find_memb_nodeid(mg, our_nodeid);
- if (memb->opts & MEMB_OPT_RECOVER) {
- log_group(mg, "first mounter failed, we get "
- "MEMB_OPT_RECOVER");
- unlink_checkpoint(mg);
- memb->opts |= MEMB_OPT_RECOVER;
- mg->first_mounter = 1;
- mg->first_mounter_done = 0;
- mg->mount_client_delay = 0;
- notify_mount_client(mg);
- }
- }
-}
-
-int gfs_join_mountgroup_old(struct mountgroup *mg, struct gfsc_mount_args *ma)
-{
- int rv;
-
- if (strlen(ma->options) > MAX_OPTIONS_LEN-1) {
- log_error("join: options too long %zu", strlen(ma->options));
- return -EMLINK;
- }
-
- rv = group_join(gh, mg->name);
- if (rv)
- return -ENOTCONN;
- return 0;
-}
-
-/* recover_members() discovers which nodes need journal recovery
- and moves the memb structs for those nodes into members_gone
- and sets memb->tell_gfs_to_recover on them */
-
-/* we don't want to tell gfs-kernel to do journal recovery for a failed
- node in a number of cases:
- - we're a spectator or readonly mount
- - gfs-kernel is currently withdrawing
- - we're mounting and haven't received a journals message yet
- - we're mounting and got a kernel mount error back from mount.gfs
- - we're mounting and haven't notified mount.gfs yet (to do mount(2))
- - we're mounting and got_kernel_mount is 0, i.e. we've not seen a uevent
- related to the kernel mount yet
- (some of the mounting checks should be obviated by others)
-
- the problem we're trying to avoid here is telling gfs-kernel to do
- recovery when it can't for some reason and then waiting forever for
- a recovery_done signal that will never arrive. */
-
-static void recover_journals(struct mountgroup *mg)
-{
- struct mg_member *memb;
- int rv;
-
- if (mg->spectator ||
- mg->ro ||
- mg->withdraw_suspend ||
- mg->our_jid == JID_INIT ||
- mg->kernel_mount_error ||
- !mg->mount_client_notified ||
- !mg->got_kernel_mount ||
- !mg->kernel_mount_done) {
- log_group(mg, "recover_journals: unable %d,%d,%d,%d,%d,%d,%d,%d",
- mg->spectator,
- mg->ro,
- mg->withdraw_suspend,
- mg->our_jid,
- mg->kernel_mount_error,
- mg->mount_client_notified,
- mg->got_kernel_mount,
- mg->kernel_mount_done);
-
- list_for_each_entry(memb, &mg->members_gone, list) {
- log_group(mg, "member gone %d jid %d "
- "tell_gfs_to_recover %d",
- memb->nodeid, memb->jid,
- memb->tell_gfs_to_recover);
-
- if (memb->tell_gfs_to_recover) {
- memb->tell_gfs_to_recover = 0;
- memb->local_recovery_status = RS_READONLY;
- }
- }
- start_done(mg);
- return;
- }
-
- /* we feed one jid into the kernel for recovery instead of all
- at once because we need to get the result of each independently
- through the single recovery_done sysfs file */
-
- list_for_each_entry(memb, &mg->members_gone, list) {
- if (memb->wait_gfs_recover_done) {
- log_group(mg, "delay new gfs recovery, "
- "wait_gfs_recover_done for nodeid %d jid %d",
- memb->nodeid, memb->jid);
- return;
- }
- }
-
- list_for_each_entry(memb, &mg->members_gone, list) {
- if (!memb->tell_gfs_to_recover)
- continue;
-
- log_group(mg, "recover journal %d nodeid %d",
- memb->jid, memb->nodeid);
-
- rv = set_sysfs(mg, "recover", memb->jid);
- if (rv < 0) {
- memb->local_recovery_status = RS_NOFS;
- continue;
- }
- memb->tell_gfs_to_recover = 0;
- memb->wait_gfs_recover_done = 1;
- return;
- }
-
- /* no more journals to attempt to recover, if we've been successful
- recovering any then send out status, if not then start_done...
- receiving no status message from us before start_done means we
- didn't successfully recover any journals. If we send out status,
- then delay start_done until we get our own message (so all nodes
- will get the status before finish) */
-
- list_for_each_entry(memb, &mg->members_gone, list) {
- if (memb->local_recovery_status == RS_SUCCESS) {
- send_recovery_status(mg);
- log_group(mg, "delay start_done until status recvd");
- return;
- }
- }
-
- start_done(mg);
-}
-
-/* In some cases, we may be joining a mountgroup with needs_recovery
- set (there are journals that need recovery and current members can't
- recover them because they're ro). In this case, we're told to act
- like the first mounter to cause gfs to try to recovery all journals
- when it mounts. When gfs does this, we'll get recovery_done's for
- the individual journals it recovers (ignored) and finally, if all
- journals are ok, an others_may_mount/first_done. */
-
-/* When gfs does first-mount recovery, the mount(2) fails if it can't
- recover one of the journals. If we get o_m_m, then we know it was
- able to successfully recover all the journals. */
-
-/* When we're the first mounter, gfs does recovery on all the journals
- and does "recovery_done" callbacks when it finishes each. We ignore
- these and wait for gfs to be finished with all at which point it calls
- others_may_mount() and first_done is set. */
-
-static int kernel_recovery_done_first(struct mountgroup *mg, int first_done)
-{
- int rv;
-
- if (first_done < 0) {
- /* for back compat, sysfs file deprecated */
- rv = read_sysfs_int(mg, "first_done", &first_done);
- if (rv < 0)
- return rv;
- }
-
- log_group(mg, "kernel_recovery_done_first first_done %d", first_done);
-
- if (mg->kernel_mount_done)
- log_group(mg, "FIXME: assuming kernel_mount_done comes after "
- "first_done");
-
- if (first_done) {
- mg->first_mounter_done = 1;
- send_recovery_done(mg);
- }
-
- return 0;
-}
-
-static int need_kernel_recovery_done(struct mountgroup *mg)
-{
- struct mg_member *memb;
-
- list_for_each_entry(memb, &mg->members_gone, list) {
- if (memb->wait_gfs_recover_done)
- return 1;
- }
- return 0;
-}
-
-/* Note: when a readonly node fails we do consider its journal (and the
- fs) to need recovery... not sure this is really necessary, but
- the readonly node did "own" a journal so it seems proper to recover
- it even if the node wasn't writing to it. So, if there are 3 ro
- nodes mounting the fs and one fails, gfs on the remaining 2 will
- remain blocked until an rw node mounts, and the next mounter must
- be rw. */
-
-int process_recovery_uevent_old(char *name, int jid_done, int status, int first)
-{
- struct mountgroup *mg;
- struct mg_member *memb;
- char *ss;
- int rv, found = 0;
-
- mg = find_mg(name);
- if (!mg) {
- log_error("recovery_done: unknown mount group %s", name);
- return -1;
- }
-
- if (mg->first_mounter && !mg->first_mounter_done)
- return kernel_recovery_done_first(mg, first);
-
- if (jid_done < 0) {
- /* for back compat, sysfs file deprecated */
- rv = read_sysfs_int(mg, "recover_done", &jid_done);
- if (rv < 0)
- return rv;
- }
-
- list_for_each_entry(memb, &mg->members_gone, list) {
- if (memb->jid == jid_done) {
- if (memb->wait_gfs_recover_done) {
- memb->wait_gfs_recover_done = 0;
- found = 1;
- }
- break;
- }
- }
-
- /* We need to ignore recovery_done callbacks in the case where there
- are a bunch of recovery_done callbacks for the first mounter, but
- we detect "first_done" before we've processed all the
- recovery_done's. */
-
- if (!found) {
- log_group(mg, "recovery_done jid %d ignored, first %d,%d",
- jid_done, mg->first_mounter, mg->first_mounter_done);
- return 0;
- }
-
- if (status < 0) {
- /* for back compat, sysfs file deprecated */
- rv = read_sysfs_int(mg, "recover_status", &status);
- if (rv < 0) {
- log_group(mg, "recovery_done jid %d nodeid %d sysfs error %d",
- memb->jid, memb->nodeid, rv);
- memb->local_recovery_status = RS_NOFS;
- goto out;
- }
- }
-
- switch (status) {
- case LM_RD_GAVEUP:
- /*
- * This is unfortunate; it's needed for bz 442451 where
- * gfs-kernel fails to acquire the journal lock on all nodes
- * because a withdrawing node has not yet called
- * dlm_release_lockspace() to free it's journal lock. With
- * this, all nodes should repeatedly try to to recover the
- * journal of the withdrawn node until the withdrawing node
- * clears its dlm locks, and gfs on each of the remaining nodes
- * succeeds in doing the recovery.
- */
-
- if (memb->withdrawing) {
- log_group(mg, "recovery_done jid %d nodeid %d retry "
- "for withdraw", memb->jid, memb->nodeid);
- memb->tell_gfs_to_recover = 1;
- memb->wait_gfs_recover_done = 0;
- usleep(500000);
- }
-
- memb->local_recovery_status = RS_GAVEUP;
- ss = "gaveup";
- break;
- case LM_RD_SUCCESS:
- memb->local_recovery_status = RS_SUCCESS;
- ss = "success";
- break;
- default:
- log_error("recovery_done: jid %d nodeid %d unknown status %d",
- memb->jid, memb->nodeid, status);
- ss = "unknown";
- }
-
- log_group(mg, "recovery_done jid %d nodeid %d %s",
- memb->jid, memb->nodeid, ss);
-
- /* sanity check */
- if (need_kernel_recovery_done(mg))
- log_error("recovery_done: should be no pending gfs recoveries");
-
- out:
- recover_journals(mg);
- return 0;
-}
-
-static void leave_mountgroup(struct mountgroup *mg, int mnterr)
-{
- /* sanity check: we should already have gotten the error from
- the mount.gfs mount_done; so this shouldn't happen */
-
- if (mnterr && !mg->kernel_mount_error) {
- log_error("leave: mount_error is new %d %d",
- mg->kernel_mount_error, mnterr);
- }
-
- mg->leaving = 1;
-
- /* Check to see if we're waiting for a kernel recovery_done to do a
- start_done(). If so, call the start_done() here because we won't be
- getting anything else from gfs-kernel which is now gone. */
-
- if (need_kernel_recovery_done(mg)) {
- log_group(mg, "leave: fill in start_done");
- start_done(mg);
- }
-
- group_leave(gh, mg->name);
-}
-
-void do_leave_old(char *name, int mnterr)
-{
- struct mountgroup *mg;
-
- log_debug("do_leave_old %s mnterr %d", name, mnterr);
-
- list_for_each_entry(mg, &withdrawn_mounts, list) {
- if (strcmp(mg->name, name))
- continue;
- log_group(mg, "leave for withdrawn fs");
- list_del(&mg->list);
- free_mg(mg);
- return;
- }
-
- mg = find_mg(name);
- if (!mg) {
- log_error("do_leave_old: %s not found", name);
- return;
- }
-
- leave_mountgroup(mg, mnterr);
-}
-
-/* When mounting a fs, we first join the mountgroup, then tell mount.gfs
- to procede with the kernel mount. Once we're in the mountgroup, we
- can get a stop callback at any time, which requires us to block the
- fs by setting a sysfs file. If the kernel mount is slow, we can get
- a stop callback and try to set the sysfs file before the kernel mount
- has actually created the sysfs files for the fs. This function delays
- any further processing until the sysfs files exist. */
-
-/* This function returns 0 when the kernel mount is successfully detected
- and we know that do_stop() will be able to block the fs.
- This function returns a negative error if it detects the kernel mount
- has failed which means there's nothing to stop and do_stop() can assume
- an implicit stop. */
-
-/* wait for
- - kernel mount to get to the point of creating sysfs files we
- can read (and that do_stop can then use), or
- - kernel mount to fail causing mount.gfs to send us a MOUNT_DONE
- which we read in process_connection() */
-
-static int wait_for_kernel_mount(struct mountgroup *mg)
-{
- int rv, val;
-
- while (1) {
- /* This is the standard way we leave this loop, where the
- kernel mount gets to the point of creating the sysfs files
- which we see by successfully reading "id". With the
- sysfs files in place, do_stop() will be able to block
- the kernel. */
-
- rv = read_sysfs_int(mg, "block", &val);
- if (!rv)
- break;
- usleep(100000);
-
- /* kernel_mount_done is set by mount_done_old() which is called
- by process_connection() if mount.gfs sends MOUNT_DONE. */
-
- if (mg->kernel_mount_done && !mg->kernel_mount_error) {
- /* mount(2) was successful and we should be able
- to read "id" very shortly... */
- continue;
- }
-
- if (mg->kernel_mount_done && mg->kernel_mount_error) {
- /* mount(2) failed, stop becomes implicit */
- break;
- }
-
- /* this should either do nothing and return immediatley, or
- read a MOUNT_DONE from mount.gfs and call mount_done_old()
- which will set kernel_mount_done and set kernel_mount_error */
-
- process_connection(mg->mount_client);
- }
-
- return rv;
-}
-
-/* The processing of new mounters (send/recv options, send/recv journals,
- notify mount.gfs) is not very integrated with the stop/start/finish
- callbacks from libgroup. A start callback just notifies us of a new
- mounter and the options/journals messages drive things from there.
- Recovery for failed nodes _is_ controlled more directly by the
- stop/start/finish callbacks. So, processing new mounters happens
- independently of recovery and of the libgroup callbacks. One place
- where they need to intersect, though, is in stopping/suspending
- gfs-kernel:
- - When we get a stop callback, we need to be certain that gfs-kernel
- is blocked.
- - When a mounter notifies mount.gfs to go ahead, gfs-kernel will
- shortly begin running in an unblocked fashion as it goes through
- the kernel mounting process.
- Given this, we need to be sure that if gfs-kernel is supposed to be
- blocked, we don't notify mount.gfs to go ahead and do the kernel mount
- since that starts gfs-kernel in an unblocked state. */
-
-/* - if we're unmounting, the kernel is gone, so no problem.
- - if we've just mounted and notified mount.gfs, then wait for kernel
- mount and then block.
- - if we're mounting and have not yet notified mount.gfs, then set
- a flag that delays the notification until block is set to 0. */
-
-int do_stop(struct mountgroup *mg)
-{
- int rv;
-
- if (mg->first_mounter && !mg->kernel_mount_done) {
- log_group(mg, "do_stop skip during first mount recovery");
- goto out;
- }
-
- for (;;) {
- rv = set_sysfs(mg, "block", 1);
- if (!rv) {
- mg->kernel_stopped = 1; /* for queries */
- break;
- }
-
- /* We get an error trying to block gfs, this could be due
- to a number of things:
- 1. if the kernel instance of gfs existed before but now
- we can't see it, that must mean it's been unmounted,
- so it's implicitly stopped
- 2. we're in the process of mounting and gfs hasn't created
- the sysfs files for this fs yet
- 3. we're mounting and mount(2) returned an error
- 4. we're mounting but haven't told mount.gfs to go ahead
- with mount(2) yet
- We also need to handle the situation where we get here in
- case 2 but it turns into case 3 while we're in
- wait_for_kernel_mount() */
-
- if (mg->got_kernel_mount) {
- log_group(mg, "do_stop skipped fs unmounted");
- break;
- }
-
- if (mg->mount_client_notified) {
- if (!mg->kernel_mount_error) {
- log_group(mg, "do_stop wait for kernel mount");
- rv = wait_for_kernel_mount(mg);
- if (rv < 0)
- break;
- } else {
- log_group(mg, "do_stop ignore, failed mount");
- break;
- }
- } else {
- log_group(mg, "do_stop causes mount_client_delay");
- mg->mount_client_delay = 1;
- break;
- }
- }
- out:
- group_stop_done(gh, mg->name);
- return 0;
-}
-
-/* After a start that initiated a recovery, everyone will go and see if they
- can do recovery and try if they can. If a node can't, it does start_done,
- if it tries and fails, it does start_done, if it tries and succeeds it
- sends a message and then does start_done once it receives's it back. So,
- when we get a finish we know that we have all the results from the recovery
- cycle and can judge if everything is recovered properly or not. If so, we
- can unblock locks (in the finish), if not, we leave them blocked (in the
- finish).
-
- If we leave locks blocked in the finish, then they can only be unblocked
- after someone is able to do the recovery that's needed. So, leaving locks
- blocked in a finish because recovery hasn't worked puts us into a special
- state: the fs needs recovery, none of the current mounters has been able to
- recover it, all current mounters have locks blocked in gfs, new mounters
- are allowed, nodes can unmount, new mounters are asked to do first-mounter
- recovery, if one of them succeeds then we can all clear this special state
- and unblock locks (the unblock would happen upon recving the success
- message from the new pseudo-first mounter, not as part of a finish), future
- finishes would then go back to being able to unblock locks.
-
- While in this special state, a new node has been added and asked to do
- first-mounter recovery, other nodes can also be added while the new
- first-mounter is active. These other nodes don't notify mount.gfs.
- They'll receive the result of the first mounter and if it succeeded they'll
- notify mount.gfs, otherwise one of them will become the next first-mounter
- and notify mount.gfs. */
-
-int do_finish(struct mountgroup *mg)
-{
- struct mg_member *memb, *safe;
-
- log_group(mg, "finish %d needs_recovery %d", mg->last_finish,
- mg->needs_recovery);
-
- /* members_gone list are the members that were removed from the
- members list when processing a start. members are removed
- from members_gone if their journals have been recovered */
-
- list_for_each_entry_safe(memb, safe, &mg->members_gone, list) {
- if (!memb->recovery_status) {
- list_del(&memb->list);
- free(memb);
- } else if (memb->recovery_status == RS_SUCCESS) {
- ASSERT(memb->gone_event <= mg->last_finish);
- log_group(mg, "finish: recovered jid %d nodeid %d",
- memb->jid, memb->nodeid);
- list_del(&memb->list);
- free(memb);
- } else {
- log_error("%s finish: needs recovery jid %d nodeid %d "
- "status %d", mg->name, memb->jid,
- memb->nodeid, memb->recovery_status);
- mg->needs_recovery = 1;
- }
- }
-
- list_for_each_entry(memb, &mg->members, list)
- memb->finished = 1;
-
- if (mg->group_leave_on_finish) {
- log_group(mg, "leaving group after delay for join to finish");
- group_leave(gh, mg->name);
- mg->group_leave_on_finish = 0;
- return 0;
- }
-
- if (!mg->needs_recovery) {
- mg->kernel_stopped = 0; /* for queries */
- set_sysfs(mg, "block", 0);
-
- /* we may have been holding back our local mount due to
- being stopped/blocked */
- if (mg->mount_client_delay && !first_mounter_recovery(mg)) {
- mg->mount_client_delay = 0;
- notify_mount_client(mg);
- }
- } else
- log_group(mg, "finish: leave locks blocked for needs_recovery");
-
- return 0;
-}
-
-/*
- * - require the first mounter to be rw, not ro or spectator.
- *
- * - if rw mounter fails, leaving only spectator mounters,
- * require the next mounter to be rw, more ro/spectator mounts should
- * fail until the fs is mounted rw.
- *
- * - if last rw mounter fails and ro mounters are left (possibly with
- * some spectators), disallow any ro->rw remounts, leave gfs blocked,
- * require next mounter to be rw, have next mounter do first mount
- * gfs/journal recovery.
- */
-
-/* called for the initial start on the node that's first to mount the fs.
- (it should be ok to let the first mounter be a spectator, gfs should do
- first recovery and bail out if there are any dirty journals) */
-
-/* FIXME: if journal recovery fails on any of the journals, we should
- fail the mount */
-
-static void start_first_mounter(struct mountgroup *mg)
-{
- struct mg_member *memb;
-
- log_group(mg, "start_first_mounter");
- set_our_memb_options(mg);
- memb = find_memb_nodeid(mg, our_nodeid);
- ASSERT(memb);
-
- if (mg->ro || mg->spectator) {
- memb->jid = -2;
- mg->our_jid = -2;
- log_group(mg, "start_first_mounter not rw ro=%d spect=%d",
- mg->ro , mg->spectator);
- mg->mount_client_result = -EUCLEAN;
- } else {
- memb->opts |= MEMB_OPT_RECOVER;
- memb->jid = 0;
- mg->our_jid = 0;
- mg->first_mounter = 1;
- mg->first_mounter_done = 0;
- mg->got_our_options = 1;
- mg->got_our_journals = 1;
- }
- start_done(mg);
- notify_mount_client(mg);
-}
-
-/* called for the initial start on a rw/ro mounter;
- the existing mounters are running start_participant() */
-
-static void start_participant_init(struct mountgroup *mg)
-{
- log_group(mg, "start_participant_init");
- set_our_memb_options(mg);
- send_options(mg);
- start_done(mg);
-}
-
-/* called for a non-initial start on a normal mounter.
- NB we can get here without having received a journals message for
- our (recent) mount yet in which case we don't know the jid or ro/rw
- status of any members, and don't know our own jid. */
-
-static void start_participant(struct mountgroup *mg, int pos, int neg)
-{
- log_group(mg, "start_participant pos=%d neg=%d", pos, neg);
-
- if (pos) {
- start_done(mg);
- /* we save options messages from nodes for whom we've not
- received a start yet */
- process_saved_options(mg);
- } else if (neg) {
- recover_journals(mg);
- process_saved_recovery_status(mg);
- }
-}
-
-/* called for the initial start on a spectator mounter,
- after _receive_journals() */
-
-static void start_spectator_init_2(struct mountgroup *mg)
-{
- log_group(mg, "start_spectator_init_2 our_jid=%d", mg->our_jid);
-
- /* we've been given jid of -2 which means we're not permitted
- to mount the fs; probably because the next mounter must be rw */
-
- if (mg->our_jid == -2) {
- mg->mount_client_result = -EUCLEAN;
- } else
- ASSERT(mg->our_jid == -1);
-
- notify_mount_client(mg);
-}
-
-/* called for the initial start on a spectator mounter */
-
-static void start_spectator_init(struct mountgroup *mg)
-{
- log_group(mg, "start_spectator_init");
- set_our_memb_options(mg);
- send_options(mg);
- start_done(mg);
- mg->start2_fn = start_spectator_init_2;
-}
-
-/* called for a non-initial start on a spectator mounter */
-
-static void start_spectator(struct mountgroup *mg, int pos, int neg)
-{
- log_group(mg, "start_spectator pos=%d neg=%d", pos, neg);
-
- if (pos) {
- start_done(mg);
- process_saved_options(mg);
- } else if (neg) {
- recover_journals(mg);
- process_saved_recovery_status(mg);
- }
-}
-
-/* If nodeA fails, nodeB is recovering journalA and nodeB fails before
- finishing, then nodeC needs to tell gfs to recover both journalA and
- journalB. We do this by setting tell_gfs_to_recover back to 1 for
- any nodes that are still on the members_gone list. */
-
-static void reset_unfinished_recoveries(struct mountgroup *mg)
-{
- struct mg_member *memb;
-
- list_for_each_entry(memb, &mg->members_gone, list) {
- if (memb->recovery_status &&
- memb->recovery_status != RS_NEED_RECOVERY) {
- log_group(mg, "retry unfinished recovery "
- "jid %d nodeid %d",
- memb->jid, memb->nodeid);
- memb->tell_gfs_to_recover = 1;
- memb->recovery_status = RS_NEED_RECOVERY;
- memb->local_recovery_status = RS_NEED_RECOVERY;
- }
- }
-}
-
-/*
- old method:
- A is rw mount, B mounts rw
-
- do_start do_start
- start_participant start_participant_init
- send_options
- receive_options
- start_participant_2
- discover_journals
- assign B a jid
- send_journals
- group_start_done
- receive_journals
- start_participant_init_2
- group_start_done
- do_finish do_finish
-
- new method: decouples stop/start/finish from mount processing
- A is rw mount, B mounts rw
-
- do_start do_start
- start_participant start_participant_init
- start_done send_options
- start_done
- do_finish do_finish
-
- receive_options
- assign_journal
- send_journals
- receive_journals
- start_participant_init_2
- notify_mount_client
-*/
-
-void do_start(struct mountgroup *mg, int type, int member_count, int *nodeids)
-{
- int pos = 0, neg = 0;
-
- mg->start_event_nr = mg->last_start;
- mg->start_type = type;
-
- log_group(mg, "start %d init %d type %d member_count %d",
- mg->last_start, mg->init, type, member_count);
-
- recover_members(mg, member_count, nodeids, &pos, &neg);
- reset_unfinished_recoveries(mg);
-
- if (mg->init) {
- if (member_count == 1)
- start_first_mounter(mg);
- else if (mg->spectator)
- start_spectator_init(mg);
- else
- start_participant_init(mg);
- mg->init = 0;
- } else {
- if (mg->spectator)
- start_spectator(mg, pos, neg);
- else
- start_participant(mg, pos, neg);
- }
-}
-
-/*
- What repurcussions are there from umount shutting down gfs in the
- kernel before we leave the mountgroup? We can no longer participate
- in recovery even though we're in the group -- what are the end cases
- that we need to deal with where this causes a problem? i.e. there
- is a period of time where the mountgroup=A,B,C but the kernel fs
- is only active on A,B, not C. The mountgroup on A,B can't depend
- on the mg on C to necessarily be able to do some things (recovery).
-
- At least in part, it means that after we do an umount and have
- removed the instance of this fs in the kernel, we'll still get
- stop/start/finish callbacks from groupd for which we'll attempt
- and fail to: block/unblock gfs kernel activity, initiate gfs
- journal recovery, get recovery-done signals fromt eh kernel.
-
- We don't want to hang groupd event processing by failing to send
- an ack (stop_done/start_done) back to groupd when it needs one
- to procede. In the case where we get a start for a failed node
- that needs journal recovery, we have a problem because we wait to
- call group_start_done() until gfs in the kernel to signal that
- the journal recovery is done. If we've unmounted gfs isn't there
- any more to give us this signal and we'll never call start_done.
-
- update: we should be dealing with all these issues correctly now. */
-
-int do_terminate(struct mountgroup *mg)
-{
- purge_plocks(mg, 0, 1);
-
- if (mg->withdraw_suspend) {
- log_group(mg, "termination of our withdraw leave");
- set_sysfs(mg, "withdraw", 1);
- list_move(&mg->list, &withdrawn_mounts);
- } else {
- log_group(mg, "termination of our unmount leave");
- list_del(&mg->list);
- free(mg);
- }
-
- return 0;
-}
-
-static void do_deliver(int nodeid, char *data, int len)
-{
- struct mountgroup *mg;
- struct gdlm_header *hd;
-
- hd = (struct gdlm_header *) data;
-
- mg = find_mg(hd->name);
- if (!mg) {
- /*
- log_error("cpg message from %d len %d no group %s",
- nodeid, len, hd->name);
- */
- return;
- }
-
- hd->version[0] = le16_to_cpu(hd->version[0]);
- hd->version[1] = le16_to_cpu(hd->version[1]);
- hd->version[2] = le16_to_cpu(hd->version[2]);
- hd->type = le16_to_cpu(hd->type);
- hd->nodeid = le32_to_cpu(hd->nodeid);
- hd->to_nodeid = le32_to_cpu(hd->to_nodeid);
-
- /* FIXME: we need to look at how to gracefully fail when we end up
- with mixed incompat versions */
-
- if (hd->version[0] != protocol_active[0]) {
- log_error("reject message from %d version %u.%u.%u vs %u.%u.%u",
- nodeid, hd->version[0], hd->version[1],
- hd->version[2], protocol_active[0],
- protocol_active[1], protocol_active[2]);
- return;
- }
-
- /* If there are some group messages between a new node being added to
- the cpg group and being added to the app group, the new node should
- discard them since they're only relevant to the app group. */
-
- if (!mg->last_callback) {
- log_group(mg, "discard %s len %d from %d",
- msg_name(hd->type), len, nodeid);
- return;
- }
-
- switch (hd->type) {
- case MSG_JOURNAL:
- receive_journals(mg, data, len, nodeid);
- break;
-
- case MSG_OPTIONS:
- receive_options(mg, data, len, nodeid);
- break;
-
- case MSG_REMOUNT:
- receive_remount(mg, data, len, nodeid);
- break;
-
- case MSG_PLOCK:
- receive_plock(mg, data, len, nodeid);
- break;
-
- case MSG_MOUNT_STATUS:
- receive_mount_status(mg, data, len, nodeid);
- break;
-
- case MSG_RECOVERY_STATUS:
- receive_recovery_status(mg, data, len, nodeid);
- break;
-
- case MSG_RECOVERY_DONE:
- receive_recovery_done(mg, data, len, nodeid);
- break;
-
- case MSG_WITHDRAW:
- receive_withdraw(mg, data, len, nodeid);
- break;
-
- case MSG_PLOCK_OWN:
- receive_own(mg, data, len, nodeid);
- break;
-
- case MSG_PLOCK_DROP:
- receive_drop(mg, data, len, nodeid);
- break;
-
- case MSG_PLOCK_SYNC_LOCK:
- case MSG_PLOCK_SYNC_WAITER:
- receive_sync(mg, data, len, nodeid);
- break;
-
- default:
- log_error("unknown message type %d from %d",
- hd->type, hd->nodeid);
- }
-}
-
-static void deliver_cb(cpg_handle_t handle, struct cpg_name *group_name,
- uint32_t nodeid, uint32_t pid, void *data, int data_len)
-{
- do_deliver(nodeid, data, data_len);
-}
-
-/* Not sure if purging plocks (driven by confchg) needs to be synchronized with
- the other recovery steps (driven by libgroup) for a node, don't think so.
- Is it possible for a node to have been cleared from the members_gone list
- before this confchg is processed? */
-
-static void confchg_cb(cpg_handle_t handle, struct cpg_name *group_name,
- struct cpg_address *member_list, int member_list_entries,
- struct cpg_address *left_list, int left_list_entries,
- struct cpg_address *joined_list, int joined_list_entries)
-{
- struct mountgroup *mg;
- int i, nodeid;
-
- for (i = 0; i < left_list_entries; i++) {
- nodeid = left_list[i].nodeid;
- list_for_each_entry(mg, &mountgroups, list) {
- if (is_member(mg, nodeid) || is_removed(mg, nodeid))
- purge_plocks(mg, left_list[i].nodeid, 0);
- }
- }
-}
-
-static cpg_callbacks_t callbacks = {
- .cpg_deliver_fn = deliver_cb,
- .cpg_confchg_fn = confchg_cb,
-};
-
-void process_cpg_old(int ci)
-{
- cpg_error_t error;
-
- error = cpg_dispatch(cpg_handle_daemon, CPG_DISPATCH_ALL);
- if (error != CPG_OK) {
- log_error("cpg_dispatch error %d", error);
- return;
- }
-
- update_flow_control_status();
-}
-
-int setup_cpg_old(void)
-{
- static struct cpg_name name;
- cpg_error_t error;
- int fd = 0;
-
- if (cfgd_plock_ownership)
- memcpy(protocol_active, protocol_v200, sizeof(protocol_v200));
- else
- memcpy(protocol_active, protocol_v100, sizeof(protocol_v100));
-
- error = cpg_initialize(&cpg_handle_daemon, &callbacks);
- if (error != CPG_OK) {
- log_error("daemon cpg_initialize error %d", error);
- return -1;
- }
-
- cpg_fd_get(cpg_handle_daemon, &fd);
- if (fd < 0) {
- log_error("daemon cpg_fd_get error %d", error);
- return -1;
- }
-
- memset(&name, 0, sizeof(name));
- strcpy(name.value, "gfs_controld");
- name.length = 12;
-
- retry:
- error = cpg_join(cpg_handle_daemon, &name);
- if (error == CPG_ERR_TRY_AGAIN) {
- log_debug("daemon cpg_join retry");
- sleep(1);
- goto retry;
- }
- if (error != CPG_OK) {
- log_error("daemon cpg_join error %d", error);
- cpg_finalize(cpg_handle_daemon);
- return -1;
- }
-
- log_debug("setup_cpg_old %d", fd);
- return fd;
-}
-
-void close_cpg_old(void)
-{
- static struct cpg_name name;
- cpg_error_t error;
- int i = 0;
-
- if (!cpg_handle_daemon || cluster_down)
- return;
-
- memset(&name, 0, sizeof(name));
- strcpy(name.value, "gfs_controld");
- name.length = 12;
-
- retry:
- error = cpg_leave(cpg_handle_daemon, &name);
- if (error == CPG_ERR_TRY_AGAIN) {
- sleep(1);
- if (!(++i % 10))
- log_error("daemon cpg_leave error retrying");
- goto retry;
- }
- if (error != CPG_OK)
- log_error("daemon cpg_leave error %d", error);
-}
-
diff --git a/group/gfs_controld/cpg-old.h b/group/gfs_controld/cpg-old.h
deleted file mode 100644
index 0458338..0000000
--- a/group/gfs_controld/cpg-old.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifndef __CPG_OLD_DOT_H__
-#define __CPG_OLD_DOT_H__
-
-#define DO_STOP 1
-#define DO_START 2
-#define DO_FINISH 3
-#define DO_TERMINATE 4
-#define DO_SETID 5
-
-enum {
-
- MSG_JOURNAL = 1,
- MSG_OPTIONS,
- MSG_REMOUNT,
- MSG_PLOCK,
- MSG_WITHDRAW,
- MSG_MOUNT_STATUS,
- MSG_RECOVERY_STATUS,
- MSG_RECOVERY_DONE,
- MSG_PLOCK_OWN,
- MSG_PLOCK_DROP,
- MSG_PLOCK_SYNC_LOCK,
- MSG_PLOCK_SYNC_WAITER,
-};
-
-/* These lengths are part of the "old" wire protocol. */
-
-#define MAX_OPTIONS_LEN 1024
-#define MSG_NAMELEN 255
-
-struct gdlm_header {
- uint16_t version[3];
- uint16_t type; /* MSG_ */
- uint32_t nodeid; /* sender */
- uint32_t to_nodeid; /* 0 if to all */
- char name[MSG_NAMELEN];
-};
-
-struct save_msg {
- struct list_head list;
- int nodeid;
- int len;
- int type;
- char buf[0];
-};
-
-struct mg_member {
- struct list_head list;
- int nodeid;
- int jid;
-
- int spectator;
- int readonly;
- int rw;
- uint32_t opts;
-
- int tell_gfs_to_recover;
- int wait_gfs_recover_done;
- int gone_event;
- int gone_type;
- int finished;
- int local_recovery_status;
- int recovery_status;
- int withdrawing;
- int needs_journals;
-
- int ms_kernel_mount_done;
- int ms_first_mounter;
- int ms_kernel_mount_error;
-};
-
-#endif
-
diff --git a/group/gfs_controld/gfs_daemon.h b/group/gfs_controld/gfs_daemon.h
index 157865a..ec9c0c6 100644
--- a/group/gfs_controld/gfs_daemon.h
+++ b/group/gfs_controld/gfs_daemon.h
@@ -57,18 +57,10 @@
#define MAXLINE 256
-/* group_mode */
-
-#define GROUP_LIBGROUP 2
-#define GROUP_LIBCPG 3
-
extern int daemon_debug_opt;
extern int daemon_quit;
extern int cluster_down;
extern int poll_dlm;
-extern int poll_ignore_plock;
-extern int plock_fd;
-extern int plock_ci;
extern struct list_head mountgroups;
extern int cman_quorate;
extern int our_nodeid;
@@ -77,14 +69,9 @@ extern char daemon_debug_buf[256];
extern char dump_buf[GFSC_DUMP_SIZE];
extern int dump_point;
extern int dump_wrap;
-extern char plock_dump_buf[GFSC_DUMP_SIZE];
-extern int plock_dump_len;
extern int dmsetup_wait;
extern cpg_handle_t cpg_handle_daemon;
extern int libcpg_flow_control_on;
-extern int group_mode;
-extern uint32_t plock_minor;
-extern uint32_t old_plock_minor;
extern struct list_head withdrawn_mounts;
void daemon_dump_save(void);
@@ -124,7 +111,6 @@ struct mountgroup {
uint32_t id;
struct gfsc_mount_args mount_args;
char name[GFS_MOUNTGROUP_LEN+1];
- int old_group_mode;
int mount_client;
int mount_client_result;
@@ -165,44 +151,6 @@ struct mountgroup {
int first_recovery_msg;
int local_recovery_jid;
int local_recovery_busy;
-
- /* cpg-old stuff for rhel5/stable2 compat */
-
- struct list_head members;
- struct list_head members_gone;
- int memb_count;
- int last_stop;
- int last_start;
- int last_finish;
- int last_callback;
- int start_event_nr;
- int start_type;
- int group_leave_on_finish;
- int init;
- int got_our_options;
- int got_our_journals;
- int delay_send_journals;
- int first_mount_pending_stop;
- int first_mounter_done;
- int global_first_recover_done;
- int emulate_first_mounter;
- int wait_first_done;
- int needs_recovery;
- int low_nodeid;
- int master_nodeid;
- int got_kernel_mount;
- struct list_head saved_messages;
- void *start2_fn;
-
- /* cpg-old plock stuff */
-
- int save_plocks;
- struct list_head plock_resources;
- uint32_t associated_ls_id;
- uint64_t cp_handle;
- time_t last_checkpoint_time;
- time_t last_plock_time;
- struct timeval drop_resources_last;
};
/* these need to match the kernel defines of the same name in lm_interface.h */
@@ -239,39 +187,6 @@ int set_mountgroup_nodes(struct mountgroup *mg, int option, int *node_count,
struct gfsc_node **nodes_out);
void free_mg(struct mountgroup *mg);
-/* cpg-old.c */
-int setup_cpg_old(void);
-void close_cpg_old(void);
-void process_cpg_old(int ci);
-
-int gfs_join_mountgroup_old(struct mountgroup *mg, struct gfsc_mount_args *ma);
-void do_leave_old(char *name, int mnterr);
-int send_group_message_old(struct mountgroup *mg, int len, char *buf);
-void save_message_old(struct mountgroup *mg, char *buf, int len, int from,
- int type);
-void send_withdraw_old(struct mountgroup *mg);
-int process_recovery_uevent_old(char *name, int jid, int status, int first);
-void send_remount_old(struct mountgroup *mg, struct gfsc_mount_args *ma);
-void send_mount_status_old(struct mountgroup *mg);
-int do_stop(struct mountgroup *mg);
-int do_finish(struct mountgroup *mg);
-void do_start(struct mountgroup *mg, int type, int member_count, int *nodeids);
-int do_terminate(struct mountgroup *mg);
-int do_withdraw_old(char *table);
-
-/* group.c */
-int setup_groupd(void);
-void close_groupd(void);
-void process_groupd(int ci);
-int set_mountgroup_info_group(struct mountgroup *mg,
- struct gfsc_mountgroup *out);
-int set_node_info_group(struct mountgroup *mg, int nodeid,
- struct gfsc_node *node);
-int set_mountgroups_group(int *count, struct gfsc_mountgroup **mgs_out);
-int set_mountgroup_nodes_group(struct mountgroup *mg, int option,
- int *node_count, struct gfsc_node **nodes_out);
-int set_group_mode(void);
-
/* main.c */
int do_read(int fd, void *buf, size_t count);
int do_write(int fd, void *buf, size_t count);
@@ -297,22 +212,6 @@ void close_cman(void);
void process_cman(int ci);
void kick_node_from_cluster(int nodeid);
-/* plock.c */
-int setup_plocks(void);
-void process_plocks(int ci);
-int limit_plocks(void);
-void receive_plock(struct mountgroup *mg, char *buf, int len, int from);
-void receive_own(struct mountgroup *mg, char *buf, int len, int from);
-void receive_sync(struct mountgroup *mg, char *buf, int len, int from);
-void receive_drop(struct mountgroup *mg, char *buf, int len, int from);
-void process_saved_plocks(struct mountgroup *mg);
-int unlink_checkpoint(struct mountgroup *mg);
-void store_plocks(struct mountgroup *mg, int nodeid);
-void retrieve_plocks(struct mountgroup *mg);
-void purge_plocks(struct mountgroup *mg, int nodeid, int unmount);
-int fill_plock_dump_buf(struct mountgroup *mg);
-int setup_misc_devices(void);
-
/* util.c */
int we_are_in_fence_domain(void);
int set_sysfs(struct mountgroup *mg, char *field, int val);
diff --git a/group/gfs_controld/group.c b/group/gfs_controld/group.c
deleted file mode 100644
index e5046b2..0000000
--- a/group/gfs_controld/group.c
+++ /dev/null
@@ -1,360 +0,0 @@
-#include "gfs_daemon.h"
-#include "config.h"
-#include "cpg-old.h"
-#include "libgroup.h"
-
-#define LOCK_DLM_GROUP_LEVEL 2
-#define LOCK_DLM_GROUP_NAME "gfs"
-
-/* save all the params from callback functions here because we can't
- do the processing within the callback function itself */
-
-group_handle_t gh;
-static int cb_action;
-static char cb_name[GFS_MOUNTGROUP_LEN+1];
-static int cb_event_nr;
-static unsigned int cb_id;
-static int cb_type;
-static int cb_member_count;
-static int cb_members[MAX_NODES];
-
-
-static void stop_cbfn(group_handle_t h, void *private, char *name)
-{
- cb_action = DO_STOP;
- strcpy(cb_name, name);
-}
-
-static void start_cbfn(group_handle_t h, void *private, char *name,
- int event_nr, int type, int member_count, int *members)
-{
- int i;
-
- cb_action = DO_START;
- strncpy(cb_name, name, GFS_MOUNTGROUP_LEN);
- cb_event_nr = event_nr;
- cb_type = type;
- cb_member_count = member_count;
-
- for (i = 0; i < member_count; i++)
- cb_members[i] = members[i];
-}
-
-static void finish_cbfn(group_handle_t h, void *private, char *name,
- int event_nr)
-{
- cb_action = DO_FINISH;
- strncpy(cb_name, name, GFS_MOUNTGROUP_LEN);
- cb_event_nr = event_nr;
-}
-
-static void terminate_cbfn(group_handle_t h, void *private, char *name)
-{
- cb_action = DO_TERMINATE;
- strncpy(cb_name, name, GFS_MOUNTGROUP_LEN);
-}
-
-static void setid_cbfn(group_handle_t h, void *private, char *name,
- unsigned int id)
-{
- cb_action = DO_SETID;
- strncpy(cb_name, name, GFS_MOUNTGROUP_LEN);
- cb_id = id;
-}
-
-static group_callbacks_t callbacks = {
- stop_cbfn,
- start_cbfn,
- finish_cbfn,
- terminate_cbfn,
- setid_cbfn,
-};
-
-static char *str_members(void)
-{
- static char str_members_buf[MAXLINE];
- int i, ret, pos = 0, len = MAXLINE;
-
- memset(str_members_buf, 0, MAXLINE);
-
- for (i = 0; i < cb_member_count; i++) {
- if (i != 0) {
- ret = snprintf(str_members_buf + pos, len - pos, " ");
- if (ret >= len - pos)
- break;
- pos += ret;
- }
- ret = snprintf(str_members_buf + pos, len - pos, "%d",
- cb_members[i]);
- if (ret >= len - pos)
- break;
- pos += ret;
- }
- return str_members_buf;
-}
-
-void process_groupd(int ci)
-{
- struct mountgroup *mg;
- int error = 0;
-
- error = group_dispatch(gh);
- if (error) {
- log_error("groupd_dispatch error %d errno %d", error, errno);
- goto out;
- }
-
- if (!cb_action)
- goto out;
-
- mg = find_mg(cb_name);
- if (!mg) {
- log_error("callback %d group %s not found", cb_action, cb_name);
- error = -1;
- goto out;
- }
-
- switch (cb_action) {
- case DO_STOP:
- log_debug("groupd cb: stop %s", cb_name);
- mg->last_callback = DO_STOP;
- mg->last_stop = mg->last_start;
- do_stop(mg);
- break;
-
- case DO_START:
- log_debug("groupd cb: start %s type %d count %d members %s",
- cb_name, cb_type, cb_member_count, str_members());
- mg->last_callback = DO_START;
- mg->last_start = cb_event_nr;
- do_start(mg, cb_type, cb_member_count, cb_members);
- break;
-
- case DO_FINISH:
- log_debug("groupd cb: finish %s", cb_name);
- mg->last_callback = DO_FINISH;
- mg->last_finish = cb_event_nr;
- do_finish(mg);
- break;
-
- case DO_TERMINATE:
- log_debug("groupd cb: terminate %s", cb_name);
- mg->last_callback = DO_TERMINATE;
- do_terminate(mg);
- break;
-
- case DO_SETID:
- log_debug("groupd cb: set_id %s %x", cb_name, cb_id);
- mg->id = cb_id;
- break;
-
- default:
- error = -EINVAL;
- }
-
- out:
- cb_action = 0;
-}
-
-int setup_groupd(void)
-{
- int rv;
-
- gh = group_init(NULL, LOCK_DLM_GROUP_NAME, LOCK_DLM_GROUP_LEVEL,
- &callbacks, 10);
- if (!gh) {
- log_error("group_init error %p %d", gh, errno);
- return -ENOTCONN;
- }
-
- rv = group_get_fd(gh);
- if (rv < 0)
- log_error("group_get_fd error %d %d", rv, errno);
-
- log_debug("groupd %d", rv);
-
- return rv;
-}
-
-void close_groupd(void)
-{
- group_exit(gh);
-}
-
-/* most of the query info doesn't apply in the LIBGROUP mode, but we can
- emulate some basic parts of it */
-
-int set_mountgroup_info_group(struct mountgroup *mg,
- struct gfsc_mountgroup *out)
-{
- strncpy(out->name, mg->name, GFS_MOUNTGROUP_LEN);
- out->global_id = mg->id;
-
- if (mg->joining)
- out->flags |= GFSC_MF_JOINING;
- if (mg->leaving)
- out->flags |= GFSC_MF_LEAVING;
- if (mg->kernel_stopped)
- out->flags |= GFSC_MF_KERNEL_STOPPED;
-
- out->cg_prev.member_count = mg->memb_count;
-
- return 0;
-}
-
-static int _set_node_info(struct mountgroup *mg, int nodeid,
- struct gfsc_node *node)
-{
- struct mg_member *memb;
- int is_member = 0, is_gone = 0;
-
- list_for_each_entry(memb, &mg->members, list) {
- if (memb->nodeid != nodeid)
- continue;
- is_member = 1;
- goto found;
- }
- list_for_each_entry(memb, &mg->members_gone, list) {
- if (memb->nodeid != nodeid)
- continue;
- is_gone = 1;
- break;
- }
- if (!is_member && !is_gone)
- goto out;
- found:
- node->nodeid = nodeid;
-
- if (is_member)
- node->flags |= GFSC_NF_MEMBER;
- if (memb->spectator)
- node->flags |= GFSC_NF_SPECTATOR;
- if (memb->readonly)
- node->flags |= GFSC_NF_READONLY;
- if (memb->ms_kernel_mount_done)
- node->flags |= GFSC_NF_KERNEL_MOUNT_DONE;
- if (memb->ms_kernel_mount_error)
- node->flags |= GFSC_NF_KERNEL_MOUNT_ERROR;
-
- node->jid = memb->jid;
-
- if (is_gone && memb->gone_type == GROUP_NODE_FAILED)
- node->failed_reason = 1;
- out:
- return 0;
-}
-
-int set_node_info_group(struct mountgroup *mg, int nodeid,
- struct gfsc_node *node)
-{
- return _set_node_info(mg, nodeid, node);
-}
-
-int set_mountgroups_group(int *count, struct gfsc_mountgroup **mgs_out)
-{
- struct mountgroup *mg;
- struct gfsc_mountgroup *mgs, *mgp;
- int mg_count = 0;
-
- list_for_each_entry(mg, &mountgroups, list)
- mg_count++;
-
- mgs = malloc(mg_count * sizeof(struct gfsc_mountgroup));
- if (!mgs)
- return -ENOMEM;
- memset(mgs, 0, mg_count * sizeof(struct gfsc_mountgroup));
-
- mgp = mgs;
- list_for_each_entry(mg, &mountgroups, list) {
- set_mountgroup_info(mg, mgp++);
- }
-
- *count = mg_count;
- *mgs_out = mgs;
- return 0;
-}
-
-int list_count(struct list_head *head)
-{
- struct list_head *tmp;
- int count = 0;
-
- list_for_each(tmp, head)
- count++;
- return count;
-}
-
-int set_mountgroup_nodes_group(struct mountgroup *mg, int option,
- int *node_count, struct gfsc_node **nodes_out)
-{
- struct gfsc_node *nodes = NULL, *nodep;
- struct mg_member *memb;
- int count = 0;
-
- if (option == GFSC_NODES_ALL) {
- count = mg->memb_count + list_count(&mg->members_gone);
- } else if (option == GFSC_NODES_MEMBERS) {
- count = mg->memb_count;
- } else
- goto out;
-
- nodes = malloc(count * sizeof(struct gfsc_node));
- if (!nodes)
- return -ENOMEM;
- memset(nodes, 0, count * sizeof(struct gfsc_node));
- nodep = nodes;
-
- list_for_each_entry(memb, &mg->members, list)
- _set_node_info(mg, memb->nodeid, nodep++);
-
- if (option == GFSC_NODES_ALL) {
- list_for_each_entry(memb, &mg->members_gone, list)
- _set_node_info(mg, memb->nodeid, nodep++);
- }
- out:
- *node_count = count;
- *nodes_out = nodes;
- return 0;
-}
-
-int set_group_mode(void)
-{
- int i = 0, rv, version, limit;
-
- while (1) {
- rv = group_get_version(&version);
-
- if (rv || version < 0) {
- /* we expect to get version of -EAGAIN while groupd
- is detecting the mode of everyone; don't retry
- as long if we're not getting anything back from
- groupd */
-
- log_debug("set_group_mode get_version %d ver %d",
- rv, version);
-
- limit = (version == -EAGAIN) ? 30 : 5;
-
- if (i++ > limit) {
- log_error("cannot get groupd compatibility "
- "mode rv %d ver %d", rv, version);
- return -1;
- }
- sleep(1);
- continue;
- }
-
-
- if (version == GROUP_LIBGROUP) {
- group_mode = GROUP_LIBGROUP;
- return 0;
- } else if (version == GROUP_LIBCPG) {
- group_mode = GROUP_LIBCPG;
- return 0;
- } else {
- log_error("set_group_mode invalid ver %d", version);
- return -1;
- }
- }
-}
-
diff --git a/group/gfs_controld/main.c b/group/gfs_controld/main.c
index 457a5d6..bf481af 100644
--- a/group/gfs_controld/main.c
+++ b/group/gfs_controld/main.c
@@ -157,19 +157,9 @@ struct mountgroup *create_mg(char *name)
return NULL;
memset(mg, 0, sizeof(struct mountgroup));
- if (group_mode == GROUP_LIBGROUP)
- mg->old_group_mode = 1;
-
- INIT_LIST_HEAD(&mg->members);
- INIT_LIST_HEAD(&mg->members_gone);
- INIT_LIST_HEAD(&mg->plock_resources);
- INIT_LIST_HEAD(&mg->saved_messages);
INIT_LIST_HEAD(&mg->changes);
INIT_LIST_HEAD(&mg->journals);
INIT_LIST_HEAD(&mg->node_history);
- mg->init = 1;
- mg->master_nodeid = -1;
- mg->low_nodeid = -1;
strncpy(mg->name, name, GFS_MOUNTGROUP_LEN);
@@ -340,10 +330,7 @@ static void process_uevent(int ci)
if (strcmp(uevent_vals[Env_SUBSYSTEM], "lock_dlm") == 0)
return;
- if (group_mode == GROUP_LIBGROUP)
- do_leave_old(fsname, 0);
- else
- do_leave(fsname, 0);
+ do_leave(fsname, 0);
} else if (!strcmp(uevent_vals[Env_ACTION], "change")) {
int jid, status = -1, first = -1;
@@ -363,10 +350,7 @@ static void process_uevent(int ci)
(strcmp(uevent_vals[Env_FIRSTMOUNT], "Done") == 0))
first = 1;
- if (group_mode == GROUP_LIBGROUP)
- process_recovery_uevent_old(fsname, jid, status, first);
- else
- process_recovery_uevent(fsname, jid, status, first);
+ process_recovery_uevent(fsname, jid, status, first);
} else if (!strcmp(uevent_vals[Env_ACTION], "offline")) {
do_withdraw(fsname);
@@ -445,29 +429,6 @@ static void query_dump_debug(int fd)
do_write(fd, dump_buf, len);
}
-static void query_dump_plocks(int fd, char *name)
-{
- struct mountgroup *mg;
- struct gfsc_header h;
- int rv;
-
- mg = find_mg(name);
- if (!mg) {
- plock_dump_len = 0;
- rv = -ENOENT;
- } else {
- /* writes to plock_dump_buf and sets plock_dump_len */
- rv = fill_plock_dump_buf(mg);
- }
-
- init_header(&h, GFSC_CMD_DUMP_PLOCKS, name, rv, plock_dump_len);
-
- do_write(fd, &h, sizeof(h));
-
- if (plock_dump_len)
- do_write(fd, plock_dump_buf, plock_dump_len);
-}
-
/* combines a header and the data and sends it back to the client in
a single do_write() call */
@@ -506,12 +467,8 @@ static void query_mountgroup_info(int fd, char *name)
}
memset(&mountgroup, 0, sizeof(mountgroup));
- mountgroup.group_mode = group_mode;
- if (group_mode == GROUP_LIBGROUP)
- rv = set_mountgroup_info_group(mg, &mountgroup);
- else
- rv = set_mountgroup_info(mg, &mountgroup);
+ rv = set_mountgroup_info(mg, &mountgroup);
out:
do_reply(fd, GFSC_CMD_MOUNTGROUP_INFO, name, rv,
(char *)&mountgroup, sizeof(mountgroup));
@@ -529,10 +486,7 @@ static void query_node_info(int fd, char *name, int nodeid)
goto out;
}
- if (group_mode == GROUP_LIBGROUP)
- rv = set_node_info_group(mg, nodeid, &node);
- else
- rv = set_node_info(mg, nodeid, &node);
+ rv = set_node_info(mg, nodeid, &node);
out:
do_reply(fd, GFSC_CMD_NODE_INFO, name, rv,
(char *)&node, sizeof(node));
@@ -544,11 +498,7 @@ static void query_mountgroups(int fd, int max)
struct gfsc_mountgroup *mgs = NULL;
int rv, result;
- if (group_mode == GROUP_LIBGROUP)
- rv = set_mountgroups_group(&mg_count, &mgs);
- else
- rv = set_mountgroups(&mg_count, &mgs);
-
+ rv = set_mountgroups(&mg_count, &mgs);
if (rv < 0) {
result = rv;
mg_count = 0;
@@ -583,11 +533,7 @@ static void query_mountgroup_nodes(int fd, char *name, int option, int max)
goto out;
}
- if (group_mode == GROUP_LIBGROUP)
- rv = set_mountgroup_nodes_group(mg, option, &node_count, &nodes);
- else
- rv = set_mountgroup_nodes(mg, option, &node_count, &nodes);
-
+ rv = set_mountgroup_nodes(mg, option, &node_count, &nodes);
if (rv < 0) {
result = rv;
node_count = 0;
@@ -757,11 +703,7 @@ static void do_join(int ci, struct gfsc_mount_args *ma)
list_add(&mg->list, &mountgroups);
- if (group_mode == GROUP_LIBGROUP)
- rv = gfs_join_mountgroup_old(mg, ma);
- else
- rv = gfs_join_mountgroup(mg);
-
+ rv = gfs_join_mountgroup(mg);
if (rv) {
log_error("join: group join error %d", rv);
list_del(&mg->list);
@@ -829,10 +771,7 @@ static void do_mount_done(char *table, int result)
mg->kernel_mount_done = 1;
mg->kernel_mount_error = result;
- if (group_mode == GROUP_LIBGROUP)
- send_mount_status_old(mg);
- else
- gfs_mount_done(mg);
+ gfs_mount_done(mg);
}
void client_reply_remount(struct mountgroup *mg, int ci, int result)
@@ -870,13 +809,6 @@ static void do_remount(int ci, struct gfsc_mount_args *ma)
if ((mg->ro && ro) || (!mg->ro && !ro))
goto out;
- if (group_mode == GROUP_LIBGROUP) {
- /* the receive calls client_reply_remount */
- mg->remount_client = ci;
- send_remount_old(mg, ma);
- return;
- }
-
send_remount(mg, ma);
out:
client_reply_remount(mg, ci, result);
@@ -943,10 +875,7 @@ void process_connection(int ci)
break;
case GFSC_CMD_FS_LEAVE:
- if (group_mode == GROUP_LIBGROUP)
- do_leave_old(ma->table, h.data);
- else
- do_leave(ma->table, h.data);
+ do_leave(ma->table, h.data);
break;
case GFSC_CMD_FS_MOUNT_DONE:
@@ -1068,9 +997,6 @@ static void *process_queries(void *arg)
case GFSC_CMD_DUMP_DEBUG:
query_dump_debug(f);
break;
- case GFSC_CMD_DUMP_PLOCKS:
- query_dump_plocks(f, h.name);
- break;
case GFSC_CMD_MOUNTGROUP_INFO:
query_mountgroup_info(f, h.name);
break;
@@ -1158,75 +1084,19 @@ static void loop(void)
goto out;
client_add(rv, process_uevent, NULL);
- group_mode = GROUP_LIBCPG;
-
- if (cfgd_groupd_compat) {
- rv = setup_groupd();
- if (rv < 0)
- goto out;
- client_add(rv, process_groupd, cluster_dead);
-
- switch (cfgd_groupd_compat) {
- case 1:
- group_mode = GROUP_LIBGROUP;
- rv = 0;
- break;
- case 2:
- rv = set_group_mode();
- break;
- default:
- log_error("inval groupd_compat %d", cfgd_groupd_compat);
- rv = -1;
- break;
- }
- if (rv < 0)
- goto out;
- }
- log_debug("group_mode %d compat %d", group_mode, cfgd_groupd_compat);
-
- if (group_mode == GROUP_LIBCPG) {
-
- /*
- * The new, good, way of doing things using libcpg directly.
- * code in: cpg-new.c
- */
-
- rv = setup_cpg();
- if (rv < 0)
- goto out;
- client_add(rv, process_cpg, cluster_dead);
-
- rv = set_protocol();
- if (rv < 0)
- goto out;
-
- rv = setup_dlmcontrol();
- if (rv < 0)
- goto out;
- client_add(rv, process_dlmcontrol, dlmcontrol_dead);
-
- } else if (group_mode == GROUP_LIBGROUP) {
-
- /*
- * The old, bad, way of doing things using libgroup.
- * code in: cpg-old.c group.c plock.c
- */
-
- rv = setup_cpg_old();
- if (rv < 0)
- goto out;
- client_add(rv, process_cpg_old, cluster_dead);
+ rv = setup_cpg();
+ if (rv < 0)
+ goto out;
+ client_add(rv, process_cpg, cluster_dead);
- rv = setup_misc_devices();
- if (rv < 0)
- goto out;
+ rv = set_protocol();
+ if (rv < 0)
+ goto out;
- rv = setup_plocks();
- if (rv < 0)
- goto out;
- plock_fd = rv;
- plock_ci = client_add(rv, process_plocks, NULL);
- }
+ rv = setup_dlmcontrol();
+ if (rv < 0)
+ goto out;
+ client_add(rv, process_dlmcontrol, dlmcontrol_dead);
for (;;) {
rv = poll(pollfd, client_maxi + 1, poll_timeout);
@@ -1265,20 +1135,10 @@ static void loop(void)
poll_timeout = -1;
if (poll_dlm) {
- /* only happens for GROUP_LIBCPG */
process_mountgroups();
poll_timeout = 500;
}
- if (poll_ignore_plock) {
- /* only happens for GROUP_LIBGROUP */
- if (!limit_plocks()) {
- poll_ignore_plock = 0;
- client_back(plock_ci, plock_fd);
- }
- poll_timeout = 1000;
- }
-
if (dmsetup_wait) {
update_dmsetup_wait();
if (dmsetup_wait) {
@@ -1292,12 +1152,7 @@ static void loop(void)
query_unlock();
}
out:
- if (group_mode == GROUP_LIBCPG)
- close_cpg();
- else if (group_mode == GROUP_LIBGROUP)
- close_cpg_old();
- if (cfgd_groupd_compat)
- close_groupd();
+ close_cpg();
close_logging();
close_ccs();
close_cman();
@@ -1358,33 +1213,13 @@ static void print_usage(void)
printf("\n");
printf(" -D Enable debugging to stderr and don't fork\n");
printf(" -L Enable debugging to log file\n");
- printf(" -g <num> groupd compatibility mode, 0 off, 1 on, 2 detect\n");
- printf(" 0: use libcpg, no backward compat, best performance\n");
- printf(" 1: use libgroup for compat with cluster2/rhel5\n");
- printf(" 2: use groupd to detect old, or mode 1, nodes that\n"
- " require compat, use libcpg if none found\n");
- printf(" Default is %d\n", DEFAULT_GROUPD_COMPAT);
printf(" -w <num> Enable (1) or disable (0) withdraw\n");
printf(" Default is %d\n", DEFAULT_ENABLE_WITHDRAW);
- printf(" -p <num> Enable (1) or disable (0) plock code\n");
- printf(" Default is %d\n", DEFAULT_ENABLE_PLOCK);
- printf(" -P Enable plock debugging\n");
-
- printf(" -l <limit> Limit the rate of plock operations\n");
- printf(" Default is %d, set to 0 for no limit\n", DEFAULT_PLOCK_RATE_LIMIT);
- printf(" -o <n> Enable (1) or disable (0) plock ownership\n");
- printf(" Default is %d\n", DEFAULT_PLOCK_OWNERSHIP);
- printf(" -t <ms> plock ownership drop resources time (milliseconds)\n");
- printf(" Default is %u\n", DEFAULT_DROP_RESOURCES_TIME);
- printf(" -c <num> plock ownership drop resources count\n");
- printf(" Default is %u\n", DEFAULT_DROP_RESOURCES_COUNT);
- printf(" -a <ms> plock ownership drop resources age (milliseconds)\n");
- printf(" Default is %u\n", DEFAULT_DROP_RESOURCES_AGE);
printf(" -h Print this help, then exit\n");
printf(" -V Print program version information, then exit\n");
}
-#define OPTION_STRING "LDKg:w:f:q:d:p:Pl:o:t:c:a:hV"
+#define OPTION_STRING "LDw:hV"
static void read_arguments(int argc, char **argv)
{
@@ -1405,51 +1240,11 @@ static void read_arguments(int argc, char **argv)
cfgd_debug_logfile = 1;
break;
- case 'g':
- optd_groupd_compat = 1;
- cfgd_groupd_compat = atoi(optarg);
- break;
-
case 'w':
optd_enable_withdraw = 1;
cfgd_enable_withdraw = atoi(optarg);
break;
- case 'p':
- optd_enable_plock = 1;
- cfgd_enable_plock = atoi(optarg);
- break;
-
- case 'P':
- optd_plock_debug = 1;
- cfgd_plock_debug = 1;
- break;
-
- case 'l':
- optd_plock_rate_limit = 1;
- cfgd_plock_rate_limit = atoi(optarg);
- break;
-
- case 'o':
- optd_plock_ownership = 1;
- cfgd_plock_ownership = atoi(optarg);
- break;
-
- case 't':
- optd_drop_resources_time = 1;
- cfgd_drop_resources_time = atoi(optarg);
- break;
-
- case 'c':
- optd_drop_resources_count = 1;
- cfgd_drop_resources_count = atoi(optarg);
- break;
-
- case 'a':
- optd_drop_resources_age = 1;
- cfgd_drop_resources_age = atoi(optarg);
- break;
-
case 'h':
print_usage();
exit(EXIT_SUCCESS);
@@ -1559,10 +1354,7 @@ void daemon_dump_save(void)
int daemon_debug_opt;
int daemon_quit;
int cluster_down;
-int poll_ignore_plock;
int poll_dlm;
-int plock_fd;
-int plock_ci;
struct list_head mountgroups;
int cman_quorate;
int our_nodeid;
@@ -1571,13 +1363,8 @@ char daemon_debug_buf[256];
char dump_buf[GFSC_DUMP_SIZE];
int dump_point;
int dump_wrap;
-char plock_dump_buf[GFSC_DUMP_SIZE];
-int plock_dump_len;
int dmsetup_wait;
cpg_handle_t cpg_handle_daemon;
int libcpg_flow_control_on;
-int group_mode;
-uint32_t plock_minor;
-uint32_t old_plock_minor;
struct list_head withdrawn_mounts;
diff --git a/group/gfs_controld/plock.c b/group/gfs_controld/plock.c
deleted file mode 100644
index ce17afe..0000000
--- a/group/gfs_controld/plock.c
+++ /dev/null
@@ -1,2361 +0,0 @@
-/* gfs_controld only handles plocks in rhel5/stable2 compat mode */
-
-#include "gfs_daemon.h"
-#include "cpg-old.h"
-#include "config.h"
-
-#include <linux/dlm_plock.h>
-
-static uint32_t plock_read_count;
-static uint32_t plock_recv_count;
-static uint32_t plock_rate_delays;
-static struct timeval plock_read_time;
-static struct timeval plock_recv_time;
-static struct timeval plock_rate_last;
-
-static int plock_device_fd = -1;
-static SaCkptHandleT ckpt_handle;
-static SaCkptCallbacksT callbacks = { 0, 0 };
-static SaVersionT version = { 'B', 1, 1 };
-static char section_buf[1024 * 1024];
-static uint32_t section_len;
-static int need_fsid_translation = 0;
-
-struct pack_plock {
- uint64_t start;
- uint64_t end;
- uint64_t owner;
- uint32_t pid;
- uint32_t nodeid;
- uint8_t ex;
- uint8_t waiter;
- uint16_t pad1;
- uint32_t pad;
-};
-
-#define R_GOT_UNOWN 0x00000001 /* have received owner=0 message */
-
-struct resource {
- struct list_head list; /* list of resources */
- uint64_t number;
- int owner; /* nodeid or 0 for unowned */
- uint32_t flags;
- struct timeval last_access;
- struct list_head locks; /* one lock for each range */
- struct list_head waiters;
- struct list_head pending; /* discovering r owner */
-};
-
-#define P_SYNCING 0x00000001 /* plock has been sent as part of sync but not
- yet received */
-
-struct posix_lock {
- struct list_head list; /* resource locks or waiters list */
- uint32_t pid;
- uint64_t owner;
- uint64_t start;
- uint64_t end;
- int ex;
- int nodeid;
- uint32_t flags;
-};
-
-struct lock_waiter {
- struct list_head list;
- uint32_t flags;
- struct dlm_plock_info info;
-};
-
-
-static void send_own(struct mountgroup *mg, struct resource *r, int owner);
-static void save_pending_plock(struct mountgroup *mg, struct resource *r,
- struct dlm_plock_info *in);
-
-
-static int got_unown(struct resource *r)
-{
- return !!(r->flags & R_GOT_UNOWN);
-}
-
-static void info_bswap_out(struct dlm_plock_info *i)
-{
- i->version[0] = cpu_to_le32(i->version[0]);
- i->version[1] = cpu_to_le32(i->version[1]);
- i->version[2] = cpu_to_le32(i->version[2]);
- i->pid = cpu_to_le32(i->pid);
- i->nodeid = cpu_to_le32(i->nodeid);
- i->rv = cpu_to_le32(i->rv);
- i->fsid = cpu_to_le32(i->fsid);
- i->number = cpu_to_le64(i->number);
- i->start = cpu_to_le64(i->start);
- i->end = cpu_to_le64(i->end);
- i->owner = cpu_to_le64(i->owner);
-}
-
-static void info_bswap_in(struct dlm_plock_info *i)
-{
- i->version[0] = le32_to_cpu(i->version[0]);
- i->version[1] = le32_to_cpu(i->version[1]);
- i->version[2] = le32_to_cpu(i->version[2]);
- i->pid = le32_to_cpu(i->pid);
- i->nodeid = le32_to_cpu(i->nodeid);
- i->rv = le32_to_cpu(i->rv);
- i->fsid = le32_to_cpu(i->fsid);
- i->number = le64_to_cpu(i->number);
- i->start = le64_to_cpu(i->start);
- i->end = le64_to_cpu(i->end);
- i->owner = le64_to_cpu(i->owner);
-}
-
-static char *op_str(int optype)
-{
- switch (optype) {
- case DLM_PLOCK_OP_LOCK:
- return "LK";
- case DLM_PLOCK_OP_UNLOCK:
- return "UN";
- case DLM_PLOCK_OP_GET:
- return "GET";
- default:
- return "??";
- }
-}
-
-static char *ex_str(int optype, int ex)
-{
- if (optype == DLM_PLOCK_OP_UNLOCK || optype == DLM_PLOCK_OP_GET)
- return "-";
- if (ex)
- return "WR";
- else
- return "RD";
-}
-
-/*
- * In kernels before 2.6.26, plocks came from gfs2's lock_dlm module.
- * Reading plocks from there as well should allow us to use cluster3
- * on old (RHEL5) kernels. In this case, the fsid we read in plock_info
- * structs is the mountgroup id, which we need to translate to the ls id.
- */
-
-int setup_plocks(void)
-{
- SaAisErrorT err;
-
- plock_read_count = 0;
- plock_recv_count = 0;
- plock_rate_delays = 0;
- gettimeofday(&plock_read_time, NULL);
- gettimeofday(&plock_recv_time, NULL);
- gettimeofday(&plock_rate_last, NULL);
-
- err = saCkptInitialize(&ckpt_handle, &callbacks, &version);
- if (err != SA_AIS_OK) {
- log_error("ckpt init error %d", err);
- cfgd_enable_plock = 0;
-
- /* still try to open and read the control device so that we can
- send ENOSYS back to the kernel if it tries to do a plock */
- }
-
- if (plock_minor) {
- need_fsid_translation = 1;
- plock_device_fd = open("/dev/misc/dlm_plock", O_RDWR);
- } else if (old_plock_minor) {
- log_debug("setup_plocks using old lock_dlm interface");
- need_fsid_translation = 0;
- plock_device_fd = open("/dev/misc/lock_dlm_plock", O_RDWR);
- }
-
- if (plock_device_fd < 0) {
- log_error("Failure to open plock device: %s", strerror(errno));
- return -1;
- }
-
- log_debug("plocks %d", plock_device_fd);
- log_debug("plock cpg message size: %u bytes",
- (unsigned int) (sizeof(struct gdlm_header) +
- sizeof(struct dlm_plock_info)));
-
- return plock_device_fd;
-}
-
-/* FIXME: unify these two */
-
-static unsigned long time_diff_ms(struct timeval *begin, struct timeval *end)
-{
- struct timeval result;
- timersub(end, begin, &result);
- return (result.tv_sec * 1000) + (result.tv_usec / 1000);
-}
-
-static uint64_t dt_usec(struct timeval *start, struct timeval *stop)
-{
- uint64_t dt;
-
- dt = stop->tv_sec - start->tv_sec;
- dt *= 1000000;
- dt += stop->tv_usec - start->tv_usec;
- return dt;
-}
-
-static struct resource *search_resource(struct mountgroup *mg, uint64_t number)
-{
- struct resource *r;
-
- list_for_each_entry(r, &mg->plock_resources, list) {
- if (r->number == number)
- return r;
- }
- return NULL;
-}
-
-static int find_resource(struct mountgroup *mg, uint64_t number, int create,
- struct resource **r_out)
-{
- struct resource *r = NULL;
- int rv = 0;
-
- r = search_resource(mg, number);
- if (r)
- goto out;
-
- if (create == 0) {
- rv = -ENOENT;
- goto out;
- }
-
- r = malloc(sizeof(struct resource));
- if (!r) {
- log_error("find_resource no memory %d", errno);
- rv = -ENOMEM;
- goto out;
- }
-
- memset(r, 0, sizeof(struct resource));
- r->number = number;
- INIT_LIST_HEAD(&r->locks);
- INIT_LIST_HEAD(&r->waiters);
- INIT_LIST_HEAD(&r->pending);
-
- if (cfgd_plock_ownership)
- r->owner = -1;
- else
- r->owner = 0;
-
- list_add_tail(&r->list, &mg->plock_resources);
- out:
- if (r)
- gettimeofday(&r->last_access, NULL);
- *r_out = r;
- return rv;
-}
-
-static void put_resource(struct resource *r)
-{
- /* with ownership, resources are only freed via drop messages */
- if (cfgd_plock_ownership)
- return;
-
- if (list_empty(&r->locks) && list_empty(&r->waiters)) {
- list_del(&r->list);
- free(r);
- }
-}
-
-static inline int ranges_overlap(uint64_t start1, uint64_t end1,
- uint64_t start2, uint64_t end2)
-{
- if (end1 < start2 || start1 > end2)
- return 0;
- return 1;
-}
-
-/**
- * overlap_type - returns a value based on the type of overlap
- * @s1 - start of new lock range
- * @e1 - end of new lock range
- * @s2 - start of existing lock range
- * @e2 - end of existing lock range
- *
- */
-
-static int overlap_type(uint64_t s1, uint64_t e1, uint64_t s2, uint64_t e2)
-{
- int ret;
-
- /*
- * ---r1---
- * ---r2---
- */
-
- if (s1 == s2 && e1 == e2)
- ret = 0;
-
- /*
- * --r1--
- * ---r2---
- */
-
- else if (s1 == s2 && e1 < e2)
- ret = 1;
-
- /*
- * --r1--
- * ---r2---
- */
-
- else if (s1 > s2 && e1 == e2)
- ret = 1;
-
- /*
- * --r1--
- * ---r2---
- */
-
- else if (s1 > s2 && e1 < e2)
- ret = 2;
-
- /*
- * ---r1--- or ---r1--- or ---r1---
- * --r2-- --r2-- --r2--
- */
-
- else if (s1 <= s2 && e1 >= e2)
- ret = 3;
-
- /*
- * ---r1---
- * ---r2---
- */
-
- else if (s1 > s2 && e1 > e2)
- ret = 4;
-
- /*
- * ---r1---
- * ---r2---
- */
-
- else if (s1 < s2 && e1 < e2)
- ret = 4;
-
- else
- ret = -1;
-
- return ret;
-}
-
-/* shrink the range start2:end2 by the partially overlapping start:end */
-
-static int shrink_range2(uint64_t *start2, uint64_t *end2,
- uint64_t start, uint64_t end)
-{
- int error = 0;
-
- if (*start2 < start)
- *end2 = start - 1;
- else if (*end2 > end)
- *start2 = end + 1;
- else
- error = -1;
- return error;
-}
-
-static int shrink_range(struct posix_lock *po, uint64_t start, uint64_t end)
-{
- return shrink_range2(&po->start, &po->end, start, end);
-}
-
-static int is_conflict(struct resource *r, struct dlm_plock_info *in, int get)
-{
- struct posix_lock *po;
-
- list_for_each_entry(po, &r->locks, list) {
- if (po->nodeid == in->nodeid && po->owner == in->owner)
- continue;
- if (!ranges_overlap(po->start, po->end, in->start, in->end))
- continue;
-
- if (in->ex || po->ex) {
- if (get) {
- in->ex = po->ex;
- in->pid = po->pid;
- in->start = po->start;
- in->end = po->end;
- }
- return 1;
- }
- }
- return 0;
-}
-
-static int add_lock(struct resource *r, uint32_t nodeid, uint64_t owner,
- uint32_t pid, int ex, uint64_t start, uint64_t end)
-{
- struct posix_lock *po;
-
- po = malloc(sizeof(struct posix_lock));
- if (!po)
- return -ENOMEM;
- memset(po, 0, sizeof(struct posix_lock));
-
- po->start = start;
- po->end = end;
- po->nodeid = nodeid;
- po->owner = owner;
- po->pid = pid;
- po->ex = ex;
- list_add_tail(&po->list, &r->locks);
-
- return 0;
-}
-
-/* RN within RE (and starts or ends on RE boundary)
- 1. add new lock for non-overlap area of RE, orig mode
- 2. convert RE to RN range and mode */
-
-static int lock_case1(struct posix_lock *po, struct resource *r,
- struct dlm_plock_info *in)
-{
- uint64_t start2, end2;
- int rv;
-
- /* non-overlapping area start2:end2 */
- start2 = po->start;
- end2 = po->end;
- rv = shrink_range2(&start2, &end2, in->start, in->end);
- if (rv)
- goto out;
-
- po->start = in->start;
- po->end = in->end;
- po->ex = in->ex;
-
- rv = add_lock(r, in->nodeid, in->owner, in->pid, !in->ex, start2, end2);
- out:
- return rv;
-}
-
-/* RN within RE (RE overlaps RN on both sides)
- 1. add new lock for front fragment, orig mode
- 2. add new lock for back fragment, orig mode
- 3. convert RE to RN range and mode */
-
-static int lock_case2(struct posix_lock *po, struct resource *r,
- struct dlm_plock_info *in)
-
-{
- int rv;
-
- rv = add_lock(r, in->nodeid, in->owner, in->pid,
- !in->ex, po->start, in->start - 1);
- if (rv)
- goto out;
-
- rv = add_lock(r, in->nodeid, in->owner, in->pid,
- !in->ex, in->end + 1, po->end);
- if (rv)
- goto out;
-
- po->start = in->start;
- po->end = in->end;
- po->ex = in->ex;
- out:
- return rv;
-}
-
-static int lock_internal(struct mountgroup *mg, struct resource *r,
- struct dlm_plock_info *in)
-{
- struct posix_lock *po, *safe;
- int rv = 0;
-
- list_for_each_entry_safe(po, safe, &r->locks, list) {
- if (po->nodeid != in->nodeid || po->owner != in->owner)
- continue;
- if (!ranges_overlap(po->start, po->end, in->start, in->end))
- continue;
-
- /* existing range (RE) overlaps new range (RN) */
-
- switch(overlap_type(in->start, in->end, po->start, po->end)) {
-
- case 0:
- if (po->ex == in->ex)
- goto out;
-
- /* ranges the same - just update the existing lock */
- po->ex = in->ex;
- goto out;
-
- case 1:
- if (po->ex == in->ex)
- goto out;
-
- rv = lock_case1(po, r, in);
- goto out;
-
- case 2:
- if (po->ex == in->ex)
- goto out;
-
- rv = lock_case2(po, r, in);
- goto out;
-
- case 3:
- list_del(&po->list);
- free(po);
- break;
-
- case 4:
- if (po->start < in->start)
- po->end = in->start - 1;
- else
- po->start = in->end + 1;
- break;
-
- default:
- rv = -1;
- goto out;
- }
- }
-
- rv = add_lock(r, in->nodeid, in->owner, in->pid,
- in->ex, in->start, in->end);
- out:
- return rv;
-
-}
-
-static int unlock_internal(struct mountgroup *mg, struct resource *r,
- struct dlm_plock_info *in)
-{
- struct posix_lock *po, *safe;
- int rv = 0;
-
- list_for_each_entry_safe(po, safe, &r->locks, list) {
- if (po->nodeid != in->nodeid || po->owner != in->owner)
- continue;
- if (!ranges_overlap(po->start, po->end, in->start, in->end))
- continue;
-
- /* existing range (RE) overlaps new range (RN) */
-
- switch (overlap_type(in->start, in->end, po->start, po->end)) {
-
- case 0:
- /* ranges the same - just remove the existing lock */
-
- list_del(&po->list);
- free(po);
- goto out;
-
- case 1:
- /* RN within RE and starts or ends on RE boundary -
- * shrink and update RE */
-
- rv = shrink_range(po, in->start, in->end);
- goto out;
-
- case 2:
- /* RN within RE - shrink and update RE to be front
- * fragment, and add a new lock for back fragment */
-
- rv = add_lock(r, in->nodeid, in->owner, in->pid,
- po->ex, in->end + 1, po->end);
- po->end = in->start - 1;
- goto out;
-
- case 3:
- /* RE within RN - remove RE, then continue checking
- * because RN could cover other locks */
-
- list_del(&po->list);
- free(po);
- continue;
-
- case 4:
- /* front of RE in RN, or end of RE in RN - shrink and
- * update RE, then continue because RN could cover
- * other locks */
-
- rv = shrink_range(po, in->start, in->end);
- continue;
-
- default:
- rv = -1;
- goto out;
- }
- }
- out:
- return rv;
-}
-
-static int add_waiter(struct mountgroup *mg, struct resource *r,
- struct dlm_plock_info *in)
-
-{
- struct lock_waiter *w;
-
- w = malloc(sizeof(struct lock_waiter));
- if (!w)
- return -ENOMEM;
- memcpy(&w->info, in, sizeof(struct dlm_plock_info));
- list_add_tail(&w->list, &r->waiters);
- return 0;
-}
-
-static void write_result(struct mountgroup *mg, struct dlm_plock_info *in,
- int rv)
-{
- if (need_fsid_translation)
- in->fsid = mg->associated_ls_id;
-
- in->rv = rv;
- write(plock_device_fd, in, sizeof(struct dlm_plock_info));
-}
-
-static void do_waiters(struct mountgroup *mg, struct resource *r)
-{
- struct lock_waiter *w, *safe;
- struct dlm_plock_info *in;
- int rv;
-
- list_for_each_entry_safe(w, safe, &r->waiters, list) {
- in = &w->info;
-
- if (is_conflict(r, in, 0))
- continue;
-
- list_del(&w->list);
-
- /*
- log_group(mg, "take waiter %llx %llx-%llx %d/%u/%llx",
- in->number, in->start, in->end,
- in->nodeid, in->pid, in->owner);
- */
-
- rv = lock_internal(mg, r, in);
-
- if (in->nodeid == our_nodeid)
- write_result(mg, in, rv);
-
- free(w);
- }
-}
-
-static void do_lock(struct mountgroup *mg, struct dlm_plock_info *in,
- struct resource *r)
-{
- int rv;
-
- if (is_conflict(r, in, 0)) {
- if (!in->wait)
- rv = -EAGAIN;
- else {
- rv = add_waiter(mg, r, in);
- if (rv)
- goto out;
- rv = -EINPROGRESS;
- }
- } else
- rv = lock_internal(mg, r, in);
-
- out:
- if (in->nodeid == our_nodeid && rv != -EINPROGRESS)
- write_result(mg, in, rv);
-
- do_waiters(mg, r);
- put_resource(r);
-}
-
-static void do_unlock(struct mountgroup *mg, struct dlm_plock_info *in,
- struct resource *r)
-{
- int rv;
-
- rv = unlock_internal(mg, r, in);
-
- if (in->nodeid == our_nodeid)
- write_result(mg, in, rv);
-
- do_waiters(mg, r);
- put_resource(r);
-}
-
-/* we don't even get to this function if the getlk isn't from us */
-
-static void do_get(struct mountgroup *mg, struct dlm_plock_info *in,
- struct resource *r)
-{
- int rv;
-
- if (is_conflict(r, in, 1))
- rv = 1;
- else
- rv = 0;
-
- write_result(mg, in, rv);
-}
-
-static void __receive_plock(struct mountgroup *mg, struct dlm_plock_info *in,
- int from, struct resource *r)
-{
- switch (in->optype) {
- case DLM_PLOCK_OP_LOCK:
- mg->last_plock_time = time(NULL);
- do_lock(mg, in, r);
- break;
- case DLM_PLOCK_OP_UNLOCK:
- mg->last_plock_time = time(NULL);
- do_unlock(mg, in, r);
- break;
- case DLM_PLOCK_OP_GET:
- do_get(mg, in, r);
- break;
- default:
- log_error("receive_plock from %d optype %d", from, in->optype);
- if (from == our_nodeid)
- write_result(mg, in, -EINVAL);
- }
-}
-
-/* When mg members receive our options message (for our mount), one of them
- saves all plock state received to that point in a checkpoint and then sends
- us our journals message. We know to retrieve the plock state from the
- checkpoint when we receive our journals message. Any plocks messages that
- arrive between seeing our options message and our journals message needs to
- be saved and processed after we synchronize our plock state from the
- checkpoint. Any plock message received while we're mounting but before we
- set save_plocks (when we see our options message) can be ignored because it
- should be reflected in the checkpointed state. */
-
-static void _receive_plock(struct mountgroup *mg, char *buf, int len, int from)
-{
- struct dlm_plock_info info;
- struct gdlm_header *hd = (struct gdlm_header *) buf;
- struct resource *r = NULL;
- struct timeval now;
- uint64_t usec;
- int rv, create;
-
- memcpy(&info, buf + sizeof(struct gdlm_header), sizeof(info));
- info_bswap_in(&info);
-
- log_plock(mg, "receive plock %llx %s %s %llx-%llx %d/%u/%llx w %d",
- (unsigned long long)info.number,
- op_str(info.optype),
- ex_str(info.optype, info.ex),
- (unsigned long long)info.start, (unsigned long long)info.end,
- info.nodeid, info.pid, (unsigned long long)info.owner,
- info.wait);
-
- plock_recv_count++;
- if (!(plock_recv_count % 1000)) {
- gettimeofday(&now, NULL);
- usec = dt_usec(&plock_recv_time, &now);
- log_group(mg, "plock_recv_count %u time %.3f s",
- plock_recv_count, usec * 1.e-6);
- plock_recv_time = now;
- }
-
- if (info.optype == DLM_PLOCK_OP_GET && from != our_nodeid)
- return;
-
- if (from != hd->nodeid || from != info.nodeid) {
- log_error("receive_plock from %d header %d info %d",
- from, hd->nodeid, info.nodeid);
- return;
- }
-
- create = !cfgd_plock_ownership;
-
- rv = find_resource(mg, info.number, create, &r);
-
- if (rv && cfgd_plock_ownership) {
- /* There must have been a race with a drop, so we need to
- ignore this plock op which will be resent. If we're the one
- who sent the plock, we need to send_own() and put it on the
- pending list to resend once the owner is established. */
-
- log_debug("receive_plock from %d no r %llx", from,
- (unsigned long long)info.number);
-
- if (from != our_nodeid)
- return;
-
- rv = find_resource(mg, info.number, 1, &r);
- if (rv)
- return;
- send_own(mg, r, our_nodeid);
- save_pending_plock(mg, r, &info);
- return;
- }
- if (rv) {
- /* r not found, rv is -ENOENT, this shouldn't happen because
- process_plocks() creates a resource for every op */
-
- log_error("receive_plock from %d no r %llx %d", from,
- (unsigned long long)info.number, rv);
- return;
- }
-
- /* The owner should almost always be 0 here, but other owners may
- be possible given odd combinations of races with drop. Odd races to
- worry about (some seem pretty improbable):
-
- - A sends drop, B sends plock, receive drop, receive plock.
- This is addressed above.
-
- - A sends drop, B sends plock, receive drop, B reads plock
- and sends own, receive plock, on B we find owner of -1.
-
- - A sends drop, B sends two plocks, receive drop, receive plocks.
- Receiving the first plock is the previous case, receiving the
- second plock will find r with owner of -1.
-
- - A sends drop, B sends two plocks, receive drop, C sends own,
- receive plock, B sends own, receive own (C), receive plock,
- receive own (B).
-
- Haven't tried to cook up a scenario that would lead to the
- last case below; receiving a plock from ourself and finding
- we're the owner of r. */
-
- if (!r->owner) {
- __receive_plock(mg, &info, from, r);
-
- } else if (r->owner == -1) {
- log_debug("receive_plock from %d r %llx owner %d", from,
- (unsigned long long)info.number, r->owner);
-
- if (from == our_nodeid)
- save_pending_plock(mg, r, &info);
-
- } else if (r->owner != our_nodeid) {
- /* might happen, if frequent change to log_debug */
- log_error("receive_plock from %d r %llx owner %d", from,
- (unsigned long long)info.number, r->owner);
-
- if (from == our_nodeid)
- save_pending_plock(mg, r, &info);
-
- } else if (r->owner == our_nodeid) {
- /* might happen, if frequent change to log_debug */
- log_error("receive_plock from %d r %llx owner %d", from,
- (unsigned long long)info.number, r->owner);
-
- if (from == our_nodeid)
- __receive_plock(mg, &info, from, r);
- }
-}
-
-void receive_plock(struct mountgroup *mg, char *buf, int len, int from)
-{
- if (mg->save_plocks) {
- save_message_old(mg, buf, len, from, MSG_PLOCK);
- return;
- }
-
- if (!mg->got_our_journals) {
- log_group(mg, "not saving plock messages yet");
- return;
- }
-
- _receive_plock(mg, buf, len, from);
-}
-
-static int send_struct_info(struct mountgroup *mg, struct dlm_plock_info *in,
- int msg_type)
-{
- char *buf;
- int rv, len;
- struct gdlm_header *hd;
-
- len = sizeof(struct gdlm_header) + sizeof(struct dlm_plock_info);
- buf = malloc(len);
- if (!buf) {
- rv = -ENOMEM;
- goto out;
- }
- memset(buf, 0, len);
-
- info_bswap_out(in);
-
- hd = (struct gdlm_header *)buf;
- hd->type = msg_type;
- hd->nodeid = our_nodeid;
- hd->to_nodeid = 0;
-
- memcpy(buf + sizeof(struct gdlm_header), in, sizeof(*in));
-
- rv = send_group_message_old(mg, len, buf);
-
- free(buf);
- out:
- if (rv)
- log_error("send plock message error %d", rv);
- return rv;
-}
-
-static void send_plock(struct mountgroup *mg, struct resource *r,
- struct dlm_plock_info *in)
-{
- send_struct_info(mg, in, MSG_PLOCK);
-}
-
-static void send_own(struct mountgroup *mg, struct resource *r, int owner)
-{
- struct dlm_plock_info info;
-
- /* if we've already sent an own message for this resource,
- (pending list is not empty), then we shouldn't send another */
-
- if (!list_empty(&r->pending)) {
- log_debug("send_own %llx already pending",
- (unsigned long long)r->number);
- return;
- }
-
- memset(&info, 0, sizeof(info));
- info.number = r->number;
- info.nodeid = owner;
-
- send_struct_info(mg, &info, MSG_PLOCK_OWN);
-}
-
-static void send_syncs(struct mountgroup *mg, struct resource *r)
-{
- struct dlm_plock_info info;
- struct posix_lock *po;
- struct lock_waiter *w;
- int rv;
-
- list_for_each_entry(po, &r->locks, list) {
- memset(&info, 0, sizeof(info));
- info.number = r->number;
- info.start = po->start;
- info.end = po->end;
- info.nodeid = po->nodeid;
- info.owner = po->owner;
- info.pid = po->pid;
- info.ex = po->ex;
-
- rv = send_struct_info(mg, &info, MSG_PLOCK_SYNC_LOCK);
- if (rv)
- goto out;
-
- po->flags |= P_SYNCING;
- }
-
- list_for_each_entry(w, &r->waiters, list) {
- memcpy(&info, &w->info, sizeof(info));
-
- rv = send_struct_info(mg, &info, MSG_PLOCK_SYNC_WAITER);
- if (rv)
- goto out;
-
- w->flags |= P_SYNCING;
- }
- out:
- return;
-}
-
-static void send_drop(struct mountgroup *mg, struct resource *r)
-{
- struct dlm_plock_info info;
-
- memset(&info, 0, sizeof(info));
- info.number = r->number;
-
- send_struct_info(mg, &info, MSG_PLOCK_DROP);
-}
-
-/* plock op can't be handled until we know the owner value of the resource,
- so the op is saved on the pending list until the r owner is established */
-
-static void save_pending_plock(struct mountgroup *mg, struct resource *r,
- struct dlm_plock_info *in)
-{
- struct lock_waiter *w;
-
- w = malloc(sizeof(struct lock_waiter));
- if (!w) {
- log_error("save_pending_plock no mem");
- return;
- }
- memcpy(&w->info, in, sizeof(struct dlm_plock_info));
- list_add_tail(&w->list, &r->pending);
-}
-
-/* plock ops are on pending list waiting for ownership to be established.
- owner has now become us, so add these plocks to r */
-
-static void add_pending_plocks(struct mountgroup *mg, struct resource *r)
-{
- struct lock_waiter *w, *safe;
-
- list_for_each_entry_safe(w, safe, &r->pending, list) {
- __receive_plock(mg, &w->info, our_nodeid, r);
- list_del(&w->list);
- free(w);
- }
-}
-
-/* plock ops are on pending list waiting for ownership to be established.
- owner has now become 0, so send these plocks to everyone */
-
-static void send_pending_plocks(struct mountgroup *mg, struct resource *r)
-{
- struct lock_waiter *w, *safe;
-
- list_for_each_entry_safe(w, safe, &r->pending, list) {
- send_plock(mg, r, &w->info);
- list_del(&w->list);
- free(w);
- }
-}
-
-static void _receive_own(struct mountgroup *mg, char *buf, int len, int from)
-{
- struct gdlm_header *hd = (struct gdlm_header *) buf;
- struct dlm_plock_info info;
- struct resource *r;
- int should_not_happen = 0;
- int rv;
-
- memcpy(&info, buf + sizeof(struct gdlm_header), sizeof(info));
- info_bswap_in(&info);
-
- log_plock(mg, "receive own %llx from %u owner %u",
- (unsigned long long)info.number, hd->nodeid, info.nodeid);
-
- rv = find_resource(mg, info.number, 1, &r);
- if (rv)
- return;
-
- if (from == our_nodeid) {
- /*
- * received our own own message
- */
-
- if (info.nodeid == 0) {
- /* we are setting owner to 0 */
-
- if (r->owner == our_nodeid) {
- /* we set owner to 0 when we relinquish
- ownership */
- should_not_happen = 1;
- } else if (r->owner == 0) {
- /* this happens when we relinquish ownership */
- r->flags |= R_GOT_UNOWN;
- } else {
- should_not_happen = 1;
- }
-
- } else if (info.nodeid == our_nodeid) {
- /* we are setting owner to ourself */
-
- if (r->owner == -1) {
- /* we have gained ownership */
- r->owner = our_nodeid;
- add_pending_plocks(mg, r);
- } else if (r->owner == our_nodeid) {
- should_not_happen = 1;
- } else if (r->owner == 0) {
- send_pending_plocks(mg, r);
- } else {
- /* resource is owned by other node;
- they should set owner to 0 shortly */
- }
-
- } else {
- /* we should only ever set owner to 0 or ourself */
- should_not_happen = 1;
- }
- } else {
- /*
- * received own message from another node
- */
-
- if (info.nodeid == 0) {
- /* other node is setting owner to 0 */
-
- if (r->owner == -1) {
- /* we should have a record of the owner before
- it relinquishes */
- should_not_happen = 1;
- } else if (r->owner == our_nodeid) {
- /* only the owner should relinquish */
- should_not_happen = 1;
- } else if (r->owner == 0) {
- should_not_happen = 1;
- } else {
- r->owner = 0;
- r->flags |= R_GOT_UNOWN;
- send_pending_plocks(mg, r);
- }
-
- } else if (info.nodeid == from) {
- /* other node is setting owner to itself */
-
- if (r->owner == -1) {
- /* normal path for a node becoming owner */
- r->owner = from;
- } else if (r->owner == our_nodeid) {
- /* we relinquish our ownership: sync our local
- plocks to everyone, then set owner to 0 */
- send_syncs(mg, r);
- send_own(mg, r, 0);
- /* we need to set owner to 0 here because
- local ops may arrive before we receive
- our send_own message and can't be added
- locally */
- r->owner = 0;
- } else if (r->owner == 0) {
- /* can happen because we set owner to 0 before
- we receive our send_own sent just above */
- } else {
- /* do nothing, current owner should be
- relinquishing its ownership */
- }
-
- } else if (info.nodeid == our_nodeid) {
- /* no one else should try to set the owner to us */
- should_not_happen = 1;
- } else {
- /* a node should only ever set owner to 0 or itself */
- should_not_happen = 1;
- }
- }
-
- if (should_not_happen) {
- log_error("receive_own from %u %llx info nodeid %d r owner %d",
- from, (unsigned long long)r->number, info.nodeid,
- r->owner);
- }
-}
-
-void receive_own(struct mountgroup *mg, char *buf, int len, int from)
-{
- if (mg->save_plocks) {
- save_message_old(mg, buf, len, from, MSG_PLOCK_OWN);
- return;
- }
-
- _receive_own(mg, buf, len, from);
-}
-
-static void clear_syncing_flag(struct resource *r, struct dlm_plock_info *in)
-{
- struct posix_lock *po;
- struct lock_waiter *w;
-
- list_for_each_entry(po, &r->locks, list) {
- if ((po->flags & P_SYNCING) &&
- in->start == po->start &&
- in->end == po->end &&
- in->nodeid == po->nodeid &&
- in->owner == po->owner &&
- in->pid == po->pid &&
- in->ex == po->ex) {
- po->flags &= ~P_SYNCING;
- return;
- }
- }
-
- list_for_each_entry(w, &r->waiters, list) {
- if ((w->flags & P_SYNCING) &&
- in->start == w->info.start &&
- in->end == w->info.end &&
- in->nodeid == w->info.nodeid &&
- in->owner == w->info.owner &&
- in->pid == w->info.pid &&
- in->ex == w->info.ex) {
- w->flags &= ~P_SYNCING;
- return;
- }
- }
-
- log_error("clear_syncing %llx no match %s %llx-%llx %d/%u/%llx",
- (unsigned long long)r->number, in->ex ? "WR" : "RD",
- (unsigned long long)in->start, (unsigned long long)in->end,
- in->nodeid, in->pid, (unsigned long long)in->owner);
-}
-
-static void _receive_sync(struct mountgroup *mg, char *buf, int len, int from)
-{
- struct dlm_plock_info info;
- struct gdlm_header *hd = (struct gdlm_header *) buf;
- struct resource *r;
- int rv;
-
- memcpy(&info, buf + sizeof(struct gdlm_header), sizeof(info));
- info_bswap_in(&info);
-
- log_plock(mg, "receive sync %llx from %u %s %llx-%llx %d/%u/%llx",
- (unsigned long long)info.number, from, info.ex ? "WR" : "RD",
- (unsigned long long)info.start, (unsigned long long)info.end,
- info.nodeid, info.pid, (unsigned long long)info.owner);
-
- rv = find_resource(mg, info.number, 0, &r);
- if (rv) {
- log_error("receive_sync no r %llx from %d", info.number, from);
- return;
- }
-
- if (from == our_nodeid) {
- /* this plock now in sync on all nodes */
- clear_syncing_flag(r, &info);
- return;
- }
-
- if (hd->type == MSG_PLOCK_SYNC_LOCK)
- add_lock(r, info.nodeid, info.owner, info.pid, info.ex,
- info.start, info.end);
- else if (hd->type == MSG_PLOCK_SYNC_WAITER)
- add_waiter(mg, r, &info);
-}
-
-void receive_sync(struct mountgroup *mg, char *buf, int len, int from)
-{
- struct gdlm_header *hd = (struct gdlm_header *) buf;
-
- if (mg->save_plocks) {
- save_message_old(mg, buf, len, from, hd->type);
- return;
- }
-
- _receive_sync(mg, buf, len, from);
-}
-
-static void _receive_drop(struct mountgroup *mg, char *buf, int len, int from)
-{
- struct dlm_plock_info info;
- struct resource *r;
- int rv;
-
- memcpy(&info, buf + sizeof(struct gdlm_header), sizeof(info));
- info_bswap_in(&info);
-
- log_plock(mg, "receive drop %llx from %u",
- (unsigned long long)info.number, from);
-
- rv = find_resource(mg, info.number, 0, &r);
- if (rv) {
- /* we'll find no r if two nodes sent drop at once */
- log_debug("receive_drop from %d no r %llx", from,
- (unsigned long long)info.number);
- return;
- }
-
- if (r->owner != 0) {
- /* - A sent drop, B sent drop, receive drop A, C sent own,
- receive drop B (this warning on C, owner -1)
- - A sent drop, B sent drop, receive drop A, A sent own,
- receive own A, receive drop B (this warning on all,
- owner A) */
- log_debug("receive_drop from %d r %llx owner %d", from,
- (unsigned long long)r->number, r->owner);
- return;
- }
-
- if (!list_empty(&r->pending)) {
- /* shouldn't happen */
- log_error("receive_drop from %d r %llx pending op", from,
- (unsigned long long)r->number);
- return;
- }
-
- /* the decision to drop or not must be based on things that are
- guaranteed to be the same on all nodes */
-
- if (list_empty(&r->locks) && list_empty(&r->waiters)) {
- list_del(&r->list);
- free(r);
- } else {
- /* A sent drop, B sent a plock, receive plock, receive drop */
- log_debug("receive_drop from %d r %llx in use", from,
- (unsigned long long)r->number);
- }
-}
-
-void receive_drop(struct mountgroup *mg, char *buf, int len, int from)
-{
- if (mg->save_plocks) {
- save_message_old(mg, buf, len, from, MSG_PLOCK_DROP);
- return;
- }
-
- _receive_drop(mg, buf, len, from);
-}
-
-/* We only drop resources from the unowned state to simplify things.
- If we want to drop a resource we own, we unown/relinquish it first. */
-
-/* FIXME: in the transition from owner = us, to owner = 0, to drop;
- we want the second period to be shorter than the first */
-
-static int drop_resources(struct mountgroup *mg)
-{
- struct resource *r;
- struct timeval now;
- int count = 0;
-
- gettimeofday(&now, NULL);
-
- /* try to drop the oldest, unused resources */
-
- list_for_each_entry_reverse(r, &mg->plock_resources, list) {
- if (count >= cfgd_drop_resources_count)
- break;
- if (r->owner && r->owner != our_nodeid)
- continue;
- if (time_diff_ms(&r->last_access, &now) <
- cfgd_drop_resources_age)
- continue;
-
- if (list_empty(&r->locks) && list_empty(&r->waiters)) {
- if (r->owner == our_nodeid) {
- send_own(mg, r, 0);
- r->owner = 0;
- } else if (r->owner == 0 && got_unown(r)) {
- send_drop(mg, r);
- }
-
- count++;
- }
- }
-
- return 0;
-}
-
-/* iterate through directory names looking for matching id:
- /sys/kernel/dlm/<name>/id */
-
-#define DLM_SYSFS_DIR "/sys/kernel/dlm"
-
-static char ls_name[256];
-
-static int get_lockspace_name(uint32_t ls_id)
-{
- char path[PATH_MAX];
- DIR *d;
- FILE *file;
- struct dirent *de;
- uint32_t id;
- int rv, error;
-
- d = opendir(DLM_SYSFS_DIR);
- if (!d) {
- log_debug("%s: opendir failed: %d", path, errno);
- return -1;
- }
-
- rv = -1;
-
- while ((de = readdir(d))) {
- if (de->d_name[0] == '.')
- continue;
-
- id = 0;
- memset(path, 0, PATH_MAX);
- snprintf(path, PATH_MAX, "%s/%s/id", DLM_SYSFS_DIR, de->d_name);
-
- file = fopen(path, "r");
- if (!file) {
- log_error("can't open %s %d", path, errno);
- continue;
- }
-
- error = fscanf(file, "%u", &id);
- fclose(file);
-
- if (error != 1) {
- log_error("bad read %s %d", path, errno);
- continue;
- }
- if (id != ls_id) {
- log_debug("get_lockspace_name skip %x %s",
- id, de->d_name);
- continue;
- }
-
- log_debug("get_lockspace_name found %x %s", id, de->d_name);
- strncpy(ls_name, de->d_name, 256);
- rv = 0;
- break;
- }
-
- closedir(d);
- return rv;
-}
-
-/* find the locskapce with "ls_id" in sysfs, get it's name, then look for
- the mg with with the same name in mounts list, return it's id */
-
-static void set_associated_id(uint32_t ls_id)
-{
- struct mountgroup *mg;
- int rv;
-
- log_debug("set_associated_id ls_id %x %d", ls_id, ls_id);
-
- memset(&ls_name, 0, sizeof(ls_name));
-
- rv = get_lockspace_name(ls_id);
- if (rv) {
- log_error("no lockspace found with id %x", ls_id);
- return;
- }
-
- mg = find_mg(ls_name);
- if (!mg) {
- log_error("no mountgroup found with name %s for ls_id %x",
- ls_name, ls_id);
- return;
- }
-
- log_debug("set_associated_id ls %x is mg %x", ls_id, mg->id);
-
- mg->associated_ls_id = ls_id;
-}
-
-static uint32_t ls_to_mg_id(uint32_t fsid)
-{
- struct mountgroup *mg;
- int do_set = 1;
-
- retry:
- list_for_each_entry(mg, &mountgroups, list) {
- if (mg->associated_ls_id == fsid)
- return mg->id;
- }
-
- if (do_set) {
- do_set = 0;
- set_associated_id(fsid);
- goto retry;
- }
-
- return fsid;
-}
-
-int limit_plocks(void)
-{
- struct timeval now;
-
- /* Don't send more messages while the cpg message queue is backed up */
-
- if (libcpg_flow_control_on) {
- update_flow_control_status();
- if (libcpg_flow_control_on)
- return 1;
- }
-
- if (!cfgd_plock_rate_limit || !plock_read_count)
- return 0;
-
- gettimeofday(&now, NULL);
-
- /* Every time a plock op is read from the kernel, we increment
- plock_read_count. After every cfgd_plock_rate_limit (N) reads,
- we check the time it's taken to do those N; if the time is less than
- a second, then we delay reading any more until a second is up.
- This way we read a max of N ops from the kernel every second. */
-
- if (!(plock_read_count % cfgd_plock_rate_limit)) {
- if (time_diff_ms(&plock_rate_last, &now) < 1000) {
- plock_rate_delays++;
- return 2;
- }
- plock_rate_last = now;
- plock_read_count++;
- }
- return 0;
-}
-
-void process_plocks(int ci)
-{
- struct mountgroup *mg;
- struct resource *r;
- struct dlm_plock_info info;
- struct timeval now;
- uint64_t usec;
- int rv;
-
- if (limit_plocks()) {
- poll_ignore_plock = 1;
- client_ignore(plock_ci, plock_fd);
- return;
- }
-
- gettimeofday(&now, NULL);
-
- memset(&info, 0, sizeof(info));
-
- rv = do_read(plock_device_fd, &info, sizeof(info));
- if (rv < 0) {
- log_debug("process_plocks: read error %d fd %d\n",
- errno, plock_device_fd);
- return;
- }
-
- /* kernel doesn't set the nodeid field */
- info.nodeid = our_nodeid;
-
- if (!cfgd_enable_plock) {
- rv = -ENOSYS;
- goto fail;
- }
-
- if (need_fsid_translation)
- info.fsid = ls_to_mg_id(info.fsid);
-
- mg = find_mg_id(info.fsid);
- if (!mg) {
- log_debug("process_plocks: no mg id %x", info.fsid);
- rv = -EEXIST;
- goto fail;
- }
-
- log_plock(mg, "read plock %llx %s %s %llx-%llx %d/%u/%llx w %d",
- (unsigned long long)info.number,
- op_str(info.optype),
- ex_str(info.optype, info.ex),
- (unsigned long long)info.start, (unsigned long long)info.end,
- info.nodeid, info.pid, (unsigned long long)info.owner,
- info.wait);
-
- /* report plock rate and any delays since the last report */
- plock_read_count++;
- if (!(plock_read_count % 1000)) {
- usec = dt_usec(&plock_read_time, &now) ;
- log_group(mg, "plock_read_count %u time %.3f s delays %u",
- plock_read_count, usec * 1.e-6, plock_rate_delays);
- plock_read_time = now;
- plock_rate_delays = 0;
- }
-
- rv = find_resource(mg, info.number, 1, &r);
- if (rv)
- goto fail;
-
- if (r->owner == 0) {
- /* plock state replicated on all nodes */
- send_plock(mg, r, &info);
-
- } else if (r->owner == our_nodeid) {
- /* we are the owner of r, so our plocks are local */
- __receive_plock(mg, &info, our_nodeid, r);
-
- } else {
- /* r owner is -1: r is new, try to become the owner;
- r owner > 0: tell other owner to give up ownership;
- both done with a message trying to set owner to ourself */
- send_own(mg, r, our_nodeid);
- save_pending_plock(mg, r, &info);
- }
-
- if (cfgd_plock_ownership &&
- time_diff_ms(&mg->drop_resources_last, &now) >=
- cfgd_drop_resources_time) {
- mg->drop_resources_last = now;
- drop_resources(mg);
- }
-
- return;
-
- fail:
- info.rv = rv;
- rv = write(plock_device_fd, &info, sizeof(info));
-
- return;
-}
-
-void process_saved_plocks(struct mountgroup *mg)
-{
- struct save_msg *sm, *sm2;
-
- if (list_empty(&mg->saved_messages))
- return;
-
- log_group(mg, "process_saved_plocks");
-
- list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) {
- switch (sm->type) {
- case MSG_PLOCK:
- _receive_plock(mg, sm->buf, sm->len, sm->nodeid);
- break;
- case MSG_PLOCK_OWN:
- _receive_own(mg, sm->buf, sm->len, sm->nodeid);
- break;
- case MSG_PLOCK_DROP:
- _receive_drop(mg, sm->buf, sm->len, sm->nodeid);
- break;
- case MSG_PLOCK_SYNC_LOCK:
- case MSG_PLOCK_SYNC_WAITER:
- _receive_sync(mg, sm->buf, sm->len, sm->nodeid);
- break;
- default:
- continue;
- }
-
- list_del(&sm->list);
- free(sm);
- }
-}
-
-void plock_exit(void)
-{
- if (cfgd_enable_plock)
- saCkptFinalize(ckpt_handle);
-}
-
-/* locks still marked SYNCING should not go into the ckpt; the new node
- will get those locks by receiving PLOCK_SYNC messages */
-
-static void pack_section_buf(struct mountgroup *mg, struct resource *r)
-{
- struct pack_plock *pp;
- struct posix_lock *po;
- struct lock_waiter *w;
- int count = 0;
-
- /* plocks on owned resources are not replicated on other nodes */
- if (r->owner == our_nodeid)
- return;
-
- pp = (struct pack_plock *) §ion_buf;
-
- list_for_each_entry(po, &r->locks, list) {
- if (po->flags & P_SYNCING)
- continue;
- pp->start = cpu_to_le64(po->start);
- pp->end = cpu_to_le64(po->end);
- pp->owner = cpu_to_le64(po->owner);
- pp->pid = cpu_to_le32(po->pid);
- pp->nodeid = cpu_to_le32(po->nodeid);
- pp->ex = po->ex;
- pp->waiter = 0;
- pp++;
- count++;
- }
-
- list_for_each_entry(w, &r->waiters, list) {
- if (w->flags & P_SYNCING)
- continue;
- pp->start = cpu_to_le64(w->info.start);
- pp->end = cpu_to_le64(w->info.end);
- pp->owner = cpu_to_le64(w->info.owner);
- pp->pid = cpu_to_le32(w->info.pid);
- pp->nodeid = cpu_to_le32(w->info.nodeid);
- pp->ex = w->info.ex;
- pp->waiter = 1;
- pp++;
- count++;
- }
-
- section_len = count * sizeof(struct pack_plock);
-}
-
-static int unpack_section_buf(struct mountgroup *mg, char *numbuf, int buflen)
-{
- struct pack_plock *pp;
- struct posix_lock *po;
- struct lock_waiter *w;
- struct resource *r;
- int count = section_len / sizeof(struct pack_plock);
- int i, owner = 0;
- unsigned long long num;
- struct timeval now;
-
- gettimeofday(&now, NULL);
-
- r = malloc(sizeof(struct resource));
- if (!r)
- return -ENOMEM;
- memset(r, 0, sizeof(struct resource));
- INIT_LIST_HEAD(&r->locks);
- INIT_LIST_HEAD(&r->waiters);
- INIT_LIST_HEAD(&r->pending);
-
- if (cfgd_plock_ownership)
- sscanf(numbuf, "r%llu.%d", &num, &owner);
- else
- sscanf(numbuf, "r%llu", &num);
-
- r->number = num;
- r->owner = owner;
- r->last_access = now;
-
- pp = (struct pack_plock *) §ion_buf;
-
- for (i = 0; i < count; i++) {
- if (!pp->waiter) {
- po = malloc(sizeof(struct posix_lock));
- // FIXME: handle failed malloc
- po->start = le64_to_cpu(pp->start);
- po->end = le64_to_cpu(pp->end);
- po->owner = le64_to_cpu(pp->owner);
- po->pid = le32_to_cpu(pp->pid);
- po->nodeid = le32_to_cpu(pp->nodeid);
- po->ex = pp->ex;
- list_add_tail(&po->list, &r->locks);
- } else {
- w = malloc(sizeof(struct lock_waiter));
- // FIXME: handle failed malloc
- w->info.start = le64_to_cpu(pp->start);
- w->info.end = le64_to_cpu(pp->end);
- w->info.owner = le64_to_cpu(pp->owner);
- w->info.pid = le32_to_cpu(pp->pid);
- w->info.nodeid = le32_to_cpu(pp->nodeid);
- w->info.ex = pp->ex;
- list_add_tail(&w->list, &r->waiters);
- }
- pp++;
- }
-
- list_add_tail(&r->list, &mg->plock_resources);
- return 0;
-}
-
-static int _unlink_checkpoint(struct mountgroup *mg, SaNameT *name)
-{
- SaCkptCheckpointHandleT h;
- SaCkptCheckpointDescriptorT s;
- SaAisErrorT rv;
- int ret = 0;
-
- h = (SaCkptCheckpointHandleT) mg->cp_handle;
- log_group(mg, "unlink ckpt %llx", (unsigned long long)h);
-
- unlink_retry:
- rv = saCkptCheckpointUnlink(ckpt_handle, name);
- if (rv == SA_AIS_ERR_TRY_AGAIN) {
- log_group(mg, "unlink ckpt retry");
- sleep(1);
- goto unlink_retry;
- }
- if (rv == SA_AIS_OK)
- goto out_close;
-
- log_error("unlink ckpt error %d %s", rv, mg->name);
- ret = -1;
-
- status_retry:
- rv = saCkptCheckpointStatusGet(h, &s);
- if (rv == SA_AIS_ERR_TRY_AGAIN) {
- log_group(mg, "unlink ckpt status retry");
- sleep(1);
- goto status_retry;
- }
- if (rv != SA_AIS_OK) {
- log_error("unlink ckpt status error %d %s", rv, mg->name);
- goto out_close;
- }
-
- log_group(mg, "unlink ckpt status: size %llu, max sections %u, "
- "max section size %llu, section count %u, mem %u",
- (unsigned long long)s.checkpointCreationAttributes.checkpointSize,
- s.checkpointCreationAttributes.maxSections,
- (unsigned long long)s.checkpointCreationAttributes.maxSectionSize,
- s.numberOfSections, s.memoryUsed);
-
- out_close:
- if (!h)
- goto out;
-
- rv = saCkptCheckpointClose(h);
- if (rv == SA_AIS_ERR_TRY_AGAIN) {
- log_group(mg, "unlink ckpt close retry");
- sleep(1);
- goto out_close;
- }
- if (rv != SA_AIS_OK) {
- log_error("unlink ckpt %llx close err %d %s",
- (unsigned long long)h, rv, mg->name);
- /* should we return an error here and possibly cause
- store_plocks() to fail on this? */
- /* ret = -1; */
- }
- out:
- mg->cp_handle = 0;
- return ret;
-}
-
-int unlink_checkpoint(struct mountgroup *mg)
-{
- SaNameT name;
- int len;
-
- len = snprintf((char *)name.value, SA_MAX_NAME_LENGTH, "gfsplock.%s",
- mg->name);
- name.length = len;
- return _unlink_checkpoint(mg, &name);
-}
-
-/*
- * section id is r<inodenum>.<owner>, the maximum string length is:
- * "r" prefix = 1 strlen("r")
- * max uint64 = 20 strlen("18446744073709551615")
- * "." before owner = 1 strlen(".")
- * max int = 11 strlen("-2147483647")
- * \0 at end = 1
- * ---------------------
- * 34 SECTION_NAME_LEN
- */
-
-#define SECTION_NAME_LEN 34
-
-/* Copy all plock state into a checkpoint so new node can retrieve it. The
- node creating the ckpt for the mounter needs to be the same node that's
- sending the mounter its journals message (i.e. the low nodeid). The new
- mounter knows the ckpt is ready to read only after it gets its journals
- message.
-
- If the mounter is becoming the new low nodeid in the group, the node doing
- the store closes the ckpt and the new node unlinks the ckpt after reading
- it. The ckpt should then disappear and the new node can create a new ckpt
- for the next mounter. */
-
-void store_plocks(struct mountgroup *mg, int nodeid)
-{
- SaCkptCheckpointCreationAttributesT attr;
- SaCkptCheckpointHandleT h;
- SaCkptSectionIdT section_id;
- SaCkptSectionCreationAttributesT section_attr;
- SaCkptCheckpointOpenFlagsT flags;
- SaNameT name;
- SaAisErrorT rv;
- char buf[SECTION_NAME_LEN];
- struct resource *r;
- struct posix_lock *po;
- struct lock_waiter *w;
- int r_count, lock_count, total_size, section_size, max_section_size;
- int len, owner;
-
- if (!cfgd_enable_plock)
- return;
-
- /* no change to plock state since we created the last checkpoint */
- if (mg->last_checkpoint_time > mg->last_plock_time) {
- log_group(mg, "store_plocks: saved ckpt uptodate");
- goto out;
- }
- mg->last_checkpoint_time = time(NULL);
-
- len = snprintf((char *)name.value, SA_MAX_NAME_LENGTH, "gfsplock.%s",
- mg->name);
- name.length = len;
-
- /* unlink an old checkpoint before we create a new one */
- if (mg->cp_handle) {
- if (_unlink_checkpoint(mg, &name))
- return;
- }
-
- /* loop through all plocks to figure out sizes to set in
- the attr fields */
-
- r_count = 0;
- lock_count = 0;
- total_size = 0;
- max_section_size = 0;
-
- list_for_each_entry(r, &mg->plock_resources, list) {
- if (r->owner == -1)
- continue;
-
- r_count++;
- section_size = 0;
- list_for_each_entry(po, &r->locks, list) {
- section_size += sizeof(struct pack_plock);
- lock_count++;
- }
- list_for_each_entry(w, &r->waiters, list) {
- section_size += sizeof(struct pack_plock);
- lock_count++;
- }
- total_size += section_size;
- if (section_size > max_section_size)
- max_section_size = section_size;
- }
-
- log_group(mg, "store_plocks: r_count %d, lock_count %d, pp %u bytes",
- r_count, lock_count, (unsigned int)sizeof(struct pack_plock));
-
- log_group(mg, "store_plocks: total %d bytes, max_section %d bytes",
- total_size, max_section_size);
-
- attr.creationFlags = SA_CKPT_WR_ALL_REPLICAS;
- attr.checkpointSize = total_size;
- attr.retentionDuration = SA_TIME_MAX;
- attr.maxSections = r_count + 1; /* don't know why we need +1 */
- attr.maxSectionSize = max_section_size;
- attr.maxSectionIdSize = SECTION_NAME_LEN;
-
- flags = SA_CKPT_CHECKPOINT_READ |
- SA_CKPT_CHECKPOINT_WRITE |
- SA_CKPT_CHECKPOINT_CREATE;
-
- open_retry:
- rv = saCkptCheckpointOpen(ckpt_handle, &name, &attr, flags, 0, &h);
- if (rv == SA_AIS_ERR_TRY_AGAIN) {
- log_group(mg, "store_plocks: ckpt open retry");
- sleep(1);
- goto open_retry;
- }
- if (rv == SA_AIS_ERR_EXIST) {
- log_group(mg, "store_plocks: ckpt already exists");
- return;
- }
- if (rv != SA_AIS_OK) {
- log_error("store_plocks: ckpt open error %d %s", rv, mg->name);
- return;
- }
-
- log_group(mg, "store_plocks: open ckpt handle %llx",
- (unsigned long long)h);
- mg->cp_handle = (uint64_t) h;
-
- /* - If r owner is -1, ckpt nothing.
- - If r owner is us, ckpt owner of us and no plocks.
- - If r owner is other, ckpt that owner and any plocks we have on r
- (they've just been synced but owner=0 msg not recved yet).
- - If r owner is 0 and !got_unown, then we've just unowned r;
- ckpt owner of us and any plocks that don't have SYNCING set
- (plocks with SYNCING will be handled by our sync messages).
- - If r owner is 0 and got_unown, then ckpt owner 0 and all plocks;
- (there should be no SYNCING plocks) */
-
- list_for_each_entry(r, &mg->plock_resources, list) {
- if (r->owner == -1)
- continue;
- else if (r->owner == our_nodeid)
- owner = our_nodeid;
- else if (r->owner)
- owner = r->owner;
- else if (!r->owner && !got_unown(r))
- owner = our_nodeid;
- else if (!r->owner)
- owner = 0;
- else {
- log_error("store_plocks owner %d r %llx", r->owner,
- (unsigned long long)r->number);
- continue;
- }
-
- memset(&buf, 0, sizeof(buf));
- if (cfgd_plock_ownership)
- len = snprintf(buf, SECTION_NAME_LEN, "r%llu.%d",
- (unsigned long long)r->number, owner);
- else
- len = snprintf(buf, SECTION_NAME_LEN, "r%llu",
- (unsigned long long)r->number);
-
- section_id.id = (void *)buf;
- section_id.idLen = len + 1;
- section_attr.sectionId = §ion_id;
- section_attr.expirationTime = SA_TIME_END;
-
- memset(§ion_buf, 0, sizeof(section_buf));
- section_len = 0;
-
- pack_section_buf(mg, r);
-
- log_group(mg, "store_plocks: section size %u id %u \"%s\"",
- section_len, section_id.idLen, buf);
-
- create_retry:
- rv = saCkptSectionCreate(h, §ion_attr, §ion_buf,
- section_len);
- if (rv == SA_AIS_ERR_TRY_AGAIN) {
- log_group(mg, "store_plocks: ckpt create retry");
- sleep(1);
- goto create_retry;
- }
- if (rv == SA_AIS_ERR_EXIST) {
- /* this shouldn't happen in general */
- log_group(mg, "store_plocks: clearing old ckpt");
- saCkptCheckpointClose(h);
- _unlink_checkpoint(mg, &name);
- goto open_retry;
- }
- if (rv != SA_AIS_OK) {
- log_error("store_plocks: ckpt section create err %d %s",
- rv, mg->name);
- break;
- }
- }
-
- out:
- /* If the new nodeid is becoming the low nodeid it will now be in
- charge of creating ckpt's for mounters instead of us. */
-
- if (nodeid < our_nodeid) {
- log_group(mg, "store_plocks: closing ckpt for new low node %d",
- nodeid);
- saCkptCheckpointClose(h);
- mg->cp_handle = 0;
- }
-}
-
-/* called by a node that's just been added to the group to get existing plock
- state */
-
-void retrieve_plocks(struct mountgroup *mg)
-{
- SaCkptCheckpointHandleT h;
- SaCkptSectionIterationHandleT itr;
- SaCkptSectionDescriptorT desc;
- SaCkptIOVectorElementT iov;
- SaNameT name;
- SaAisErrorT rv;
- char buf[SECTION_NAME_LEN];
- int len;
-
- if (!cfgd_enable_plock)
- return;
-
- log_group(mg, "retrieve_plocks");
-
- len = snprintf((char *)name.value, SA_MAX_NAME_LENGTH, "gfsplock.%s",
- mg->name);
- name.length = len;
-
- open_retry:
- rv = saCkptCheckpointOpen(ckpt_handle, &name, NULL,
- SA_CKPT_CHECKPOINT_READ, 0, &h);
- if (rv == SA_AIS_ERR_TRY_AGAIN) {
- log_group(mg, "retrieve_plocks: ckpt open retry");
- sleep(1);
- goto open_retry;
- }
- if (rv != SA_AIS_OK) {
- log_error("retrieve_plocks: ckpt open error %d %s",
- rv, mg->name);
- return;
- }
-
- init_retry:
- rv = saCkptSectionIterationInitialize(h, SA_CKPT_SECTIONS_ANY, 0, &itr);
- if (rv == SA_AIS_ERR_TRY_AGAIN) {
- log_group(mg, "retrieve_plocks: ckpt iterinit retry");
- sleep(1);
- goto init_retry;
- }
- if (rv != SA_AIS_OK) {
- log_error("retrieve_plocks: ckpt iterinit error %d %s",
- rv, mg->name);
- goto out;
- }
-
- while (1) {
- next_retry:
- rv = saCkptSectionIterationNext(itr, &desc);
- if (rv == SA_AIS_ERR_NO_SECTIONS)
- break;
- if (rv == SA_AIS_ERR_TRY_AGAIN) {
- log_group(mg, "retrieve_plocks: ckpt iternext retry");
- sleep(1);
- goto next_retry;
- }
- if (rv != SA_AIS_OK) {
- log_error("retrieve_plocks: ckpt iternext error %d %s",
- rv, mg->name);
- goto out_it;
- }
-
- if (!desc.sectionId.idLen)
- continue;
-
- iov.sectionId = desc.sectionId;
- iov.dataBuffer = §ion_buf;
- iov.dataSize = desc.sectionSize;
- iov.dataOffset = 0;
-
- /* for debug print */
- memset(&buf, 0, sizeof(buf));
- snprintf(buf, SECTION_NAME_LEN, "%s", desc.sectionId.id);
-
- log_group(mg, "retrieve_plocks: section size %llu id %u \"%s\"",
- (unsigned long long)iov.dataSize, iov.sectionId.idLen,
- buf);
-
- read_retry:
- rv = saCkptCheckpointRead(h, &iov, 1, NULL);
- if (rv == SA_AIS_ERR_TRY_AGAIN) {
- log_group(mg, "retrieve_plocks: ckpt read retry");
- sleep(1);
- goto read_retry;
- }
- if (rv != SA_AIS_OK) {
- log_error("retrieve_plocks: ckpt read error %d %s",
- rv, mg->name);
- goto out_it;
- }
-
- /* we'll get empty (zero length) sections for resources with
- no locks, which exist in ownership mode; the resource
- name and owner come from the section id */
-
- log_group(mg, "retrieve_plocks: ckpt read %llu bytes",
- (unsigned long long)iov.readSize);
- section_len = iov.readSize;
-
- if (section_len % sizeof(struct pack_plock)) {
- log_error("retrieve_plocks: bad section len %d %s",
- section_len, mg->name);
- continue;
- }
-
- unpack_section_buf(mg, (char *)desc.sectionId.id,
- desc.sectionId.idLen);
- }
-
- out_it:
- saCkptSectionIterationFinalize(itr);
- out:
- if (mg->low_nodeid == our_nodeid) {
- /* we're the new low nodeid, will be master */
- log_group(mg, "retrieve_plocks: unlink ckpt from old master");
- mg->cp_handle = (uint64_t) h;
- _unlink_checkpoint(mg, &name);
- } else
- saCkptCheckpointClose(h);
-}
-
-/* Called when a node has failed, or we're unmounting. For a node failure, we
- need to call this when the cpg confchg arrives so that we're guaranteed all
- nodes do this in the same sequence wrt other messages. */
-
-void purge_plocks(struct mountgroup *mg, int nodeid, int unmount)
-{
- struct posix_lock *po, *po2;
- struct lock_waiter *w, *w2;
- struct resource *r, *r2;
- int purged = 0;
-
- if (!cfgd_enable_plock)
- return;
-
- list_for_each_entry_safe(r, r2, &mg->plock_resources, list) {
- list_for_each_entry_safe(po, po2, &r->locks, list) {
- if (po->nodeid == nodeid || unmount) {
- list_del(&po->list);
- free(po);
- purged++;
- }
- }
-
- list_for_each_entry_safe(w, w2, &r->waiters, list) {
- if (w->info.nodeid == nodeid || unmount) {
- list_del(&w->list);
- free(w);
- purged++;
- }
- }
-
- /* TODO: haven't thought carefully about how this transition
- to owner 0 might interact with other owner messages in
- progress. */
-
- if (r->owner == nodeid) {
- r->owner = 0;
- send_pending_plocks(mg, r);
- }
-
- if (!list_empty(&r->waiters))
- do_waiters(mg, r);
-
- if (!cfgd_plock_ownership &&
- list_empty(&r->locks) && list_empty(&r->waiters)) {
- list_del(&r->list);
- free(r);
- }
- }
-
- if (purged)
- mg->last_plock_time = time(NULL);
-
- log_group(mg, "purged %d plocks for %d", purged, nodeid);
-
- /* we may have a saved ckpt that we created for the last mounter,
- we need to unlink it so another node can create a new ckpt for
- the next mounter after we leave */
-
- if (unmount && mg->cp_handle)
- unlink_checkpoint(mg);
-}
-
-int fill_plock_dump_buf(struct mountgroup *mg)
-{
- struct posix_lock *po;
- struct lock_waiter *w;
- struct resource *r;
- struct timeval now;
- int rv = 0;
- int len = GFSC_DUMP_SIZE, pos = 0, ret;
-
- memset(plock_dump_buf, 0, sizeof(plock_dump_buf));
- plock_dump_len = 0;
-
- gettimeofday(&now, NULL);
-
- list_for_each_entry(r, &mg->plock_resources, list) {
-
- if (list_empty(&r->locks) &&
- list_empty(&r->waiters) &&
- list_empty(&r->pending)) {
- ret = snprintf(plock_dump_buf + pos, len - pos,
- "%llu rown %d unused_ms %llu\n",
- (unsigned long long)r->number, r->owner,
- (unsigned long long)time_diff_ms(&r->last_access,
- &now));
- if (ret >= len - pos) {
- rv = -ENOSPC;
- goto out;
- }
- pos += ret;
- continue;
- }
-
- list_for_each_entry(po, &r->locks, list) {
- ret = snprintf(plock_dump_buf + pos, len - pos,
- "%llu %s %llu-%llu nodeid %d pid %u owner %llx rown %d\n",
- (unsigned long long)r->number,
- po->ex ? "WR" : "RD",
- (unsigned long long)po->start,
- (unsigned long long)po->end,
- po->nodeid, po->pid,
- (unsigned long long)po->owner, r->owner);
-
- if (ret >= len - pos) {
- rv = -ENOSPC;
- goto out;
- }
- pos += ret;
- }
-
- list_for_each_entry(w, &r->waiters, list) {
- ret = snprintf(plock_dump_buf + pos, len - pos,
- "%llu %s %llu-%llu nodeid %d pid %u owner %llx rown %d WAITING\n",
- (unsigned long long)r->number,
- w->info.ex ? "WR" : "RD",
- (unsigned long long)w->info.start,
- (unsigned long long)w->info.end,
- w->info.nodeid, w->info.pid,
- (unsigned long long)w->info.owner, r->owner);
-
- if (ret >= len - pos) {
- rv = -ENOSPC;
- goto out;
- }
- pos += ret;
- }
-
- list_for_each_entry(w, &r->pending, list) {
- ret = snprintf(plock_dump_buf + pos, len - pos,
- "%llu %s %llu-%llu nodeid %d pid %u owner %llx rown %d PENDING\n",
- (unsigned long long)r->number,
- w->info.ex ? "WR" : "RD",
- (unsigned long long)w->info.start,
- (unsigned long long)w->info.end,
- w->info.nodeid, w->info.pid,
- (unsigned long long)w->info.owner, r->owner);
-
- if (ret >= len - pos) {
- rv = -ENOSPC;
- goto out;
- }
- pos += ret;
- }
- }
- out:
- plock_dump_len = pos;
- return rv;
-}
-
-static void find_minors(void)
-{
- FILE *fl;
- char name[256];
- uint32_t number;
- int found = 0;
- int c;
-
- plock_minor = 0;
- old_plock_minor = 0;
-
- if (!(fl = fopen("/proc/misc", "r"))) {
- log_error("/proc/misc fopen failed: %s", strerror(errno));
- return;
- }
-
- while (!feof(fl)) {
- if (fscanf(fl, "%d %255s\n", &number, &name[0]) == 2) {
-
- if (!strcmp(name, "dlm_plock")) {
- plock_minor = number;
- found++;
- } else if (!strcmp(name, "lock_dlm_plock")) {
- old_plock_minor = number;
- found++;
- }
-
- } else do {
- c = fgetc(fl);
- } while (c != EOF && c != '\n');
-
- if (found == 3)
- break;
- }
- fclose(fl);
-
- if (!found)
- log_error("Is lock_dlm or dlm missing from kernel? No misc devices found.");
-}
-
-static int find_udev_device(char *path, uint32_t minor)
-{
- struct stat st;
- int i;
-
- for (i = 0; i < 10; i++) {
- if (stat(path, &st) == 0 && minor(st.st_rdev) == minor)
- return 0;
- sleep(1);
- }
-
- log_error("cannot find device %s with minor %d", path, minor);
- return -1;
-}
-
-int setup_misc_devices(void)
-{
- int rv;
-
- find_minors();
-
- if (plock_minor) {
- rv = find_udev_device("/dev/misc/dlm_plock", plock_minor);
- if (rv < 0)
- return rv;
- log_debug("found /dev/misc/dlm_plock minor %u",
- plock_minor);
- }
-
- if (!plock_minor && old_plock_minor) {
- rv = find_udev_device("/dev/misc/lock_dlm_plock",
- old_plock_minor);
- if (rv < 0)
- return rv;
- log_debug("found /dev/misc/lock_dlm_plock minor %u",
- old_plock_minor);
- }
-
- return 0;
-}
-
diff --git a/group/gfs_controld/util.c b/group/gfs_controld/util.c
index a0650fe..51d274f 100644
--- a/group/gfs_controld/util.c
+++ b/group/gfs_controld/util.c
@@ -64,8 +64,6 @@ int set_sysfs(struct mountgroup *mg, char *field, int val)
return -1;
}
- mg->got_kernel_mount = 1;
-
memset(out, 0, sizeof(out));
sprintf(out, "%d", val);
@@ -92,8 +90,6 @@ static int get_sysfs(struct mountgroup *mg, char *field, char *buf, int len)
return -1;
}
- mg->got_kernel_mount = 1;
-
rv = read(fd, buf, len);
if (rv < 0)
log_error("read %s error %d %d", fname, rv, errno);
@@ -166,10 +162,7 @@ static void dmsetup_suspend_done(struct mountgroup *mg, int rv)
if (!rv) {
mg->withdraw_suspend = 1;
- if (mg->old_group_mode)
- send_withdraw_old(mg);
- else
- send_withdraw(mg);
+ send_withdraw(mg);
}
}
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2009-01-09 20:05 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-01-09 20:05 gfs2-utils: master - gfs_controld: remove groupd compat David Teigland
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).