public inbox for cluster-cvs@sourceware.org
help / color / mirror / Atom feed
* cluster: RHEL54 - clogd/dm-log-clustered.ko: Fix for bugs 506843, 479749, 507400
@ 2009-07-27 18:10 Jonathan Brassow
  0 siblings, 0 replies; only message in thread
From: Jonathan Brassow @ 2009-07-27 18:10 UTC (permalink / raw)
  To: cluster-cvs-relay

Gitweb:        http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=fa15cb16f0de9ddf06ddb50f2fcf26dd9c257931
Commit:        fa15cb16f0de9ddf06ddb50f2fcf26dd9c257931
Parent:        6326334d93408ebe5ec43d0bef6365c7c82e8849
Author:        Jonathan Brassow <jbrassow@redhat.com>
AuthorDate:    Mon Jul 27 13:07:36 2009 -0500
Committer:     Jonathan Brassow <jbrassow@redhat.com>
CommitterDate: Mon Jul 27 13:09:26 2009 -0500

clogd/dm-log-clustered.ko: Fix for bugs 506843, 479749, 507400

Device-mapper userspace logs (like the clustered log) are
identified by a universally unique identifier (UUID).  This
identifier is used to associate requests from the kernel to
a specific log in userspace.  The UUID must be unique everywhere,
since multiple machines may use this identifier when sharing a
log, as is the case for cluster logs.

Sometimes, device-mapper/LVM may re-use a UUID.  This is the
case during 'pvmove's, when moving from one segment of an LV
to another, or when resizing a mirror, etc.  In these cases,
a new log is created with the same UUID and loaded in the
"inactive" slot.  When a device-mapper "resume" is issued,
the "live" talbe is deactivated and the new "inactive" table
becomes "live".  (The "inactive" table can also be removed
via a device-mapper 'clear' command.)

The above two issues where colliding in the 3 bugs (possibly
more) mentioned in the title.  More than one log was being
created with the same UUID, and there was no way to distinguish
between them.  So, sometimes the wrong log would be swapped
out during the exchange.

The solution is to create a 'uuid_instance', or perhaps a
'luid'.  A local unique identifier to go along with the
UUID.  This new identifier is used to determine exactly which
log is being referenced by the kernel when the log exchange
is made.  The identifier is not universally safe, but it does
not need to be, since create/destroy/suspend/resume operations
are bound to a specific machine; and these are the operations
that make up the exchange.
---
 cmirror-kernel/src/dm-clog-tfr.c |    7 +-
 cmirror-kernel/src/dm-clog-tfr.h |    9 ++-
 cmirror-kernel/src/dm-clog.c     |   61 ++++++++++++---
 cmirror/src/cluster.c            |   41 +++++++----
 cmirror/src/cluster.h            |    2 +-
 cmirror/src/functions.c          |  155 ++++++++++++++++++++++----------------
 cmirror/src/functions.h          |    8 +-
 cmirror/src/local.c              |    1 +
 8 files changed, 185 insertions(+), 99 deletions(-)

diff --git a/cmirror-kernel/src/dm-clog-tfr.c b/cmirror-kernel/src/dm-clog-tfr.c
index 3ceb320..ddcf359 100644
--- a/cmirror-kernel/src/dm-clog-tfr.c
+++ b/cmirror-kernel/src/dm-clog-tfr.c
@@ -105,7 +105,7 @@ static int fill_pkg(struct cn_msg *msg, struct clog_tfr *tfr)
 		} else if (tfr->data_size > *(pkg->data_size)) {
 			DMERR("Insufficient space to receive package [%s]::",
 			      RQ_TYPE(tfr->request_type));
-			DMERR("  tfr->data_size    = %u", tfr->data_size);
+			DMERR("  tfr->data_size    = %llu", tfr->data_size);
 			DMERR("  *(pkg->data_size) = %u", *(pkg->data_size));
 
 			*(pkg->data_size) = 0;
@@ -147,6 +147,7 @@ static void cn_clog_callback(void *data)
 /*
  * dm_clog_consult_server
  * @uuid: log's uuid (must be DM_UUID_LEN in size)
+ * @uuid_instance: further identifier (if more than 1 log with same uuid)
  * @request_type:
  * @data: data to tx to the server
  * @data_size: size of data in bytes
@@ -158,7 +159,8 @@ static void cn_clog_callback(void *data)
  *
  * Returns: 0 on success, -EXXX on failure
  */
-int dm_clog_consult_server(const char *uuid, int request_type,
+int dm_clog_consult_server(const char *uuid, uint32_t uuid_instance,
+			   int request_type,
 			   char *data, int data_size,
 			   char *rdata, int *rdata_size)
 {
@@ -188,6 +190,7 @@ resend:
 
 	memset(tfr, 0, DM_CLOG_PREALLOCED_SIZE - overhead_size);
 	memcpy(tfr->uuid, uuid, DM_UUID_LEN);
+	tfr->uuid_instance = uuid_instance;
 	tfr->seq = seq++;
 	tfr->request_type = request_type;
 	tfr->data_size = data_size;
diff --git a/cmirror-kernel/src/dm-clog-tfr.h b/cmirror-kernel/src/dm-clog-tfr.h
index bdf4b6d..802db8b 100644
--- a/cmirror-kernel/src/dm-clog-tfr.h
+++ b/cmirror-kernel/src/dm-clog-tfr.h
@@ -52,13 +52,15 @@
 struct clog_tfr {
 	uint64_t private[2];
 	char uuid[DM_UUID_LEN]; /* Ties a request to a specific mirror log */
+	char uuid_padding[3];   /* DM_UUID_LEN == 129 */
+	uint32_t uuid_instance; /* Allows more than one instance w/ same uuid */
 
-	int error;              /* Used by server to inform of errors */
+	int32_t error;              /* Used by server to inform of errors */
 	uint32_t originator;    /* Cluster ID of this machine */
 
 	uint32_t seq;           /* Sequence number for request */
 	uint32_t request_type;  /* DM_CLOG_* */
-	uint32_t data_size;     /* How much data (not including this struct) */
+	uint64_t data_size;     /* How much data (not including this struct) */
 	char data[0];
 };
 
@@ -67,7 +69,8 @@ struct clog_tfr {
 
 int dm_clog_tfr_init(void);
 void dm_clog_tfr_exit(void);
-int dm_clog_consult_server(const char *uuid, int request_type,
+int dm_clog_consult_server(const char *uuid, uint32_t uuid_instance,
+			   int request_type,
 			   char *data, int data_size,
 			   char *rdata, int *rdata_size);
 #endif
diff --git a/cmirror-kernel/src/dm-clog.c b/cmirror-kernel/src/dm-clog.c
index f21823e..17e7486 100644
--- a/cmirror-kernel/src/dm-clog.c
+++ b/cmirror-kernel/src/dm-clog.c
@@ -18,10 +18,13 @@ struct flush_entry {
 };
 
 struct log_c {
+	struct list_head list;
+
 	struct dm_target *ti;
 	uint32_t region_size;
 	region_t region_count;
 	char uuid[DM_UUID_LEN];
+	uint32_t uuid_instance;
 
 	char *ctr_str; /* Gives ability to restart if userspace dies */
 	uint32_t ctr_size;
@@ -41,6 +44,8 @@ struct log_c {
 	struct dm_dev *disk_log;
 };
 
+static struct list_head log_list_head;
+static spinlock_t log_list_lock;
 static mempool_t *flush_entry_pool = NULL;
 
 static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data)
@@ -64,7 +69,7 @@ int cluster_do_request(struct log_c *lc, const char *uuid, int request_type,
 	 * restored.
 	 */
 retry:
-	r = dm_clog_consult_server(uuid, request_type, data,
+	r = dm_clog_consult_server(uuid, lc->uuid_instance, request_type, data,
 				   data_size, rdata, rdata_size);
 
 	if (r != -ESRCH)
@@ -75,13 +80,15 @@ retry:
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(2*HZ);
 		DMWARN("Attempting to contact cluster log server...");
-		r = dm_clog_consult_server(uuid, DM_CLOG_CTR, lc->ctr_str,
+		r = dm_clog_consult_server(uuid, lc->uuid_instance,
+					   DM_CLOG_CTR, lc->ctr_str,
 					   lc->ctr_size, NULL, NULL);
 		if (!r)
 			break;
 	}
 	DMINFO("Reconnected to cluster log server... CTR complete");
-	r = dm_clog_consult_server(uuid, DM_CLOG_RESUME, NULL,
+	r = dm_clog_consult_server(uuid, lc->uuid_instance,
+				   DM_CLOG_RESUME, NULL,
 				   0, NULL, NULL);
 	if (!r)
 		goto retry;
@@ -100,7 +107,7 @@ static int cluster_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 	int str_size;
 	int offset = (disk_log) ? 1 : 0;
 	char *ctr_str = NULL;
-	struct log_c *lc = NULL;
+	struct log_c *lc = NULL, *tmp;
 	uint32_t region_size;
 	region_t region_count;
 
@@ -129,6 +136,15 @@ static int cluster_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 	spin_lock_init(&lc->flush_lock);
 	INIT_LIST_HEAD(&lc->flush_list);
 
+	lc->uuid_instance = 1;
+	spin_lock(&log_list_lock);
+	list_for_each_entry(tmp, &log_list_head, list)
+		if (!strncmp(tmp->uuid, lc->uuid, DM_UUID_LEN) &&
+		    (tmp->uuid_instance >= lc->uuid_instance))
+			lc->uuid_instance = tmp->uuid_instance + 1;
+	list_add(&lc->list, &log_list_head);
+	spin_unlock(&log_list_lock);
+
 	for (i = 0, str_size = 0; i < argc; i++)
 		str_size += strlen(argv[i]) + 1; /* +1 for space between args */
 
@@ -146,7 +162,7 @@ static int cluster_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 	str_size += sprintf(ctr_str + str_size, "%llu", ti->len);
 
 	/* Send table string */
-	r = dm_clog_consult_server(lc->uuid, DM_CLOG_CTR,
+	r = dm_clog_consult_server(lc->uuid, lc->uuid_instance, DM_CLOG_CTR,
 				   ctr_str, str_size, NULL, NULL);
 
 	if (r == -ESRCH)
@@ -242,11 +258,15 @@ static void cluster_dtr(struct dm_dirty_log *log)
 	int r;
 	struct log_c *lc = (struct log_c *)log->context;
 
-	r = dm_clog_consult_server(lc->uuid, DM_CLOG_DTR,
+	r = dm_clog_consult_server(lc->uuid, lc->uuid_instance, DM_CLOG_DTR,
 				   NULL, 0,
 				   NULL, NULL);
 
 	/* FIXME: What do we do on failure? */
+
+	spin_lock(&log_list_lock);
+	list_del(&lc->list);
+	spin_unlock(&log_list_lock);
 	if (lc->disk_log)
 		dm_put_device(lc->ti, lc->disk_log);
 	kfree(lc->ctr_str);
@@ -264,7 +284,8 @@ static int cluster_presuspend(struct dm_dirty_log *log)
 	int r;
 	struct log_c *lc = (struct log_c *)log->context;
 
-	r = dm_clog_consult_server(lc->uuid, DM_CLOG_PRESUSPEND,
+	r = dm_clog_consult_server(lc->uuid, lc->uuid_instance,
+				   DM_CLOG_PRESUSPEND,
 				   NULL, 0,
 				   NULL, NULL);
 
@@ -280,7 +301,8 @@ static int cluster_postsuspend(struct dm_dirty_log *log)
 	int r;
 	struct log_c *lc = (struct log_c *)log->context;
 
-	r = dm_clog_consult_server(lc->uuid, DM_CLOG_POSTSUSPEND,
+	r = dm_clog_consult_server(lc->uuid, lc->uuid_instance,
+				   DM_CLOG_POSTSUSPEND,
 				   NULL, 0,
 				   NULL, NULL);
 
@@ -297,7 +319,7 @@ static int cluster_resume(struct dm_dirty_log *log)
 	struct log_c *lc = (struct log_c *)log->context;
 
 	lc->in_sync_hint = 0;
-	r = dm_clog_consult_server(lc->uuid, DM_CLOG_RESUME,
+	r = dm_clog_consult_server(lc->uuid, lc->uuid_instance, DM_CLOG_RESUME,
 				   NULL, 0,
 				   NULL, NULL);
 
@@ -536,7 +558,11 @@ static int cluster_get_resync_work(struct dm_dirty_log *log, region_t *region)
 	int r;
 	int rdata_size;
 	struct log_c *lc = (struct log_c *)log->context;
-	struct { int i; region_t r; } pkg;
+	struct { 
+		int32_t i;
+		uint32_t arch_padding;
+		region_t r;
+	} pkg;
 
 	if (lc->in_sync_hint >= lc->region_count)
 		return 0;
@@ -564,7 +590,11 @@ static void cluster_set_region_sync(struct dm_dirty_log *log,
 {
 	int r;
 	struct log_c *lc = (struct log_c *)log->context;
-	struct { region_t r; int i; } pkg;
+	struct {
+		region_t r;
+		uint32_t arch_padding;
+		int32_t i;
+	} pkg;
 
 	pkg.r = region;
 	pkg.i = in_sync;
@@ -659,7 +689,11 @@ static int cluster_is_remote_recovering(struct dm_dirty_log *log, region_t regio
 	int r;
 	struct log_c *lc = (struct log_c *)log->context;
 	static unsigned long long limit = 0;
-	struct { int is_recovering; uint64_t in_sync_hint; } pkg;
+	struct {
+		int32_t is_recovering;
+		uint32_t arch_padding;
+		uint64_t in_sync_hint;
+	} pkg;
 	int rdata_size = sizeof(pkg);
 
 	/*
@@ -734,6 +768,9 @@ static int __init cluster_dirty_log_init(void)
 {
 	int r = 0;
 
+	INIT_LIST_HEAD(&log_list_head);
+	spin_lock_init(&log_list_lock);
+
 	flush_entry_pool = mempool_create(100, flush_entry_alloc,
 					  flush_entry_free, NULL);
 
diff --git a/cmirror/src/cluster.c b/cmirror/src/cluster.c
index 63af0dd..1ecef33 100644
--- a/cmirror/src/cluster.c
+++ b/cmirror/src/cluster.c
@@ -82,6 +82,7 @@ static int log_resp_rec = 0;
 struct checkpoint_data {
 	uint32_t requester;
 	char uuid[CPG_MAX_NAME_LENGTH];
+	uint32_t uuid_instance;
 
 	int bitmap_size; /* in bytes */
 	char *sync_bits;
@@ -101,6 +102,7 @@ struct clog_cpg {
 	uint32_t lowest_id;
 	cpg_handle_t handle;
 	struct cpg_name name;
+	uint32_t uuid_instance;
 
 	/* Are we the first, or have we received checkpoint? */
 	int state;
@@ -144,6 +146,12 @@ int cluster_send(struct clog_tfr *tfr)
 		return -ENOENT;
 	}
 
+	/*
+	 * uuid_instance is only valid per machine, it looses its meaning
+	 * when sent to the cluster.
+	 */
+	tfr->uuid_instance = 0;
+
 	iov.iov_base = tfr;
 	iov.iov_len = sizeof(struct clog_tfr) + tfr->data_size;
 
@@ -349,9 +357,11 @@ static struct checkpoint_data *prepare_checkpoint(struct clog_cpg *entry,
 	memset(new, 0, sizeof(*new));
 	new->requester = cp_requester;
 	strncpy(new->uuid, entry->name.value, entry->name.length);
+	new->uuid_instance = entry->uuid_instance;
 
-	new->bitmap_size = push_state(entry->name.value, "clean_bits",
-				      &new->clean_bits, cp_requester);
+	new->bitmap_size = push_state(entry->name.value, entry->uuid_instance,
+				      "clean_bits", &new->clean_bits,
+				      cp_requester);
 	if (new->bitmap_size <= 0) {
 		LOG_ERROR("Failed to store clean_bits to checkpoint for node %u",
 			  new->requester);
@@ -359,7 +369,7 @@ static struct checkpoint_data *prepare_checkpoint(struct clog_cpg *entry,
 		return NULL;
 	}
 
-	new->bitmap_size = push_state(entry->name.value,
+	new->bitmap_size = push_state(entry->name.value, entry->uuid_instance,
 				      "sync_bits", &new->sync_bits, cp_requester);
 	if (new->bitmap_size <= 0) {
 		LOG_ERROR("Failed to store sync_bits to checkpoint for node %u",
@@ -369,7 +379,9 @@ static struct checkpoint_data *prepare_checkpoint(struct clog_cpg *entry,
 		return NULL;
 	}
 
-	r = push_state(entry->name.value, "recovering_region", &new->recovering_region, cp_requester);
+	r = push_state(entry->name.value, entry->uuid_instance,
+		       "recovering_region", &new->recovering_region,
+		       cp_requester);
 	if (r <= 0) {
 		LOG_ERROR("Failed to store recovering_region to checkpoint for node %u",
 			  new->requester);
@@ -696,7 +708,7 @@ init_retry:
 		}
 
 		if (iov.readSize) {
-			if (pull_state(entry->name.value,
+			if (pull_state(entry->name.value, entry->uuid_instance,
 				       (char *)desc.sectionId.id, bitmap,
 				       iov.readSize)) {
 				LOG_ERROR("Error loading state");
@@ -1087,7 +1099,7 @@ static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname,
 
 		new->next = match->checkpoint_list;
 		match->checkpoint_list = new;
-	}		
+	}
 
 out:
 	/* nothing happens after this point.  It is just for debugging */
@@ -1229,7 +1241,7 @@ static void cpg_leave_callback(struct clog_cpg *match,
 		cpg_fd_get(match->handle, &fd);
 		links_unregister(fd);
 
-		cluster_postsuspend(match->name.value);
+		cluster_postsuspend(match->name.value, match->uuid_instance);
 
 		list_for_each_safe(p, n, &match->working_list) {
 			list_del_init(p);
@@ -1281,7 +1293,7 @@ static void cpg_leave_callback(struct clog_cpg *match,
 			j--;
 		}
 	}
-	match->checkpoints_needed = j;			
+	match->checkpoints_needed = j;
 
 	if (left->nodeid < my_cluster_id) {
 		match->delay = (match->delay > 0) ? match->delay - 1 : 0;
@@ -1434,7 +1446,7 @@ unlink_retry:
 	return 1;
 }
 
-int create_cluster_cpg(char *str)
+int create_cluster_cpg(char *uuid, uint32_t uuid_instance)
 {
 	int r;
 	int size;
@@ -1442,8 +1454,8 @@ int create_cluster_cpg(char *str)
 	struct clog_cpg *tmp, *tmp2;
 
 	list_for_each_entry_safe(tmp, tmp2, &clog_cpg_list, list)
-		if (!strncmp(tmp->name.value, str, CPG_MAX_NAME_LENGTH)) {
-			LOG_ERROR("Log entry already exists: %s", str);
+		if (!strncmp(tmp->name.value, uuid, CPG_MAX_NAME_LENGTH)) {
+			LOG_ERROR("Log entry already exists: %s", uuid);
 			return -EEXIST;
 		}
 
@@ -1458,10 +1470,11 @@ int create_cluster_cpg(char *str)
 	INIT_LIST_HEAD(&new->startup_list);
 	INIT_LIST_HEAD(&new->working_list);
 
-	size = ((strlen(str) + 1) > CPG_MAX_NAME_LENGTH) ?
-		CPG_MAX_NAME_LENGTH : (strlen(str) + 1);
-	strncpy(new->name.value, str, size);
+	size = ((strlen(uuid) + 1) > CPG_MAX_NAME_LENGTH) ?
+		CPG_MAX_NAME_LENGTH : (strlen(uuid) + 1);
+	strncpy(new->name.value, uuid, size);
 	new->name.length = size;
+	new->uuid_instance = uuid_instance;
 
 	/*
 	 * Ensure there are no stale checkpoints around before we join
diff --git a/cmirror/src/cluster.h b/cmirror/src/cluster.h
index 9c98085..176024b 100644
--- a/cmirror/src/cluster.h
+++ b/cmirror/src/cluster.h
@@ -5,7 +5,7 @@ int init_cluster(void);
 void cleanup_cluster(void);
 void cluster_debug(void);
 
-int create_cluster_cpg(char *str);
+int create_cluster_cpg(char *uuid, uint32_t uuid_instance);
 int destroy_cluster_cpg(char *str);
 
 int cluster_send(struct clog_tfr *tfr);
diff --git a/cmirror/src/functions.c b/cmirror/src/functions.c
index d37c75a..f7270a6 100644
--- a/cmirror/src/functions.c
+++ b/cmirror/src/functions.c
@@ -49,7 +49,7 @@ struct log_c {
 	struct list_head list;
 
 	char uuid[DM_UUID_LEN];
-	uint32_t ref_count;
+	uint32_t uuid_instance;
 
 	time_t delay; /* limits how fast a resume can happen after suspend */
 	int touched;
@@ -148,7 +148,7 @@ static uint64_t count_bits32(uint32_t *addr, uint32_t count)
  *
  * Returns: log if found, NULL otherwise
  */
-static struct log_c *get_log(const char *uuid)
+static struct log_c *get_log(const char *uuid, uint32_t uuid_instance)
 {
 	struct list_head *l;
 	struct log_c *lc;
@@ -156,7 +156,9 @@ static struct log_c *get_log(const char *uuid)
 	/* FIXME: Need prefetch to do this right */
 	__list_for_each(l, &log_list) {
 		lc = list_entry(l, struct log_c, list);
-		if (!strcmp(lc->uuid, uuid))
+		if (!strcmp(lc->uuid, uuid) &&
+		    (!uuid_instance ||
+		     (lc->uuid_instance == uuid_instance)))
 			return lc;
 	}
 
@@ -172,7 +174,7 @@ static struct log_c *get_log(const char *uuid)
  *
  * Returns: log if found, NULL otherwise
  */
-static struct log_c *get_pending_log(const char *uuid)
+static struct log_c *get_pending_log(const char *uuid, uint32_t uuid_instance)
 {
 	struct list_head *l;
 	struct log_c *lc;
@@ -180,7 +182,9 @@ static struct log_c *get_pending_log(const char *uuid)
 	/* FIXME: Need prefetch to do this right */
 	__list_for_each(l, &log_pending_list) {
 		lc = list_entry(l, struct log_c, list);
-		if (!strcmp(lc->uuid, uuid))
+		if (!strcmp(lc->uuid, uuid) &&
+		    (!uuid_instance ||
+		     (lc->uuid_instance == uuid_instance)))
 			return lc;
 	}
 
@@ -346,7 +350,8 @@ static int find_disk_path(char *major_minor_str, char *path_rtn, int *unlink_pat
 	return r ? -errno : 0;
 }
 
-static int _clog_ctr(int argc, char **argv, uint64_t device_size)
+static int _clog_ctr(int argc, char **argv, uint32_t uuid_instance,
+		     uint64_t device_size)
 {
 	int i;
 	int r = 0;
@@ -435,16 +440,23 @@ static int _clog_ctr(int argc, char **argv, uint64_t device_size)
 	lc->skip_bit_warning = region_count;
 	lc->disk_fd = -1;
 	lc->log_dev_failed = 0;
-	lc->ref_count = 1;
 	strncpy(lc->uuid, argv[1 + disk_log], DM_UUID_LEN);
+	lc->uuid_instance = uuid_instance;
 
-	if ((dup = get_log(lc->uuid)) ||
-	    (dup = get_pending_log(lc->uuid))) {
-		LOG_DBG("[%s] Inc reference count on cluster log",
-			  SHORT_UUID(lc->uuid));
+	if ((dup = get_log(lc->uuid, lc->uuid_instance))) {
+		LOG_ERROR("Duplicate log UUID and uuid_instance!");
+		LOG_ERROR("UUID: %s", lc->uuid);
+		LOG_ERROR("uuid_instance: %u", lc->uuid_instance);
 		free(lc);
-		dup->ref_count++;
-		return 0;
+		return -EINVAL;
+	}
+
+	if ((dup = get_pending_log(lc->uuid, lc->uuid_instance))) {
+		LOG_ERROR("Duplicate pending log UUID and uuid_instance!");
+		LOG_ERROR("UUID: %s", lc->uuid);
+		LOG_ERROR("uuid_instance: %u", lc->uuid_instance);
+		free(lc);
+		return -EINVAL;
 	}
 
 	INIT_LIST_HEAD(&lc->mark_list);
@@ -545,8 +557,9 @@ static int clog_ctr(struct clog_tfr *tfr)
 
 	if (strlen(tfr->data) != tfr->data_size) {
 		LOG_ERROR("Received constructor request with bad data");
-		LOG_ERROR("strlen(tfr->data)[%d] != tfr->data_size[%d]",
-			  (int)strlen(tfr->data), tfr->data_size);
+		LOG_ERROR("strlen(tfr->data)[%d] != tfr->data_size[%llu]",
+			  (int)strlen(tfr->data),
+			  (unsigned long long)tfr->data_size);
 		LOG_ERROR("tfr->data = '%s' [%d]",
 			  tfr->data, (int)strlen(tfr->data));
 		return -EINVAL;
@@ -570,7 +583,7 @@ static int clog_ctr(struct clog_tfr *tfr)
 	}
 
 	argc--;  /* We pass in the device_size separate */
-	r = _clog_ctr(argc, argv, device_size);
+	r = _clog_ctr(argc, argv, tfr->uuid_instance, device_size);
 
 	/* We join the CPG when we resume */
 
@@ -579,7 +592,8 @@ static int clog_ctr(struct clog_tfr *tfr)
 
 	free(argv);
 	if (r)
-		LOG_ERROR("Failed to create cluster log (%s)", tfr->uuid);
+		LOG_ERROR("Failed to create cluster log (%s)",
+			  SHORT_UUID(tfr->uuid));
 	else
 		LOG_DBG("[%s] Cluster log created",
 			  SHORT_UUID(tfr->uuid));
@@ -594,32 +608,21 @@ static int clog_ctr(struct clog_tfr *tfr)
  */
 static int clog_dtr(struct clog_tfr *tfr)
 {
-	struct log_c *lc = get_log(tfr->uuid);
+	struct log_c *lc = get_log(tfr->uuid, tfr->uuid_instance);
 
 	if (lc) {
 		/*
 		 * The log should not be on the official list.  There
 		 * should have been a suspend first.
 		 */
-		lc->ref_count--;
-		if (!lc->ref_count) {
-			LOG_ERROR("[%s] DTR before SUS: leaving CPG",
-				  SHORT_UUID(tfr->uuid));
-			destroy_cluster_cpg(tfr->uuid);
-		}
-	} else if ((lc = get_pending_log(tfr->uuid))) {
-		lc->ref_count--;
-	} else {
+		LOG_ERROR("[%s] DTR before SUS: leaving CPG",
+			  SHORT_UUID(tfr->uuid));
+		destroy_cluster_cpg(tfr->uuid);
+	} else if (!(lc = get_pending_log(tfr->uuid, tfr->uuid_instance))) {
 		LOG_ERROR("clog_dtr called on log that is not official or pending");
 		return -EINVAL;
 	}
 
-	if (lc->ref_count) {
-		LOG_DBG("[%s] Dec reference count on cluster log",
-			  SHORT_UUID(lc->uuid));
-		return 0;
-	}
-
 	LOG_DBG("[%s] Cluster log removed", SHORT_UUID(lc->uuid));
 
 	list_del_init(&lc->list);
@@ -641,7 +644,7 @@ static int clog_dtr(struct clog_tfr *tfr)
  */
 static int clog_presuspend(struct clog_tfr *tfr)
 {
-	struct log_c *lc = get_log(tfr->uuid);
+	struct log_c *lc = get_log(tfr->uuid, tfr->uuid_instance);
 
 	if (!lc)
 		return -EINVAL;
@@ -661,7 +664,7 @@ static int clog_presuspend(struct clog_tfr *tfr)
  */
 static int clog_postsuspend(struct clog_tfr *tfr)
 {
-	struct log_c *lc = get_log(tfr->uuid);
+	struct log_c *lc = get_log(tfr->uuid, tfr->uuid_instance);
 
 	if (!lc)
 		return -EINVAL;
@@ -682,9 +685,9 @@ static int clog_postsuspend(struct clog_tfr *tfr)
  * @tfr
  *
  */
-int cluster_postsuspend(char *uuid)
+int cluster_postsuspend(char *uuid, uint32_t uuid_instance)
 {
-	struct log_c *lc = get_log(uuid);
+	struct log_c *lc = get_log(uuid, uuid_instance);
 
 	if (!lc)
 		return -EINVAL;
@@ -709,7 +712,7 @@ static int clog_resume(struct clog_tfr *tfr)
 {
 	uint32_t i;
 	int commit_log = 0;
-	struct log_c *lc = get_log(tfr->uuid);
+	struct log_c *lc = get_log(tfr->uuid, tfr->uuid_instance);
 	size_t size = lc->bitset_uint32_count * sizeof(uint32_t);
 
 	if (!lc)
@@ -835,11 +838,11 @@ int local_resume(struct clog_tfr *tfr)
 {
 	int r;
 	time_t t;
-	struct log_c *lc = get_log(tfr->uuid);
+	struct log_c *lc = get_log(tfr->uuid, tfr->uuid_instance);
 
 	if (!lc) {
 		/* Is the log in the pending list? */
-		lc = get_pending_log(tfr->uuid);
+		lc = get_pending_log(tfr->uuid, tfr->uuid_instance);
 		if (!lc) {
 			LOG_ERROR("clog_resume called on log that is not official or pending");
 			return -EINVAL;
@@ -874,12 +877,22 @@ int local_resume(struct clog_tfr *tfr)
 			sleep(3 - t);
 
 		/* Join the CPG */
-		r = create_cluster_cpg(tfr->uuid);
+		r = create_cluster_cpg(tfr->uuid, tfr->uuid_instance);
 		if (r) {
 			LOG_ERROR("clog_resume:  Failed to create cluster CPG");
 			return r;
 		}
 
+		if (get_log(lc->uuid, lc->uuid_instance)) {
+			/*
+			 * Not being able to identify a log uniquely is fatal,
+			 * and represents a programming error.
+			 */
+			LOG_ERROR("[%s/%u]  DUPLICATE LOG WITH SAME UUID[_INSTANCE]!",
+				  SHORT_UUID(lc->uuid), lc->uuid_instance);
+			exit(EXIT_FAILURE);
+		}
+
 		/* move log to official list */
 		list_del_init(&lc->list);
 		list_add(&lc->list, &log_list);
@@ -901,7 +914,7 @@ int local_resume(struct clog_tfr *tfr)
 static int clog_get_region_size(struct clog_tfr *tfr)
 {
 	uint64_t *rtn = (uint64_t *)tfr->data;
-	struct log_c *lc = get_log(tfr->uuid);
+	struct log_c *lc = get_log(tfr->uuid, tfr->uuid_instance);
 
 	LOG_PRINT("WARNING: kernel should not be calling clog_get_region_size");
 	if (!lc)
@@ -924,7 +937,7 @@ static int clog_is_clean(struct clog_tfr *tfr)
 {
 	int *rtn = (int *)tfr->data;
 	uint64_t region = *((uint64_t *)(tfr->data));
-	struct log_c *lc = get_log(tfr->uuid);
+	struct log_c *lc = get_log(tfr->uuid, tfr->uuid_instance);
 
 	if (!lc)
 		return -EINVAL;
@@ -949,7 +962,7 @@ static int clog_in_sync(struct clog_tfr *tfr)
 {
 	int *rtn = (int *)tfr->data;
 	uint64_t region = *((uint64_t *)(tfr->data));
-	struct log_c *lc = get_log(tfr->uuid);
+	struct log_c *lc = get_log(tfr->uuid, tfr->uuid_instance);
 
 	if (!lc)
 		return -EINVAL;
@@ -978,7 +991,7 @@ static int clog_in_sync(struct clog_tfr *tfr)
 static int clog_flush(struct clog_tfr *tfr, int server)
 {
 	int r = 0;
-	struct log_c *lc = get_log(tfr->uuid);
+	struct log_c *lc = get_log(tfr->uuid, tfr->uuid_instance);
 	
 	if (!lc)
 		return -EINVAL;
@@ -1066,7 +1079,7 @@ static int clog_mark_region(struct clog_tfr *tfr)
 	int r;
 	int count;
 	uint64_t *region;
-	struct log_c *lc = get_log(tfr->uuid);
+	struct log_c *lc = get_log(tfr->uuid, tfr->uuid_instance);
 
 	if (!lc)
 		return -EINVAL;
@@ -1133,7 +1146,7 @@ static int clog_clear_region(struct clog_tfr *tfr)
 	int r;
 	int count;
 	uint64_t *region;
-	struct log_c *lc = get_log(tfr->uuid);
+	struct log_c *lc = get_log(tfr->uuid, tfr->uuid_instance);
 
 	if (!lc)
 		return -EINVAL;
@@ -1164,8 +1177,12 @@ static int clog_clear_region(struct clog_tfr *tfr)
  */
 static int clog_get_resync_work(struct clog_tfr *tfr)
 {
-	struct {int i; uint64_t r; } *pkg = (void *)tfr->data;
-	struct log_c *lc = get_log(tfr->uuid);
+	struct {
+		int32_t i;
+		uint32_t arch_padding;
+		uint64_t r;
+	} *pkg = (void *)tfr->data;
+	struct log_c *lc = get_log(tfr->uuid, tfr->uuid_instance);
 
 	if (!lc)
 		return -EINVAL;
@@ -1254,8 +1271,12 @@ static int clog_get_resync_work(struct clog_tfr *tfr)
  */
 static int clog_set_region_sync(struct clog_tfr *tfr)
 {
-	struct { uint64_t region; int in_sync; } *pkg = (void *)tfr->data;
-	struct log_c *lc = get_log(tfr->uuid);
+	struct {
+		uint64_t region;
+		uint32_t arch_padding;
+		int32_t in_sync;
+	} *pkg = (void *)tfr->data;
+	struct log_c *lc = get_log(tfr->uuid, tfr->uuid_instance);
 
 	if (!lc)
 		return -EINVAL;
@@ -1334,7 +1355,7 @@ static int clog_set_region_sync(struct clog_tfr *tfr)
 static int clog_get_sync_count(struct clog_tfr *tfr)
 {
 	uint64_t *sync_count = (uint64_t *)tfr->data;
-	struct log_c *lc = get_log(tfr->uuid);
+	struct log_c *lc = get_log(tfr->uuid, tfr->uuid_instance);
 
 	/*
 	 * FIXME: Mirror requires us to be able to ask for
@@ -1343,7 +1364,7 @@ static int clog_get_sync_count(struct clog_tfr *tfr)
 	 * the stored value may not be accurate.
 	 */
 	if (!lc)
-		lc = get_pending_log(tfr->uuid);
+		lc = get_pending_log(tfr->uuid, tfr->uuid_instance);
 
 	if (!lc)
 		return -EINVAL;
@@ -1402,10 +1423,10 @@ static int disk_status_info(struct log_c *lc, struct clog_tfr *tfr)
 static int clog_status_info(struct clog_tfr *tfr)
 {
 	int r;
-	struct log_c *lc = get_log(tfr->uuid);
+	struct log_c *lc = get_log(tfr->uuid, tfr->uuid_instance);
 
 	if (!lc)
-		lc = get_pending_log(tfr->uuid);
+		lc = get_pending_log(tfr->uuid, tfr->uuid_instance);
 
 	if (!lc)
 		return -EINVAL;
@@ -1461,10 +1482,10 @@ static int disk_status_table(struct log_c *lc, struct clog_tfr *tfr)
 static int clog_status_table(struct clog_tfr *tfr)
 {
 	int r;
-	struct log_c *lc = get_log(tfr->uuid);
+	struct log_c *lc = get_log(tfr->uuid, tfr->uuid_instance);
 
 	if (!lc)
-		lc = get_pending_log(tfr->uuid);
+		lc = get_pending_log(tfr->uuid, tfr->uuid_instance);
 
 	if (!lc)
 		return -EINVAL;
@@ -1485,8 +1506,12 @@ static int clog_status_table(struct clog_tfr *tfr)
 static int clog_is_remote_recovering(struct clog_tfr *tfr)
 {
 	uint64_t region = *((uint64_t *)(tfr->data));
-	struct { int is_recovering; uint64_t in_sync_hint; } *pkg = (void *)tfr->data;
-	struct log_c *lc = get_log(tfr->uuid);
+	struct {
+		int32_t is_recovering;
+		uint32_t arch_padding;
+		uint64_t in_sync_hint;
+	} *pkg = (void *)tfr->data;
+	struct log_c *lc = get_log(tfr->uuid, tfr->uuid_instance);
 
 	if (!lc)
 		return -EINVAL;
@@ -1667,7 +1692,8 @@ static void print_bits(char *buf, int size, int print)
 }
 
 /* int store_bits(const char *uuid, const char *which, char **buf)*/
-int push_state(const char *uuid, const char *which, char **buf, uint32_t debug_who)
+int push_state(const char *uuid, uint32_t uuid_instance, const char *which,
+	       char **buf, uint32_t debug_who)
 {
 	int bitset_size;
 	struct log_c *lc;
@@ -1675,7 +1701,7 @@ int push_state(const char *uuid, const char *which, char **buf, uint32_t debug_w
 	if (*buf)
 		LOG_ERROR("store_bits: *buf != NULL");
 
-	lc = get_log(uuid);
+	lc = get_log(uuid, uuid_instance);
 	if (!lc) {
 		LOG_ERROR("store_bits: No log found for %s", uuid);
 		return -EINVAL;
@@ -1721,7 +1747,8 @@ int push_state(const char *uuid, const char *which, char **buf, uint32_t debug_w
 }
 
 /*int load_bits(const char *uuid, const char *which, char *buf, int size)*/
-int pull_state(const char *uuid, const char *which, char *buf, int size)
+int pull_state(const char *uuid, uint32_t uuid_instance,
+	       const char *which, char *buf, int size)
 {
 	int bitset_size;
 	struct log_c *lc;
@@ -1729,7 +1756,7 @@ int pull_state(const char *uuid, const char *which, char *buf, int size)
 	if (!buf)
 		LOG_ERROR("pull_state: buf == NULL");
 
-	lc = get_log(uuid);
+	lc = get_log(uuid, uuid_instance);
 	if (!lc) {
 		LOG_ERROR("pull_state: No log found for %s", uuid);
 		return -EINVAL;
@@ -1773,7 +1800,7 @@ int log_get_state(struct clog_tfr *tfr)
 {
 	struct log_c *lc;
 
-	lc = get_log(tfr->uuid);
+	lc = get_log(tfr->uuid, tfr->uuid_instance);
 	if (!lc)
 		return -EINVAL;
 
diff --git a/cmirror/src/functions.h b/cmirror/src/functions.h
index 7c01c64..63ed49a 100644
--- a/cmirror/src/functions.h
+++ b/cmirror/src/functions.h
@@ -7,11 +7,13 @@
 #define LOG_SUSPENDED 2
 
 int local_resume(struct clog_tfr *tfr);
-int cluster_postsuspend(char *);
+int cluster_postsuspend(char *, uint32_t);
 
 int do_request(struct clog_tfr *tfr, int server);
-int push_state(const char *uuid, const char *which, char **buf, uint32_t debug_who);
-int pull_state(const char *uuid, const char *which, char *buf, int size);
+int push_state(const char *uuid, uint32_t uuid_instance,
+	       const char *which, char **buf, uint32_t debug_who);
+int pull_state(const char *uuid, uint32_t uuid_instance,
+	       const char *which, char *buf, int size);
 
 int log_get_state(struct clog_tfr *tfr);
 int log_status(void);
diff --git a/cmirror/src/local.c b/cmirror/src/local.c
index 7f9a403..cf64855 100644
--- a/cmirror/src/local.c
+++ b/cmirror/src/local.c
@@ -263,6 +263,7 @@ static int do_local_work(void *data)
 	case DM_CLOG_GET_REGION_SIZE:
 	default:
 		LOG_ERROR("Invalid log request received, ignoring.");
+
 		return 0;
 	}
 


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2009-07-27 18:10 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-07-27 18:10 cluster: RHEL54 - clogd/dm-log-clustered.ko: Fix for bugs 506843, 479749, 507400 Jonathan Brassow

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).