public inbox for cluster-cvs@sourceware.org
help / color / mirror / Atom feed
* Cluster Project branch, RHEL5, updated. cmirror_1_1_15-150-gb291647
@ 2008-07-18 15:19 jbrassow
  0 siblings, 0 replies; only message in thread
From: jbrassow @ 2008-07-18 15:19 UTC (permalink / raw)
  To: cluster-cvs, cluster-devel

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "Cluster Project".

http://sources.redhat.com/git/gitweb.cgi?p=cluster.git;a=commitdiff;h=b2916471b1b7c79dba7f9624a1a148240375891f

The branch, RHEL5 has been updated
       via  b2916471b1b7c79dba7f9624a1a148240375891f (commit)
      from  705c5ceb17da0daf018c688ac478b4fae2371e3d (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit b2916471b1b7c79dba7f9624a1a148240375891f
Author: Jonathan Brassow <jbrassow@redhat.com>
Date:   Fri Jul 18 10:16:45 2008 -0500

    dm-log-clustered: Fix bug that would cause communication problems
    
    When an error is received from user space, the caller of the
    communication function is given the return code along with a
    parameter indicating that no data is available.  However,
    if the error was EAGAIN, the data_size variable was still set
    to 0.  So, when the retry was attempted and data returned from
    userspace, the kernel would think that there was not enough
    room to store the request.
    
    Also did quite a bit of code clean-up (no functional changes)
    in userspace.... like a number->string translation for openAIS
    error codes, etc...

-----------------------------------------------------------------------

Summary of changes:
 cmirror-kernel/src/dm-clog-tfr.c |    8 +-
 cmirror/src/cluster.c            |  237 ++++++++++++++++++--------------------
 2 files changed, 117 insertions(+), 128 deletions(-)

diff --git a/cmirror-kernel/src/dm-clog-tfr.c b/cmirror-kernel/src/dm-clog-tfr.c
index 95d4bd2..7932f77 100644
--- a/cmirror-kernel/src/dm-clog-tfr.c
+++ b/cmirror-kernel/src/dm-clog-tfr.c
@@ -96,7 +96,13 @@ static int fill_pkg(struct cn_msg *msg, struct clog_tfr *tfr)
 
 		if (msg) {
 			pkg->error = -msg->ack;
-			*(pkg->data_size) = 0;
+			/*
+			 * If we are trying again, we will need to know our
+			 * storage capacity.  Otherwise, along with the
+			 * error code, we make explicit that we have no data.
+			 */
+			if (pkg->error != -EAGAIN)
+				*(pkg->data_size) = 0;
 		} else if (tfr->data_size > *(pkg->data_size)) {
 			DMERR("Insufficient space to receive package [%s]::",
 			      RQ_TYPE(tfr->request_type));
diff --git a/cmirror/src/cluster.c b/cmirror/src/cluster.c
index 3a18687..68ffefb 100644
--- a/cmirror/src/cluster.c
+++ b/cmirror/src/cluster.c
@@ -19,35 +19,36 @@
 #include "logging.h"
 #include "link_mon.h"
 
-/* Open AIS error codes
-        SA_AIS_OK = 1,
-        SA_AIS_ERR_LIBRARY = 2,
-        SA_AIS_ERR_VERSION = 3,
-        SA_AIS_ERR_INIT = 4,
-        SA_AIS_ERR_TIMEOUT = 5,
-        SA_AIS_ERR_TRY_AGAIN = 6,
-        SA_AIS_ERR_INVALID_PARAM = 7,
-        SA_AIS_ERR_NO_MEMORY = 8,
-        SA_AIS_ERR_BAD_HANDLE = 9,
-        SA_AIS_ERR_BUSY = 10,
-        SA_AIS_ERR_ACCESS = 11,
-        SA_AIS_ERR_NOT_EXIST = 12,
-        SA_AIS_ERR_NAME_TOO_LONG = 13,
-        SA_AIS_ERR_EXIST = 14,
-        SA_AIS_ERR_NO_SPACE = 15,
-        SA_AIS_ERR_INTERRUPT = 16,
-        SA_AIS_ERR_NAME_NOT_FOUND = 17,
-        SA_AIS_ERR_NO_RESOURCES = 18,
-        SA_AIS_ERR_NOT_SUPPORTED = 19,
-        SA_AIS_ERR_BAD_OPERATION = 20,
-        SA_AIS_ERR_FAILED_OPERATION = 21,
-        SA_AIS_ERR_MESSAGE_ERROR = 22,
-        SA_AIS_ERR_QUEUE_FULL = 23,
-        SA_AIS_ERR_QUEUE_NOT_AVAILABLE = 24,
-        SA_AIS_ERR_BAD_FLAGS = 25,
-        SA_AIS_ERR_TOO_BIG = 26,
-        SA_AIS_ERR_NO_SECTIONS = 27
-*/
+/* Open AIS error codes */
+#define str_ais_error(x) \
+	((x) == SA_AIS_OK) ? "SA_AIS_OK" : \
+	((x) == SA_AIS_ERR_LIBRARY) ? "SA_AIS_ERR_LIBRARY" : \
+	((x) == SA_AIS_ERR_VERSION) ? "SA_AIS_ERR_VERSION" : \
+	((x) == SA_AIS_ERR_INIT) ? "SA_AIS_ERR_INIT" : \
+	((x) == SA_AIS_ERR_TIMEOUT) ? "SA_AIS_ERR_TIMEOUT" : \
+	((x) == SA_AIS_ERR_TRY_AGAIN) ? "SA_AIS_ERR_TRY_AGAIN" : \
+	((x) == SA_AIS_ERR_INVALID_PARAM) ? "SA_AIS_ERR_INVALID_PARAM" : \
+	((x) == SA_AIS_ERR_NO_MEMORY) ? "SA_AIS_ERR_NO_MEMORY" : \
+	((x) == SA_AIS_ERR_BAD_HANDLE) ? "SA_AIS_ERR_BAD_HANDLE" : \
+	((x) == SA_AIS_ERR_BUSY) ? "SA_AIS_ERR_BUSY" : \
+	((x) == SA_AIS_ERR_ACCESS) ? "SA_AIS_ERR_ACCESS" : \
+	((x) == SA_AIS_ERR_NOT_EXIST) ? "SA_AIS_ERR_NOT_EXIST" : \
+	((x) == SA_AIS_ERR_NAME_TOO_LONG) ? "SA_AIS_ERR_NAME_TOO_LONG" : \
+	((x) == SA_AIS_ERR_EXIST) ? "SA_AIS_ERR_EXIST" : \
+	((x) == SA_AIS_ERR_NO_SPACE) ? "SA_AIS_ERR_NO_SPACE" : \
+	((x) == SA_AIS_ERR_INTERRUPT) ? "SA_AIS_ERR_INTERRUPT" : \
+	((x) == SA_AIS_ERR_NAME_NOT_FOUND) ? "SA_AIS_ERR_NAME_NOT_FOUND" : \
+	((x) == SA_AIS_ERR_NO_RESOURCES) ? "SA_AIS_ERR_NO_RESOURCES" : \
+	((x) == SA_AIS_ERR_NOT_SUPPORTED) ? "SA_AIS_ERR_NOT_SUPPORTED" : \
+	((x) == SA_AIS_ERR_BAD_OPERATION) ? "SA_AIS_ERR_BAD_OPERATION" : \
+	((x) == SA_AIS_ERR_FAILED_OPERATION) ? "SA_AIS_ERR_FAILED_OPERATION" : \
+	((x) == SA_AIS_ERR_MESSAGE_ERROR) ? "SA_AIS_ERR_MESSAGE_ERROR" : \
+	((x) == SA_AIS_ERR_QUEUE_FULL) ? "SA_AIS_ERR_QUEUE_FULL" : \
+	((x) == SA_AIS_ERR_QUEUE_NOT_AVAILABLE) ? "SA_AIS_ERR_QUEUE_NOT_AVAILABLE" : \
+	((x) == SA_AIS_ERR_BAD_FLAGS) ? "SA_AIS_ERR_BAD_FLAGS" : \
+	((x) == SA_AIS_ERR_TOO_BIG) ? "SA_AIS_ERR_TOO_BIG" : \
+	((x) == SA_AIS_ERR_NO_SECTIONS) ? "SA_AIS_ERR_NO_SECTIONS" : \
+	"ais_error_unknown"
 
 #define DM_CLOG_RESPONSE 0x1000 /* in last byte of 32-bit value */
 #define DM_CLOG_CHECKPOINT_READY 21
@@ -145,10 +146,7 @@ int cluster_send(struct clog_tfr *tfr)
 		return 0;
 
 	/* error codes found in openais/cpg.h */
-	LOG_ERROR("cpg_mcast_joined error: %d%s", r,
-		  (r == SA_AIS_ERR_TRY_AGAIN) ? "/SA_AIS_ERR_TRY_AGAIN" :
-		  (r == CPG_ERR_BAD_HANDLE) ? "/CPG_ERR_BAD_HANDLE" :
-		  (r == CPG_ERR_ACCESS) ? "/CPG_ERR_ACCESS" : "");
+	LOG_ERROR("cpg_mcast_joined error: %s", str_ais_error(r));
 
 	tfr->error = -EBADE;
 	return -EBADE;
@@ -358,16 +356,19 @@ static int export_checkpoint(struct checkpoint_data *cp)
 
 	LOG_DBG("Sending checkpointed data to %u", cp->requester);
 
-	len = snprintf((char *)(name.value), SA_MAX_NAME_LENGTH, "bitmaps_%s_%u",
-		       SHORT_UUID(cp->uuid), cp->requester);
+	len = snprintf((char *)(name.value), SA_MAX_NAME_LENGTH,
+		       "bitmaps_%s_%u", SHORT_UUID(cp->uuid), cp->requester);
 	name.length = len;
 
+	len = strlen(cp->recovering_region) + 1;
+
 	attr.creationFlags = SA_CKPT_WR_ALL_REPLICAS;
-	attr.checkpointSize = cp->bitmap_size * 2 + strlen(cp->recovering_region) + 1;
+	attr.checkpointSize = cp->bitmap_size * 2 + len;
+
 	attr.retentionDuration = SA_TIME_MAX;
 	attr.maxSections = 4;      /* don't know why we need +1 */
-	attr.maxSectionSize = (cp->bitmap_size > (strlen(cp->recovering_region) + 1)) ?
-		cp->bitmap_size : (strlen(cp->recovering_region) + 1);
+
+	attr.maxSectionSize = (cp->bitmap_size > len) ?	cp->bitmap_size : len;
 	attr.maxSectionIdSize = 22;
 
 	flags = SA_CKPT_CHECKPOINT_READ |
@@ -388,8 +389,9 @@ open_retry:
 	}
 
 	if (rv != SA_AIS_OK) {
-		LOG_ERROR("[%s] Failed to open checkpoint for %u:  Reason = %d",
-			  SHORT_UUID(cp->uuid), cp->requester, rv);
+		LOG_ERROR("[%s] Failed to open checkpoint for %u: %s",
+			  SHORT_UUID(cp->uuid), cp->requester,
+			  str_ais_error(rv));
 		return -EIO; /* FIXME: better error */
 	}
 
@@ -402,21 +404,23 @@ open_retry:
 	section_attr.expirationTime = SA_TIME_END;
 
 sync_create_retry:
-	rv = saCkptSectionCreate(h, &section_attr, cp->sync_bits, cp->bitmap_size);
+	rv = saCkptSectionCreate(h, &section_attr,
+				 cp->sync_bits, cp->bitmap_size);
 	if (rv == SA_AIS_ERR_TRY_AGAIN) {
-		LOG_ERROR("export_checkpoint: sync create retry");
+		LOG_ERROR("Sync checkpoint section create retry");
 		sleep(1);
 		goto sync_create_retry;
 	}
 
 	if (rv == SA_AIS_ERR_EXIST) {
-		LOG_DBG("export_checkpoint: sync checkpoint section already exists");
+		LOG_DBG("Sync checkpoint section already exists");
 		saCkptCheckpointClose(h);
 		return -EEXIST;
 	}
 
 	if (rv != SA_AIS_OK) {
-		LOG_ERROR("export_checkpoint: sync checkpoint section creation failed");
+		LOG_ERROR("Sync checkpoint section creation failed: %s",
+			  str_ais_error(rv));
 		saCkptCheckpointClose(h);
 		return -EIO; /* FIXME: better error */
 	}
@@ -432,19 +436,20 @@ sync_create_retry:
 clean_create_retry:
 	rv = saCkptSectionCreate(h, &section_attr, cp->clean_bits, cp->bitmap_size);
 	if (rv == SA_AIS_ERR_TRY_AGAIN) {
-		LOG_ERROR("export_checkpoint: clean create retry");
+		LOG_ERROR("Clean checkpoint section create retry");
 		sleep(1);
 		goto clean_create_retry;
 	}
 
 	if (rv == SA_AIS_ERR_EXIST) {
-		LOG_DBG("export_checkpoint: clean checkpoint section already exists");
+		LOG_DBG("Clean checkpoint section already exists");
 		saCkptCheckpointClose(h);
 		return -EEXIST;
 	}
 
 	if (rv != SA_AIS_OK) {
-		LOG_ERROR("export_checkpoint: clean checkpoint section creation failed");
+		LOG_ERROR("Clean checkpoint section creation failed: %s",
+			  str_ais_error(rv));
 		saCkptCheckpointClose(h);
 		return -EIO; /* FIXME: better error */
 	}
@@ -461,19 +466,20 @@ rr_create_retry:
 	rv = saCkptSectionCreate(h, &section_attr, cp->recovering_region,
 				 strlen(cp->recovering_region) + 1);
 	if (rv == SA_AIS_ERR_TRY_AGAIN) {
-		LOG_ERROR("export_checkpoint: RR create retry");
+		LOG_ERROR("RR checkpoint section create retry");
 		sleep(1);
 		goto rr_create_retry;
 	}
 
 	if (rv == SA_AIS_ERR_EXIST) {
-		LOG_DBG("export_checkpoint: RR checkpoint section already exists");
+		LOG_DBG("RR checkpoint section already exists");
 		saCkptCheckpointClose(h);
 		return -EEXIST;
 	}
 
 	if (rv != SA_AIS_OK) {
-		LOG_ERROR("export_checkpoint: RR checkpoint section creation failed");
+		LOG_ERROR("RR checkpoint section creation failed: %s",
+			  str_ais_error(rv));
 		saCkptCheckpointClose(h);
 		return -EIO; /* FIXME: better error */
 	}
@@ -509,21 +515,6 @@ rr_create_retry:
 	return 0;
 }
 
-void ckpt_print (char *str, SaCkptCheckpointHandleT handle)
-{
-	SaCkptCheckpointDescriptorT descriptor;
-	SaAisErrorT rv;
-
-retry_statusget:
-	rv = saCkptCheckpointStatusGet (handle, &descriptor);
-	if (rv == SA_AIS_ERR_TRY_AGAIN)
-		goto retry_statusget;
-	
-	LOG_DBG("printing [%s] sections [%d] result [%d]",
-		str, descriptor.numberOfSections, rv);
-}
-
-
 static int import_checkpoint(struct clog_cpg *entry, int no_read)
 {
 	int rtn = 0;
@@ -554,12 +545,11 @@ open_retry:
 	}
 
 	if (rv != SA_AIS_OK) {
-		LOG_ERROR("Failed to open checkpoint");
+		LOG_ERROR("[%s] Failed to open checkpoint: %s",
+			  SHORT_UUID(entry->name.value), str_ais_error(rv));
 		return -EIO; /* FIXME: better error */
 	}
 
-	ckpt_print ("Before unlink", h);
-
 unlink_retry:
 	rv = saCkptCheckpointUnlink(ckpt_handle, &name);
 	if (rv == SA_AIS_ERR_TRY_AGAIN) {
@@ -573,10 +563,9 @@ unlink_retry:
 		goto no_read;
 	}
 
-	ckpt_print ("After unlink", h);
-
 init_retry:
-	rv = saCkptSectionIterationInitialize(h, SA_CKPT_SECTIONS_ANY, SA_TIME_END, &itr);
+	rv = saCkptSectionIterationInitialize(h, SA_CKPT_SECTIONS_ANY,
+					      SA_TIME_END, &itr);
 	if (rv == SA_AIS_ERR_TRY_AGAIN) {
 		LOG_ERROR("import_checkpoint: sync create retry");
 		sleep(1);
@@ -584,7 +573,8 @@ init_retry:
 	}
 
 	if (rv != SA_AIS_OK) {
-		LOG_ERROR("import_checkpoint: sync checkpoint section creation failed");
+		LOG_ERROR("[%s] Sync checkpoint section creation failed: %s",
+			  SHORT_UUID(entry->name.value), str_ais_error(rv));
 		return -EIO; /* FIXME: better error */
 	}
 
@@ -602,11 +592,13 @@ init_retry:
 	}
 	saCkptSectionIterationFinalize(itr);
 	if (len != 3) {
-		LOG_ERROR("import_checkpoint: %d checkpoint sections found", len);
+		LOG_ERROR("import_checkpoint: %d checkpoint sections found",
+			  len);
 		sleep(1);
 		goto init_retry;
 	}
-	saCkptSectionIterationInitialize(h, SA_CKPT_SECTIONS_ANY, SA_TIME_END, &itr);
+	saCkptSectionIterationInitialize(h, SA_CKPT_SECTIONS_ANY,
+					 SA_TIME_END, &itr);
 
 	while (1) {
 		rv = saCkptSectionIterationNext(itr, &desc);
@@ -620,7 +612,8 @@ init_retry:
 		}
 
 		if (rv != SA_AIS_OK) {
-			LOG_ERROR("import_checkpoint: clean checkpoint section creation failed");
+			LOG_ERROR("import_checkpoint: clean checkpoint section "
+				  "creation failed: %s", str_ais_error(rv));
 			rtn = -EIO; /* FIXME: better error */
 			goto fail;
 		}
@@ -645,20 +638,15 @@ init_retry:
 		}
 
 		if (rv != SA_AIS_OK) {
-			LOG_ERROR("import_checkpoint: ckpt read error");
+			LOG_ERROR("import_checkpoint: ckpt read error: %s",
+				  str_ais_error(rv));
 			rtn = -EIO; /* FIXME: better error */
 			goto fail;
 		}
 
-		/* FIXME: Is this catching something special?
-		if (!iov.readSize) {
-			LOG_ERROR("%s section empty", (char *)desc.sectionId.id);
-			continue;
-		}
-		*/
-
 		if (iov.readSize) {
-			if (pull_state(entry->name.value, (char *)desc.sectionId.id, bitmap,
+			if (pull_state(entry->name.value,
+				       (char *)desc.sectionId.id, bitmap,
 				       iov.readSize)) {
 				LOG_ERROR("Error loading state");
 				rtn = -EIO;
@@ -676,67 +664,62 @@ fail:
 no_read:
 	saCkptCheckpointClose(h);
 
-	/*
-	LOG_PRINT("Testing if chkpoint exists after unlink/close");
-	rv = saCkptCheckpointOpen(ckpt_handle, &name, NULL,
-				  SA_CKPT_CHECKPOINT_READ, 0, &h);
-	if (rv != SA_AIS_OK) {
-		LOG_PRINT("Checkpoint was not removed!!!");
-		saCkptCheckpointClose(h);
-	} else
-		LOG_PRINT("   rv == %d", rv);
-	*/
-
 	free(bitmap);
 	return rtn;
 }
 
+static void do_checkpoints(struct clog_cpg *entry)
+{
+	struct checkpoint_data *cp;
+
+	for (cp = entry->checkpoint_list; cp;) {
+		LOG_DBG("[%s] Checkpoint data available for node %u",
+			SHORT_UUID(entry->name.value), cp->requester);
+
+		/*
+		 * FIXME: Check return code.  Could send failure
+		 * notice in tfr in export_checkpoint function
+		 * by setting tfr->error
+		 */
+		switch (export_checkpoint(cp)) {
+		case -EEXIST:
+			LOG_DBG("[%s] Checkpoint for %u already handled",
+				SHORT_UUID(entry->name.value), cp->requester);
+		case 0:
+			entry->checkpoint_list = cp->next;
+			free_checkpoint(cp);
+			cp = entry->checkpoint_list;
+			break;
+		default:
+			/* FIXME: Skipping will cause list corruption */
+			LOG_ERROR("[%s] Failed to export checkpoint for %u",
+				  SHORT_UUID(entry->name.value), cp->requester);
+		}
+	}
+}
+
 static int do_cluster_work(void *data)
 {
 	int r = SA_AIS_OK;
 	struct clog_cpg *entry, *tmp;
-	struct checkpoint_data *cp;
 
 	list_for_each_entry_safe(entry, tmp, &clog_cpg_list, list) {
 		r = cpg_dispatch(entry->handle, CPG_DISPATCH_ALL);
 		if (r != SA_AIS_OK)
-			LOG_ERROR("cpg_dispatch failed: %d", r);
+			LOG_ERROR("cpg_dispatch failed: %s", str_ais_error(r));
 
 		if (entry->free_me) {
 			free(entry);
 			continue;
 		}
-
-		for (cp = entry->checkpoint_list; cp;) {
-			LOG_DBG("[%s] Checkpoint data available for node %u",
-				SHORT_UUID(entry->name.value), cp->requester);
-
-			/*
-			 * FIXME: Check return code.  Could send failure
-			 * notice in tfr in export_checkpoint function
-			 * by setting tfr->error
-			 */
-			switch (export_checkpoint(cp)) {
-			case -EEXIST:
-				LOG_DBG("[%s] Checkpoint for %u already handled by someone else",
-					SHORT_UUID(entry->name.value), cp->requester);
-			case 0:
-				entry->checkpoint_list = cp->next;
-				free_checkpoint(cp);
-				cp = entry->checkpoint_list;
-				break;
-			default:
-				/* FIXME: Skipping will cause list corruption */
-				LOG_ERROR("[%s] Failed to export checkpoint for %u",
-					  SHORT_UUID(entry->name.value), cp->requester);
-			}
-		}
+		do_checkpoints(entry);
 	}
 	return (r == SA_AIS_OK) ? 0 : -1;  /* FIXME: good error number? */
 }
 
 static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname,
-				 uint32_t nodeid, uint32_t pid, void *msg, int msg_len)
+				 uint32_t nodeid, uint32_t pid,
+				 void *msg, int msg_len)
 {
 	int i;
 	int r = 0;
@@ -785,7 +768,7 @@ static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname,
 	 * get our config callback.  However, since we can't respond after
 	 * leaving, we simply return.
 	 */
-	if (match->cpg_state != VALID)
+	if (match->state == LEAVING)
 		return;
 
 	i_am_server = (my_cluster_id == match->lowest_id) ? 1 : 0;
@@ -1098,10 +1081,10 @@ static void cpg_leave_callback(struct clog_cpg *match,
 			 */
 			if (!strcmp(match->name.value, tfr->uuid) &&
 			    (tfr->request_type != DM_CLOG_POSTSUSPEND)){
-				LOG_PRINT("[%s] Resending %s due to new server(%u)",
+				LOG_PRINT("[%s] Resending %s due to new server(%u -> %u)",
 					  SHORT_UUID(match->name.value),
 					  RQ_TYPE(tfr->request_type),
-					  match->lowest_id);
+					  lowest, match->lowest_id);
 				if (cluster_send(tfr))
 					LOG_ERROR("Failed resend");
 			}
@@ -1225,11 +1208,11 @@ int destroy_cluster_cpg(char *str)
 	list_for_each_entry_safe(del, tmp, &clog_cpg_list, list)
 		if (!strncmp(del->name.value, str, CPG_MAX_NAME_LENGTH)) {
 			del->cpg_state = INVALID;
+			del->state = LEAVING;
 			r = cpg_leave(del->handle, &del->name);
 			if (r != CPG_OK)
 				LOG_ERROR("Error leaving CPG!");
 			break;
-			del->state = LEAVING;
 		}
 
 	return 0;


hooks/post-receive
--
Cluster Project


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2008-07-18 15:19 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-07-18 15:19 Cluster Project branch, RHEL5, updated. cmirror_1_1_15-150-gb291647 jbrassow

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).