From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 12508 invoked by alias); 18 Jul 2008 15:19:14 -0000 Received: (qmail 12476 invoked by uid 9478); 18 Jul 2008 15:19:13 -0000 Date: Fri, 18 Jul 2008 15:19:00 -0000 Message-ID: <20080718151913.12461.qmail@sourceware.org> From: jbrassow@sourceware.org To: cluster-cvs@sources.redhat.com, cluster-devel@redhat.com Subject: Cluster Project branch, RHEL5, updated. cmirror_1_1_15-150-gb291647 X-Git-Refname: refs/heads/RHEL5 X-Git-Reftype: branch X-Git-Oldrev: 705c5ceb17da0daf018c688ac478b4fae2371e3d X-Git-Newrev: b2916471b1b7c79dba7f9624a1a148240375891f Mailing-List: contact cluster-cvs-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Post: List-Help: , Sender: cluster-cvs-owner@sourceware.org X-SW-Source: 2008-q3/txt/msg00100.txt.bz2 This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "Cluster Project". http://sources.redhat.com/git/gitweb.cgi?p=cluster.git;a=commitdiff;h=b2916471b1b7c79dba7f9624a1a148240375891f The branch, RHEL5 has been updated via b2916471b1b7c79dba7f9624a1a148240375891f (commit) from 705c5ceb17da0daf018c688ac478b4fae2371e3d (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit b2916471b1b7c79dba7f9624a1a148240375891f Author: Jonathan Brassow Date: Fri Jul 18 10:16:45 2008 -0500 dm-log-clustered: Fix bug that would cause communication problems When an error is received from user space, the caller of the communication function is given the return code along with a parameter indicating that no data is available. However, if the error was EAGAIN, the data_size variable was still set to 0. So, when the retry was attempted and data returned from userspace, the kernel would think that there was not enough room to store the request. Also did quite a bit of code clean-up (no functional changes) in userspace.... like a number->string translation for openAIS error codes, etc... ----------------------------------------------------------------------- Summary of changes: cmirror-kernel/src/dm-clog-tfr.c | 8 +- cmirror/src/cluster.c | 237 ++++++++++++++++++-------------------- 2 files changed, 117 insertions(+), 128 deletions(-) diff --git a/cmirror-kernel/src/dm-clog-tfr.c b/cmirror-kernel/src/dm-clog-tfr.c index 95d4bd2..7932f77 100644 --- a/cmirror-kernel/src/dm-clog-tfr.c +++ b/cmirror-kernel/src/dm-clog-tfr.c @@ -96,7 +96,13 @@ static int fill_pkg(struct cn_msg *msg, struct clog_tfr *tfr) if (msg) { pkg->error = -msg->ack; - *(pkg->data_size) = 0; + /* + * If we are trying again, we will need to know our + * storage capacity. Otherwise, along with the + * error code, we make explicit that we have no data. + */ + if (pkg->error != -EAGAIN) + *(pkg->data_size) = 0; } else if (tfr->data_size > *(pkg->data_size)) { DMERR("Insufficient space to receive package [%s]::", RQ_TYPE(tfr->request_type)); diff --git a/cmirror/src/cluster.c b/cmirror/src/cluster.c index 3a18687..68ffefb 100644 --- a/cmirror/src/cluster.c +++ b/cmirror/src/cluster.c @@ -19,35 +19,36 @@ #include "logging.h" #include "link_mon.h" -/* Open AIS error codes - SA_AIS_OK = 1, - SA_AIS_ERR_LIBRARY = 2, - SA_AIS_ERR_VERSION = 3, - SA_AIS_ERR_INIT = 4, - SA_AIS_ERR_TIMEOUT = 5, - SA_AIS_ERR_TRY_AGAIN = 6, - SA_AIS_ERR_INVALID_PARAM = 7, - SA_AIS_ERR_NO_MEMORY = 8, - SA_AIS_ERR_BAD_HANDLE = 9, - SA_AIS_ERR_BUSY = 10, - SA_AIS_ERR_ACCESS = 11, - SA_AIS_ERR_NOT_EXIST = 12, - SA_AIS_ERR_NAME_TOO_LONG = 13, - SA_AIS_ERR_EXIST = 14, - SA_AIS_ERR_NO_SPACE = 15, - SA_AIS_ERR_INTERRUPT = 16, - SA_AIS_ERR_NAME_NOT_FOUND = 17, - SA_AIS_ERR_NO_RESOURCES = 18, - SA_AIS_ERR_NOT_SUPPORTED = 19, - SA_AIS_ERR_BAD_OPERATION = 20, - SA_AIS_ERR_FAILED_OPERATION = 21, - SA_AIS_ERR_MESSAGE_ERROR = 22, - SA_AIS_ERR_QUEUE_FULL = 23, - SA_AIS_ERR_QUEUE_NOT_AVAILABLE = 24, - SA_AIS_ERR_BAD_FLAGS = 25, - SA_AIS_ERR_TOO_BIG = 26, - SA_AIS_ERR_NO_SECTIONS = 27 -*/ +/* Open AIS error codes */ +#define str_ais_error(x) \ + ((x) == SA_AIS_OK) ? "SA_AIS_OK" : \ + ((x) == SA_AIS_ERR_LIBRARY) ? "SA_AIS_ERR_LIBRARY" : \ + ((x) == SA_AIS_ERR_VERSION) ? "SA_AIS_ERR_VERSION" : \ + ((x) == SA_AIS_ERR_INIT) ? "SA_AIS_ERR_INIT" : \ + ((x) == SA_AIS_ERR_TIMEOUT) ? "SA_AIS_ERR_TIMEOUT" : \ + ((x) == SA_AIS_ERR_TRY_AGAIN) ? "SA_AIS_ERR_TRY_AGAIN" : \ + ((x) == SA_AIS_ERR_INVALID_PARAM) ? "SA_AIS_ERR_INVALID_PARAM" : \ + ((x) == SA_AIS_ERR_NO_MEMORY) ? "SA_AIS_ERR_NO_MEMORY" : \ + ((x) == SA_AIS_ERR_BAD_HANDLE) ? "SA_AIS_ERR_BAD_HANDLE" : \ + ((x) == SA_AIS_ERR_BUSY) ? "SA_AIS_ERR_BUSY" : \ + ((x) == SA_AIS_ERR_ACCESS) ? "SA_AIS_ERR_ACCESS" : \ + ((x) == SA_AIS_ERR_NOT_EXIST) ? "SA_AIS_ERR_NOT_EXIST" : \ + ((x) == SA_AIS_ERR_NAME_TOO_LONG) ? "SA_AIS_ERR_NAME_TOO_LONG" : \ + ((x) == SA_AIS_ERR_EXIST) ? "SA_AIS_ERR_EXIST" : \ + ((x) == SA_AIS_ERR_NO_SPACE) ? "SA_AIS_ERR_NO_SPACE" : \ + ((x) == SA_AIS_ERR_INTERRUPT) ? "SA_AIS_ERR_INTERRUPT" : \ + ((x) == SA_AIS_ERR_NAME_NOT_FOUND) ? "SA_AIS_ERR_NAME_NOT_FOUND" : \ + ((x) == SA_AIS_ERR_NO_RESOURCES) ? "SA_AIS_ERR_NO_RESOURCES" : \ + ((x) == SA_AIS_ERR_NOT_SUPPORTED) ? "SA_AIS_ERR_NOT_SUPPORTED" : \ + ((x) == SA_AIS_ERR_BAD_OPERATION) ? "SA_AIS_ERR_BAD_OPERATION" : \ + ((x) == SA_AIS_ERR_FAILED_OPERATION) ? "SA_AIS_ERR_FAILED_OPERATION" : \ + ((x) == SA_AIS_ERR_MESSAGE_ERROR) ? "SA_AIS_ERR_MESSAGE_ERROR" : \ + ((x) == SA_AIS_ERR_QUEUE_FULL) ? "SA_AIS_ERR_QUEUE_FULL" : \ + ((x) == SA_AIS_ERR_QUEUE_NOT_AVAILABLE) ? "SA_AIS_ERR_QUEUE_NOT_AVAILABLE" : \ + ((x) == SA_AIS_ERR_BAD_FLAGS) ? "SA_AIS_ERR_BAD_FLAGS" : \ + ((x) == SA_AIS_ERR_TOO_BIG) ? "SA_AIS_ERR_TOO_BIG" : \ + ((x) == SA_AIS_ERR_NO_SECTIONS) ? "SA_AIS_ERR_NO_SECTIONS" : \ + "ais_error_unknown" #define DM_CLOG_RESPONSE 0x1000 /* in last byte of 32-bit value */ #define DM_CLOG_CHECKPOINT_READY 21 @@ -145,10 +146,7 @@ int cluster_send(struct clog_tfr *tfr) return 0; /* error codes found in openais/cpg.h */ - LOG_ERROR("cpg_mcast_joined error: %d%s", r, - (r == SA_AIS_ERR_TRY_AGAIN) ? "/SA_AIS_ERR_TRY_AGAIN" : - (r == CPG_ERR_BAD_HANDLE) ? "/CPG_ERR_BAD_HANDLE" : - (r == CPG_ERR_ACCESS) ? "/CPG_ERR_ACCESS" : ""); + LOG_ERROR("cpg_mcast_joined error: %s", str_ais_error(r)); tfr->error = -EBADE; return -EBADE; @@ -358,16 +356,19 @@ static int export_checkpoint(struct checkpoint_data *cp) LOG_DBG("Sending checkpointed data to %u", cp->requester); - len = snprintf((char *)(name.value), SA_MAX_NAME_LENGTH, "bitmaps_%s_%u", - SHORT_UUID(cp->uuid), cp->requester); + len = snprintf((char *)(name.value), SA_MAX_NAME_LENGTH, + "bitmaps_%s_%u", SHORT_UUID(cp->uuid), cp->requester); name.length = len; + len = strlen(cp->recovering_region) + 1; + attr.creationFlags = SA_CKPT_WR_ALL_REPLICAS; - attr.checkpointSize = cp->bitmap_size * 2 + strlen(cp->recovering_region) + 1; + attr.checkpointSize = cp->bitmap_size * 2 + len; + attr.retentionDuration = SA_TIME_MAX; attr.maxSections = 4; /* don't know why we need +1 */ - attr.maxSectionSize = (cp->bitmap_size > (strlen(cp->recovering_region) + 1)) ? - cp->bitmap_size : (strlen(cp->recovering_region) + 1); + + attr.maxSectionSize = (cp->bitmap_size > len) ? cp->bitmap_size : len; attr.maxSectionIdSize = 22; flags = SA_CKPT_CHECKPOINT_READ | @@ -388,8 +389,9 @@ open_retry: } if (rv != SA_AIS_OK) { - LOG_ERROR("[%s] Failed to open checkpoint for %u: Reason = %d", - SHORT_UUID(cp->uuid), cp->requester, rv); + LOG_ERROR("[%s] Failed to open checkpoint for %u: %s", + SHORT_UUID(cp->uuid), cp->requester, + str_ais_error(rv)); return -EIO; /* FIXME: better error */ } @@ -402,21 +404,23 @@ open_retry: section_attr.expirationTime = SA_TIME_END; sync_create_retry: - rv = saCkptSectionCreate(h, §ion_attr, cp->sync_bits, cp->bitmap_size); + rv = saCkptSectionCreate(h, §ion_attr, + cp->sync_bits, cp->bitmap_size); if (rv == SA_AIS_ERR_TRY_AGAIN) { - LOG_ERROR("export_checkpoint: sync create retry"); + LOG_ERROR("Sync checkpoint section create retry"); sleep(1); goto sync_create_retry; } if (rv == SA_AIS_ERR_EXIST) { - LOG_DBG("export_checkpoint: sync checkpoint section already exists"); + LOG_DBG("Sync checkpoint section already exists"); saCkptCheckpointClose(h); return -EEXIST; } if (rv != SA_AIS_OK) { - LOG_ERROR("export_checkpoint: sync checkpoint section creation failed"); + LOG_ERROR("Sync checkpoint section creation failed: %s", + str_ais_error(rv)); saCkptCheckpointClose(h); return -EIO; /* FIXME: better error */ } @@ -432,19 +436,20 @@ sync_create_retry: clean_create_retry: rv = saCkptSectionCreate(h, §ion_attr, cp->clean_bits, cp->bitmap_size); if (rv == SA_AIS_ERR_TRY_AGAIN) { - LOG_ERROR("export_checkpoint: clean create retry"); + LOG_ERROR("Clean checkpoint section create retry"); sleep(1); goto clean_create_retry; } if (rv == SA_AIS_ERR_EXIST) { - LOG_DBG("export_checkpoint: clean checkpoint section already exists"); + LOG_DBG("Clean checkpoint section already exists"); saCkptCheckpointClose(h); return -EEXIST; } if (rv != SA_AIS_OK) { - LOG_ERROR("export_checkpoint: clean checkpoint section creation failed"); + LOG_ERROR("Clean checkpoint section creation failed: %s", + str_ais_error(rv)); saCkptCheckpointClose(h); return -EIO; /* FIXME: better error */ } @@ -461,19 +466,20 @@ rr_create_retry: rv = saCkptSectionCreate(h, §ion_attr, cp->recovering_region, strlen(cp->recovering_region) + 1); if (rv == SA_AIS_ERR_TRY_AGAIN) { - LOG_ERROR("export_checkpoint: RR create retry"); + LOG_ERROR("RR checkpoint section create retry"); sleep(1); goto rr_create_retry; } if (rv == SA_AIS_ERR_EXIST) { - LOG_DBG("export_checkpoint: RR checkpoint section already exists"); + LOG_DBG("RR checkpoint section already exists"); saCkptCheckpointClose(h); return -EEXIST; } if (rv != SA_AIS_OK) { - LOG_ERROR("export_checkpoint: RR checkpoint section creation failed"); + LOG_ERROR("RR checkpoint section creation failed: %s", + str_ais_error(rv)); saCkptCheckpointClose(h); return -EIO; /* FIXME: better error */ } @@ -509,21 +515,6 @@ rr_create_retry: return 0; } -void ckpt_print (char *str, SaCkptCheckpointHandleT handle) -{ - SaCkptCheckpointDescriptorT descriptor; - SaAisErrorT rv; - -retry_statusget: - rv = saCkptCheckpointStatusGet (handle, &descriptor); - if (rv == SA_AIS_ERR_TRY_AGAIN) - goto retry_statusget; - - LOG_DBG("printing [%s] sections [%d] result [%d]", - str, descriptor.numberOfSections, rv); -} - - static int import_checkpoint(struct clog_cpg *entry, int no_read) { int rtn = 0; @@ -554,12 +545,11 @@ open_retry: } if (rv != SA_AIS_OK) { - LOG_ERROR("Failed to open checkpoint"); + LOG_ERROR("[%s] Failed to open checkpoint: %s", + SHORT_UUID(entry->name.value), str_ais_error(rv)); return -EIO; /* FIXME: better error */ } - ckpt_print ("Before unlink", h); - unlink_retry: rv = saCkptCheckpointUnlink(ckpt_handle, &name); if (rv == SA_AIS_ERR_TRY_AGAIN) { @@ -573,10 +563,9 @@ unlink_retry: goto no_read; } - ckpt_print ("After unlink", h); - init_retry: - rv = saCkptSectionIterationInitialize(h, SA_CKPT_SECTIONS_ANY, SA_TIME_END, &itr); + rv = saCkptSectionIterationInitialize(h, SA_CKPT_SECTIONS_ANY, + SA_TIME_END, &itr); if (rv == SA_AIS_ERR_TRY_AGAIN) { LOG_ERROR("import_checkpoint: sync create retry"); sleep(1); @@ -584,7 +573,8 @@ init_retry: } if (rv != SA_AIS_OK) { - LOG_ERROR("import_checkpoint: sync checkpoint section creation failed"); + LOG_ERROR("[%s] Sync checkpoint section creation failed: %s", + SHORT_UUID(entry->name.value), str_ais_error(rv)); return -EIO; /* FIXME: better error */ } @@ -602,11 +592,13 @@ init_retry: } saCkptSectionIterationFinalize(itr); if (len != 3) { - LOG_ERROR("import_checkpoint: %d checkpoint sections found", len); + LOG_ERROR("import_checkpoint: %d checkpoint sections found", + len); sleep(1); goto init_retry; } - saCkptSectionIterationInitialize(h, SA_CKPT_SECTIONS_ANY, SA_TIME_END, &itr); + saCkptSectionIterationInitialize(h, SA_CKPT_SECTIONS_ANY, + SA_TIME_END, &itr); while (1) { rv = saCkptSectionIterationNext(itr, &desc); @@ -620,7 +612,8 @@ init_retry: } if (rv != SA_AIS_OK) { - LOG_ERROR("import_checkpoint: clean checkpoint section creation failed"); + LOG_ERROR("import_checkpoint: clean checkpoint section " + "creation failed: %s", str_ais_error(rv)); rtn = -EIO; /* FIXME: better error */ goto fail; } @@ -645,20 +638,15 @@ init_retry: } if (rv != SA_AIS_OK) { - LOG_ERROR("import_checkpoint: ckpt read error"); + LOG_ERROR("import_checkpoint: ckpt read error: %s", + str_ais_error(rv)); rtn = -EIO; /* FIXME: better error */ goto fail; } - /* FIXME: Is this catching something special? - if (!iov.readSize) { - LOG_ERROR("%s section empty", (char *)desc.sectionId.id); - continue; - } - */ - if (iov.readSize) { - if (pull_state(entry->name.value, (char *)desc.sectionId.id, bitmap, + if (pull_state(entry->name.value, + (char *)desc.sectionId.id, bitmap, iov.readSize)) { LOG_ERROR("Error loading state"); rtn = -EIO; @@ -676,67 +664,62 @@ fail: no_read: saCkptCheckpointClose(h); - /* - LOG_PRINT("Testing if chkpoint exists after unlink/close"); - rv = saCkptCheckpointOpen(ckpt_handle, &name, NULL, - SA_CKPT_CHECKPOINT_READ, 0, &h); - if (rv != SA_AIS_OK) { - LOG_PRINT("Checkpoint was not removed!!!"); - saCkptCheckpointClose(h); - } else - LOG_PRINT(" rv == %d", rv); - */ - free(bitmap); return rtn; } +static void do_checkpoints(struct clog_cpg *entry) +{ + struct checkpoint_data *cp; + + for (cp = entry->checkpoint_list; cp;) { + LOG_DBG("[%s] Checkpoint data available for node %u", + SHORT_UUID(entry->name.value), cp->requester); + + /* + * FIXME: Check return code. Could send failure + * notice in tfr in export_checkpoint function + * by setting tfr->error + */ + switch (export_checkpoint(cp)) { + case -EEXIST: + LOG_DBG("[%s] Checkpoint for %u already handled", + SHORT_UUID(entry->name.value), cp->requester); + case 0: + entry->checkpoint_list = cp->next; + free_checkpoint(cp); + cp = entry->checkpoint_list; + break; + default: + /* FIXME: Skipping will cause list corruption */ + LOG_ERROR("[%s] Failed to export checkpoint for %u", + SHORT_UUID(entry->name.value), cp->requester); + } + } +} + static int do_cluster_work(void *data) { int r = SA_AIS_OK; struct clog_cpg *entry, *tmp; - struct checkpoint_data *cp; list_for_each_entry_safe(entry, tmp, &clog_cpg_list, list) { r = cpg_dispatch(entry->handle, CPG_DISPATCH_ALL); if (r != SA_AIS_OK) - LOG_ERROR("cpg_dispatch failed: %d", r); + LOG_ERROR("cpg_dispatch failed: %s", str_ais_error(r)); if (entry->free_me) { free(entry); continue; } - - for (cp = entry->checkpoint_list; cp;) { - LOG_DBG("[%s] Checkpoint data available for node %u", - SHORT_UUID(entry->name.value), cp->requester); - - /* - * FIXME: Check return code. Could send failure - * notice in tfr in export_checkpoint function - * by setting tfr->error - */ - switch (export_checkpoint(cp)) { - case -EEXIST: - LOG_DBG("[%s] Checkpoint for %u already handled by someone else", - SHORT_UUID(entry->name.value), cp->requester); - case 0: - entry->checkpoint_list = cp->next; - free_checkpoint(cp); - cp = entry->checkpoint_list; - break; - default: - /* FIXME: Skipping will cause list corruption */ - LOG_ERROR("[%s] Failed to export checkpoint for %u", - SHORT_UUID(entry->name.value), cp->requester); - } - } + do_checkpoints(entry); } return (r == SA_AIS_OK) ? 0 : -1; /* FIXME: good error number? */ } static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname, - uint32_t nodeid, uint32_t pid, void *msg, int msg_len) + uint32_t nodeid, uint32_t pid, + void *msg, int msg_len) { int i; int r = 0; @@ -785,7 +768,7 @@ static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname, * get our config callback. However, since we can't respond after * leaving, we simply return. */ - if (match->cpg_state != VALID) + if (match->state == LEAVING) return; i_am_server = (my_cluster_id == match->lowest_id) ? 1 : 0; @@ -1098,10 +1081,10 @@ static void cpg_leave_callback(struct clog_cpg *match, */ if (!strcmp(match->name.value, tfr->uuid) && (tfr->request_type != DM_CLOG_POSTSUSPEND)){ - LOG_PRINT("[%s] Resending %s due to new server(%u)", + LOG_PRINT("[%s] Resending %s due to new server(%u -> %u)", SHORT_UUID(match->name.value), RQ_TYPE(tfr->request_type), - match->lowest_id); + lowest, match->lowest_id); if (cluster_send(tfr)) LOG_ERROR("Failed resend"); } @@ -1225,11 +1208,11 @@ int destroy_cluster_cpg(char *str) list_for_each_entry_safe(del, tmp, &clog_cpg_list, list) if (!strncmp(del->name.value, str, CPG_MAX_NAME_LENGTH)) { del->cpg_state = INVALID; + del->state = LEAVING; r = cpg_leave(del->handle, &del->name); if (r != CPG_OK) LOG_ERROR("Error leaving CPG!"); break; - del->state = LEAVING; } return 0; hooks/post-receive -- Cluster Project