From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 26385 invoked by alias); 21 Apr 2009 19:12:09 -0000 Received: (qmail 26378 invoked by alias); 21 Apr 2009 19:12:09 -0000 X-SWARE-Spam-Status: No, hits=-1.3 required=5.0 tests=AWL,BAYES_00,J_CHICKENPOX_64,SPF_HELO_PASS X-Spam-Status: No, hits=-1.3 required=5.0 tests=AWL,BAYES_00,J_CHICKENPOX_64,SPF_HELO_PASS X-Spam-Check-By: sourceware.org X-Spam-Checker-Version: SpamAssassin 3.2.5 (2008-06-10) on bastion2.fedora.phx.redhat.com Subject: cluster: RHEL5 - clogd: Fix for bug 491340 - copy percentage can get stuck during sync attempt To: cluster-cvs-relay@redhat.com X-Project: Cluster Project X-Git-Module: cluster.git X-Git-Refname: refs/heads/RHEL5 X-Git-Reftype: branch X-Git-Oldrev: ff07f9f1f267a66de0b7428a5f3c5fee0c5e4d6f X-Git-Newrev: 9d2e36adb7a14378ee5934ea273094428a278089 From: Jonathan Brassow Message-Id: <20090421191140.9B1091201D8@lists.fedorahosted.org> Date: Tue, 21 Apr 2009 19:12:00 -0000 X-Scanned-By: MIMEDefang 2.58 on 172.16.52.254 Mailing-List: contact cluster-cvs-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Post: List-Help: , Sender: cluster-cvs-owner@sourceware.org X-SW-Source: 2009-q2/txt/msg00087.txt.bz2 Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=9d2e36adb7a14378ee5934ea273094428a278089 Commit: 9d2e36adb7a14378ee5934ea273094428a278089 Parent: ff07f9f1f267a66de0b7428a5f3c5fee0c5e4d6f Author: Jonathan Brassow AuthorDate: Mon Apr 20 11:02:50 2009 -0500 Committer: Jonathan Brassow CommitterDate: Tue Apr 21 14:11:24 2009 -0500 clogd: Fix for bug 491340 - copy percentage can get stuck during sync attempt May also help fix 480000. Introduce a delay between suspend/resume cycles to prevent the receipt of stale checkpoint data. --- cmirror/src/cluster.c | 162 ++++++++++++++++++++++++++++------------------- cmirror/src/functions.c | 86 +++++++++++++++++++++++-- cmirror/src/functions.h | 2 +- 3 files changed, 179 insertions(+), 71 deletions(-) diff --git a/cmirror/src/cluster.c b/cmirror/src/cluster.c index a23e39e..63af0dd 100644 --- a/cmirror/src/cluster.c +++ b/cmirror/src/cluster.c @@ -68,9 +68,14 @@ static SaCkptHandleT ckpt_handle = 0; static SaCkptCallbacksT callbacks = { 0, 0 }; static SaVersionT version = { 'B', 1, 1 }; -#define DEBUGGING_HISTORY 200 +#define DEBUGGING_HISTORY 100 static char debugging[DEBUGGING_HISTORY][128]; static int idx = 0; +#define LOG_SPRINT(f, arg...) do {\ + idx++; \ + idx = idx % DEBUGGING_HISTORY; \ + sprintf(debugging[idx], f, ## arg); \ + } while (0) static int log_resp_rec = 0; @@ -218,8 +223,8 @@ static int handle_cluster_request(struct clog_cpg *entry, r = do_request(t, server); r = kernel_send(t); - if (r) - LOG_ERROR("Failed to send response to kernel"); + if (r < 0) + LOG_ERROR("Failed to send resume response to kernel"); } return r; } @@ -346,7 +351,7 @@ static struct checkpoint_data *prepare_checkpoint(struct clog_cpg *entry, strncpy(new->uuid, entry->name.value, entry->name.length); new->bitmap_size = push_state(entry->name.value, "clean_bits", - &new->clean_bits); + &new->clean_bits, cp_requester); if (new->bitmap_size <= 0) { LOG_ERROR("Failed to store clean_bits to checkpoint for node %u", new->requester); @@ -355,7 +360,7 @@ static struct checkpoint_data *prepare_checkpoint(struct clog_cpg *entry, } new->bitmap_size = push_state(entry->name.value, - "sync_bits", &new->sync_bits); + "sync_bits", &new->sync_bits, cp_requester); if (new->bitmap_size <= 0) { LOG_ERROR("Failed to store sync_bits to checkpoint for node %u", new->requester); @@ -364,7 +369,7 @@ static struct checkpoint_data *prepare_checkpoint(struct clog_cpg *entry, return NULL; } - r = push_state(entry->name.value, "recovering_region", &new->recovering_region); + r = push_state(entry->name.value, "recovering_region", &new->recovering_region, cp_requester); if (r <= 0) { LOG_ERROR("Failed to store recovering_region to checkpoint for node %u", new->requester); @@ -550,7 +555,7 @@ rr_create_retry: tfr->request_type = DM_CLOG_CHECKPOINT_READY; tfr->originator = cp->requester; /* FIXME: hack to overload meaning of originator */ strncpy(tfr->uuid, cp->uuid, CPG_MAX_NAME_LENGTH); - tfr->seq = my_cluster_id; /* Just for debugging */ + tfr->seq = my_cluster_id; r = cluster_send(tfr); if (r) @@ -714,15 +719,11 @@ no_read: return rtn; } -static void do_checkpoints(struct clog_cpg *entry) +static void do_checkpoints(struct clog_cpg *entry, int leaving) { struct checkpoint_data *cp; for (cp = entry->checkpoint_list; cp;) { - LOG_COND(log_checkpoint, - "[%s] Checkpoint data available for node %u", - SHORT_UUID(entry->name.value), cp->requester); - /* * FIXME: Check return code. Could send failure * notice in tfr in export_checkpoint function @@ -730,18 +731,34 @@ static void do_checkpoints(struct clog_cpg *entry) */ switch (export_checkpoint(cp)) { case -EEXIST: + LOG_SPRINT("[%s] Checkpoint for %u already handled%s", + SHORT_UUID(entry->name.value), cp->requester, + (leaving) ? "(L)": ""); LOG_COND(log_checkpoint, - "[%s] Checkpoint for %u already handled", - SHORT_UUID(entry->name.value), cp->requester); + "[%s] Checkpoint for %u already handled%s", + SHORT_UUID(entry->name.value), cp->requester, + (leaving) ? "(L)": ""); + entry->checkpoint_list = cp->next; + free_checkpoint(cp); + cp = entry->checkpoint_list; + break; case 0: + LOG_SPRINT("[%s] Checkpoint data available for node %u%s", + SHORT_UUID(entry->name.value), cp->requester, + (leaving) ? "(L)": ""); + LOG_COND(log_checkpoint, + "[%s] Checkpoint data available for node %u%s", + SHORT_UUID(entry->name.value), cp->requester, + (leaving) ? "(L)": ""); entry->checkpoint_list = cp->next; free_checkpoint(cp); cp = entry->checkpoint_list; break; default: /* FIXME: Skipping will cause list corruption */ - LOG_ERROR("[%s] Failed to export checkpoint for %u", - SHORT_UUID(entry->name.value), cp->requester); + LOG_ERROR("[%s] Failed to export checkpoint for %u%s", + SHORT_UUID(entry->name.value), cp->requester, + (leaving) ? "(L)": ""); } } } @@ -784,11 +801,10 @@ static int resend_requests(struct clog_cpg *entry) "[%s] Skipping resend of %s/#%u...", SHORT_UUID(entry->name.value), _RQ_TYPE(tfr->request_type), tfr->seq); - idx++; - idx = idx % DEBUGGING_HISTORY; - sprintf(debugging[idx], "### No resend: [%s] %s/%u ###", - SHORT_UUID(entry->name.value), _RQ_TYPE(tfr->request_type), - tfr->seq); + LOG_SPRINT("### No resend: [%s] %s/%u ###", + SHORT_UUID(entry->name.value), + _RQ_TYPE(tfr->request_type), tfr->seq); + tfr->data_size = 0; kernel_send(tfr); @@ -804,11 +820,9 @@ static int resend_requests(struct clog_cpg *entry) SHORT_UUID(entry->name.value), _RQ_TYPE(tfr->request_type), tfr->seq, entry->lowest_id); - idx++; - idx = idx % DEBUGGING_HISTORY; - sprintf(debugging[idx], "*** Resending: [%s] %s/%u ***", - SHORT_UUID(entry->name.value), _RQ_TYPE(tfr->request_type), - tfr->seq); + LOG_SPRINT("*** Resending: [%s] %s/%u ***", + SHORT_UUID(entry->name.value), + _RQ_TYPE(tfr->request_type), tfr->seq); r = cluster_send(tfr); if (r < 0) LOG_ERROR("Failed resend"); @@ -833,7 +847,7 @@ static int do_cluster_work(void *data) free(entry); continue; } - do_checkpoints(entry); + do_checkpoints(entry, 0); resend_requests(entry); } @@ -866,6 +880,8 @@ static int flush_startup_list(struct clog_cpg *entry) free(tfr); continue; } + LOG_SPRINT("[%s] Checkpoint prepared for %u", + SHORT_UUID(entry->name.value), tfr->originator); LOG_COND(log_checkpoint, "[%s] Checkpoint prepared for %u", SHORT_UUID(entry->name.value), tfr->originator); new->next = entry->checkpoint_list; @@ -886,6 +902,7 @@ static int flush_startup_list(struct clog_cpg *entry) } free(tfr); } + return 0; } @@ -962,6 +979,7 @@ static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname, LOG_COND(log_resend_requests, "[%s] %u is leaving, delay = %d", SHORT_UUID(tfr->uuid), nodeid, match->delay); } + tfr->originator = nodeid; /* don't really need this, but nice for debug */ goto out; } } @@ -979,18 +997,28 @@ static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname, if (tfr->request_type == DM_CLOG_CHECKPOINT_READY) { if (my_cluster_id == tfr->originator) { /* Redundant checkpoints ignored if match->valid */ + LOG_SPRINT("[%s] CHECKPOINT_READY notification from %u", + SHORT_UUID(tfr->uuid), nodeid); if (import_checkpoint(match, (match->state != INVALID))) { + LOG_SPRINT("[%s] Failed to import checkpoint from %u", + SHORT_UUID(tfr->uuid), nodeid); LOG_ERROR("[%s] Failed to import checkpoint from %u", SHORT_UUID(tfr->uuid), nodeid); + kill(getpid(), SIGUSR1); /* Could we retry? */ goto out; } else if (match->state == INVALID) { + LOG_SPRINT("[%s] Checkpoint data received from %u. Log is now valid", + SHORT_UUID(match->name.value), nodeid); LOG_COND(log_checkpoint, "[%s] Checkpoint data received from %u. Log is now valid", SHORT_UUID(match->name.value), nodeid); match->state = VALID; flush_startup_list(match); + } else { + LOG_SPRINT("[%s] Redundant checkpoint from %u ignored.", + SHORT_UUID(tfr->uuid), nodeid); } } goto out; @@ -1031,15 +1059,14 @@ static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname, } /* - * If the log is now valid and resumed, we can queue the checkpoints + * If the log is now valid, we can queue the checkpoints */ for (i = match->checkpoints_needed; i; ) { struct checkpoint_data *new; if (log_get_state(tfr) != LOG_RESUMED) { - LOG_COND(log_checkpoint, - "[%s] Withholding checkpoints until log is valid", - SHORT_UUID(tfr->uuid)); + LOG_DBG("[%s] Withholding checkpoints until log is valid (%s from %u)", + SHORT_UUID(tfr->uuid), _RQ_TYPE(tfr->request_type), nodeid); break; } @@ -1051,6 +1078,9 @@ static void cpg_message_callback(cpg_handle_t handle, struct cpg_name *gname, SHORT_UUID(tfr->uuid), match->checkpoint_requesters[i]); break; } + LOG_SPRINT("[%s] Checkpoint prepared for %u* (%s)", + SHORT_UUID(tfr->uuid), match->checkpoint_requesters[i], + (log_get_state(tfr) != LOG_RESUMED)? "LOG_RESUMED": "LOG_SUSPENDED"); LOG_COND(log_checkpoint, "[%s] Checkpoint prepared for %u*", SHORT_UUID(tfr->uuid), match->checkpoint_requesters[i]); match->checkpoints_needed--; @@ -1084,17 +1114,17 @@ out: } } else if (!(tfr->request_type & DM_CLOG_RESPONSE) || (tfr->originator == my_cluster_id)) { - int len; - idx++; - idx = idx % DEBUGGING_HISTORY; - len = sprintf(debugging[idx], - "SEQ#=%u, UUID=%s, TYPE=%s, ORIG=%u, RESP=%s", - tfr->seq, - SHORT_UUID(tfr->uuid), - _RQ_TYPE(tfr->request_type), - tfr->originator, (response) ? "YES" : "NO"); - if (response) - sprintf(debugging[idx] + len, ", RSPR=%u", nodeid); + if (!response) + LOG_SPRINT("SEQ#=%u, UUID=%s, TYPE=%s, ORIG=%u, RESP=%s", + tfr->seq, SHORT_UUID(tfr->uuid), + _RQ_TYPE(tfr->request_type), + tfr->originator, (response) ? "YES" : "NO"); + else + LOG_SPRINT("SEQ#=%u, UUID=%s, TYPE=%s, ORIG=%u, RESP=%s, RSPR=%u", + tfr->seq, SHORT_UUID(tfr->uuid), + _RQ_TYPE(tfr->request_type), + tfr->originator, (response) ? "YES" : "NO", + nodeid); } } @@ -1107,6 +1137,7 @@ static void cpg_join_callback(struct clog_cpg *match, int my_pid = getpid(); uint32_t lowest = match->lowest_id; struct clog_tfr *tfr; + char dbuf[32]; /* Assign my_cluster_id */ if ((my_cluster_id == 0xDEAD) && (joined->pid == my_pid)) @@ -1122,8 +1153,12 @@ static void cpg_join_callback(struct clog_cpg *match, if (joined->nodeid == my_cluster_id) goto out; - LOG_COND(log_checkpoint, "[%s] Joining node, %u needs checkpoint", - SHORT_UUID(match->name.value), joined->nodeid); + memset(dbuf, 0, sizeof(dbuf)); + for (i = 0; i < (member_list_entries-1); i++) + sprintf(dbuf+strlen(dbuf), "%u-", member_list[i].nodeid); + sprintf(dbuf+strlen(dbuf), "(%u)", joined->nodeid); + LOG_COND(log_checkpoint, "[%s] Joining node, %u needs checkpoint [%s]", + SHORT_UUID(match->name.value), joined->nodeid, dbuf); /* * FIXME: remove checkpoint_requesters/checkpoints_needed, and use @@ -1168,10 +1203,8 @@ out: LOG_COND(log_membership_change, "[%s] Server unchanged at %u (%u joined)", SHORT_UUID(match->name.value), lowest, joined->nodeid); - idx++; - idx = idx % DEBUGGING_HISTORY; - sprintf(debugging[idx], "+++ UUID=%s %u join +++", - SHORT_UUID(match->name.value), joined->nodeid); + LOG_SPRINT("+++ UUID=%s %u join +++", + SHORT_UUID(match->name.value), joined->nodeid); } static void cpg_leave_callback(struct clog_cpg *match, @@ -1185,12 +1218,8 @@ static void cpg_leave_callback(struct clog_cpg *match, struct clog_tfr *tfr; struct checkpoint_data *p_cp, *c_cp; - { - idx++; - idx = idx % DEBUGGING_HISTORY; - sprintf(debugging[idx], "--- UUID=%s %u left ---", - SHORT_UUID(match->name.value), left->nodeid); - } + LOG_SPRINT("--- UUID=%s %u left ---", + SHORT_UUID(match->name.value), left->nodeid); /* Am I leaving? */ if (my_cluster_id == left->nodeid) { @@ -1218,7 +1247,7 @@ static void cpg_leave_callback(struct clog_cpg *match, match->state = INVALID; } - /* Remove any pending checkpoints for the leaving node */ + /* Remove any pending checkpoints for the leaving node. */ for (p_cp = NULL, c_cp = match->checkpoint_list; c_cp && (c_cp->requester != left->nodeid); p_cp = c_cp, c_cp = c_cp->next); @@ -1252,7 +1281,7 @@ static void cpg_leave_callback(struct clog_cpg *match, j--; } } - match->checkpoints_needed = j; + match->checkpoints_needed = j; if (left->nodeid < my_cluster_id) { match->delay = (match->delay > 0) ? match->delay - 1 : 0; @@ -1435,9 +1464,7 @@ int create_cluster_cpg(char *str) new->name.length = size; /* - * Look for checkpoints before joining to see if - * someone wrote a checkpoint after I left a previous - * session. + * Ensure there are no stale checkpoints around before we join */ if (remove_checkpoint(new) == 1) LOG_COND(log_checkpoint, @@ -1502,8 +1529,15 @@ static int _destroy_cluster_cpg(struct clog_cpg *del) * We must send any left over checkpoints before * leaving. If we don't, an incoming node could * be stuck with no checkpoint and stall. + do_checkpoints(del); --- THIS COULD BE CAUSING OUR PROBLEMS: + + - Incoming node deletes old checkpoints before joining + - A stale checkpoint is issued here by leaving node + - (leaving node leaves) + - Incoming node joins cluster and finds stale checkpoint. + - (leaving node leaves - option 2) */ - do_checkpoints(del); + do_checkpoints(del, 1); state = del->state; @@ -1537,13 +1571,11 @@ int destroy_cluster_cpg(char *str) int init_cluster(void) { + int i; SaAisErrorT rv; - { - int i; - for (i = 0; i < DEBUGGING_HISTORY; i++) - debugging[i][0] = '\0'; - } + for (i = 0; i < DEBUGGING_HISTORY; i++) + debugging[i][0] = '\0'; INIT_LIST_HEAD(&clog_cpg_list); rv = saCkptInitialize(&ckpt_handle, &callbacks, &version); diff --git a/cmirror/src/functions.c b/cmirror/src/functions.c index c106889..45fdc1a 100644 --- a/cmirror/src/functions.c +++ b/cmirror/src/functions.c @@ -11,6 +11,7 @@ #include #define __USE_GNU /* for O_DIRECT */ #include +#include #include "linux/dm-clog-tfr.h" #include "list.h" #include "functions.h" @@ -50,6 +51,7 @@ struct log_c { char uuid[DM_UUID_LEN]; uint32_t ref_count; + time_t delay; /* limits how fast a resume can happen after suspend */ int touched; uint32_t region_size; uint32_t region_count; @@ -60,6 +62,7 @@ struct log_c { uint32_t *sync_bits; uint32_t recoverer; uint64_t recovering_region; /* -1 means not recovering */ + uint64_t skip_bit_warning; /* used to warn if region skipped */ int sync_search; int resume_override; @@ -429,6 +432,7 @@ static int _clog_ctr(int argc, char **argv, uint64_t device_size) lc->block_on_error = block_on_error; lc->sync_search = 0; lc->recovering_region = (uint64_t)-1; + lc->skip_bit_warning = region_count; lc->disk_fd = -1; lc->log_dev_failed = 0; lc->ref_count = 1; @@ -668,6 +672,7 @@ static int clog_postsuspend(struct clog_tfr *tfr) lc->state = LOG_SUSPENDED; lc->recovering_region = (uint64_t)-1; lc->recoverer = (uint32_t)-1; + lc->delay = time(NULL); return 0; } @@ -714,6 +719,9 @@ static int clog_resume(struct clog_tfr *tfr) case 1000: LOG_ERROR("[%s] Additional resume issued before suspend", SHORT_UUID(tfr->uuid)); +#ifdef DEBUG + kill(getpid(), SIGUSR1); +#endif return 0; case 0: lc->resume_override = 1000; @@ -826,6 +834,7 @@ out: int local_resume(struct clog_tfr *tfr) { int r; + time_t t; struct log_c *lc = get_log(tfr->uuid); if (!lc) { @@ -836,6 +845,34 @@ int local_resume(struct clog_tfr *tfr) return -EINVAL; } + t = time(NULL); + t -= lc->delay; + /* + * This should be considered a temporary fix. It addresses + * a problem that exists when nodes suspend/resume in rapid + * succession. While the problem is very rare, it has been + * seen to happen in real-world-like testing. + * + * The problem: + * - Node A joins cluster + * - Node B joins cluster + * - Node A prepares checkpoint + * - Node A gets ready to write checkpoint + * - Node B leaves + * - Node B joins + * - Node A finishes write of checkpoint + * - Node B receives checkpoint meant for previous session + * -- Node B can now be non-coherent + * + * This timer will solve the problem for now, but could be + * replaced by a generation number sent with the resume + * command from the kernel. The generation number would + * be included in the name of the checkpoint to prevent + * reading stale data. + */ + if ((t < 3) && (t >= 0)) + sleep(3 - t); + /* Join the CPG */ r = create_cluster_cpg(tfr->uuid); if (r) { @@ -1155,6 +1192,7 @@ static int clog_get_resync_work(struct clog_tfr *tfr) (unsigned long long)lc->recovering_region); pkg->r = lc->recovering_region; pkg->i = 1; + LOG_COND(log_resend_requests, "***** RE-REQUEST *****"); } else { LOG_SPRINT("GET - SEQ#=%u, UUID=%s, nodeid = %u:: " "Someone already recovering (%llu)", @@ -1233,10 +1271,30 @@ static int clog_set_region_sync(struct clog_tfr *tfr) } else { log_set_bit(lc, lc->sync_bits, pkg->region); lc->sync_count++; + + /* The rest of this section is all for debugging */ LOG_SPRINT("SET - SEQ#=%u, UUID=%s, nodeid = %u:: " "Setting region (%llu)", tfr->seq, SHORT_UUID(lc->uuid), tfr->originator, (unsigned long long)pkg->region); + if (pkg->region == lc->skip_bit_warning) + lc->skip_bit_warning = lc->region_count; + + if (pkg->region > (lc->skip_bit_warning + 5)) { + LOG_ERROR("*** Region #%llu skipped during recovery ***", + (unsigned long long)lc->skip_bit_warning); + lc->skip_bit_warning = lc->region_count; +#ifdef DEBUG + kill(getpid(), SIGUSR1); +#endif + } + + if (!log_test_bit(lc->sync_bits, + (pkg->region) ? pkg->region - 1 : 0)) { + LOG_SPRINT("*** Previous bit not set ***"); + lc->skip_bit_warning = (pkg->region) ? + pkg->region - 1 : 0; + } } } else if (log_test_bit(lc->sync_bits, pkg->region)) { lc->sync_count--; @@ -1254,6 +1312,9 @@ static int clog_set_region_sync(struct clog_tfr *tfr) "sync_count(%llu) != bitmap count(%llu)", tfr->seq, SHORT_UUID(lc->uuid), tfr->originator, (unsigned long long)lc->sync_count, reset); +#ifdef DEBUG + kill(getpid(), SIGUSR1); +#endif lc->sync_count = reset; } @@ -1291,6 +1352,19 @@ static int clog_get_sync_count(struct clog_tfr *tfr) tfr->data_size = sizeof(*sync_count); + if (lc->sync_count != count_bits32(lc->sync_bits, lc->bitset_uint32_count)) { + unsigned long long reset = count_bits32(lc->sync_bits, lc->bitset_uint32_count); + + LOG_SPRINT("get_sync_count - SEQ#=%u, UUID=%s, nodeid = %u:: " + "sync_count(%llu) != bitmap count(%llu)", + tfr->seq, SHORT_UUID(lc->uuid), tfr->originator, + (unsigned long long)lc->sync_count, reset); +#ifdef DEBUG + kill(getpid(), SIGUSR1); +#endif + lc->sync_count = reset; + } + return 0; } @@ -1593,7 +1667,7 @@ static void print_bits(char *buf, int size, int print) } /* int store_bits(const char *uuid, const char *which, char **buf)*/ -int push_state(const char *uuid, const char *which, char **buf) +int push_state(const char *uuid, const char *which, char **buf, uint32_t debug_who) { int bitset_size; struct log_c *lc; @@ -1614,10 +1688,12 @@ int push_state(const char *uuid, const char *which, char **buf) sprintf(*buf, "%llu %u", (unsigned long long)lc->recovering_region, lc->recoverer); - LOG_SPRINT("CKPT SEND - SEQ#=X, UUID=%s, nodeid = X:: " - "recovering_region=%llu, recoverer=%u", - SHORT_UUID(lc->uuid), - (unsigned long long)lc->recovering_region, lc->recoverer); + LOG_SPRINT("CKPT SEND - SEQ#=X, UUID=%s, nodeid = %u:: " + "recovering_region=%llu, recoverer=%u, sync_count=%llu", + SHORT_UUID(lc->uuid), debug_who, + (unsigned long long)lc->recovering_region, + lc->recoverer, + (unsigned long long)count_bits32(lc->sync_bits, lc->bitset_uint32_count)); return 64; } diff --git a/cmirror/src/functions.h b/cmirror/src/functions.h index 5e0fab2..7c01c64 100644 --- a/cmirror/src/functions.h +++ b/cmirror/src/functions.h @@ -10,7 +10,7 @@ int local_resume(struct clog_tfr *tfr); int cluster_postsuspend(char *); int do_request(struct clog_tfr *tfr, int server); -int push_state(const char *uuid, const char *which, char **buf); +int push_state(const char *uuid, const char *which, char **buf, uint32_t debug_who); int pull_state(const char *uuid, const char *which, char *buf, int size); int log_get_state(struct clog_tfr *tfr);