From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 21165 invoked by alias); 30 Jul 2008 20:33:45 -0000 Received: (qmail 21158 invoked by alias); 30 Jul 2008 20:33:45 -0000 X-Spam-Status: No, hits=3.2 required=5.0 tests=BAYES_50,J_CHICKENPOX_31,J_CHICKENPOX_63,J_CHICKENPOX_64,J_CHICKENPOX_66,J_CHICKENPOX_83,KAM_MX,SPF_HELO_PASS X-Spam-Check-By: sourceware.org X-Spam-Checker-Version: SpamAssassin 3.2.4 (2008-01-01) on bastion.fedora.phx.redhat.com X-Spam-Level: Subject: master - groupd: detect group_mode To: cluster-cvs-relay@redhat.com X-Project: Cluster Project X-Git-Module: cluster.git X-Git-Refname: refs/heads/master X-Git-Reftype: branch X-Git-Oldrev: d9a282126c54e371f783c2a3f5ff24b53e685f87 X-Git-Newrev: f090526867f97d09f3813b7e0e1ca09ed650d71d From: David Teigland Message-Id: <20080730203257.BDABE120022@lists.fedorahosted.org> Date: Wed, 30 Jul 2008 20:33:00 -0000 X-Scanned-By: MIMEDefang 2.58 on 172.16.52.254 Mailing-List: contact cluster-cvs-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Post: List-Help: , Sender: cluster-cvs-owner@sourceware.org X-SW-Source: 2008-q3/txt/msg00175.txt.bz2 Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=f090526867f97d09f3813b7e0e1ca09ed650d71d Commit: f090526867f97d09f3813b7e0e1ca09ed650d71d Parent: d9a282126c54e371f783c2a3f5ff24b53e685f87 Author: David Teigland AuthorDate: Fri Jul 25 14:52:19 2008 -0500 Committer: David Teigland CommitterDate: Wed Jul 30 12:28:53 2008 -0500 groupd: detect group_mode command line -g2, or cluster.conf If old cluster2/RHEL5 nodes are in the cluster, cluster3 nodes will adopt the groupd compatibility mode to interoperate with them (this is the compat mode you get directly with -g1 or groupd_compat="1"). If no cluster2/RHEL5 nodes are in the cluster, cluster3 nodes will use the new group mode that doesn't go through libgroup, and is not compatible with cluster2/RHEL5 nodes (this is the non-compat mode you get directly with -g0 or groupd_compat="0"). This new mode (groupd_compat="2") is the default to favor the case of rolling cluster2->cluster3 upgrades, where cluster2 nodes and cluster3 nodes need to interoperate in a single cluster for a limited time. After all cluster2 nodes have been upgraded to cluster3, groupd_compat="0" can be added to cluster.conf the next time the entire cluster is taken down. It is still best to set groupd_compat="0" in cluster.conf for: . new clusters that don't require compatibility with cluster2 nodes . old clusters that are taken offline while the nodes are all upgraded from cluster2 to cluster3 Signed-off-by: David Teigland --- group/daemon/app.c | 3 + group/daemon/cpg.c | 369 ++++++++++++++++++++++++++++++++++++++++++++ group/daemon/gd_internal.h | 24 +++- group/daemon/main.c | 94 +++++++++++- group/lib/libgroup.c | 26 +++ group/lib/libgroup.h | 2 + 6 files changed, 514 insertions(+), 4 deletions(-) diff --git a/group/daemon/app.c b/group/daemon/app.c index 93f5c21..30c11da 100644 --- a/group/daemon/app.c +++ b/group/daemon/app.c @@ -1799,6 +1799,9 @@ int process_apps(void) group_t *g, *safe; int rv = 0; + if (group_mode != GROUP_LIBGROUP) + return 0; + list_for_each_entry_safe(g, safe, &gd_groups, list) { rv += process_app(g); deliver_app_messages(g); diff --git a/group/daemon/cpg.c b/group/daemon/cpg.c index e062f7d..cbfe313 100644 --- a/group/daemon/cpg.c +++ b/group/daemon/cpg.c @@ -21,7 +21,274 @@ static int saved_left_count; static cpg_handle_t saved_handle; static struct cpg_name saved_name; static int message_flow_control_on; +static struct list_head group_nodes; +static uint64_t send_version_first; + +#define CLUSTER2 2 +#define CLUSTER3 3 + +struct group_version { + uint32_t nodeid; + uint16_t cluster; + uint16_t group_mode; + uint16_t groupd_compat; + uint16_t groupd_count; + uint32_t unused; +}; + +struct group_node { + uint32_t nodeid; + uint32_t got_from; + int got_version; + uint64_t add_time; + struct group_version ver; + struct list_head list; +}; + +static void block_old_nodes(void); + +static char *mode_str(int m) +{ + switch (m) { + case GROUP_PENDING: + return "PENDING"; + case GROUP_LIBGROUP: + return "LIBGROUP"; + case GROUP_LIBCPG: + return "LIBCPG"; + default: + return "UNKNOWN"; + } +} + +static struct group_node *get_group_node(int nodeid) +{ + struct group_node *node; + + list_for_each_entry(node, &group_nodes, list) { + if (node->nodeid == nodeid) + return node; + } + return NULL; +} + +static void group_node_add(int nodeid) +{ + struct group_node *node; + + node = get_group_node(nodeid); + if (node) + return; + + node = malloc(sizeof(struct group_node)); + if (!node) + return; + memset(node, 0, sizeof(struct group_node)); + + node->nodeid = nodeid; + node->add_time = time(NULL); + list_add_tail(&node->list, &group_nodes); +} + +static void group_node_del(int nodeid) +{ + struct group_node *node; + + node = get_group_node(nodeid); + if (!node) { + log_print("group_node_del %d no node", nodeid); + return; + } + + list_del(&node->list); + free(node); +} + +static void version_copy_in(struct group_version *ver) +{ + ver->nodeid = le32_to_cpu(ver->nodeid); + ver->cluster = le16_to_cpu(ver->cluster); + ver->group_mode = le16_to_cpu(ver->group_mode); + ver->groupd_compat = le16_to_cpu(ver->groupd_compat); + ver->groupd_count = le16_to_cpu(ver->groupd_count); +} + +static void _send_version(int nodeid, int cluster, int mode, int compat) +{ + group_t g, *gp; + char *buf; + msg_t *msg; + int len; + int count = 0; + struct group_version *ver; + + list_for_each_entry(gp, &gd_groups, list) + count++; + + /* just so log_group will work */ + memset(&g, 0, sizeof(group_t)); + strcpy(g.name, "groupd"); + + len = sizeof(msg_t) + sizeof(struct group_version); + + buf = malloc(len); + if (!buf) + return; + memset(buf, 0, len); + + msg = (msg_t *)buf; + ver = (struct group_version *)(buf + sizeof(msg_t)); + + msg->ms_type = MSG_GROUP_VERSION; + msg_bswap_out(msg); + + log_debug("send_version nodeid %d cluster %d mode %s compat %d", + nodeid, cluster, mode_str(mode), compat); + + ver->nodeid = cpu_to_le32(nodeid); + ver->cluster = cpu_to_le16(cluster); + ver->group_mode = cpu_to_le16(mode); + ver->groupd_compat = cpu_to_le16(compat); + ver->groupd_count = cpu_to_le16(count); + + send_message_groupd(&g, buf, len, MSG_GROUP_VERSION); +} + +static void send_version(void) +{ + _send_version(our_nodeid, CLUSTER3, group_mode, cfgd_groupd_compat); +} + +static void set_group_mode(void) +{ + struct group_node *node; + int need_version, pending_count; + + need_version = 0; + pending_count = 0; + + list_for_each_entry(node, &group_nodes, list) { + if (!node->got_version) { + need_version++; + continue; + } + if (node->ver.group_mode == GROUP_PENDING) { + pending_count++; + continue; + } + + /* If we receive any non-pending group mode, adopt it + immediately. */ + + group_mode = node->ver.group_mode; + + log_print("group mode is %s", mode_str(group_mode)); + log_debug("set_group_mode %s matching nodeid %d got_from %d", + mode_str(group_mode), node->nodeid, node->got_from); + break; + } + + if (group_mode == GROUP_LIBCPG) + block_old_nodes(); +} + +static void receive_version(int from, msg_t *msg, int len) +{ + struct group_node *node; + struct group_version *ver; + + if (group_mode != GROUP_PENDING) + return; + + ver = (struct group_version *)((char *)msg + sizeof(msg_t)); + + version_copy_in(ver); + + node = get_group_node(ver->nodeid); + if (!node) { + log_print("receive_version from %d nodeid %d not found", + from, ver->nodeid); + return; + } + + /* ignore a repeat of what we've seen before */ + + if (node->got_version && from == node->got_from && + node->ver.group_mode == ver->group_mode) + return; + log_debug("receive_version from %d nodeid %d cluster %d mode %s " + "compat %d", from, ver->nodeid, ver->cluster, + mode_str(ver->group_mode), ver->groupd_compat); + + node->got_version = 1; + node->got_from = from; + memcpy(&node->ver, ver, sizeof(struct group_version)); + + set_group_mode(); +} + +void group_mode_check_timeout(void) +{ + struct group_node *node; + int need_version, pending_count; + uint64_t now; + + if (group_mode != GROUP_PENDING) + return; + + if (!send_version_first) + return; + + /* Wait for cfgd_groupd_wait seconds to receive a version message from + an added node, after which we'll send a version message for it, + calling it a cluster2 node; receiving this will cause everyone to + immediately set mode to LIBGROUP. */ + + need_version = 0; + pending_count = 0; + now = time(NULL); + + list_for_each_entry(node, &group_nodes, list) { + if (node->got_version) { + pending_count++; + continue; + } + need_version++; + + if (now - node->add_time >= cfgd_groupd_wait) { + log_print("send version for nodeid %d times %llu %llu", + node->nodeid, + (unsigned long long)node->add_time, + (unsigned long long)now); + _send_version(node->nodeid, CLUSTER2, GROUP_LIBGROUP,1); + } + } + + if (need_version) { + log_debug("group_mode_check_timeout need %d pending %d", + need_version, pending_count); + return; + } + + /* we have a version from everyone, and they all are pending; + wait for cfgd_groupd_mode_delay to give any old cluster2 nodes + a chance to join and cause us to use LIBGROUP */ + + if (now - send_version_first < cfgd_groupd_mode_delay) { + log_debug("group_mode_check_timeout delay times %llu %llu", + (unsigned long long)send_version_first, + (unsigned long long)now); + return; + } + + /* everyone is cluster3/pending so we can use LIBCPG; receiving + this will cause everyone to immediately set mode to LIBCPG */ + + log_debug("send version LIBCPG all %d pending", pending_count); + + _send_version(our_nodeid, CLUSTER3, GROUP_LIBCPG, cfgd_groupd_compat); +} static node_t *find_group_node(group_t *g, int nodeid) { @@ -193,6 +460,27 @@ void process_groupd_confchg(void) log_debug("groupd confchg total %d left %d joined %d", saved_member_count, saved_left_count, saved_joined_count); + if (!send_version_first) { + for (i = 0; i < saved_member_count; i++) { + group_node_add(saved_member[i].nodeid); + log_debug("groupd init %d", saved_member[i].nodeid); + } + + send_version_first = time(NULL); + } else { + for (i = 0; i < saved_left_count; i++) { + group_node_del(saved_left[i].nodeid); + log_debug("groupd del %d", saved_left[i].nodeid); + } + for (i = 0; i < saved_joined_count; i++) { + group_node_add(saved_joined[i].nodeid); + log_debug("groupd add %d", saved_joined[i].nodeid); + } + } + + if (saved_joined_count) + send_version(); + memcpy(&groupd_cpg_member, &saved_member, sizeof(saved_member)); groupd_cpg_member_count = saved_member_count; @@ -307,6 +595,11 @@ void deliver_cb(cpg_handle_t handle, struct cpg_name *group_name, msg_bswap_in(msg); + if (msg->ms_type == MSG_GROUP_VERSION) { + receive_version(nodeid, msg, data_len); + return; + } + if (msg->ms_type == MSG_GLOBAL_ID) { to_nodeid = msg->ms_global_id & 0x0000FFFF; counter = (msg->ms_global_id >> 16) & 0x0000FFFF; @@ -334,6 +627,9 @@ void deliver_cb(cpg_handle_t handle, struct cpg_name *group_name, return; } } else { + if (group_mode != GROUP_LIBGROUP) + return; + g = find_group_by_handle(handle); if (!g) { len = group_name->length; @@ -377,6 +673,9 @@ void process_confchg(void) return; } + if (group_mode != GROUP_LIBGROUP) + return; + g = find_group_by_handle(saved_handle); if (!g) { log_debug("confchg: no group for handle %llx name %s", @@ -533,6 +832,8 @@ int setup_cpg(void) cpg_error_t error; int fd; + INIT_LIST_HEAD(&group_nodes); + error = cpg_initialize(&groupd_handle, &callbacks); if (error != CPG_OK) { log_print("cpg_initialize error %d", error); @@ -562,6 +863,7 @@ int setup_cpg(void) log_debug("setup_cpg groupd_handle %llx", (unsigned long long)groupd_handle); + return 0; } @@ -680,3 +982,70 @@ int send_message(group_t *g, void *buf, int len) return _send_message(g->cpg_handle, g, buf, len); } +static void block_old_group(char *name, int level) +{ + group_t *g; + app_t *a; + cpg_error_t error; + cpg_handle_t h; + struct cpg_name cpgname; + int rv, fd, ci, i = 0; + + rv = create_group(name, level, &g); + if (rv) + return; + a = create_app(g); + if (!a) + return; + + error = cpg_initialize(&h, &callbacks); + if (error != CPG_OK) { + log_print("cpg_initialize error %d", error); + return; + } + + cpg_fd_get(h, &fd); + + ci = client_add(fd, process_cpg, NULL); + + g->cpg_client = ci; + g->cpg_handle = h; + g->cpg_fd = fd; + g->joining = 1; + a->client = ci; + + memset(&cpgname, 0, sizeof(cpgname)); + sprintf(cpgname.value, "%d_%s", level, name); + cpgname.length = strlen(cpgname.value) + 1; + + retry: + error = cpg_join(h, &cpgname); + if (error == CPG_ERR_TRY_AGAIN) { + log_debug("cpg_join error retry"); + sleep(1); + if (!(++i % 10)) + log_print("cpg_join error retrying"); + goto retry; + } + if (error != CPG_OK) { + log_print("cpg_join error %d", error); + cpg_finalize(h); + return; + } +} + +/* Problem: GROUP_LIBCPG is selected during version detection, then + an old cluster2 node starts (people aren't supposed to do this, but it may + happen, so it's nice to do what we can to address it). groupd on the old + cluster2 node, using libgroup, will allow new groups to be formed on it. + Solution is a hack: when the cluster3 nodes select LIBCPG mode, they also + create unused/placeholder cpg's with the names of old known cluster2 groups, + which blocks them being fully joined by old groupd's that may come along. */ + +static void block_old_nodes(void) +{ + block_old_group("default", 0); + block_old_group("clvmd", 1); + block_old_group("rgmanager", 1); +} + diff --git a/group/daemon/gd_internal.h b/group/daemon/gd_internal.h index 59f765b..ed29457 100644 --- a/group/daemon/gd_internal.h +++ b/group/daemon/gd_internal.h @@ -47,9 +47,25 @@ extern struct list_head gd_groups; extern struct list_head gd_levels[MAX_LEVELS]; extern uint32_t gd_event_nr; -#define DEFAULT_DEBUG_LOGSYS 0 +#define GROUP_PENDING 1 +#define GROUP_LIBGROUP 2 +#define GROUP_LIBCPG 3 +extern int group_mode; + +#define DEFAULT_GROUPD_COMPAT 2 +#define DEFAULT_GROUPD_WAIT 5 +#define DEFAULT_GROUPD_MODE_DELAY 2 +#define DEFAULT_DEBUG_LOGSYS 0 + +extern int optd_groupd_compat; +extern int optd_groupd_wait; +extern int optd_groupd_mode_delay; extern int optd_debug_logsys; + +extern int cfgd_groupd_compat; +extern int cfgd_groupd_wait; +extern int cfgd_groupd_mode_delay; extern int cfgd_debug_logsys; void daemon_dump_save(void); @@ -184,9 +200,10 @@ struct app { #define MSG_APP_RECOVER 3 #define MSG_APP_INTERNAL 4 #define MSG_GLOBAL_ID 5 +#define MSG_GROUP_VERSION 6 #define MSG_VER_MAJOR 1 -#define MSG_VER_MINOR 0 +#define MSG_VER_MINOR 1 #define MSG_VER_PATCH 0 struct msg { @@ -287,6 +304,7 @@ int send_message(group_t *g, void *buf, int len); int send_message_groupd(group_t *g, void *buf, int len, int type); void copy_groupd_data(group_data_t *data); int in_groupd_cpg(int nodeid); +void group_mode_check_timeout(void); /* joinleave.c */ void remove_group(group_t *g); @@ -294,6 +312,8 @@ int do_join(char *name, int level, int ci); int do_leave(char *name, int level); node_t *new_node(int nodeid); group_t *find_group_level(char *name, int level); +int create_group(char *name, int level, group_t **g_out); +app_t *create_app(group_t *g); /* logging.c */ diff --git a/group/daemon/main.c b/group/daemon/main.c index 1f0cf08..8059908 100644 --- a/group/daemon/main.c +++ b/group/daemon/main.c @@ -120,6 +120,10 @@ void read_ccs_int(char *path, int *config_val) free(str); } +#define GROUPD_COMPAT_PATH "/cluster/group/@groupd_compat" +#define GROUPD_WAIT_PATH "/cluster/group/@groupd_wait" +#define GROUPD_MODE_DELAY_PATH "/cluster/group/@groupd_mode_delay" + int setup_ccs(void) { int i = 0, cd; @@ -133,6 +137,18 @@ int setup_ccs(void) ccs_handle = cd; + /* These config values are set from cluster.conf only if they haven't + already been set on the command line. */ + + if (!optd_groupd_compat) + read_ccs_int(GROUPD_COMPAT_PATH, &cfgd_groupd_compat); + + if (!optd_groupd_wait) + read_ccs_int(GROUPD_WAIT_PATH, &cfgd_groupd_wait); + + if (!optd_groupd_mode_delay) + read_ccs_int(GROUPD_MODE_DELAY_PATH, &cfgd_groupd_mode_delay); + return 0; } @@ -369,6 +385,7 @@ enum { DO_GET_GROUP, DO_DUMP, DO_LOG, + DO_GET_VERSION, }; int get_action(char *buf) @@ -413,6 +430,9 @@ int get_action(char *buf) if (!strncmp(act, "get_group", 16)) return DO_GET_GROUP; + if (!strncmp(act, "get_version", 16)) + return DO_GET_VERSION; + if (!strncmp(act, "dump", 16)) return DO_DUMP; @@ -602,6 +622,20 @@ static int do_get_group(int ci, int argc, char **argv) return 0; } +static int do_get_version(int ci) +{ + int mode; + int rv; + + mode = group_mode; + + rv = do_write(client[ci].fd, &mode, sizeof(mode)); + if (rv < 0) + log_print("do_get_version write error"); + + return 0; +} + static int do_dump(int fd) { int len; @@ -715,6 +749,10 @@ static void process_connection(int ci) do_get_group(ci, argc, argv); break; + case DO_GET_VERSION: + do_get_version(ci); + break; + case DO_DUMP: do_dump(client[ci].fd); close(client[ci].fd); @@ -786,6 +824,12 @@ void cluster_dead(int ci) daemon_quit = 1; } +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) + static void loop(void) { int poll_timeout = -1; @@ -809,6 +853,13 @@ static void loop(void) setup_logging(); + if (cfgd_groupd_compat == 0) + group_mode = GROUP_LIBCPG; + else if (cfgd_groupd_compat == 1) + group_mode = GROUP_LIBGROUP; + else if (cfgd_groupd_compat == 2) + group_mode = GROUP_PENDING; + rv = check_uncontrolled_groups(); if (rv < 0) goto out; @@ -853,6 +904,13 @@ static void loop(void) rv = 0; rv += process_apps(); } while (rv); + + poll_timeout = -1; + + if (group_mode == GROUP_PENDING) { + group_mode_check_timeout(); + poll_timeout = 1000 * min(cfgd_groupd_wait, cfgd_groupd_mode_delay); + } } out: close_ccs(); @@ -914,11 +972,20 @@ static void print_usage(void) printf("\n"); printf(" -D Enable debugging code and don't fork\n"); printf(" -L Enable (1) or disable (0) debugging to logsys (default %d)\n", DEFAULT_DEBUG_LOGSYS); + printf(" -g group compatibility mode, 0 off, 1 on, 2 detect\n"); + printf(" 0: use libcpg, no backward compat, best performance\n"); + printf(" 1: use libgroup for compat with cluster2/stable2/rhel5\n"); + printf(" 2: detect old, or mode 0, nodes that require compat, use libcpg if none found\n"); + printf(" Default is %d\n", DEFAULT_GROUPD_COMPAT); + printf(" -w seconds to wait for a node's version message before assuming an old version requiring compat mode\n"); + printf(" Default is %d", DEFAULT_GROUPD_WAIT); + printf(" -d seconds to delay the mode selection to give time for an old version to join and force compat mode\n"); + printf(" Default is %d", DEFAULT_GROUPD_MODE_DELAY); printf(" -h Print this help, then exit\n"); printf(" -V Print program version information, then exit\n"); } -#define OPTION_STRING "L:DhVv" +#define OPTION_STRING "L:Dg:w:d:hVv" static void read_arguments(int argc, char **argv) { @@ -939,6 +1006,21 @@ static void read_arguments(int argc, char **argv) cfgd_debug_logsys = atoi(optarg); break; + case 'g': + optd_groupd_compat = 1; + cfgd_groupd_compat = atoi(optarg); + break; + + case 'w': + optd_groupd_wait = 1; + cfgd_groupd_wait = atoi(optarg); + break; + + case 'd': + optd_groupd_mode_delay = 1; + cfgd_groupd_mode_delay = atoi(optarg); + break; + case 'h': print_usage(); exit(EXIT_SUCCESS); @@ -1068,7 +1150,15 @@ int dump_wrap; struct list_head gd_groups; struct list_head gd_levels[MAX_LEVELS]; uint32_t gd_event_nr; +int group_mode; +int optd_groupd_compat; +int optd_groupd_wait; +int optd_groupd_mode_delay; int optd_debug_logsys; -int cfgd_debug_logsys = DEFAULT_DEBUG_LOGSYS; + +int cfgd_groupd_compat = DEFAULT_GROUPD_COMPAT; +int cfgd_groupd_wait = DEFAULT_GROUPD_WAIT; +int cfgd_groupd_mode_delay = DEFAULT_GROUPD_MODE_DELAY; +int cfgd_debug_logsys = DEFAULT_DEBUG_LOGSYS; diff --git a/group/lib/libgroup.c b/group/lib/libgroup.c index 59e26e8..82fe3fb 100644 --- a/group/lib/libgroup.c +++ b/group/lib/libgroup.c @@ -497,3 +497,29 @@ int group_get_group(int level, const char *name, group_data_t *data) return rv; } +int group_get_version(int *version) +{ + char buf[GROUPD_MSGLEN]; + char data_buf[sizeof(int)]; + int fd, rv; + + fd = connect_groupd(); + if (fd < 0) + return fd; + + memset(buf, 0, sizeof(buf)); + snprintf(buf, sizeof(buf), "get_version"); + + rv = do_write(fd, &buf, GROUPD_MSGLEN); + if (rv < 0) + goto out; + + rv = do_read(fd, version, sizeof(int)); + if (rv < 0) + goto out; + rv = 0; + out: + close(fd); + return rv; +} + diff --git a/group/lib/libgroup.h b/group/lib/libgroup.h index fdc1e10..82ef7e1 100644 --- a/group/lib/libgroup.h +++ b/group/lib/libgroup.h @@ -72,6 +72,8 @@ typedef struct group_data { int group_get_groups(int max, int *count, group_data_t *data); int group_get_group(int level, const char *name, group_data_t *data); +int group_get_version(int *version); + #ifdef __cplusplus } #endif