From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 12662 invoked by alias); 13 May 2009 15:14:39 -0000 Received: (qmail 12653 invoked by alias); 13 May 2009 15:14:38 -0000 X-SWARE-Spam-Status: No, hits=-1.4 required=5.0 tests=AWL,BAYES_00,J_CHICKENPOX_43,J_CHICKENPOX_63,SPF_HELO_PASS X-Spam-Status: No, hits=-1.4 required=5.0 tests=AWL,BAYES_00,J_CHICKENPOX_43,J_CHICKENPOX_63,SPF_HELO_PASS X-Spam-Check-By: sourceware.org X-Spam-Checker-Version: SpamAssassin 3.2.5 (2008-06-10) on bastion2.fedora.phx.redhat.com Subject: cluster: RHEL5 - qdisk: Add reporting for I/O hangs to quourm disk To: cluster-cvs-relay@redhat.com X-Project: Cluster Project X-Git-Module: cluster.git X-Git-Refname: refs/heads/RHEL5 X-Git-Reftype: branch X-Git-Oldrev: bb1e50295e8dcf36d7ce9ad22196fd7f89fba899 X-Git-Newrev: 83a61282601bff7dd26e8bcf4ebd4b1f38d6e25c From: Lon Hohberger Message-Id: <20090513151407.AD8C21201C1@lists.fedorahosted.org> Date: Wed, 13 May 2009 15:14:00 -0000 X-Scanned-By: MIMEDefang 2.58 on 172.16.52.254 Mailing-List: contact cluster-cvs-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Post: List-Help: , Sender: cluster-cvs-owner@sourceware.org X-SW-Source: 2009-q2/txt/msg00304.txt.bz2 Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=83a61282601bff7dd26e8bcf4ebd4b1f38d6e25c Commit: 83a61282601bff7dd26e8bcf4ebd4b1f38d6e25c Parent: bb1e50295e8dcf36d7ce9ad22196fd7f89fba899 Author: Lon Hohberger AuthorDate: Fri May 8 13:23:04 2009 -0400 Committer: Lon Hohberger CommitterDate: Wed May 13 11:13:56 2009 -0400 qdisk: Add reporting for I/O hangs to quourm disk Signed-off-by: Lon Hohberger --- cman/qdisk/Makefile | 10 ++-- cman/qdisk/disk.c | 21 +++++++- cman/qdisk/iostate.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++ cman/qdisk/iostate.h | 17 ++++++ cman/qdisk/main.c | 6 ++ 5 files changed, 189 insertions(+), 7 deletions(-) diff --git a/cman/qdisk/Makefile b/cman/qdisk/Makefile index 23d0890..f58806b 100644 --- a/cman/qdisk/Makefile +++ b/cman/qdisk/Makefile @@ -28,12 +28,12 @@ install: all install ${TARGET} ${sbindir} qdiskd: disk.o crc32.o disk_util.o main.o score.o bitmap.o clulog.o \ - gettid.o proc.o daemon_init.o scandisk.o ../lib/libcman.a - gcc -o $@ $^ -lpthread -L../lib -L${ccslibdir} -lccs + gettid.o proc.o daemon_init.o scandisk.o iostate.o ../lib/libcman.a + gcc -o $@ $^ -lpthread -L../lib -L${ccslibdir} -lccs -lrt -mkqdisk: disk.o crc32.o disk_util.o \ - proc.o mkqdisk.o scandisk.o - gcc -o $@ $^ +mkqdisk: disk.o crc32.o disk_util.o iostate.o \ + proc.o mkqdisk.o scandisk.o clulog.o gettid.o + gcc -o $@ $^ -lrt %.o: %.c $(CC) -c -o $@ $^ $(INCLUDES) $(CFLAGS) diff --git a/cman/qdisk/disk.c b/cman/qdisk/disk.c index 8cf7b5a..6771e06 100644 --- a/cman/qdisk/disk.c +++ b/cman/qdisk/disk.c @@ -44,6 +44,7 @@ #include #include #include +#include "iostate.h" static int diskRawRead(target_info_t *disk, char *buf, int len); uint32_t clu_crc32(const char *data, size_t count); @@ -236,7 +237,9 @@ qdisk_open(char *name, target_info_t *disk) disk->d_pagesz = sysconf(_SC_PAGESIZE); /* Check to verify that the partition is large enough.*/ + io_state(STATE_LSEEK); ret = lseek(disk->d_fd, END_OF_DISK(disk->d_blksz), SEEK_SET); + io_state(STATE_NONE); if (ret < 0) { perror("open_partition: seek"); close(disk->d_fd); @@ -340,7 +343,9 @@ diskRawReadShadow(target_info_t *disk, off_t readOffset, char *buf, int len) char *data; int datalen; + io_state(STATE_LSEEK); ret = lseek(disk->d_fd, readOffset, SEEK_SET); + io_state(STATE_NONE); if (ret != readOffset) { #if 0 fprintf(stderr, @@ -405,7 +410,10 @@ diskRawRead(target_info_t *disk, char *buf, int len) if (bounceNeeded == 0) { /* Already aligned and even multiple of 512, no bounceio * required. */ - return (read(disk->d_fd, buf, len)); + io_state(STATE_READ); + readret = read(disk->d_fd, buf, len); + io_state(STATE_NONE); + return readret; } if (len > disk->d_blksz) { @@ -434,7 +442,9 @@ diskRawRead(target_info_t *disk, char *buf, int len) return -1; } + io_state(STATE_READ); readret = read(disk->d_fd, alignedBuf, readlen); + io_state(STATE_NONE); if (readret > 0) { if (readret > len) { memcpy(alignedBuf, buf, len); @@ -477,7 +487,10 @@ diskRawWrite(target_info_t *disk, char *buf, int len) if (bounceNeeded == 0) { /* Already aligned and even multiple of 512, no bounceio * required. */ - return (write(disk->d_fd, buf, len)); + io_state(STATE_WRITE); + ret = write(disk->d_fd, buf, len); + io_state(STATE_NONE); + return ret; } if (len > disk->d_blksz) { @@ -514,7 +527,9 @@ diskRawWrite(target_info_t *disk, char *buf, int len) } memcpy(buf, alignedBuf, len); + io_state(STATE_WRITE); ret = write(disk->d_fd, alignedBuf, writelen); + io_state(STATE_NONE); if (ret > len) { ret = len; } @@ -542,7 +557,9 @@ diskRawWriteShadow(target_info_t *disk, __off64_t writeOffset, char *buf, int le return (-1); } + io_state(STATE_LSEEK); retval_seek = lseek(disk->d_fd, writeOffset, SEEK_SET); + io_state(STATE_NONE); if (retval_seek != writeOffset) { fprintf(stderr, "diskRawWriteShadow: can't seek to offset %d\n", diff --git a/cman/qdisk/iostate.c b/cman/qdisk/iostate.c new file mode 100644 index 0000000..f4f2329 --- /dev/null +++ b/cman/qdisk/iostate.c @@ -0,0 +1,142 @@ +#include +#include +#include +#include +#include +#include +#include "iostate.h" + +static iostate_t main_state = 0; +static int main_incarnation = 0; +static int qdisk_timeout = 0, sleeptime = 0; +static int thread_active = 0; +static pthread_t io_nanny_tid = 0; +static pthread_mutex_t state_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t state_cond = PTHREAD_COND_INITIALIZER; + +struct state_table { + iostate_t state; + const char *value; +}; + +static struct state_table io_state_table[] = { +{ STATE_NONE, "none" }, +{ STATE_WRITE, "write" }, +{ STATE_READ, "read" }, +{ STATE_LSEEK, "seek" }, +{ -1, NULL } }; + +static const char * +state_to_string(iostate_t state) +{ + static const char *ret = "unknown"; + int i; + + for (i=0; io_state_table[i].value; i++) { + if (io_state_table[i].state == state) { + ret = io_state_table[i].value; + break; + } + } + + return ret; +} + + +void +io_state(iostate_t state) +{ + pthread_mutex_lock(&state_mutex); + main_state = state; + main_incarnation++; /* it does not matter if this wraps. */ + pthread_mutex_unlock(&state_mutex); + + /* Optimization: Don't signal on STATE_NONE */ + if (state != STATE_NONE) + pthread_cond_broadcast(&state_cond); +} + + +static void * +io_nanny_thread(void *arg) +{ + struct timespec wait_time; + iostate_t last_main_state = 0, current_main_state = 0; + int last_main_incarnation = 0, current_main_incarnation = 0; + int logged_incarnation = 0; + + /* Start with wherever we're at now */ + pthread_mutex_lock(&state_mutex); + current_main_state = last_main_state = main_state; + current_main_incarnation = last_main_incarnation = main_incarnation; + pthread_mutex_unlock(&state_mutex); + + while (thread_active) { + pthread_mutex_lock(&state_mutex); + clock_gettime(CLOCK_REALTIME, &wait_time); + wait_time.tv_sec += sleeptime; + pthread_cond_timedwait(&state_cond, &state_mutex, &wait_time); + current_main_state = main_state; + current_main_incarnation = main_incarnation; + pthread_mutex_unlock(&state_mutex); + + if (!thread_active) + break; + + if (!current_main_state) + continue; + + /* if the state or incarnation changed, the main qdiskd + * thread is healthy */ + if (current_main_state != last_main_state || + current_main_incarnation != last_main_incarnation) { + last_main_state = current_main_state; + last_main_incarnation = current_main_incarnation; + continue; + } + + /* Don't log things twice */ + if (logged_incarnation == current_main_incarnation) + continue; + logged_incarnation = current_main_incarnation; + + clulog(LOG_WARNING, "qdiskd: %s " + "(system call) has hung for %d seconds\n", + state_to_string(current_main_state), sleeptime); + clulog(LOG_WARNING, + "In %d more seconds, we will be evicted\n", + (qdisk_timeout-sleeptime)); + } + + return NULL; +} + + +int +io_nanny_start(int timeout) +{ + int ret; + + pthread_mutex_lock(&state_mutex); + + sleeptime = timeout / 2; + qdisk_timeout = timeout; + thread_active = 1; + + ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, NULL); + pthread_mutex_unlock(&state_mutex); + + return ret; +} + + +int +io_nanny_stop(void) +{ + thread_active = 0; + pthread_cond_broadcast(&state_cond); + pthread_join(io_nanny_tid, NULL); + io_nanny_tid = 0; + + return 0; +} diff --git a/cman/qdisk/iostate.h b/cman/qdisk/iostate.h new file mode 100644 index 0000000..7dd7bf6 --- /dev/null +++ b/cman/qdisk/iostate.h @@ -0,0 +1,17 @@ +#ifndef _IOSTATE_H +#define _IOSTATE_H + +typedef enum { + STATE_NONE = 0, + STATE_READ = 1, + STATE_WRITE = 2, + STATE_LSEEK = 3, + STATE_UNKNOWN = 4 +} iostate_t; + +void io_state(iostate_t state); + +int io_nanny_start(int timeout); +int io_nanny_stop(void); + +#endif diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c index e235883..090c71e 100644 --- a/cman/qdisk/main.c +++ b/cman/qdisk/main.c @@ -43,6 +43,7 @@ #include #include "score.h" #include "clulog.h" +#include "iostate.h" #if (!defined(LIBCMAN_VERSION) || \ (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION < 2)) #include @@ -1592,9 +1593,14 @@ main(int argc, char **argv) goto out; } */ + + io_nanny_start(ctx.qc_tko * ctx.qc_interval); + if (quorum_loop(&ctx, ni, MAX_NODES_DISK) == 0) cman_unregister_quorum_device(ctx.qc_ch); + io_nanny_stop(); + quorum_logout(&ctx); /* free cman handle to avoid leak in cman */ out: