From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 22956 invoked by alias); 14 May 2009 13:38:58 -0000 Received: (qmail 22950 invoked by alias); 14 May 2009 13:38:57 -0000 X-SWARE-Spam-Status: No, hits=-1.4 required=5.0 tests=AWL,BAYES_00,J_CHICKENPOX_43,J_CHICKENPOX_63,SPF_HELO_PASS X-Spam-Status: No, hits=-1.4 required=5.0 tests=AWL,BAYES_00,J_CHICKENPOX_43,J_CHICKENPOX_63,SPF_HELO_PASS X-Spam-Check-By: sourceware.org X-Spam-Checker-Version: SpamAssassin 3.2.5 (2008-06-10) on bastion2.fedora.phx.redhat.com Subject: cluster: STABLE3 - qdisk: Add reporting for I/O hangs to quourm disk To: cluster-cvs-relay@redhat.com X-Project: Cluster Project X-Git-Module: cluster.git X-Git-Refname: refs/heads/STABLE3 X-Git-Reftype: branch X-Git-Oldrev: f0918fef3046b9362cfe9349a4c6589d1b96e3d0 X-Git-Newrev: 6c4dea2b599fc9f461a7e1063f36a772c8e7d15f From: Lon Hohberger Message-Id: <20090514133801.69310120231@lists.fedorahosted.org> Date: Thu, 14 May 2009 13:38:00 -0000 X-Scanned-By: MIMEDefang 2.58 on 172.16.52.254 Mailing-List: contact cluster-cvs-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Post: List-Help: , Sender: cluster-cvs-owner@sourceware.org X-SW-Source: 2009-q2/txt/msg00323.txt.bz2 Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=6c4dea2b599fc9f461a7e1063f36a772c8e7d15f Commit: 6c4dea2b599fc9f461a7e1063f36a772c8e7d15f Parent: f0918fef3046b9362cfe9349a4c6589d1b96e3d0 Author: Lon Hohberger AuthorDate: Fri May 8 13:23:04 2009 -0400 Committer: Lon Hohberger CommitterDate: Thu May 14 09:37:52 2009 -0400 qdisk: Add reporting for I/O hangs to quourm disk Signed-off-by: Lon Hohberger --- cman/qdisk/Makefile | 5 +- cman/qdisk/disk.c | 21 +++++++- cman/qdisk/iostate.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++ cman/qdisk/iostate.h | 17 ++++++ cman/qdisk/main.c | 7 +++ 5 files changed, 188 insertions(+), 4 deletions(-) diff --git a/cman/qdisk/Makefile b/cman/qdisk/Makefile index 0b3629d..68e20cd 100644 --- a/cman/qdisk/Makefile +++ b/cman/qdisk/Makefile @@ -18,7 +18,7 @@ CFLAGS += -I$(S) CFLAGS += -I${incdir} LDFLAGS += -L${logtlibdir} -llogthread -lpthread -LDFLAGS += -L${zliblibdir} -lz +LDFLAGS += -L${zliblibdir} -lz -lrt LDFLAGS += -L${libdir} EXTRA_LDFLAGS += -L${cmanlibdir} -L${ccslibdir} -lcman -lccs @@ -33,7 +33,8 @@ OBJS2= mkqdisk.o SHAREDOBJS= disk.o \ disk_util.o \ proc.o \ - scandisk.o + scandisk.o \ + iostate.o ${TARGET1}: ${SHAREDOBJS} ${OBJS1} $(CC) -o $@ $^ $(EXTRA_LDFLAGS) $(LDFLAGS) diff --git a/cman/qdisk/disk.c b/cman/qdisk/disk.c index e349698..680da2f 100644 --- a/cman/qdisk/disk.c +++ b/cman/qdisk/disk.c @@ -27,6 +27,7 @@ #include #include #include +#include "iostate.h" static int diskRawRead(target_info_t *disk, char *buf, int len); @@ -229,7 +230,9 @@ qdisk_open(char *name, target_info_t *disk) disk->d_pagesz = sysconf(_SC_PAGESIZE); /* Check to verify that the partition is large enough.*/ + io_state(STATE_LSEEK); ret = lseek(disk->d_fd, END_OF_DISK(disk->d_blksz), SEEK_SET); + io_state(STATE_NONE); if (ret < 0) { logt_print(LOG_DEBUG, "open_partition: seek"); close(disk->d_fd); @@ -332,7 +335,9 @@ diskRawReadShadow(target_info_t *disk, off_t readOffset, char *buf, int len) shared_header_t *hdrp; char *data; + io_state(STATE_LSEEK); ret = lseek(disk->d_fd, readOffset, SEEK_SET); + io_state(STATE_NONE); if (ret != readOffset) { logt_print(LOG_DEBUG, "diskRawReadShadow: can't seek to offset %d.\n", @@ -391,7 +396,10 @@ diskRawRead(target_info_t *disk, char *buf, int len) if (bounceNeeded == 0) { /* Already aligned and even multiple of 512, no bounceio * required. */ - return (read(disk->d_fd, buf, len)); + io_state(STATE_READ); + readret = read(disk->d_fd, buf, len); + io_state(STATE_NONE); + return readret; } if (len > disk->d_blksz) { @@ -420,7 +428,9 @@ diskRawRead(target_info_t *disk, char *buf, int len) return -1; } + io_state(STATE_READ); readret = read(disk->d_fd, alignedBuf, readlen); + io_state(STATE_NONE); if (readret > 0) { if (readret > len) { memcpy(alignedBuf, buf, len); @@ -463,7 +473,10 @@ diskRawWrite(target_info_t *disk, char *buf, int len) if (bounceNeeded == 0) { /* Already aligned and even multiple of 512, no bounceio * required. */ - return (write(disk->d_fd, buf, len)); + io_state(STATE_WRITE); + ret = write(disk->d_fd, buf, len); + io_state(STATE_NONE); + return ret; } if (len > disk->d_blksz) { @@ -500,7 +513,9 @@ diskRawWrite(target_info_t *disk, char *buf, int len) } memcpy(buf, alignedBuf, len); + io_state(STATE_WRITE); ret = write(disk->d_fd, alignedBuf, writelen); + io_state(STATE_NONE); if (ret > len) { ret = len; } @@ -528,7 +543,9 @@ diskRawWriteShadow(target_info_t *disk, __off64_t writeOffset, char *buf, int le return (-1); } + io_state(STATE_LSEEK); retval_seek = lseek(disk->d_fd, writeOffset, SEEK_SET); + io_state(STATE_NONE); if (retval_seek != writeOffset) { logt_print(LOG_ERR, "diskRawWriteShadow: can't seek to offset %d\n", diff --git a/cman/qdisk/iostate.c b/cman/qdisk/iostate.c new file mode 100644 index 0000000..f195c45 --- /dev/null +++ b/cman/qdisk/iostate.c @@ -0,0 +1,142 @@ +#include +#include +#include +#include +#include +#include +#include "iostate.h" + +static iostate_t main_state = 0; +static int main_incarnation = 0; +static int qdisk_timeout = 0, sleeptime = 0; +static int thread_active = 0; +static pthread_t io_nanny_tid = 0; +static pthread_mutex_t state_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t state_cond = PTHREAD_COND_INITIALIZER; + +struct state_table { + iostate_t state; + const char *value; +}; + +static struct state_table io_state_table[] = { +{ STATE_NONE, "none" }, +{ STATE_WRITE, "write" }, +{ STATE_READ, "read" }, +{ STATE_LSEEK, "seek" }, +{ -1, NULL } }; + +static const char * +state_to_string(iostate_t state) +{ + static const char *ret = "unknown"; + int i; + + for (i=0; io_state_table[i].value; i++) { + if (io_state_table[i].state == state) { + ret = io_state_table[i].value; + break; + } + } + + return ret; +} + + +void +io_state(iostate_t state) +{ + pthread_mutex_lock(&state_mutex); + main_state = state; + main_incarnation++; /* it does not matter if this wraps. */ + pthread_mutex_unlock(&state_mutex); + + /* Optimization: Don't signal on STATE_NONE */ + if (state != STATE_NONE) + pthread_cond_broadcast(&state_cond); +} + + +static void * +io_nanny_thread(void *arg) +{ + struct timespec wait_time; + iostate_t last_main_state = 0, current_main_state = 0; + int last_main_incarnation = 0, current_main_incarnation = 0; + int logged_incarnation = 0; + + /* Start with wherever we're at now */ + pthread_mutex_lock(&state_mutex); + current_main_state = last_main_state = main_state; + current_main_incarnation = last_main_incarnation = main_incarnation; + pthread_mutex_unlock(&state_mutex); + + while (thread_active) { + pthread_mutex_lock(&state_mutex); + clock_gettime(CLOCK_REALTIME, &wait_time); + wait_time.tv_sec += sleeptime; + pthread_cond_timedwait(&state_cond, &state_mutex, &wait_time); + current_main_state = main_state; + current_main_incarnation = main_incarnation; + pthread_mutex_unlock(&state_mutex); + + if (!thread_active) + break; + + if (!current_main_state) + continue; + + /* if the state or incarnation changed, the main qdiskd + * thread is healthy */ + if (current_main_state != last_main_state || + current_main_incarnation != last_main_incarnation) { + last_main_state = current_main_state; + last_main_incarnation = current_main_incarnation; + continue; + } + + /* Don't log things twice */ + if (logged_incarnation == current_main_incarnation) + continue; + logged_incarnation = current_main_incarnation; + + logt_print(LOG_WARNING, "qdiskd: %s " + "(system call) has hung for %d seconds\n", + state_to_string(current_main_state), sleeptime); + logt_print(LOG_WARNING, + "In %d more seconds, we will be evicted\n", + (qdisk_timeout-sleeptime)); + } + + return NULL; +} + + +int +io_nanny_start(int timeout) +{ + int ret; + + pthread_mutex_lock(&state_mutex); + + sleeptime = timeout / 2; + qdisk_timeout = timeout; + thread_active = 1; + + ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, NULL); + pthread_mutex_unlock(&state_mutex); + + return ret; +} + + +int +io_nanny_stop(void) +{ + thread_active = 0; + pthread_cond_broadcast(&state_cond); + pthread_join(io_nanny_tid, NULL); + io_nanny_tid = 0; + + return 0; +} diff --git a/cman/qdisk/iostate.h b/cman/qdisk/iostate.h new file mode 100644 index 0000000..7dd7bf6 --- /dev/null +++ b/cman/qdisk/iostate.h @@ -0,0 +1,17 @@ +#ifndef _IOSTATE_H +#define _IOSTATE_H + +typedef enum { + STATE_NONE = 0, + STATE_READ = 1, + STATE_WRITE = 2, + STATE_LSEEK = 3, + STATE_UNKNOWN = 4 +} iostate_t; + +void io_state(iostate_t state); + +int io_nanny_start(int timeout); +int io_nanny_stop(void); + +#endif diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c index d1865cd..a6be5a8 100644 --- a/cman/qdisk/main.c +++ b/cman/qdisk/main.c @@ -28,6 +28,8 @@ #define LOG_DAEMON_NAME "qdiskd" #define LOG_MODE_DEFAULT LOG_MODE_OUTPUT_SYSLOG|LOG_MODE_OUTPUT_FILE +#include "iostate.h" + /* from main.c */ void set_priority(int queue, int prio); @@ -1793,9 +1795,14 @@ main(int argc, char **argv) goto out; } */ + + io_nanny_start(ctx.qc_tko * ctx.qc_interval); + if (quorum_loop(&ctx, ni, MAX_NODES_DISK) == 0) cman_unregister_quorum_device(ctx.qc_cman_admin); + io_nanny_stop(); + quorum_logout(&ctx); out: /* free cman handle to avoid leak in cman */