public inbox for cluster-cvs@sourceware.org
help / color / mirror / Atom feed
* cluster: RHEL4 - qdisk: Add reporting for I/O hangs to quourm disk
@ 2009-05-13 15:41 Lon Hohberger
  0 siblings, 0 replies; only message in thread
From: Lon Hohberger @ 2009-05-13 15:41 UTC (permalink / raw)
  To: cluster-cvs-relay

Gitweb:        http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=779a71b145323ad97e5b73a58178e20a357b5a11
Commit:        779a71b145323ad97e5b73a58178e20a357b5a11
Parent:        5796a179508472d26ef41a77fe49d5c0de4de159
Author:        Lon Hohberger <lhh@redhat.com>
AuthorDate:    Fri May 8 13:23:04 2009 -0400
Committer:     Lon Hohberger <lhh@redhat.com>
CommitterDate: Wed May 13 11:29:17 2009 -0400

qdisk: Add reporting for I/O hangs to quourm disk

Signed-off-by: Lon Hohberger <lhh@redhat.com>
---
 cman/qdisk/Makefile  |   10 ++--
 cman/qdisk/disk.c    |   21 +++++++-
 cman/qdisk/iostate.c |  142 ++++++++++++++++++++++++++++++++++++++++++++++++++
 cman/qdisk/iostate.h |   17 ++++++
 cman/qdisk/main.c    |    4 ++
 5 files changed, 187 insertions(+), 7 deletions(-)

diff --git a/cman/qdisk/Makefile b/cman/qdisk/Makefile
index 905f1a2..632e47b 100644
--- a/cman/qdisk/Makefile
+++ b/cman/qdisk/Makefile
@@ -41,12 +41,12 @@ install: ${TARGET}
 	install ${TARGET} ${sbindir}
 
 qdiskd: disk.o crc32.o disk_util.o main.o score.o bitmap.o clulog.o \
-	gettid.o proc.o daemon_init.o ../lib/libcman.a
-	gcc -o $@ $^ -lpthread -L../lib -lccs
+	gettid.o proc.o daemon_init.o iostate.o ../lib/libcman.a
+	gcc -o $@ $^ -lpthread -L../lib -lccs -lrt
 
-mkqdisk: disk.o crc32.o disk_util.o \
-	 proc.o mkqdisk.o
-	gcc -o $@ $^ 
+mkqdisk: disk.o crc32.o disk_util.o iostate.o \
+	 proc.o mkqdisk.o clulog.o gettid.o
+	gcc -o $@ $^ -lrt
 
 
 %.o: %.c
diff --git a/cman/qdisk/disk.c b/cman/qdisk/disk.c
index cd9a462..723b35f 100644
--- a/cman/qdisk/disk.c
+++ b/cman/qdisk/disk.c
@@ -44,6 +44,7 @@
 #include <unistd.h>
 #include <time.h>
 #include <linux/fs.h>
+#include "iostate.h"
 
 static int diskRawRead(target_info_t *disk, char *buf, int len);
 uint32_t clu_crc32(const char *data, size_t count);
@@ -235,7 +236,9 @@ qdisk_open(char *name, target_info_t *disk)
 	disk->d_pagesz = sysconf(_SC_PAGESIZE);
 
 	/* Check to verify that the partition is large enough.*/
+	io_state(STATE_LSEEK);
 	ret = lseek(disk->d_fd, END_OF_DISK(disk->d_blksz), SEEK_SET);
+	io_state(STATE_NONE);
 	if (ret < 0) {
 		perror("open_partition: seek");
 		return -1;
@@ -337,7 +340,9 @@ diskRawReadShadow(target_info_t *disk, off_t readOffset, char *buf, int len)
 	char *data;
 	int datalen;
 
+	io_state(STATE_LSEEK);
 	ret = lseek(disk->d_fd, readOffset, SEEK_SET);
+	io_state(STATE_NONE);
 	if (ret != readOffset) {
 #if 0
 		fprintf(stderr,
@@ -402,7 +407,10 @@ diskRawRead(target_info_t *disk, char *buf, int len)
 	if (bounceNeeded == 0) {
 		/* Already aligned and even multiple of 512, no bounceio
 		 * required. */
-		return (read(disk->d_fd, buf, len));
+		io_state(STATE_READ);
+		readret = read(disk->d_fd, buf, len);
+		io_state(STATE_NONE);
+		return readret;
 	}
 
 	if (len > disk->d_blksz) {
@@ -431,7 +439,9 @@ diskRawRead(target_info_t *disk, char *buf, int len)
 		return -1;
 	}
 
+	io_state(STATE_READ);
 	readret = read(disk->d_fd, alignedBuf, readlen);
+	io_state(STATE_NONE);
 	if (readret > 0) {
 		if (readret > len) {
 			memcpy(alignedBuf, buf, len);
@@ -474,7 +484,10 @@ diskRawWrite(target_info_t *disk, char *buf, int len)
 	if (bounceNeeded == 0) {
 		/* Already aligned and even multiple of 512, no bounceio
 		 * required. */
-		return (write(disk->d_fd, buf, len));
+		io_state(STATE_WRITE);
+		ret = write(disk->d_fd, buf, len);
+		io_state(STATE_NONE);
+		return ret;
 	}
 
 	if (len > disk->d_blksz) {
@@ -511,7 +524,9 @@ diskRawWrite(target_info_t *disk, char *buf, int len)
 	}
 
 	memcpy(buf, alignedBuf, len);
+	io_state(STATE_WRITE);
 	ret = write(disk->d_fd, alignedBuf, writelen);
+	io_state(STATE_NONE);
 	if (ret > len) {
 		ret = len;
 	}
@@ -539,7 +554,9 @@ diskRawWriteShadow(target_info_t *disk, __off64_t writeOffset, char *buf, int le
 		return (-1);
 	}
 
+	io_state(STATE_LSEEK);
 	retval_seek = lseek(disk->d_fd, writeOffset, SEEK_SET);
+	io_state(STATE_NONE);
 	if (retval_seek != writeOffset) {
 		fprintf(stderr,
 		       "diskRawWriteShadow: can't seek to offset %d\n",
diff --git a/cman/qdisk/iostate.c b/cman/qdisk/iostate.c
new file mode 100644
index 0000000..f4f2329
--- /dev/null
+++ b/cman/qdisk/iostate.c
@@ -0,0 +1,142 @@
+#include <pthread.h>
+#include <iostate.h>
+#include <unistd.h>
+#include <time.h>
+#include <sys/time.h>
+#include <clulog.h>
+#include "iostate.h"
+
+static iostate_t main_state = 0;
+static int main_incarnation = 0;
+static int qdisk_timeout = 0, sleeptime = 0;
+static int thread_active = 0;
+static pthread_t io_nanny_tid = 0;
+static pthread_mutex_t state_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t state_cond = PTHREAD_COND_INITIALIZER;
+
+struct state_table {
+	iostate_t state;
+	const char *value;
+};
+
+static struct state_table io_state_table[] = {
+{	STATE_NONE,	"none"	},
+{	STATE_WRITE,	"write"	},
+{	STATE_READ,	"read"	},
+{	STATE_LSEEK,	"seek"	},
+{	-1,		NULL	} };
+
+static const char *
+state_to_string(iostate_t state)
+{
+	static const char *ret = "unknown";
+	int i;
+
+	for (i=0; io_state_table[i].value; i++) {
+		if (io_state_table[i].state == state) {
+			ret = io_state_table[i].value;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+
+void
+io_state(iostate_t state)
+{
+	pthread_mutex_lock(&state_mutex);
+	main_state = state;
+	main_incarnation++; /* it does not matter if this wraps. */
+	pthread_mutex_unlock(&state_mutex);
+
+	/* Optimization: Don't signal on STATE_NONE */
+	if (state != STATE_NONE)
+		pthread_cond_broadcast(&state_cond);
+}
+
+
+static void *
+io_nanny_thread(void *arg)
+{
+	struct timespec wait_time;
+	iostate_t last_main_state = 0, current_main_state = 0;
+	int last_main_incarnation = 0, current_main_incarnation = 0;
+	int logged_incarnation = 0;
+
+	/* Start with wherever we're at now */
+	pthread_mutex_lock(&state_mutex);
+	current_main_state = last_main_state = main_state;
+	current_main_incarnation = last_main_incarnation = main_incarnation;
+	pthread_mutex_unlock(&state_mutex);
+
+	while (thread_active) {
+		pthread_mutex_lock(&state_mutex);
+    		clock_gettime(CLOCK_REALTIME, &wait_time);
+		wait_time.tv_sec += sleeptime;
+		pthread_cond_timedwait(&state_cond, &state_mutex, &wait_time);
+		current_main_state = main_state;
+		current_main_incarnation = main_incarnation;
+		pthread_mutex_unlock(&state_mutex);
+
+		if (!thread_active)
+			break;
+
+		if (!current_main_state)
+			continue;
+
+		/* if the state or incarnation changed, the main qdiskd
+		 * thread is healthy */
+		if (current_main_state != last_main_state ||
+		    current_main_incarnation != last_main_incarnation) {
+			last_main_state = current_main_state;
+			last_main_incarnation = current_main_incarnation;
+			continue;
+		}
+
+		/* Don't log things twice */
+		if (logged_incarnation == current_main_incarnation)
+			continue;
+		logged_incarnation = current_main_incarnation;
+
+		clulog(LOG_WARNING, "qdiskd: %s "
+			   "(system call) has hung for %d seconds\n",
+			   state_to_string(current_main_state), sleeptime);
+		clulog(LOG_WARNING,
+			   "In %d more seconds, we will be evicted\n",
+			   (qdisk_timeout-sleeptime));
+	}
+
+	return NULL;
+}
+
+
+int
+io_nanny_start(int timeout)
+{
+	int ret;
+
+	pthread_mutex_lock(&state_mutex);
+
+	sleeptime = timeout / 2;
+	qdisk_timeout = timeout;
+	thread_active = 1;
+
+	ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, NULL);
+	pthread_mutex_unlock(&state_mutex);
+
+	return ret;
+}
+
+
+int
+io_nanny_stop(void)
+{
+	thread_active = 0;
+	pthread_cond_broadcast(&state_cond);
+	pthread_join(io_nanny_tid, NULL);
+	io_nanny_tid = 0;
+
+	return 0;
+}
diff --git a/cman/qdisk/iostate.h b/cman/qdisk/iostate.h
new file mode 100644
index 0000000..7dd7bf6
--- /dev/null
+++ b/cman/qdisk/iostate.h
@@ -0,0 +1,17 @@
+#ifndef _IOSTATE_H
+#define _IOSTATE_H
+
+typedef enum {
+	STATE_NONE	= 0,
+	STATE_READ	= 1,
+	STATE_WRITE	= 2,
+	STATE_LSEEK	= 3,
+	STATE_UNKNOWN	= 4
+} iostate_t;
+
+void io_state(iostate_t state);
+
+int io_nanny_start(int timeout);
+int io_nanny_stop(void);
+
+#endif
diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c
index 7b0a6ce..538bc88 100644
--- a/cman/qdisk/main.c
+++ b/cman/qdisk/main.c
@@ -42,6 +42,7 @@
 #include <ccs.h>
 #include "score.h"
 #include "clulog.h"
+#include "iostate.h"
 #if (!defined(LIBCMAN_VERSION) || \
      (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION < 2))
 #include <cluster/cnxman-socket.h>
@@ -1574,10 +1575,13 @@ main(int argc, char **argv)
 		return -1;
 	}
 	*/
+	io_nanny_start(ctx.qc_tko * ctx.qc_interval);
 
 	if (quorum_loop(&ctx, ni, MAX_NODES_DISK) == 0)
 		cman_unregister_quorum_device(ctx.qc_ch);
 
+	io_nanny_stop();
+
 	quorum_logout(&ctx);
 	qd_destroy(&ctx);
 


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2009-05-13 15:41 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-05-13 15:41 cluster: RHEL4 - qdisk: Add reporting for I/O hangs to quourm disk Lon Hohberger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).