public inbox for cluster-cvs@sourceware.org
help / color / mirror / Atom feed
* cluster: STABLE2 - Speed up gfs_grow
@ 2009-04-07 22:21 Bob Peterson
  0 siblings, 0 replies; only message in thread
From: Bob Peterson @ 2009-04-07 22:21 UTC (permalink / raw)
  To: cluster-cvs-relay

Gitweb:        http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=1bd1fc12720a42fd7f5116573cbd48597a2acdfc
Commit:        1bd1fc12720a42fd7f5116573cbd48597a2acdfc
Parent:        b518f58fd3a512bf01566dbf595ecbbda119963b
Author:        Bob Peterson <rpeterso@redhat.com>
AuthorDate:    Tue Apr 7 17:19:30 2009 -0500
Committer:     Bob Peterson <rpeterso@redhat.com>
CommitterDate: Tue Apr 7 17:19:30 2009 -0500

Speed up gfs_grow

bz 485451 -  gfs_grow very slow with 1k file system block size

This patch speeds up gfs_grow by doing two main things:
First, it doesn't close/open/seek for every RG block write.
Second, it combines writes to the rindex file so that a
whole page may be allocated and written at a time.  That
avoids some very slow glock issues, especially when block size
is less than page size.
---
 gfs/gfs_grow/main.c |  132 +++++++++++++++++++++++++++++++++++----------------
 1 files changed, 91 insertions(+), 41 deletions(-)

diff --git a/gfs/gfs_grow/main.c b/gfs/gfs_grow/main.c
index eb30e21..661d364 100644
--- a/gfs/gfs_grow/main.c
+++ b/gfs/gfs_grow/main.c
@@ -109,7 +109,7 @@ device_geometry(char *device)
  * Returns: Error code, or amount of data read
  */
 
-int
+static int
 jread(int fd, char *file, void *buf, uint64_t size, uint64_t *offset)
 {
 	struct gfs_ioctl gi;
@@ -140,7 +140,7 @@ jread(int fd, char *file, void *buf, uint64_t size, uint64_t *offset)
  * Returns: Error code, or the amount of data written
  */
 
-int
+static int
 jwrite(int fd, char *file, void *buf, uint64_t size, uint64_t *offset)
 {
 	struct gfs_ioctl gi;
@@ -314,39 +314,25 @@ read_rgrps(int fs_fd)
  */
 
 static void
-write_a_block(uint64_t where, struct gfs_rgrp *rg)
+write_a_block(int fd, struct gfs_rgrp *rg)
 {
 	char buffer[4096];
-	uint64_t fsoffset = where * (uint64_t) fs_sb.sb_bsize;
-	int fd = open(device, O_RDWR);
-	struct gfs_meta_header mh;
-	mh.mh_magic = GFS_MAGIC;
-	mh.mh_type = GFS_METATYPE_RB;
-	mh.mh_format = GFS_FORMAT_RB;
 
-	if (fd < 0) {
-		perror(device);
-		exit(EXIT_FAILURE);
-	}
-	if (where < fssize) {
-		fprintf(stderr,
-			"Sanity check failed: Caught trying to write to live filesystem!\n");
-		exit(EXIT_FAILURE);
-	}
-	memset(buffer, 0, 4096);
+	memset(buffer, 0, fs_sb.sb_bsize);
 	if (rg)
 		gfs_rgrp_out(rg, buffer);
-	else
+	else {
+		struct gfs_meta_header mh;
+
+		mh.mh_magic = GFS_MAGIC;
+		mh.mh_type = GFS_METATYPE_RB;
+		mh.mh_format = GFS_FORMAT_RB;
 		gfs_meta_header_out(&mh, buffer);
-	if (lseek(fd, fsoffset, SEEK_SET) != fsoffset) {
-		perror(device);
-		exit(EXIT_FAILURE);
 	}
 	if (write(fd, buffer, fs_sb.sb_bsize) != fs_sb.sb_bsize) {
 		perror("write_zero_block");
 		exit(EXIT_FAILURE);
 	}
-	close(fd);
 }
 
 /**
@@ -359,16 +345,24 @@ write_a_block(uint64_t where, struct gfs_rgrp *rg)
  */
 
 static void
-write_whole_rgrp(struct rglist_entry *rgl)
+write_whole_rgrp(int fd, struct rglist_entry *rgl)
 {
 	uint32_t l;
 	uint32_t nzb = rgl->ri.ri_length;
-	uint64_t addr = rgl->ri.ri_addr;
+	uint64_t fsoffset = rgl->ri.ri_addr * (uint64_t) fs_sb.sb_bsize;
 
-	write_a_block(addr++, &rgl->rg);
+	if (fsoffset < fssize) {
+		fprintf(stderr,
+			"Sanity check failed: Caught trying to write to live filesystem!\n");
+		exit(EXIT_FAILURE);
+	}
+	if (lseek(fd, fsoffset, SEEK_SET) != fsoffset) {
+		perror(device);
+		exit(EXIT_FAILURE);
+	}
+	write_a_block(fd, &rgl->rg);
 	for (l = 1; l < nzb; l++)
-		write_a_block(addr++, NULL);
-	sync();
+		write_a_block(fd, NULL);
 }
 
 /**
@@ -415,9 +409,18 @@ write_rindex(int fs_fd)
 {
 	osi_list_t *tmp, *head;
 	struct rglist_entry *rgl;
-	char buffer[sizeof(struct gfs_rindex)];
-	uint64_t offset;
-
+	char *buffer;
+	int b, data_per_blk, data_per_page, firstblock;
+	int chunks_this_page, page_freebytes, data_this_page;
+	long page_size;
+	uint64_t offset, size;
+
+	page_size = sysconf(_SC_PAGESIZE);
+	buffer = malloc(page_size * 2);
+	if (!buffer) {
+		fprintf(stderr, "Error: out of memory.\n");
+		exit(EXIT_FAILURE);
+	}
 	offset = get_length(fs_fd, "rindex");
 
 	/*
@@ -425,23 +428,63 @@ write_rindex(int fs_fd)
 	 * If things mess up here, it could be very difficult to put right
 	 */
 	tmp = head = &rglist_new;
+	firstblock = 1;
+	data_per_blk = fs_sb.sb_bsize - sizeof(struct gfs_meta_header);
+	data_per_page = (page_size / fs_sb.sb_bsize) * data_per_blk;
 	for (;;) {
-		tmp = tmp->next;
-		if (tmp == head)
-			break;
-		rgl = osi_list_entry(tmp, struct rglist_entry, list);
-		gfs_rindex_out(&rgl->ri, buffer);
-		if (jwrite(fs_fd, "rindex", buffer,
-			   sizeof(struct gfs_rindex), &offset) !=
-		    sizeof(struct gfs_rindex)) {
+		size = 0;
+		/* We used to write new rindex entries out one by one.
+		   However, that can be very slow, especially if there's a
+		   load on the file system.  That's because each write is
+		   done with the glock sync option and the journal will need
+		   to be synced before the glock is freed up for the next
+		   write.  (The rindex file is journaled data). If the block
+		   size matches the page size, the syncs happen much faster.
+		   If the block size is smaller, it takes a huge amount of
+		   time to get all the blocks to settle within the page.
+
+		   What I'm trying to do here is optimize our writes so
+		   that multiple blocks may be allocated during a single
+		   write request, all under the same glock, in order to
+		   get a page-full of data written.  This makes the syncing
+		   go much faster.  The exception is the very first write
+		   (see note below). */
+		data_this_page = offset % data_per_page;
+		page_freebytes = page_size - data_this_page;
+		if (page_freebytes < sizeof(struct gfs_rindex))
+			page_freebytes = data_per_page;
+		chunks_this_page = (page_freebytes /
+				    sizeof(struct gfs_rindex)) + 1;
+		for (b = 0; b < chunks_this_page; b++) {
+			tmp = tmp->next;
+			if (tmp == head)
+				break;
+			rgl = osi_list_entry(tmp, struct rglist_entry, list);
+			gfs_rindex_out(&rgl->ri, buffer + size);
+			size += sizeof(struct gfs_rindex);
+			/* Write the first block on its own.  This minimizes
+			   the chance of running out of blocks on a nearly
+			   full file system.  Once that first block is written,
+			   the file system should have more free blocks to
+			   allocate in chunks. */
+			if (firstblock) {
+				firstblock = 0;
+				break;
+			}
+		}
+		if (size &&
+		    jwrite(fs_fd, "rindex", buffer, size, &offset) != size) {
 			perror("write: rindex");
 			fprintf(stderr, "Aborting...\n");
 			exit(EXIT_FAILURE);
 		}
+		if (tmp == head)
+			break;
 	}
 	/*
 	 * This is the end of the critical section
 	 */
+	free(buffer);
 }
 
 /**
@@ -459,15 +502,22 @@ write_rgrps(int fs_fd)
 {
 	osi_list_t *tmp, *head;
 	struct rglist_entry *rgl;
+	int fd = open(device, O_RDWR);
 
+	if (fd < 0) {
+		perror(device);
+		exit(EXIT_FAILURE);
+	}
 	tmp = head = &rglist_new;
 	for (;;) {
 		tmp = tmp->next;
 		if (tmp == head)
 			break;
 		rgl = osi_list_entry(tmp, struct rglist_entry, list);
-		write_whole_rgrp(rgl);
+		write_whole_rgrp(fd, rgl);
 	}
+	sync();
+	close(fd);
 
 	sync();
 	sync();


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2009-04-07 22:21 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-04-07 22:21 cluster: STABLE2 - Speed up gfs_grow Bob Peterson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).