public inbox for cluster-cvs@sourceware.org
help / color / mirror / Atom feed
* cluster: STABLE2 - Speed up gfs_grow
@ 2009-04-07 22:21 Bob Peterson
0 siblings, 0 replies; only message in thread
From: Bob Peterson @ 2009-04-07 22:21 UTC (permalink / raw)
To: cluster-cvs-relay
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=1bd1fc12720a42fd7f5116573cbd48597a2acdfc
Commit: 1bd1fc12720a42fd7f5116573cbd48597a2acdfc
Parent: b518f58fd3a512bf01566dbf595ecbbda119963b
Author: Bob Peterson <rpeterso@redhat.com>
AuthorDate: Tue Apr 7 17:19:30 2009 -0500
Committer: Bob Peterson <rpeterso@redhat.com>
CommitterDate: Tue Apr 7 17:19:30 2009 -0500
Speed up gfs_grow
bz 485451 - gfs_grow very slow with 1k file system block size
This patch speeds up gfs_grow by doing two main things:
First, it doesn't close/open/seek for every RG block write.
Second, it combines writes to the rindex file so that a
whole page may be allocated and written at a time. That
avoids some very slow glock issues, especially when block size
is less than page size.
---
gfs/gfs_grow/main.c | 132 +++++++++++++++++++++++++++++++++++----------------
1 files changed, 91 insertions(+), 41 deletions(-)
diff --git a/gfs/gfs_grow/main.c b/gfs/gfs_grow/main.c
index eb30e21..661d364 100644
--- a/gfs/gfs_grow/main.c
+++ b/gfs/gfs_grow/main.c
@@ -109,7 +109,7 @@ device_geometry(char *device)
* Returns: Error code, or amount of data read
*/
-int
+static int
jread(int fd, char *file, void *buf, uint64_t size, uint64_t *offset)
{
struct gfs_ioctl gi;
@@ -140,7 +140,7 @@ jread(int fd, char *file, void *buf, uint64_t size, uint64_t *offset)
* Returns: Error code, or the amount of data written
*/
-int
+static int
jwrite(int fd, char *file, void *buf, uint64_t size, uint64_t *offset)
{
struct gfs_ioctl gi;
@@ -314,39 +314,25 @@ read_rgrps(int fs_fd)
*/
static void
-write_a_block(uint64_t where, struct gfs_rgrp *rg)
+write_a_block(int fd, struct gfs_rgrp *rg)
{
char buffer[4096];
- uint64_t fsoffset = where * (uint64_t) fs_sb.sb_bsize;
- int fd = open(device, O_RDWR);
- struct gfs_meta_header mh;
- mh.mh_magic = GFS_MAGIC;
- mh.mh_type = GFS_METATYPE_RB;
- mh.mh_format = GFS_FORMAT_RB;
- if (fd < 0) {
- perror(device);
- exit(EXIT_FAILURE);
- }
- if (where < fssize) {
- fprintf(stderr,
- "Sanity check failed: Caught trying to write to live filesystem!\n");
- exit(EXIT_FAILURE);
- }
- memset(buffer, 0, 4096);
+ memset(buffer, 0, fs_sb.sb_bsize);
if (rg)
gfs_rgrp_out(rg, buffer);
- else
+ else {
+ struct gfs_meta_header mh;
+
+ mh.mh_magic = GFS_MAGIC;
+ mh.mh_type = GFS_METATYPE_RB;
+ mh.mh_format = GFS_FORMAT_RB;
gfs_meta_header_out(&mh, buffer);
- if (lseek(fd, fsoffset, SEEK_SET) != fsoffset) {
- perror(device);
- exit(EXIT_FAILURE);
}
if (write(fd, buffer, fs_sb.sb_bsize) != fs_sb.sb_bsize) {
perror("write_zero_block");
exit(EXIT_FAILURE);
}
- close(fd);
}
/**
@@ -359,16 +345,24 @@ write_a_block(uint64_t where, struct gfs_rgrp *rg)
*/
static void
-write_whole_rgrp(struct rglist_entry *rgl)
+write_whole_rgrp(int fd, struct rglist_entry *rgl)
{
uint32_t l;
uint32_t nzb = rgl->ri.ri_length;
- uint64_t addr = rgl->ri.ri_addr;
+ uint64_t fsoffset = rgl->ri.ri_addr * (uint64_t) fs_sb.sb_bsize;
- write_a_block(addr++, &rgl->rg);
+ if (fsoffset < fssize) {
+ fprintf(stderr,
+ "Sanity check failed: Caught trying to write to live filesystem!\n");
+ exit(EXIT_FAILURE);
+ }
+ if (lseek(fd, fsoffset, SEEK_SET) != fsoffset) {
+ perror(device);
+ exit(EXIT_FAILURE);
+ }
+ write_a_block(fd, &rgl->rg);
for (l = 1; l < nzb; l++)
- write_a_block(addr++, NULL);
- sync();
+ write_a_block(fd, NULL);
}
/**
@@ -415,9 +409,18 @@ write_rindex(int fs_fd)
{
osi_list_t *tmp, *head;
struct rglist_entry *rgl;
- char buffer[sizeof(struct gfs_rindex)];
- uint64_t offset;
-
+ char *buffer;
+ int b, data_per_blk, data_per_page, firstblock;
+ int chunks_this_page, page_freebytes, data_this_page;
+ long page_size;
+ uint64_t offset, size;
+
+ page_size = sysconf(_SC_PAGESIZE);
+ buffer = malloc(page_size * 2);
+ if (!buffer) {
+ fprintf(stderr, "Error: out of memory.\n");
+ exit(EXIT_FAILURE);
+ }
offset = get_length(fs_fd, "rindex");
/*
@@ -425,23 +428,63 @@ write_rindex(int fs_fd)
* If things mess up here, it could be very difficult to put right
*/
tmp = head = &rglist_new;
+ firstblock = 1;
+ data_per_blk = fs_sb.sb_bsize - sizeof(struct gfs_meta_header);
+ data_per_page = (page_size / fs_sb.sb_bsize) * data_per_blk;
for (;;) {
- tmp = tmp->next;
- if (tmp == head)
- break;
- rgl = osi_list_entry(tmp, struct rglist_entry, list);
- gfs_rindex_out(&rgl->ri, buffer);
- if (jwrite(fs_fd, "rindex", buffer,
- sizeof(struct gfs_rindex), &offset) !=
- sizeof(struct gfs_rindex)) {
+ size = 0;
+ /* We used to write new rindex entries out one by one.
+ However, that can be very slow, especially if there's a
+ load on the file system. That's because each write is
+ done with the glock sync option and the journal will need
+ to be synced before the glock is freed up for the next
+ write. (The rindex file is journaled data). If the block
+ size matches the page size, the syncs happen much faster.
+ If the block size is smaller, it takes a huge amount of
+ time to get all the blocks to settle within the page.
+
+ What I'm trying to do here is optimize our writes so
+ that multiple blocks may be allocated during a single
+ write request, all under the same glock, in order to
+ get a page-full of data written. This makes the syncing
+ go much faster. The exception is the very first write
+ (see note below). */
+ data_this_page = offset % data_per_page;
+ page_freebytes = page_size - data_this_page;
+ if (page_freebytes < sizeof(struct gfs_rindex))
+ page_freebytes = data_per_page;
+ chunks_this_page = (page_freebytes /
+ sizeof(struct gfs_rindex)) + 1;
+ for (b = 0; b < chunks_this_page; b++) {
+ tmp = tmp->next;
+ if (tmp == head)
+ break;
+ rgl = osi_list_entry(tmp, struct rglist_entry, list);
+ gfs_rindex_out(&rgl->ri, buffer + size);
+ size += sizeof(struct gfs_rindex);
+ /* Write the first block on its own. This minimizes
+ the chance of running out of blocks on a nearly
+ full file system. Once that first block is written,
+ the file system should have more free blocks to
+ allocate in chunks. */
+ if (firstblock) {
+ firstblock = 0;
+ break;
+ }
+ }
+ if (size &&
+ jwrite(fs_fd, "rindex", buffer, size, &offset) != size) {
perror("write: rindex");
fprintf(stderr, "Aborting...\n");
exit(EXIT_FAILURE);
}
+ if (tmp == head)
+ break;
}
/*
* This is the end of the critical section
*/
+ free(buffer);
}
/**
@@ -459,15 +502,22 @@ write_rgrps(int fs_fd)
{
osi_list_t *tmp, *head;
struct rglist_entry *rgl;
+ int fd = open(device, O_RDWR);
+ if (fd < 0) {
+ perror(device);
+ exit(EXIT_FAILURE);
+ }
tmp = head = &rglist_new;
for (;;) {
tmp = tmp->next;
if (tmp == head)
break;
rgl = osi_list_entry(tmp, struct rglist_entry, list);
- write_whole_rgrp(rgl);
+ write_whole_rgrp(fd, rgl);
}
+ sync();
+ close(fd);
sync();
sync();
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2009-04-07 22:21 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-04-07 22:21 cluster: STABLE2 - Speed up gfs_grow Bob Peterson
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).