From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 24876 invoked by alias); 8 Apr 2009 14:48:28 -0000 Received: (qmail 24858 invoked by alias); 8 Apr 2009 14:48:26 -0000 X-SWARE-Spam-Status: No, hits=-0.1 required=5.0 tests=AWL,BAYES_00,J_CHICKENPOX_46,J_CHICKENPOX_53,J_CHICKENPOX_63,J_CHICKENPOX_66,SPF_HELO_PASS X-Spam-Status: No, hits=-0.1 required=5.0 tests=AWL,BAYES_00,J_CHICKENPOX_46,J_CHICKENPOX_53,J_CHICKENPOX_63,J_CHICKENPOX_66,SPF_HELO_PASS X-Spam-Check-By: sourceware.org X-Spam-Checker-Version: SpamAssassin 3.2.5 (2008-06-10) on bastion2.fedora.phx.redhat.com Subject: cluster: RHEL4 - Speed up gfs_grow To: cluster-cvs-relay@redhat.com X-Project: Cluster Project X-Git-Module: cluster.git X-Git-Refname: refs/heads/RHEL4 X-Git-Reftype: branch X-Git-Oldrev: c70c5ab16cdf2243421e5e9a2510bc80913dc321 X-Git-Newrev: a0fde41c5ff521846aeceea4d2ef1d37b0e56615 From: Bob Peterson Message-Id: <20090408144800.BCB94120339@lists.fedorahosted.org> Date: Wed, 08 Apr 2009 14:48:00 -0000 X-Scanned-By: MIMEDefang 2.58 on 172.16.52.254 Mailing-List: contact cluster-cvs-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Post: List-Help: , Sender: cluster-cvs-owner@sourceware.org X-SW-Source: 2009-q2/txt/msg00044.txt.bz2 Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=a0fde41c5ff521846aeceea4d2ef1d37b0e56615 Commit: a0fde41c5ff521846aeceea4d2ef1d37b0e56615 Parent: c70c5ab16cdf2243421e5e9a2510bc80913dc321 Author: Bob Peterson AuthorDate: Wed Apr 8 08:46:35 2009 -0500 Committer: Bob Peterson CommitterDate: Wed Apr 8 08:46:35 2009 -0500 Speed up gfs_grow bz 485451 - gfs_grow very slow with 1k file system block size This patch speeds up gfs_grow by doing two main things: First, it doesn't close/open/seek for every RG block write. Second, it combines writes to the rindex file so that a whole page may be allocated and written at a time. That avoids some very slow glock issues, especially when block size is less than page size. --- gfs/gfs_grow/Makefile | 2 +- gfs/gfs_grow/main.c | 132 +++++++++++++++++++++++++++++++++--------------- 2 files changed, 92 insertions(+), 42 deletions(-) diff --git a/gfs/gfs_grow/Makefile b/gfs/gfs_grow/Makefile index 008366a..320a42e 100644 --- a/gfs/gfs_grow/Makefile +++ b/gfs/gfs_grow/Makefile @@ -39,7 +39,7 @@ LOADLIBS+= -liddev all: ${TARGET} -gfs_grow: +gfs_grow: main.c ${CC} ${CFLAGS} ${INCLUDE} main.c ondisk.c ${LDFLAGS} ${LOADLIBS} -o $@ copytobin: all diff --git a/gfs/gfs_grow/main.c b/gfs/gfs_grow/main.c index 4a2494a..dae9dae 100644 --- a/gfs/gfs_grow/main.c +++ b/gfs/gfs_grow/main.c @@ -123,7 +123,7 @@ device_geometry(char *device) * Returns: Error code, or amount of data read */ -int +static int jread(int fd, char *file, void *buf, uint64_t size, uint64_t *offset) { struct gfs_ioctl gi; @@ -154,7 +154,7 @@ jread(int fd, char *file, void *buf, uint64_t size, uint64_t *offset) * Returns: Error code, or the amount of data written */ -int +static int jwrite(int fd, char *file, void *buf, uint64_t size, uint64_t *offset) { struct gfs_ioctl gi; @@ -328,39 +328,25 @@ read_rgrps(int fs_fd) */ static void -write_a_block(uint64_t where, struct gfs_rgrp *rg) +write_a_block(int fd, struct gfs_rgrp *rg) { char buffer[4096]; - uint64_t fsoffset = where * (uint64_t) fs_sb.sb_bsize; - int fd = open(device, O_RDWR); - struct gfs_meta_header mh; - mh.mh_magic = GFS_MAGIC; - mh.mh_type = GFS_METATYPE_RB; - mh.mh_format = GFS_FORMAT_RB; - if (fd < 0) { - perror(device); - exit(EXIT_FAILURE); - } - if (where < fssize) { - fprintf(stderr, - "Sanity check failed: Caught trying to write to live filesystem!\n"); - exit(EXIT_FAILURE); - } - memset(buffer, 0, 4096); + memset(buffer, 0, fs_sb.sb_bsize); if (rg) gfs_rgrp_out(rg, buffer); - else + else { + struct gfs_meta_header mh; + + mh.mh_magic = GFS_MAGIC; + mh.mh_type = GFS_METATYPE_RB; + mh.mh_format = GFS_FORMAT_RB; gfs_meta_header_out(&mh, buffer); - if (lseek(fd, fsoffset, SEEK_SET) != fsoffset) { - perror(device); - exit(EXIT_FAILURE); } if (write(fd, buffer, fs_sb.sb_bsize) != fs_sb.sb_bsize) { perror("write_zero_block"); exit(EXIT_FAILURE); } - close(fd); } /** @@ -373,16 +359,24 @@ write_a_block(uint64_t where, struct gfs_rgrp *rg) */ static void -write_whole_rgrp(struct rglist_entry *rgl) +write_whole_rgrp(int fd, struct rglist_entry *rgl) { uint32_t l; uint32_t nzb = rgl->ri.ri_length; - uint64_t addr = rgl->ri.ri_addr; + uint64_t fsoffset = rgl->ri.ri_addr * (uint64_t) fs_sb.sb_bsize; - write_a_block(addr++, &rgl->rg); + if (fsoffset < fssize) { + fprintf(stderr, + "Sanity check failed: Caught trying to write to live filesystem!\n"); + exit(EXIT_FAILURE); + } + if (lseek(fd, fsoffset, SEEK_SET) != fsoffset) { + perror(device); + exit(EXIT_FAILURE); + } + write_a_block(fd, &rgl->rg); for (l = 1; l < nzb; l++) - write_a_block(addr++, NULL); - sync(); + write_a_block(fd, NULL); } /** @@ -429,9 +423,18 @@ write_rindex(int fs_fd) { osi_list_t *tmp, *head; struct rglist_entry *rgl; - char buffer[sizeof(struct gfs_rindex)]; - uint64_t offset; - + char *buffer; + int b, data_per_blk, data_per_page, firstblock; + int chunks_this_page, page_freebytes, data_this_page; + long page_size; + uint64_t offset, size; + + page_size = sysconf(_SC_PAGESIZE); + buffer = malloc(page_size * 2); + if (!buffer) { + fprintf(stderr, "Error: out of memory.\n"); + exit(EXIT_FAILURE); + } offset = get_length(fs_fd, "rindex"); /* @@ -439,23 +442,63 @@ write_rindex(int fs_fd) * If things mess up here, it could be very difficult to put right */ tmp = head = &rglist_new; + firstblock = 1; + data_per_blk = fs_sb.sb_bsize - sizeof(struct gfs_meta_header); + data_per_page = (page_size / fs_sb.sb_bsize) * data_per_blk; for (;;) { - tmp = tmp->next; - if (tmp == head) - break; - rgl = osi_list_entry(tmp, struct rglist_entry, list); - gfs_rindex_out(&rgl->ri, buffer); - if (jwrite(fs_fd, "rindex", buffer, - sizeof(struct gfs_rindex), &offset) != - sizeof(struct gfs_rindex)) { + size = 0; + /* We used to write new rindex entries out one by one. + However, that can be very slow, especially if there's a + load on the file system. That's because each write is + done with the glock sync option and the journal will need + to be synced before the glock is freed up for the next + write. (The rindex file is journaled data). If the block + size matches the page size, the syncs happen much faster. + If the block size is smaller, it takes a huge amount of + time to get all the blocks to settle within the page. + + What I'm trying to do here is optimize our writes so + that multiple blocks may be allocated during a single + write request, all under the same glock, in order to + get a page-full of data written. This makes the syncing + go much faster. The exception is the very first write + (see note below). */ + data_this_page = offset % data_per_page; + page_freebytes = page_size - data_this_page; + if (page_freebytes < sizeof(struct gfs_rindex)) + page_freebytes = data_per_page; + chunks_this_page = (page_freebytes / + sizeof(struct gfs_rindex)) + 1; + for (b = 0; b < chunks_this_page; b++) { + tmp = tmp->next; + if (tmp == head) + break; + rgl = osi_list_entry(tmp, struct rglist_entry, list); + gfs_rindex_out(&rgl->ri, buffer + size); + size += sizeof(struct gfs_rindex); + /* Write the first block on its own. This minimizes + the chance of running out of blocks on a nearly + full file system. Once that first block is written, + the file system should have more free blocks to + allocate in chunks. */ + if (firstblock) { + firstblock = 0; + break; + } + } + if (size && + jwrite(fs_fd, "rindex", buffer, size, &offset) != size) { perror("write: rindex"); fprintf(stderr, "Aborting...\n"); exit(EXIT_FAILURE); } + if (tmp == head) + break; } /* * This is the end of the critical section */ + free(buffer); } /** @@ -473,15 +516,22 @@ write_rgrps(int fs_fd) { osi_list_t *tmp, *head; struct rglist_entry *rgl; + int fd = open(device, O_RDWR); + if (fd < 0) { + perror(device); + exit(EXIT_FAILURE); + } tmp = head = &rglist_new; for (;;) { tmp = tmp->next; if (tmp == head) break; rgl = osi_list_entry(tmp, struct rglist_entry, list); - write_whole_rgrp(rgl); + write_whole_rgrp(fd, rgl); } + sync(); + close(fd); sync(); sync();