From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 757 invoked by alias); 7 Apr 2009 22:15:08 -0000 Received: (qmail 751 invoked by alias); 7 Apr 2009 22:15:07 -0000 X-SWARE-Spam-Status: No, hits=-0.1 required=5.0 tests=AWL,BAYES_00,J_CHICKENPOX_46,J_CHICKENPOX_53,J_CHICKENPOX_63,J_CHICKENPOX_66,SPF_HELO_PASS X-Spam-Status: No, hits=-0.1 required=5.0 tests=AWL,BAYES_00,J_CHICKENPOX_46,J_CHICKENPOX_53,J_CHICKENPOX_63,J_CHICKENPOX_66,SPF_HELO_PASS X-Spam-Check-By: sourceware.org X-Spam-Checker-Version: SpamAssassin 3.2.5 (2008-06-10) on bastion2.fedora.phx.redhat.com Subject: gfs1-utils: master - Speed up gfs_grow To: cluster-cvs-relay@redhat.com X-Project: Cluster Project X-Git-Module: gfs1-utils.git X-Git-Refname: refs/heads/master X-Git-Reftype: branch X-Git-Oldrev: e45ae8a00674c0d37e21a0094e77562e223aa7f6 X-Git-Newrev: 2c1e64ea123fe74d7d97ac145acbd357eb86b156 From: Bob Peterson Message-Id: <20090407221441.5728B1201B6@lists.fedorahosted.org> Date: Tue, 07 Apr 2009 22:15:00 -0000 X-Scanned-By: MIMEDefang 2.58 on 172.16.52.254 Mailing-List: contact cluster-cvs-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Post: List-Help: , Sender: cluster-cvs-owner@sourceware.org X-SW-Source: 2009-q2/txt/msg00037.txt.bz2 Gitweb: http://git.fedorahosted.org/git/gfs1-utils.git?p=gfs1-utils.git;a=commitdiff;h=2c1e64ea123fe74d7d97ac145acbd357eb86b156 Commit: 2c1e64ea123fe74d7d97ac145acbd357eb86b156 Parent: e45ae8a00674c0d37e21a0094e77562e223aa7f6 Author: Bob Peterson AuthorDate: Tue Apr 7 17:12:42 2009 -0500 Committer: Bob Peterson CommitterDate: Tue Apr 7 17:12:42 2009 -0500 Speed up gfs_grow bz 485451 - gfs_grow very slow with 1k file system block size This patch speeds up gfs_grow by doing two main things: First, it doesn't close/open/seek for every RG block write. Second, it combines writes to the rindex file so that a whole page may be allocated and written at a time. That avoids some very slow glock issues, especially when block size is less than page size. --- gfs/gfs_grow/main.c | 132 +++++++++++++++++++++++++++++++++++---------------- 1 files changed, 91 insertions(+), 41 deletions(-) diff --git a/gfs/gfs_grow/main.c b/gfs/gfs_grow/main.c index eb30e21..661d364 100644 --- a/gfs/gfs_grow/main.c +++ b/gfs/gfs_grow/main.c @@ -109,7 +109,7 @@ device_geometry(char *device) * Returns: Error code, or amount of data read */ -int +static int jread(int fd, char *file, void *buf, uint64_t size, uint64_t *offset) { struct gfs_ioctl gi; @@ -140,7 +140,7 @@ jread(int fd, char *file, void *buf, uint64_t size, uint64_t *offset) * Returns: Error code, or the amount of data written */ -int +static int jwrite(int fd, char *file, void *buf, uint64_t size, uint64_t *offset) { struct gfs_ioctl gi; @@ -314,39 +314,25 @@ read_rgrps(int fs_fd) */ static void -write_a_block(uint64_t where, struct gfs_rgrp *rg) +write_a_block(int fd, struct gfs_rgrp *rg) { char buffer[4096]; - uint64_t fsoffset = where * (uint64_t) fs_sb.sb_bsize; - int fd = open(device, O_RDWR); - struct gfs_meta_header mh; - mh.mh_magic = GFS_MAGIC; - mh.mh_type = GFS_METATYPE_RB; - mh.mh_format = GFS_FORMAT_RB; - if (fd < 0) { - perror(device); - exit(EXIT_FAILURE); - } - if (where < fssize) { - fprintf(stderr, - "Sanity check failed: Caught trying to write to live filesystem!\n"); - exit(EXIT_FAILURE); - } - memset(buffer, 0, 4096); + memset(buffer, 0, fs_sb.sb_bsize); if (rg) gfs_rgrp_out(rg, buffer); - else + else { + struct gfs_meta_header mh; + + mh.mh_magic = GFS_MAGIC; + mh.mh_type = GFS_METATYPE_RB; + mh.mh_format = GFS_FORMAT_RB; gfs_meta_header_out(&mh, buffer); - if (lseek(fd, fsoffset, SEEK_SET) != fsoffset) { - perror(device); - exit(EXIT_FAILURE); } if (write(fd, buffer, fs_sb.sb_bsize) != fs_sb.sb_bsize) { perror("write_zero_block"); exit(EXIT_FAILURE); } - close(fd); } /** @@ -359,16 +345,24 @@ write_a_block(uint64_t where, struct gfs_rgrp *rg) */ static void -write_whole_rgrp(struct rglist_entry *rgl) +write_whole_rgrp(int fd, struct rglist_entry *rgl) { uint32_t l; uint32_t nzb = rgl->ri.ri_length; - uint64_t addr = rgl->ri.ri_addr; + uint64_t fsoffset = rgl->ri.ri_addr * (uint64_t) fs_sb.sb_bsize; - write_a_block(addr++, &rgl->rg); + if (fsoffset < fssize) { + fprintf(stderr, + "Sanity check failed: Caught trying to write to live filesystem!\n"); + exit(EXIT_FAILURE); + } + if (lseek(fd, fsoffset, SEEK_SET) != fsoffset) { + perror(device); + exit(EXIT_FAILURE); + } + write_a_block(fd, &rgl->rg); for (l = 1; l < nzb; l++) - write_a_block(addr++, NULL); - sync(); + write_a_block(fd, NULL); } /** @@ -415,9 +409,18 @@ write_rindex(int fs_fd) { osi_list_t *tmp, *head; struct rglist_entry *rgl; - char buffer[sizeof(struct gfs_rindex)]; - uint64_t offset; - + char *buffer; + int b, data_per_blk, data_per_page, firstblock; + int chunks_this_page, page_freebytes, data_this_page; + long page_size; + uint64_t offset, size; + + page_size = sysconf(_SC_PAGESIZE); + buffer = malloc(page_size * 2); + if (!buffer) { + fprintf(stderr, "Error: out of memory.\n"); + exit(EXIT_FAILURE); + } offset = get_length(fs_fd, "rindex"); /* @@ -425,23 +428,63 @@ write_rindex(int fs_fd) * If things mess up here, it could be very difficult to put right */ tmp = head = &rglist_new; + firstblock = 1; + data_per_blk = fs_sb.sb_bsize - sizeof(struct gfs_meta_header); + data_per_page = (page_size / fs_sb.sb_bsize) * data_per_blk; for (;;) { - tmp = tmp->next; - if (tmp == head) - break; - rgl = osi_list_entry(tmp, struct rglist_entry, list); - gfs_rindex_out(&rgl->ri, buffer); - if (jwrite(fs_fd, "rindex", buffer, - sizeof(struct gfs_rindex), &offset) != - sizeof(struct gfs_rindex)) { + size = 0; + /* We used to write new rindex entries out one by one. + However, that can be very slow, especially if there's a + load on the file system. That's because each write is + done with the glock sync option and the journal will need + to be synced before the glock is freed up for the next + write. (The rindex file is journaled data). If the block + size matches the page size, the syncs happen much faster. + If the block size is smaller, it takes a huge amount of + time to get all the blocks to settle within the page. + + What I'm trying to do here is optimize our writes so + that multiple blocks may be allocated during a single + write request, all under the same glock, in order to + get a page-full of data written. This makes the syncing + go much faster. The exception is the very first write + (see note below). */ + data_this_page = offset % data_per_page; + page_freebytes = page_size - data_this_page; + if (page_freebytes < sizeof(struct gfs_rindex)) + page_freebytes = data_per_page; + chunks_this_page = (page_freebytes / + sizeof(struct gfs_rindex)) + 1; + for (b = 0; b < chunks_this_page; b++) { + tmp = tmp->next; + if (tmp == head) + break; + rgl = osi_list_entry(tmp, struct rglist_entry, list); + gfs_rindex_out(&rgl->ri, buffer + size); + size += sizeof(struct gfs_rindex); + /* Write the first block on its own. This minimizes + the chance of running out of blocks on a nearly + full file system. Once that first block is written, + the file system should have more free blocks to + allocate in chunks. */ + if (firstblock) { + firstblock = 0; + break; + } + } + if (size && + jwrite(fs_fd, "rindex", buffer, size, &offset) != size) { perror("write: rindex"); fprintf(stderr, "Aborting...\n"); exit(EXIT_FAILURE); } + if (tmp == head) + break; } /* * This is the end of the critical section */ + free(buffer); } /** @@ -459,15 +502,22 @@ write_rgrps(int fs_fd) { osi_list_t *tmp, *head; struct rglist_entry *rgl; + int fd = open(device, O_RDWR); + if (fd < 0) { + perror(device); + exit(EXIT_FAILURE); + } tmp = head = &rglist_new; for (;;) { tmp = tmp->next; if (tmp == head) break; rgl = osi_list_entry(tmp, struct rglist_entry, list); - write_whole_rgrp(rgl); + write_whole_rgrp(fd, rgl); } + sync(); + close(fd); sync(); sync();