public inbox for cluster-cvs@sourceware.org help / color / mirror / Atom feed
From: Bob Peterson <rpeterso@fedoraproject.org> To: cluster-cvs-relay@redhat.com Subject: cluster: STABLE2 - Speed up gfs_grow Date: Tue, 07 Apr 2009 22:21:00 -0000 [thread overview] Message-ID: <20090407222026.CE1AF1201B6@lists.fedorahosted.org> (raw) Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=1bd1fc12720a42fd7f5116573cbd48597a2acdfc Commit: 1bd1fc12720a42fd7f5116573cbd48597a2acdfc Parent: b518f58fd3a512bf01566dbf595ecbbda119963b Author: Bob Peterson <rpeterso@redhat.com> AuthorDate: Tue Apr 7 17:19:30 2009 -0500 Committer: Bob Peterson <rpeterso@redhat.com> CommitterDate: Tue Apr 7 17:19:30 2009 -0500 Speed up gfs_grow bz 485451 - gfs_grow very slow with 1k file system block size This patch speeds up gfs_grow by doing two main things: First, it doesn't close/open/seek for every RG block write. Second, it combines writes to the rindex file so that a whole page may be allocated and written at a time. That avoids some very slow glock issues, especially when block size is less than page size. --- gfs/gfs_grow/main.c | 132 +++++++++++++++++++++++++++++++++++---------------- 1 files changed, 91 insertions(+), 41 deletions(-) diff --git a/gfs/gfs_grow/main.c b/gfs/gfs_grow/main.c index eb30e21..661d364 100644 --- a/gfs/gfs_grow/main.c +++ b/gfs/gfs_grow/main.c @@ -109,7 +109,7 @@ device_geometry(char *device) * Returns: Error code, or amount of data read */ -int +static int jread(int fd, char *file, void *buf, uint64_t size, uint64_t *offset) { struct gfs_ioctl gi; @@ -140,7 +140,7 @@ jread(int fd, char *file, void *buf, uint64_t size, uint64_t *offset) * Returns: Error code, or the amount of data written */ -int +static int jwrite(int fd, char *file, void *buf, uint64_t size, uint64_t *offset) { struct gfs_ioctl gi; @@ -314,39 +314,25 @@ read_rgrps(int fs_fd) */ static void -write_a_block(uint64_t where, struct gfs_rgrp *rg) +write_a_block(int fd, struct gfs_rgrp *rg) { char buffer[4096]; - uint64_t fsoffset = where * (uint64_t) fs_sb.sb_bsize; - int fd = open(device, O_RDWR); - struct gfs_meta_header mh; - mh.mh_magic = GFS_MAGIC; - mh.mh_type = GFS_METATYPE_RB; - mh.mh_format = GFS_FORMAT_RB; - if (fd < 0) { - perror(device); - exit(EXIT_FAILURE); - } - if (where < fssize) { - fprintf(stderr, - "Sanity check failed: Caught trying to write to live filesystem!\n"); - exit(EXIT_FAILURE); - } - memset(buffer, 0, 4096); + memset(buffer, 0, fs_sb.sb_bsize); if (rg) gfs_rgrp_out(rg, buffer); - else + else { + struct gfs_meta_header mh; + + mh.mh_magic = GFS_MAGIC; + mh.mh_type = GFS_METATYPE_RB; + mh.mh_format = GFS_FORMAT_RB; gfs_meta_header_out(&mh, buffer); - if (lseek(fd, fsoffset, SEEK_SET) != fsoffset) { - perror(device); - exit(EXIT_FAILURE); } if (write(fd, buffer, fs_sb.sb_bsize) != fs_sb.sb_bsize) { perror("write_zero_block"); exit(EXIT_FAILURE); } - close(fd); } /** @@ -359,16 +345,24 @@ write_a_block(uint64_t where, struct gfs_rgrp *rg) */ static void -write_whole_rgrp(struct rglist_entry *rgl) +write_whole_rgrp(int fd, struct rglist_entry *rgl) { uint32_t l; uint32_t nzb = rgl->ri.ri_length; - uint64_t addr = rgl->ri.ri_addr; + uint64_t fsoffset = rgl->ri.ri_addr * (uint64_t) fs_sb.sb_bsize; - write_a_block(addr++, &rgl->rg); + if (fsoffset < fssize) { + fprintf(stderr, + "Sanity check failed: Caught trying to write to live filesystem!\n"); + exit(EXIT_FAILURE); + } + if (lseek(fd, fsoffset, SEEK_SET) != fsoffset) { + perror(device); + exit(EXIT_FAILURE); + } + write_a_block(fd, &rgl->rg); for (l = 1; l < nzb; l++) - write_a_block(addr++, NULL); - sync(); + write_a_block(fd, NULL); } /** @@ -415,9 +409,18 @@ write_rindex(int fs_fd) { osi_list_t *tmp, *head; struct rglist_entry *rgl; - char buffer[sizeof(struct gfs_rindex)]; - uint64_t offset; - + char *buffer; + int b, data_per_blk, data_per_page, firstblock; + int chunks_this_page, page_freebytes, data_this_page; + long page_size; + uint64_t offset, size; + + page_size = sysconf(_SC_PAGESIZE); + buffer = malloc(page_size * 2); + if (!buffer) { + fprintf(stderr, "Error: out of memory.\n"); + exit(EXIT_FAILURE); + } offset = get_length(fs_fd, "rindex"); /* @@ -425,23 +428,63 @@ write_rindex(int fs_fd) * If things mess up here, it could be very difficult to put right */ tmp = head = &rglist_new; + firstblock = 1; + data_per_blk = fs_sb.sb_bsize - sizeof(struct gfs_meta_header); + data_per_page = (page_size / fs_sb.sb_bsize) * data_per_blk; for (;;) { - tmp = tmp->next; - if (tmp == head) - break; - rgl = osi_list_entry(tmp, struct rglist_entry, list); - gfs_rindex_out(&rgl->ri, buffer); - if (jwrite(fs_fd, "rindex", buffer, - sizeof(struct gfs_rindex), &offset) != - sizeof(struct gfs_rindex)) { + size = 0; + /* We used to write new rindex entries out one by one. + However, that can be very slow, especially if there's a + load on the file system. That's because each write is + done with the glock sync option and the journal will need + to be synced before the glock is freed up for the next + write. (The rindex file is journaled data). If the block + size matches the page size, the syncs happen much faster. + If the block size is smaller, it takes a huge amount of + time to get all the blocks to settle within the page. + + What I'm trying to do here is optimize our writes so + that multiple blocks may be allocated during a single + write request, all under the same glock, in order to + get a page-full of data written. This makes the syncing + go much faster. The exception is the very first write + (see note below). */ + data_this_page = offset % data_per_page; + page_freebytes = page_size - data_this_page; + if (page_freebytes < sizeof(struct gfs_rindex)) + page_freebytes = data_per_page; + chunks_this_page = (page_freebytes / + sizeof(struct gfs_rindex)) + 1; + for (b = 0; b < chunks_this_page; b++) { + tmp = tmp->next; + if (tmp == head) + break; + rgl = osi_list_entry(tmp, struct rglist_entry, list); + gfs_rindex_out(&rgl->ri, buffer + size); + size += sizeof(struct gfs_rindex); + /* Write the first block on its own. This minimizes + the chance of running out of blocks on a nearly + full file system. Once that first block is written, + the file system should have more free blocks to + allocate in chunks. */ + if (firstblock) { + firstblock = 0; + break; + } + } + if (size && + jwrite(fs_fd, "rindex", buffer, size, &offset) != size) { perror("write: rindex"); fprintf(stderr, "Aborting...\n"); exit(EXIT_FAILURE); } + if (tmp == head) + break; } /* * This is the end of the critical section */ + free(buffer); } /** @@ -459,15 +502,22 @@ write_rgrps(int fs_fd) { osi_list_t *tmp, *head; struct rglist_entry *rgl; + int fd = open(device, O_RDWR); + if (fd < 0) { + perror(device); + exit(EXIT_FAILURE); + } tmp = head = &rglist_new; for (;;) { tmp = tmp->next; if (tmp == head) break; rgl = osi_list_entry(tmp, struct rglist_entry, list); - write_whole_rgrp(rgl); + write_whole_rgrp(fd, rgl); } + sync(); + close(fd); sync(); sync();
reply other threads:[~2009-04-07 22:21 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20090407222026.CE1AF1201B6@lists.fedorahosted.org \ --to=rpeterso@fedoraproject.org \ --cc=cluster-cvs-relay@redhat.com \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).