public inbox for cluster-cvs@sourceware.org
help / color / mirror / Atom feed
* Cluster Project branch, master, updated. cluster-2.99.03-16-gb5fb9a7
@ 2008-06-06 12:44 fabbione
  0 siblings, 0 replies; only message in thread
From: fabbione @ 2008-06-06 12:44 UTC (permalink / raw)
  To: cluster-cvs, cluster-devel

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "Cluster Project".

http://sources.redhat.com/git/gitweb.cgi?p=cluster.git;a=commitdiff;h=b5fb9a7b7245174d1d33b1c2f70e864331c4a97e

The branch, master has been updated
       via  b5fb9a7b7245174d1d33b1c2f70e864331c4a97e (commit)
      from  99c9d71e90fee71157e863f0296d9f10c8571696 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit b5fb9a7b7245174d1d33b1c2f70e864331c4a97e
Author: Fabio M. Di Nitto <fdinitto@redhat.com>
Date:   Fri Jun 6 14:43:11 2008 +0200

    [MISC] Tree cleanup
    
    Remove all dead code that has not been updated in ages
    or doesn't build and nobody knows what it is.
    
    This can be restored at a later stage from stable2 branch
    if required.
    
    Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>

-----------------------------------------------------------------------

Summary of changes:
 cmirror-kernel/src/dm-clog-tfr.c        |   83 -
 cmirror-kernel/src/dm-clog-tfr.h        |   40 -
 cmirror-kernel/src/dm-clog.c            |  624 -------
 cmirror/Makefile                        |   14 -
 csnap-kernel/Makefile                   |   14 -
 csnap-kernel/patches/2.6.15/00001.patch |   16 -
 csnap-kernel/patches/2.6.15/00002.patch |   32 -
 csnap-kernel/patches/2.6.15/00003.patch |   30 -
 csnap-kernel/patches/2.6.9/00001.patch  |   16 -
 csnap-kernel/patches/2.6.9/00002.patch  |   32 -
 csnap-kernel/patches/2.6.9/00003.patch  |   30 -
 csnap-kernel/src/Makefile               |   69 -
 csnap-kernel/src/dm-csnap.c             | 1147 ------------
 csnap-kernel/src/dm-csnap.h             |   70 -
 csnap/COPYING                           |  340 ----
 csnap/Makefile                          |   15 -
 csnap/README                            |   67 -
 csnap/doc/cluster.snapshot.design.html  | 1467 ---------------
 csnap/doc/csnap.ps                      | 2994 -------------------------------
 csnap/patches/csnap-2.6.7-2.4.26        |  195 --
 csnap/patches/csnap-2.6.8.1             | 1321 --------------
 csnap/src/Makefile                      |   44 -
 csnap/src/agent.c                       |  359 ----
 csnap/src/buffer.c                      |  268 ---
 csnap/src/buffer.h                      |   60 -
 csnap/src/buffertest.c                  |   15 -
 csnap/src/create.c                      |   58 -
 csnap/src/csnap.c                       | 2623 ---------------------------
 csnap/src/csnap.h                       |   44 -
 csnap/src/list.h                        |   64 -
 csnap/src/sock.h                        |   55 -
 csnap/src/trace.h                       |    7 -
 csnap/tests/Makefile                    |   49 -
 csnap/tests/devpoke.c                   |   55 -
 csnap/tests/devspam.c                   |   83 -
 csnap/tests/testclient.c                |  185 --
 gfs2/debug/Makefile                     |   46 -
 gfs2/debug/basic.c                      |  458 -----
 gfs2/debug/basic.h                      |   26 -
 gfs2/debug/block_device.c               |  117 --
 gfs2/debug/block_device.h               |   14 -
 gfs2/debug/gfs2_debug.h                 |   83 -
 gfs2/debug/main.c                       |  179 --
 gfs2/debug/ondisk.c                     |   12 -
 gfs2/debug/readfile.c                   |  215 ---
 gfs2/debug/readfile.h                   |   14 -
 gfs2/debug/util.c                       |  334 ----
 gfs2/debug/util.h                       |   29 -
 48 files changed, 0 insertions(+), 14112 deletions(-)
 delete mode 100644 cmirror-kernel/src/dm-clog-tfr.c
 delete mode 100644 cmirror-kernel/src/dm-clog-tfr.h
 delete mode 100644 cmirror-kernel/src/dm-clog.c
 delete mode 100644 cmirror/Makefile
 delete mode 100644 csnap-kernel/Makefile
 delete mode 100644 csnap-kernel/patches/2.6.15/00001.patch
 delete mode 100644 csnap-kernel/patches/2.6.15/00002.patch
 delete mode 100644 csnap-kernel/patches/2.6.15/00003.patch
 delete mode 100644 csnap-kernel/patches/2.6.9/00001.patch
 delete mode 100644 csnap-kernel/patches/2.6.9/00002.patch
 delete mode 100644 csnap-kernel/patches/2.6.9/00003.patch
 delete mode 100644 csnap-kernel/src/Makefile
 delete mode 100644 csnap-kernel/src/dm-csnap.c
 delete mode 100644 csnap-kernel/src/dm-csnap.h
 delete mode 100644 csnap/COPYING
 delete mode 100644 csnap/Makefile
 delete mode 100644 csnap/README
 delete mode 100644 csnap/doc/cluster.snapshot.design.html
 delete mode 100644 csnap/doc/csnap.ps
 delete mode 100644 csnap/patches/csnap-2.6.7-2.4.26
 delete mode 100644 csnap/patches/csnap-2.6.8.1
 delete mode 100644 csnap/src/Makefile
 delete mode 100644 csnap/src/agent.c
 delete mode 100644 csnap/src/buffer.c
 delete mode 100644 csnap/src/buffer.h
 delete mode 100644 csnap/src/buffertest.c
 delete mode 100644 csnap/src/create.c
 delete mode 100644 csnap/src/csnap.c
 delete mode 100644 csnap/src/csnap.h
 delete mode 100644 csnap/src/list.h
 delete mode 100644 csnap/src/sock.h
 delete mode 100644 csnap/src/trace.h
 delete mode 100644 csnap/tests/Makefile
 delete mode 100644 csnap/tests/devpoke.c
 delete mode 100644 csnap/tests/devspam.c
 delete mode 100644 csnap/tests/testclient.c
 delete mode 100644 gfs2/debug/Makefile
 delete mode 100644 gfs2/debug/basic.c
 delete mode 100644 gfs2/debug/basic.h
 delete mode 100644 gfs2/debug/block_device.c
 delete mode 100644 gfs2/debug/block_device.h
 delete mode 100644 gfs2/debug/gfs2_debug.h
 delete mode 100644 gfs2/debug/main.c
 delete mode 100644 gfs2/debug/ondisk.c
 delete mode 100644 gfs2/debug/readfile.c
 delete mode 100644 gfs2/debug/readfile.h
 delete mode 100644 gfs2/debug/util.c
 delete mode 100644 gfs2/debug/util.h

diff --git a/cmirror-kernel/src/dm-clog-tfr.c b/cmirror-kernel/src/dm-clog-tfr.c
deleted file mode 100644
index b1fc8ad..0000000
--- a/cmirror-kernel/src/dm-clog-tfr.c
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (C) 2006 Red Hat, Inc.
- *
- * This file is released under the LGPL.
- */
-
-#include "dm-clog-tfr.h"
-
-/*
- * Pre-allocated nominal request area for speed
- */
-#define DM_CLOG_NOMINAL_REQUEST_SIZE 512
-static char nominal_request[DM_CLOG_NOMINAL_REQUEST_SIZE];
-
-static DECLARE_MUTEX(consult_server_lock);
-
-/*
- * dm_clog_consult_server
- * @uuid: log's uuid (must be MAX_NAME_LEN in size)
- * @request_type:
- * @data: data to tx to the server
- * @data_size: size of data in bytes
- * @rdata: place to put return data from server
- * @rdata_size: value-result (amount of space given/amount of space used)
- *
- * Only one process at a time can communicate with the server.
- * Possible error return values:
- *   +XXX:       Server-side error
- *   -XXX:       Client-side error
- *   -ENOSPC:    Not enough space in rdata
- *   -ENOMEM:    Unable to allocate memory to complete request
- *   -ESRCH:     Unable to contact server
- *   EIO:        Server unable to commit request
- *
- * Returns: 0 on success, otherwise failure
- */
-int dm_clog_consult_server(const char *uuid, int request_type,
-			   char *data, int data_size,
-			   char *rdata, int *rdata_size)
-{
-	int r = 0;
-	struct clog_tfr *tfr = (struct clog_tfr *)nominal_request;
-
-	mutex_lock(&consult_server_lock);
-	if (data_size > (DM_CLOG_NOMINAL_REQUEST_SIZE - sizeof(*tfr)))
-		/* FIXME: is kmalloc sufficient if we need this much space? */
-		tfr = kmalloc(data_size + sizeof(*tfr), GFP_KERNEL);
-
-	if (!tfr)
-		return -ENOMEM;
-
-	memcpy(tfr->uuid, uuid, MAX_NAME_LEN);
-	tfr->request_type = request_type;
-	tfr->data_size = data_size;
-
-	/*
-	 * FIXME: Send to server
-	 */
-
-	if (rdata) {
-		/* FIXME: receive from server */
-		if (tfr->error) {
-			r = tfr->error;
-		} else if (tfr->data_size > *rdata_size) {
-			r = -ENOSPC;
-		} else {
-			*rdata_size = tfr->data_size;
-			memcpy(rdata, tft->data, tfr->data_size);
-		}
-		/* FIXME:  If using netlink, we may wish to ack back */
-	} else {
-		/*
-		 * FIXME: If we are using netlink, we may want an
-		 * ack from the server to know that it got the
-		 * request.  (Ack is implicit if we are receiving
-		 * data.)
-		 */
-	}
-	r = ENOSYS;
-
-	mutex_unlock(&consult_server_lock);
-	return r;
-}
diff --git a/cmirror-kernel/src/dm-clog-tfr.h b/cmirror-kernel/src/dm-clog-tfr.h
deleted file mode 100644
index 87d21ad..0000000
--- a/cmirror-kernel/src/dm-clog-tfr.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (C) 2006 Red Hat, Inc.
- *
- * This file is released under the LGPL.
- */
-
-#ifndef __DM_CLOG_TFR_H__
-
-#define DM_CLOG_CTR                    1
-#define DM_CLOG_DTR                    2
-#define DM_CLOG_PRESUSPEND             3
-#define DM_CLOG_POSTSUSPEND            4
-#define DM_CLOG_RESUME                 5
-#define DM_CLOG_GET_REGION_SIZE        6
-#define DM_CLOG_IS_CLEAN               7
-#define DM_CLOG_IS_REMOTE_RECOVERING   8
-#define DM_CLOG_IN_SYNC                9
-#define DM_CLOG_FLUSH                 10
-#define DM_CLOG_MARK_REGION           11
-#define DM_CLOG_CLEAR_REGION          12
-#define DM_CLOG_GET_RESYNC_WORK       13
-#define DM_CLOG_SET_REGION_SYNC       14
-#define DM_CLOG_GET_SYNC_COUNT        15
-#define DM_CLOG_STATUS                16
-#define DM_CLOG_GET_FAILURE_RESPONSE  17
-
-struct clog_tfr {
-	char uuid[MAX_NAME_LEN];
-	int error;               /* Used by server to inform of errors */
-	int request_type;
-	int data_size;
-	char data[0];
-};
-
-
-int dm_clog_consult_server(const char *uuid, int request_type,
-			   char *data, int data_size,
-			   char *rdata, int *rdata_size);
-
-#endif /* __DM_CLOG_TFR_H__ */
diff --git a/cmirror-kernel/src/dm-clog.c b/cmirror-kernel/src/dm-clog.c
deleted file mode 100644
index b21098a..0000000
--- a/cmirror-kernel/src/dm-clog.c
+++ /dev/null
@@ -1,624 +0,0 @@
-/*
- * Copyright (C) 2006 Red Hat, Inc.
- *
- * This file is released under the LGPL.
- */
-
-#include "dm-clog-tfr.h"
-
-struct flush_entry {
-	int type;
-	region_t region;
-	struct list_head list;
-};
-
-struct log_c {
-	struct dm_target *ti;
-	uint32_t region_size;
-	region_t region_count;
-	int failure_response;
-	char uuid[MAX_NAME_LEN];
-
-	spinlock_t flush_lock;
-	struct list_head flush_list;  /* only for clear and mark requests */
-};
-
-static mempool_t *flush_entry_pool = NULL;
-
-static void *flush_entry_alloc(int gfp_mask, void *pool_data)
-{
-	return kmalloc(sizeof(struct flush_entry), gfp_mask);
-}
-
-static void flush_entry_free(void *element, void *pool_data)
-{
-	kfree(element);
-}
-
-static int cluster_ctr(struct dirty_log *log, struct dm_target *ti,
-		       unsigned int argc, char **argv, int disk_log)
-{
-	int i;
-	int r = 0;
-	int failure_response = FR_NONBLOCK;
-	struct log_c *lc = NULL;
-	uint32_t region_size;
-	region_t region_count;
-
-	/* Already checked argument count */
-
-	/* Check for block_on_error.  It must be present. */
-	for (i = 1; i < argc; i++) {
-		if (!strcmp(argv[i], "block_on_error"))
-			failure_response = FR_BLOCK;
-	}
-	if (failure_response != FR_BLOCK) {
-		DMWARN("Required \"block_on_error\" argument not supplied.");
-		return -EINVAL;
-	}
-
-	if (sscanf(argv[0], SECTOR_FORMAT, &region_size) != 1) {
-		DMWARN("Invalid region size string");
-		return -EINVAL;
-	}
-
-	region_count = dm_sector_div_up(ti->len, region_size);
-
-	lc = kmalloc(sizeof(*lc), GFP_KERNEL);
-	if (!lc) {
-		DMWARN("Unable to allocate cluster log context.");
-		return -ENOMEM;
-	}
-	lc->ti = ti;
-	lc->region_size = region_size;
-	lc->region_count = region_count;
-
-	/* FIXME: Send table string to server */
-
-fail:
-	if (lc)
-		kfree(lc);
-	
-	return -ENOSYS;
-}
-
-/*
- * cluster_core_ctr
- * @log
- * @ti
- * @argc
- * @argv
- *
- * argv contains:
- *   <region_size> <uuid> [[no]sync] "block_on_error"
- *
- * Returns: 0 on success, -XXX on failure
- */
-static int cluster_core_ctr(struct dirty_log *log, struct dm_target *ti,
-			    unsigned int argc, char **argv)
-{
-	int i;
-	if ((argc < 3) || (argc > 4)) {
-		DMERR("Too %s arguments to clustered_core mirror log type.",
-		      (argc < 3) ? "few" : "many");
-		DMERR("  %d arguments supplied:", argc);
-		for (i = 0; i < argc; i++)
-			DMERR("    %s", argv[i]);
-		return -EINVAL;
-	}
-
-	return cluster_ctr(log, ti, argc, argv, 0);
-}
-
-
-/*
- * cluster_core_ctr
- * @log
- * @ti
- * @argc
- * @argv
- *
- * argv contains:
- *   <disk> <region_size> <uuid> [[no]sync] "block_on_error"
- *
- * Returns: 0 on success, -XXX on failure
- */
-static int cluster_disk_ctr(struct dirty_log *log, struct dm_target *ti,
-			    unsigned int argc, char **argv)
-{
-	int i;
-	if ((argc < 4) || (argc > 5)) {
-		DMERR("Too %s arguments to clustered_disk mirror log type.",
-		      (argc < 4) ? "few" : "many");
-		DMERR("  %d arguments supplied:", argc);
-		for (i = 0; i < argc; i++)
-			DMERR("    %s", argv[i]);
-		return -EINVAL;
-	}
-
-	return cluster_ctr(log, ti, argc, argv, 1);
-}
-
-/*
- * cluster_dtr
- * @log
- */
-static void cluster_dtr(struct dirty_log *log)
-{
-	int r;
-	struct log_c *lc = (struct log_c *)log->context;
-
-	r = dm_clog_consult_server(lc->uuid, DM_CLOG_DTR,
-				   NULL, 0,
-				   NULL, NULL);
-
-	/* FIXME: What do we do on failure? */
-	kfree(lc);
-
-	return;
-}
-
-/*
- * cluster_presuspend
- * @log
- */
-static int cluster_presuspend(struct dirty_log *log)
-{
-	int r;
-	struct log_c *lc = (struct log_c *)log->context;
-
-	r = dm_clog_consult_server(lc->uuid, DM_CLOG_PRESUSPEND,
-				   NULL, 0,
-				   NULL, NULL);
-
-	return (r > 0) ? -r : r;
-}
-
-/*
- * cluster_postsuspend
- * @log
- */
-static int cluster_postsuspend(struct dirty_log *log)
-{
-	int r;
-	struct log_c *lc = (struct log_c *)log->context;
-
-	r = dm_clog_consult_server(lc->uuid, DM_CLOG_POSTSUSPEND,
-				   NULL, 0,
-				   NULL, NULL);
-
-	return (r > 0) ? -r : r;
-}
-
-/*
- * cluster_resume
- * @log
- */
-static int cluster_resume(struct dirty_log *log)
-{
-	int r;
-	struct log_c *lc = (struct log_c *)log->context;
-
-	r = dm_clog_consult_server(lc->uuid, DM_CLOG_RESUME,
-				   NULL, 0,
-				   NULL, NULL);
-
-	return (r > 0) ? -r : r;
-}
-
-/*
- * cluster_get_region_size
- * @log
- *
- * Only called during mirror construction, ok to block.
- *
- * Returns: region size (doesn't fail)
- */
-static uint32_t cluster_get_region_size(struct dirty_log *log)
-{
-	struct log_c *lc = (struct log_c *)log->context;
-
-	return lc->region_size;
-}
-
-/*
- * cluster_is_clean
- * @log
- * @region
- *
- * Check whether a region is clean.  If there is any sort of
- * failure when consulting the server, we return not clean.
- *
- * Returns: 1 if clean, 0 otherwise
- */
-static int cluster_is_clean(struct dirty_log *log, region_t region)
-{
-	int r;
-	int is_clean;
-	int rdata_size;
-	struct log_c *lc = (struct log_c *)log->context;
-
-	rdata_size = sizeof(is_clean);
-	r = dm_clog_consult_server(lc->uuid, DM_CLOG_IS_CLEAN,
-				   (char *)&region, sizeof(region),
-				   (char *)&is_clean, &rdata_size);
-
-	return (r) ? 0 : is_clean;
-}
-
-/*
- * cluster_is_remote_recovering
- * @log
- * @region
- *
- * Check whether a region is being resync'ed on a remote node.
- * If there is any sort of failure when consulting the server,
- * we assume that the region is being remotely recovered.
- *
- * Returns: 1 if remote recovering, 0 otherwise
- */
-static int cluster_is_remote_recovering(struct dirty_log *log, region_t region)
-{
-	int r;
-	int is_recovering;
-	int rdata_size;
-	struct log_c *lc = (struct log_c *)log->context;
-
-	rdata_size = sizeof(is_recovering);
-	r = dm_clog_consult_server(lc->uuid, DM_CLOG_IS_REMOTE_RECOVERING,
-				   (char *)&region, sizeof(region),
-				   (char *)&is_recovering, &rdata_size);
-
-	return (r) ? 1 : is_recovering;
-}
-
-/*
- * cluster_in_sync
- * @log
- * @region
- * @can_block: if set, return immediately
- *
- * Check if the region is in-sync.  If there is any sort
- * of failure when consulting the server, we assume that
- * the region is not in sync.
- *
- * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK
- */
-static int cluster_in_sync(struct dirty_log *log, region_t region, int can_block)
-{
-	int r;
-	int in_sync;
-	int rdata_size;
-	struct log_c *lc = (struct log_c *)log->context;
-
-	if (!can_block)
-		return -EWOULDBLOCK;
-
-	rdata_size = sizeof(in_sync);
-	r = dm_clog_consult_server(lc->uuid, DM_CLOG_IN_SYNC,
-				   (char *)&region, sizeof(region),
-				   (char *)&in_sync, &rdata_size);
-
-	return (r) ? 0 : in_sync;
-}
-
-/*
- * cluster_flush
- * @log
- *
- * This function is ok to block.
- * The flush happens in two stages.  First, it sends all
- * clear/mark requests that are on the list.  Then it
- * tells the server to commit them.  This gives the
- * server a chance to optimise the commit to the cluster
- * and/or disk, instead of doing it for every request.
- *
- * Additionally, we could implement another thread that
- * sends the requests up to the server - reducing the
- * load on flush.  Then the flush would have less in
- * the list and be responsible for the finishing commit.
- *
- * Returns: 0 on success, < 0 on failure
- */
-static int cluster_flush(struct dirty_log *log)
-{
-	int r = 0;
-	int flags;
-	region_t region;
-	struct log_c *lc = (struct log_c *)log->context;
-	struct list_head flush_list;
-	struct flush_entry *fe, *tmp_fe;
-
-	spin_lock_irqsave(&lc->flush_lock, flags);
-	flush_list = lc->flush_list;
-	spin_unlock_irqrestore(&lc->flush_lock, flags);
-
-	/*
-	 * FIXME: Count up requests, group request types,
-	 * allocate memory to stick all requests in and
-	 * send to server in one go.  Failing the allocation,
-	 * do it one by one.
-	 */
-
-	list_for_each_entry(fe, &flush_list, list) {
-		r = dm_clog_consult_server(lc->uuid, fe->type,
-					   (char *)&fe->region,
-					   sizeof(fe->region),
-					   NULL, NULL);
-		if (r) {
-			r = (r > 0) ? -r : r;
-			goto fail;
-		}
-	}
-
-	r = dm_clog_consult_server(lc->uuid, DM_CLOG_FLUSH,
-				   NULL, 0, NULL, NULL);
-	if (r)
-		r = (r > 0) ? -r : r;
-
-fail:
-	list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) {
-		list_del(&fe->list);
-		mempool_free(fe, flush_entry_pool);
-	}
-
-	r = -EIO;
-
-	return r;
-}
-
-/*
- * cluster_mark_region
- * @log
- * @region
- *
- * This function should avoid blocking unless absolutely required.
- * (Memory allocation is valid for blocking.)
- */
-static void cluster_mark_region(struct dirty_log *log, region_t region)
-{
-	int flags;
-	struct log_c *lc = (struct log_c *)log->context;
-	struct flush_entry *fe;
-
-	/* Wait for an allocation, but _never_ fail */
-	fe = mempool_alloc(flush_enrty_pool, GFP_KERNEL);
-	BUG_ON(!fe);
-
-	spin_lock_irqsave(&lc->flush_lock, flags);
-	fe->type = DM_CLOG_MARK_REGION;
-	fe->region = region;
-	list_add(&fe->list, &lc->flush_list);
-	spin_unlock_irqrestore(&lc->flush_lock, flags);
-		
-	return;
-}
-
-/*
- * cluster_clear_region
- * @log
- * @region
- *
- * This function must not block.
- * So, the alloc can't block.  In the worst case, it is ok to
- * fail.  It would simply mean we can't clear the region.
- * Does nothing to current sync context, but does mean
- * the region will be re-sync'ed on a reload of the mirror
- * even though it is in-sync.
- */
-static void cluster_clear_region(struct dirty_log *log, region_t region)
-{
-	int flags;
-	struct log_c *lc = (struct log_c *)log->context;
-	struct flush_entry *fe;
-
-	fe = mempool_alloc(flush_enrty_pool, GFP_ATOMIC);
-	if (!fe) {
-		DMERR("Failed to allocate memory to clear region.");
-		return;
-	}
-	spin_lock_irqsave(&lc->flush_lock, flags);
-	fe->type = DM_CLOG_CLEAR_REGION;
-	fe->region = region;
-	list_add(&fe->list, &lc->flush_list);
-	spin_unlock_irqrestore(&lc->flush_lock, flags);
-	
-	return;
-}
-
-/*
- * cluster_get_resync_work
- * @log
- * @region
- *
- * Get a region that needs recovery.  It is valid to return
- * an error for this function.
- *
- * Returns: 1 if region filled, 0 if no work, <0 on error
- */
-static int cluster_get_resync_work(struct dirty_log *log, region_t *region)
-{
-	int r;
-	int rdata_size;
-	struct log_c *lc = (struct log_c *)log->context;
-	struct { int i; region_t r; } pkg;
-
-	rdata_size = sizeof(pkg);
-	r = dm_clog_consult_server(lc->uuid, DM_CLOG_GET_RESYNC_WORK,
-				   NULL, 0,
-				   &pkg, &rdata_size);
-
-	r = (r > 0) ? -r : r;
-
-	*region = pkg.r;
-
-	return (r) ? r : pkg.i;
-}
-
-/*
- * cluster_set_region_sync
- * @log
- * @region
- * @in_sync
- *
- * Set the sync status of a given region.  This function
- * must not fail.
- */
-static void cluster_set_region_sync(struct dirty_log *log,
-				    region_t region, int in_sync)
-{
-	int r;
-	struct log_c *lc = (struct log_c *)log->context;
-	struct { region_t r; int i; } pkg;
-
-	pkg.r = region;
-	pkg.i = in_sync;
-
-	r = dm_clog_consult_server(lc->uuid, DM_CLOG_SET_REGION_SYNC,
-				   &pkg, sizeof(pkg),
-				   NULL, NULL);
-
-	/* FIXME: It would be nice to be able to report failures */
-	return;
-}
-
-/*
- * cluster_get_sync_count
- * @log
- *
- * If there is any sort of failure when consulting the server,
- * we assume that the sync count is zero.
- *
- * Returns: sync count on success, 0 on failure
- */
-static region_t cluster_get_sync_count(struct dirty_log *log)
-{
-	int r;
-	int rdata_size;
-	region_t sync_count;
-	struct log_c *lc = (struct log_c *)log->context;
-
-	rdata_size = sizeof(sync_count);
-	r = dm_clog_consult_server(lc->uuid, DM_CLOG_GET_SYNC_COUNT,
-				   NULL, 0,
-				   (char *)&sync_count, &rdata_size);
-
-	return (r) ? 0 : sync_count;
-}
-
-/*
- * cluster_status
- * @log
- * @status_type
- * @result
- * @maxlen
- *
- * Returns: amount of space consumed
- */
-static int cluster_status(struct dirty_log *log, status_type_t status_type,
-			  char *result, unsigned int maxlen)
-{
-	int r;
-	unsigned int sz = maxlen;
-	struct log_c *lc = (struct log_c *)log->context;
-
-	switch(status) {
-	case STATUSTYPE_INFO:
-		r = dm_clog_consult_server(lc->uuid, DM_CLOG_STATUS_INFO,
-					   NULL, 0,
-					   result, &sz);
-		break;
-	case STATUSTYPE_TABLE:
-		r = dm_clog_consult_server(lc->uuid, DM_CLOG_STATUS_INFO,
-					   NULL, 0,
-					   result, &sz);
-		break;
-	}
-	return (r) ? 0: sz;
-}
-
-status int cluster_get_failure_response(struct dirty_log *log)
-{
-	struct log_c *lc = (struct log_c *)log->context;
-
-	return lc->failure_response;
-}
-
-static struct dirty_log_type _clustered_core_type = {
-	.name = "clustered_core",
-	.module = THIS_MODULE,
-	.ctr = cluster_core_ctr,
-	.dtr = cluster_dtr,
-	.presuspend = cluster_presuspend,
-	.postsuspend = cluster_postsuspend,
-	.resume = cluster_resume,
-	.get_region_size = cluster_get_region_size,
-	.is_clean = cluster_is_clean,
-	.is_remote_recovering = cluster_is_remote_recovering,
-	.in_sync = cluster_in_sync,
-	.flush = cluster_flush,
-	.mark_region = cluster_mark_region,
-	.clear_region = cluster_clear_region,
-	.get_resync_work = cluster_get_resync_work,
-	.set_region_sync = cluster_set_region_sync,
-	.get_sync_count = cluster_get_sync_count,
-	.status = cluster_status,
-	.get_failure_response = cluster_get_failure_response,
-};
-
-static struct dirty_log_type _clustered_disk_type = {
-	.name = "clustered_disk",
-	.module = THIS_MODULE,
-	.ctr = cluster_disk_ctr,
-	.dtr = cluster_dtr,
-	.presuspend = cluster_presuspend,
-	.postsuspend = cluster_postsuspend,
-	.resume = cluster_resume,
-	.get_region_size = cluster_get_region_size,
-	.is_clean = cluster_is_clean,
-	.is_remote_recovering = cluster_is_remote_recovering,
-	.in_sync = cluster_in_sync,
-	.flush = cluster_flush,
-	.mark_region = cluster_mark_region,
-	.clear_region = cluster_clear_region,
-	.get_resync_work = cluster_get_resync_work,
-	.set_region_sync = cluster_set_region_sync,
-	.get_sync_count = cluster_get_sync_count,
-	.status = cluster_status,
-	.get_failure_response = cluster_get_failure_response,
-};
-
-static int __init cluster_dirty_log_init(void)
-{
-	int r = 0;
-
-	flush_entry_pool = mempool_create(100, flush_entry_alloc,
-					  flush_entry_free, NULL);
-
-	if (!flush_entry_pool) {
-		DMERR("Unable to create flush_entry_pool:  No memory.");
-		return -ENOMEM;
-	}
-
-	r = dm_register_dirty_log_type(&_clustered_core_type);
-	if (r) {
-		DMWARN("Couldn't register clustered_core dirty log type");
-		return r;
-	}
-
-	r = dm_register_dirty_log_type(&_clustered_disk_type);
-	if (r) {
-		DMWARN("Couldn't register clustered_disk dirty log type");
-		dm_unregister_dirty_log_type(&_clustered_core_type);
-		return r;
-	}
-
-	return r;
-}
-
-static void __exit cluster_dirty_log_exit(void)
-{
-	dm_unregister_dirty_log_type(&_clustered_disk_type);
-	dm_unregister_dirty_log_type(&_clustered_core_type);
-	return;
-}
diff --git a/cmirror/Makefile b/cmirror/Makefile
deleted file mode 100644
index 1cb3553..0000000
--- a/cmirror/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-all:
-	${MAKE} -C src all
-
-clean:
-	${MAKE} -C src clean
-
-install: all
-	${MAKE} -C src install
-
-uninstall:
-	${MAKE} -C src uninstall
-
-distclean: clean
-	rm -f make/defines.mk
diff --git a/csnap-kernel/Makefile b/csnap-kernel/Makefile
deleted file mode 100644
index 1cb3553..0000000
--- a/csnap-kernel/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-all:
-	${MAKE} -C src all
-
-clean:
-	${MAKE} -C src clean
-
-install: all
-	${MAKE} -C src install
-
-uninstall:
-	${MAKE} -C src uninstall
-
-distclean: clean
-	rm -f make/defines.mk
diff --git a/csnap-kernel/patches/2.6.15/00001.patch b/csnap-kernel/patches/2.6.15/00001.patch
deleted file mode 100644
index 9797a36..0000000
--- a/csnap-kernel/patches/2.6.15/00001.patch
+++ /dev/null
@@ -1,16 +0,0 @@
-diff -urpN linux-orig/net/socket.c linux-patched/net/socket.c
---- linux-orig/net/socket.c	2006-01-15 00:16:02.000000000 -0600
-+++ linux-patched/net/socket.c	2006-03-03 16:53:11.000000000 -0600
-@@ -2084,6 +2084,12 @@ void socket_seq_show(struct seq_file *se
- }
- #endif /* CONFIG_PROC_FS */
- 
-+/* Cluster devices need these, or better: kernel interfaces */
-+
-+EXPORT_SYMBOL_GPL(sys_connect);
-+EXPORT_SYMBOL_GPL(sys_recvmsg);
-+EXPORT_SYMBOL_GPL(sys_socket);
-+
- /* ABI emulation layers need these two */
- EXPORT_SYMBOL(move_addr_to_kernel);
- EXPORT_SYMBOL(move_addr_to_user);
diff --git a/csnap-kernel/patches/2.6.15/00002.patch b/csnap-kernel/patches/2.6.15/00002.patch
deleted file mode 100644
index 49d7b2d..0000000
--- a/csnap-kernel/patches/2.6.15/00002.patch
+++ /dev/null
@@ -1,32 +0,0 @@
-diff -urpN linux-orig/fs/super.c linux-patched/fs/super.c
---- linux-orig/fs/super.c	2006-01-15 00:16:02.000000000 -0600
-+++ linux-patched/fs/super.c	2006-03-03 16:54:39.000000000 -0600
-@@ -53,7 +53,7 @@ DEFINE_SPINLOCK(sb_lock);
-  *	Allocates and initializes a new &struct super_block.  alloc_super()
-  *	returns a pointer new superblock or %NULL if allocation had failed.
-  */
--static struct super_block *alloc_super(void)
-+struct super_block *alloc_super(void)
- {
- 	struct super_block *s = kmalloc(sizeof(struct super_block),  GFP_USER);
- 	static struct super_operations default_op;
-@@ -91,6 +91,8 @@ out:
- 	return s;
- }
- 
-+EXPORT_SYMBOL(alloc_super);
-+
- /**
-  *	destroy_super	-	frees a superblock
-  *	@s: superblock to free
-diff -urpN linux-orig/include/linux/fs.h linux-patched/include/linux/fs.h
---- linux-orig/include/linux/fs.h	2006-01-15 00:16:02.000000000 -0600
-+++ linux-patched/include/linux/fs.h	2006-03-03 16:54:39.000000000 -0600
-@@ -1236,6 +1236,7 @@ void generic_shutdown_super(struct super
- void kill_block_super(struct super_block *sb);
- void kill_anon_super(struct super_block *sb);
- void kill_litter_super(struct super_block *sb);
-+struct super_block *alloc_super(void);
- void deactivate_super(struct super_block *sb);
- int set_anon_super(struct super_block *s, void *data);
- struct super_block *sget(struct file_system_type *type,
diff --git a/csnap-kernel/patches/2.6.15/00003.patch b/csnap-kernel/patches/2.6.15/00003.patch
deleted file mode 100644
index ef08ead..0000000
--- a/csnap-kernel/patches/2.6.15/00003.patch
+++ /dev/null
@@ -1,30 +0,0 @@
-diff -urpN linux-orig/drivers/md/Kconfig linux-patched/drivers/md/Kconfig
---- linux-orig/drivers/md/Kconfig	2006-01-15 00:16:02.000000000 -0600
-+++ linux-patched/drivers/md/Kconfig	2006-03-03 17:12:50.000000000 -0600
-@@ -236,5 +236,15 @@ config DM_MULTIPATH_EMC
- 	---help---
- 	  Multipath support for EMC CX/AX series hardware.
- 
-+config DM_CSNAP
-+	tristate "Cluster snapshot target support"
-+	depends on BLK_DEV_DM && EXPERIMENTAL
-+	---help---
-+	  This device-mapper target allows you to create a virtual device
-+	  that can take snapshots of an underlying device.  This device
-+	  can be accessed simultaneously by multiple nodes of a cluster.
-+
-+	  If unsure, say N.
-+
- endmenu
- 
-diff -urpN linux-orig/drivers/md/Makefile linux-patched/drivers/md/Makefile
---- linux-orig/drivers/md/Makefile	2006-01-15 00:16:02.000000000 -0600
-+++ linux-patched/drivers/md/Makefile	2006-03-03 17:12:50.000000000 -0600
-@@ -37,6 +37,7 @@ obj-$(CONFIG_DM_MULTIPATH_EMC)	+= dm-emc
- obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
- obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o
- obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
-+obj-$(CONFIG_DM_CSNAP)		+= dm-csnap.o
- 
- quiet_cmd_unroll = UNROLL  $@
-       cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \
diff --git a/csnap-kernel/patches/2.6.9/00001.patch b/csnap-kernel/patches/2.6.9/00001.patch
deleted file mode 100644
index cff246e..0000000
--- a/csnap-kernel/patches/2.6.9/00001.patch
+++ /dev/null
@@ -1,16 +0,0 @@
-diff -urpN linux-orig/net/socket.c linux-patched/net/socket.c
---- linux-orig/net/socket.c	2006-03-03 14:28:29.000000000 -0600
-+++ linux-patched/net/socket.c	2006-03-03 15:24:16.000000000 -0600
-@@ -2155,6 +2155,12 @@ void socket_seq_show(struct seq_file *se
- 
- EXPORT_SYMBOL_GPL(sys_recvmsg);
- 
-+/* Cluster devices need these, or better: kernel interfaces */
-+
-+EXPORT_SYMBOL_GPL(sys_connect);
-+EXPORT_SYMBOL_GPL(sys_recvmsg);
-+EXPORT_SYMBOL_GPL(sys_socket);
-+
- /* ABI emulation layers need these two */
- EXPORT_SYMBOL(move_addr_to_kernel);
- EXPORT_SYMBOL(move_addr_to_user);
diff --git a/csnap-kernel/patches/2.6.9/00002.patch b/csnap-kernel/patches/2.6.9/00002.patch
deleted file mode 100644
index 54fa125..0000000
--- a/csnap-kernel/patches/2.6.9/00002.patch
+++ /dev/null
@@ -1,32 +0,0 @@
-diff -urpN linux-orig/fs/super.c linux-patched/fs/super.c
---- linux-orig/fs/super.c	2006-03-03 14:28:28.000000000 -0600
-+++ linux-patched/fs/super.c	2006-03-03 15:25:20.000000000 -0600
-@@ -51,7 +51,7 @@ spinlock_t sb_lock = SPIN_LOCK_UNLOCKED;
-  *	Allocates and initializes a new &struct super_block.  alloc_super()
-  *	returns a pointer new superblock or %NULL if allocation had failed.
-  */
--static struct super_block *alloc_super(void)
-+struct super_block *alloc_super(void)
- {
- 	struct super_block *s = kmalloc(sizeof(struct super_block),  GFP_USER);
- 	static struct super_operations default_op;
-@@ -87,6 +87,8 @@ out:
- 	return s;
- }
- 
-+EXPORT_SYMBOL(alloc_super);
-+
- /**
-  *	destroy_super	-	frees a superblock
-  *	@s: superblock to free
-diff -urpN linux-orig/include/linux/fs.h linux-patched/include/linux/fs.h
---- linux-orig/include/linux/fs.h	2006-03-03 14:28:29.000000000 -0600
-+++ linux-patched/include/linux/fs.h	2006-03-03 15:25:20.000000000 -0600
-@@ -1171,6 +1171,7 @@ void generic_shutdown_super(struct super
- void kill_block_super(struct super_block *sb);
- void kill_anon_super(struct super_block *sb);
- void kill_litter_super(struct super_block *sb);
-+struct super_block *alloc_super(void);
- void deactivate_super(struct super_block *sb);
- int set_anon_super(struct super_block *s, void *data);
- struct super_block *sget(struct file_system_type *type,
diff --git a/csnap-kernel/patches/2.6.9/00003.patch b/csnap-kernel/patches/2.6.9/00003.patch
deleted file mode 100644
index 8ebc0ee..0000000
--- a/csnap-kernel/patches/2.6.9/00003.patch
+++ /dev/null
@@ -1,30 +0,0 @@
-diff -urpN linux-orig/drivers/md/Kconfig linux-patched/drivers/md/Kconfig
---- linux-orig/drivers/md/Kconfig	2006-03-03 14:28:20.000000000 -0600
-+++ linux-patched/drivers/md/Kconfig	2006-03-03 15:27:52.000000000 -0600
-@@ -230,5 +230,15 @@ config DM_MULTIPATH_EMC
- 	---help---
- 	  Multipath support for EMC CX/AX series hardware.
- 
-+config DM_CSNAP
-+	tristate "Cluster snapshot target support"
-+	depends on BLK_DEV_DM && EXPERIMENTAL
-+	---help---
-+	  This device-mapper target allows you to create a virtual device
-+	  that can take snapshots of an underlying device.  This device
-+	  can be accessed simultaneously by multiple nodes of a cluster.
-+
-+	  If unsure, say N.
-+
- endmenu
- 
-diff -urpN linux-orig/drivers/md/Makefile linux-patched/drivers/md/Makefile
---- linux-orig/drivers/md/Makefile	2006-03-03 14:28:29.000000000 -0600
-+++ linux-patched/drivers/md/Makefile	2006-03-03 15:27:52.000000000 -0600
-@@ -35,6 +35,7 @@ obj-$(CONFIG_DM_MULTIPATH_EMC)	+= dm-emc
- obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
- obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o
- obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
-+obj-$(CONFIG_DM_CSNAP)		+= dm-csnap.o
- 
- quiet_cmd_unroll = UNROLL  $@
-       cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \
diff --git a/csnap-kernel/src/Makefile b/csnap-kernel/src/Makefile
deleted file mode 100644
index 6554655..0000000
--- a/csnap-kernel/src/Makefile
+++ /dev/null
@@ -1,69 +0,0 @@
-###############################################################################
-###############################################################################
-##
-##  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-##  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
-##
-##  This copyrighted material is made available to anyone wishing to use,
-##  modify, copy, or redistribute it subject to the terms and conditions
-##  of the GNU General Public License v.2.
-##
-###############################################################################
-###############################################################################
-
-top_srcdir = ..
-ifndef USING_KBUILD
-include ${top_srcdir}/make/defines.mk
-UNINSTALL=${top_srcdir}/scripts/uninstall.pl
-endif
-
-linux_orig = ${top_srcdir}/patches/linux-orig
-linux_patched = ${top_srcdir}/patches/linux-patched
-
-
-PATCH_TARGET = dm-csnap.patch
-
-PWD := $(shell pwd)
-
-obj-m := dm-csnap.o
-csnap-objs := dm-csnap.c
-
-EXTRA_CFLAGS += -I$(obj) -Idrivers/md
-
-all:
-	${MAKE} -C ${KERNEL_BUILD} M=${PWD} modules USING_KBUILD=yes
-
-install: all
-	install -d ${incdir}/linux
-	install dm-csnap.h ${incdir}/linux
-	install -d ${module_dir}/drivers/md
-	install dm-csnap.ko ${module_dir}/drivers/md
-
-uninstall:
-	${UNINSTALL} dm-csnap.ko ${module_dir}/drivers/md
-	${UNINSTALL} dm-csnap.h ${incdir}/linux
-
-clean:
-	rm -rf linux *.mod.c .dm-csnap.ko.cmd \
-		.tmp_versions *o .*.o.cmd *~
-
-
-clean:
-
-
-patches: pre add post
-
-pre:
-	@if [ ! -d ${linux_orig} ] ; then \
-		echo "No linux source directory (${linux_orig})" ; \
-		exit 1; \
-	fi
-	mkdir -p ${linux_patched}
-	rsync -a --delete ${linux_orig}/ ${linux_patched}/
-
-post:
-	( cd ${top_srcdir}/patches ; diff -urN linux-orig linux-patched > ${PATCH_TARGET} ; exit 0 )
-
-add:
-	cp dm-csnap.c ${linux_patched}/drivers/block/
-	cp dm-csnap.h ${linux_patched}/include/linux/
diff --git a/csnap-kernel/src/dm-csnap.c b/csnap-kernel/src/dm-csnap.c
deleted file mode 100644
index 1f9758a..0000000
--- a/csnap-kernel/src/dm-csnap.c
+++ /dev/null
@@ -1,1147 +0,0 @@
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/pagemap.h>
-#include <linux/file.h>
-#include <linux/syscalls.h> // recvmsg
-#include <linux/socket.h>
-#include <linux/un.h>
-#include <net/sock.h>
-#include <asm/uaccess.h>
-#include <linux/bio.h>
-#include "dm.h"
-#include "dm-csnap.h"
-
-#define BREAK BUG()
-#define warn(string, args...) do { printk("%s: " string "\n", __func__, ##args); } while (0)
-#define error(string, args...) do { warn(string, ##args); BREAK; } while (0)
-#define assert(expr) do { if (!(expr)) error("Assertion " #expr " failed!\n"); } while (0)
-#define trace_on(args) args
-#define trace_off(args)
-
-#define trace trace_off
-
-/*
- * To do:
- *
- * - variable length bios
- * - unique cache
- * - receive chunk size
- * - make pending and hook a union
- * - get rid of multiple ranges per message misfeature
- * - rationalize sector vs chunk usage in messages
- * - detect message id wrap
- * - detect message timeout
- */
-
-/* Useful gizmos */
-
-static int rwpipe(struct file *file, const void *buffer, unsigned int count,
-	ssize_t (*op)(struct kiocb *, const char *, size_t, loff_t), int mode)
-{
-	struct kiocb iocb;
-	mm_segment_t oldseg;
-	int err = 0;
-
-	trace_off(warn("%s %i bytes", mode == FMODE_READ? "read": "write", count);)
-	if (!(file->f_mode & mode))
-		return -EBADF;
-	if (!op)
-		return -EINVAL;
-	init_sync_kiocb(&iocb, file); // new in 2.5 (hmm)
-	iocb.ki_pos = file->f_pos;
-	oldseg = get_fs();
-	set_fs(get_ds());
-	while (count) {
-		int chunk = (*op)(&iocb, buffer, count, iocb.ki_pos);
-		if (chunk <= 0) {
-			err = chunk? chunk: -EPIPE;
-			break;
-		}
-		BUG_ON(chunk > count);
-		count -= chunk;
-		buffer += chunk;
-	}
-	set_fs(oldseg);
-	file->f_pos = iocb.ki_pos;
-	return err;
-}
-
-static inline int readpipe(struct file *file, void *buffer, unsigned int count)
-{
-	return rwpipe(file, buffer, count, (void *)file->f_op->aio_read, FMODE_READ);
-}
-
-static inline int writepipe(struct file *file, void *buffer, unsigned int count)
-{
-	return rwpipe(file, buffer, count, file->f_op->aio_write, FMODE_WRITE);
-}
-
-#define outbead(SOCK, CODE, STRUCT, VALUES...) ({ \
-	struct { struct head head; STRUCT body; } PACKED message = \
-		{ { CODE, sizeof(STRUCT) }, { VALUES } }; \
-	writepipe(SOCK, &message, sizeof(message)); })
-
-/*
- * This gets the job done but it sucks as an internal interface: there
- * is no reason to deal with fds at all, we just want to receive the
- * (struct file *), we do not want to have to wrap the socket in a
- * fd just to call recv_fd, and user space pointer for the (bogus) data
- * payload is just silly.  Never mind the danger of triggering some
- * wierdo signal handling cruft deep in the socket layer.  This kind of
- * posturing - lathering layers of cruft upon cruft - is the stuff
- * Windows is made of, Linux is not supposed to be like that.  Fixing
- * this requires delving into the SCM_RIGHTS path deep inside sys_recvmsg
- * and breaking out the part that actually does the work, to be a usable
- * internal interface.  Put it on the list of things to do.
- */
-static int recv_fd(int sock, char *bogus, unsigned *len)
-{
-	char payload[CMSG_SPACE(sizeof(int))];
-	struct msghdr msg = {
-		.msg_control = payload,
-		.msg_controllen = sizeof(payload),
-		.msg_iov = &(struct iovec){ .iov_base = bogus, .iov_len = *len },
-		.msg_iovlen = 1,
-	};
-	mm_segment_t oldseg = get_fs();
-	struct cmsghdr *cmsg;
-	int result;
-
-	set_fs(get_ds());
-	result = sys_recvmsg(sock, &msg, 0);
-	set_fs(oldseg);
-
-	if (result <= 0)
-		return result;
-	if (!(cmsg = CMSG_FIRSTHDR(&msg)))
-		return -ENODATA;
-	if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)) ||
-		cmsg->cmsg_level != SOL_SOCKET ||
-		cmsg->cmsg_type != SCM_RIGHTS)
-		return -EBADMSG;
-
-	*len = result;
-	return *((int *)CMSG_DATA(cmsg));
-}
-
-static void kick(struct block_device *dev)
-{
-	request_queue_t *q = bdev_get_queue(dev);
-	if (q->unplug_fn)
-		q->unplug_fn(q);
-}
-
-/* ...Useful gizmos */
-
-typedef u64 chunk_t;
-
-#define SECTOR_SHIFT 9
-#define IS_SNAP_FLAG (1 << 0)
-#define REPORT_BIT 1
-#define RECOVER_FLAG (1 << 2)
-#define FINISH_FLAG (1 << 3)
-#define NUM_BUCKETS 64
-#define MASK_BUCKETS (NUM_BUCKETS - 1)
-#define ID_BITS 16
-
-struct snapinfo {
-	u64 id;
-	unsigned long flags;
-	unsigned chunksize_bits;
-	unsigned chunkshift;
-//	sector_t len;
-	int snap, nextid;
-	u32 *shared_bitmap; // !!! get rid of this, use the inode cache
-	struct inode  *inode; /* the cache */
-	struct dm_dev *orgdev;
-	struct dm_dev *snapdev;
-	struct file *sock;
-	struct file *control_socket;
-	struct semaphore server_in_sem;
-	struct semaphore server_out_sem;
-	struct semaphore more_work_sem;
-	struct semaphore recover_sem;
-	struct semaphore exit1_sem;
-	struct semaphore exit2_sem;
-	struct semaphore exit3_sem;
-	struct list_head pending[NUM_BUCKETS];
-	struct list_head queries;
-	struct list_head releases;
-	struct list_head locked;
-	spinlock_t pending_lock;
-	spinlock_t end_io_lock;
-	int dont_switch_lists;
-};
-
-static inline int is_snapshot(struct snapinfo *info)
-{
-	return !!(info->flags & IS_SNAP_FLAG);
-}
-
-static inline int running(struct snapinfo *info)
-{
-	return !(info->flags & FINISH_FLAG);
-}
-
-static inline int worker_running(struct snapinfo *info)
-{
-        return !(info->flags & (FINISH_FLAG|RECOVER_FLAG));
-}
-
-static void report_error(struct snapinfo *info)
-{
-	if (test_and_set_bit(REPORT_BIT, &info->flags))
-		return;
-	up(&info->more_work_sem);
-	down(&info->recover_sem);
-	info->flags |= RECOVER_FLAG;
-}
-
-/* Static caches, shared by all csnap instances */
-
-static kmem_cache_t *pending_cache;
-static kmem_cache_t *end_io_cache;
-static struct super_block *snapshot_super;
-
-/* We cache query results because we are greedy about speed */
-
-#ifdef CACHE
-static u64 *snap_map_cachep(struct address_space *mapping, chunk_t chunk, struct page **p)
-{
-	u32 page_index;
-	u32 page_pos;
-	struct page *page;
-	u64 *exceptions;
-
-	page_index = chunk / (PAGE_SIZE / sizeof(u64));
-	page_pos = chunk % (PAGE_SIZE / sizeof(u64));
-
-	page = find_or_create_page(mapping, page_index, GFP_KERNEL);
-	if (page) {
-		/* Clean page if it's a new one */
-		if (!Page_Uptodate(page)) {
-			memset(page_address(page), 0, PAGE_SIZE);
-			SetPageUptodate(page);
-		}
-
-		exceptions = page_address(page);
-		*p = page;
-		return &exceptions[page_pos];
-	}
-	return NULL;
-}
-
-static inline int get_unshared_bit(struct snapinfo *info, chunk_t chunk)
-{
-	return (info->shared_bitmap[chunk >> 5] >> (chunk & 31)) & 1;
-}
-
-static inline void set_unshared_bit(struct snapinfo *info, chunk_t chunk)
-{
-	info->shared_bitmap[chunk >> 5] |= 1 << (chunk & 31);
-}
-#endif
-
-/* Hash table matches up query replies to pending requests */
-
-struct pending {
-	unsigned id;
-	u64 chunk;
-	unsigned chunks;
-	struct bio *bio;
-	struct list_head list;
-};
-
-static void show_pending(struct snapinfo *info)
-{
-	unsigned i, total = 0;
-
-	spin_lock(&info->pending_lock);
-	warn("Pending server queries...");
-	for (i = 0; i < NUM_BUCKETS; i++) {
-		struct list_head *list;
-		list_for_each(list, info->pending + i) {
-			struct pending *pending = list_entry(list, struct pending, list);
-			if (!total)
-				printk("[%u]: ", i);
-			printk("%u:%Lx ", pending->id, pending->chunk);
-			total++;
-		}
-	}
-	printk("(%u)\n", total);
-	if (!list_empty(&info->queries)) {
-		struct list_head *list;
-		total = 0;
-		warn("Queued queries...");
-		list_for_each(list, &info->queries) {
-			struct pending *pending = list_entry(list, struct pending, list);
-			printk("%Lx ", pending->chunk);
-			total++;
-		}
-		printk("(%u)\n", total);
-	}
-	spin_unlock(&info->pending_lock);
-}
-
-static inline unsigned hash_pending(unsigned id)
-{
-	return id & MASK_BUCKETS;
-}
-
-/* Ah, now it gets interesting.  Called in interrupt context */
-
-struct hook {
-	struct snapinfo *info;
-	sector_t sector;
-	/* needed only for end_io, make it a union */
-	bio_end_io_t *old_end_io;
-	void *old_private;
-	/* needed after end_io, for release, make it a union */
-	struct list_head list;
-};
-
-static int snapshot_read_end_io(struct bio *bio, unsigned int done, int error)
-{
-	struct hook *hook = bio->bi_private;
-	struct snapinfo *info = hook->info;
-
-	trace(warn("sector %Lx", (long long)hook->sector);)
-	spin_lock(&info->end_io_lock);
-	bio->bi_end_io = hook->old_end_io;
-	bio->bi_private = hook->old_private;
-	hook->old_end_io = NULL;
-	if (info->dont_switch_lists == 0)
-		list_move(&hook->list, &info->releases);
-	spin_unlock(&info->end_io_lock);
-	up(&info->more_work_sem);
-
-	return bio->bi_end_io(bio, done, error);
-}
-
-/* This is the part that does all the work. */
-
-int replied_rw(struct dm_target *target, struct rw_request *body, unsigned length, int rw, int snap)
-{
-	struct snapinfo *info = target->private;
-	struct chunk_range *p = body->ranges;
-	unsigned shift = info->chunksize_bits - SECTOR_SHIFT, mask = (1 << shift) - 1;
-	int i, j, submitted = 0;
-
-	trace(show_pending(info);)
-	trace(warn("id = %u, %u ranges, %s %s", body->id, body->count,
-		rw == READ? "read from": "write to", snap? "snapshot": "origin");)
-
-	for (i = 0; i < body->count; i++) { // !!! check for length overrun
-		unsigned chunks = p->chunks, id = body->id;
-		struct list_head *list, *bucket = info->pending + hash_pending(id);
-		struct pending *pending;
-		struct bio *bio;
-
-		trace(warn("[%Lx/%x]", p->chunk, chunks);)
-		assert(chunks == 1);
-
-		spin_lock(&info->pending_lock);
-		list_for_each(list, bucket)
-			if ((pending = list_entry(list, struct pending, list))->id == id)
-				goto found;
-		warn("Can't find pending rw for chunk %u:%Lx", id, p->chunk);
-		spin_unlock(&info->pending_lock);
-		return -1;
-found:
-		list_del(&pending->list);
-		spin_unlock(&info->pending_lock);
-
-		bio = pending->bio;
-		trace(warn("Handle pending IO sector %Lx", (long long)bio->bi_sector);)
-
-		if (chunks != pending->chunks) {
-			warn("Message mismatch, expected %x got %x", chunks, chunks);
-			kmem_cache_free(pending_cache, pending);
-			bio_io_error(bio, bio->bi_size);
-			return -1;
-		}
-
-		++p;
-		if (snap) {
-			chunk_t *p2 = (chunk_t *)p;
-			for (j = 0; j < chunks; j++) {
-				u64 physical = (*p2++ << shift) + (bio->bi_sector & mask);
-				trace(warn("logical %Lx = physical %Lx", (u64)bio->bi_sector, physical));
-				bio->bi_bdev = info->snapdev->bdev;
-				bio->bi_sector = physical;
-			}
-			p = (struct chunk_range *)p2;
-		} else if (rw == READ) {
-			/* snapshot read from origin */
-			struct hook *hook;
-			trace(warn("hook end_io for %Lx", (long long)bio->bi_sector));
-			hook = kmem_cache_alloc(end_io_cache, GFP_KERNEL|__GFP_NOFAIL); // !!! union with pending
-			*hook = (struct hook){
-				.info = info,
-				.sector = bio->bi_sector,
-				.old_end_io = bio->bi_end_io,
-				.old_private = bio->bi_private };
-			bio->bi_end_io = snapshot_read_end_io;
-			bio->bi_private = hook;
-			list_add(&hook->list, &info->locked);
-		}
-
-		generic_make_request(bio);
-		submitted++;
-#ifdef CACHE
-		for (j = 0; j < p->chunks; j++)
-			set_unshared_bit(info, chunk + j);
-#endif
-		kmem_cache_free(pending_cache, pending);
-	}
-	if (submitted){
-		kick(info->orgdev->bdev);
-		kick(info->snapdev->bdev);
-	}
-	return 0;
-}
-
-/*
- * There happen to be four flavors of server replies to rw queries, two
- * write and two read, but the symmetry ends there.  Only one flavor
- * (write) is for origin IO, because origin reads do not need global
- * synchronization.  The remaining three flavors are for snapshot IO.
- * Snapshot writes are always to the snapshot store, so there is only
- * one flavor.  On the other hand, snapshot reads can be from either
- * the origin or the snapshot store.  Only the server can know which.
- * Either or both kinds of snapshot read reply are possible for a given
- * query, which is where things get nasty.  These two kinds of replies
- * can be interleaved arbitrarily along the original read request, and
- * to just to add a little more spice, the server may not send back the
- * results for an entire query in one message (it may decide to service
- * other queries first, or replly about the 'easiest' chunks first). The
- * client has to match up all these reply fragments to the original
- * request and decide what to do.  Such bizarre fragmentation of the
- * incoming request is unavoidable, it results from write access
- * patterns to the origin.  We just have to grin and deal with it.  So
- * without further ado, here is how the various reply flavors
- *
- * - Origin write replies just have logical ranges, since origin physical 
- *   address is the same as logical.
- *
- * - Snapshot read replies come back in two separate messages, one for
- *   the origin reads (if any) and one for the snapstore reads (if any),
- *   the latter includes snapstore addresses.  Origin reads are globally
- *   locked by the server, so we must send release messages on
- *   completion.
- *
- * - Snapshot writes are always to the snapstore, so snapstore write
- *   replies always include snapstore addresses.
- *
- * We know whether we're supposed to be a snapshot or origin client,
- * but we only use that knowledge as a sanity check.  The message codes
- * tell us explicitly whether the IO target is origin or snapstore.
- */
-
-/*
- * For now, we just block on incoming message traffic, so this daemon
- * can't do any other useful work.  It could if we used nonblocking pipe
- * IO but we have been too lazy to implement it so far.  So we have one
- * more daemon than we really need, and maybe we will get energetic one
- * day soon and get rid of it.
- *
- * When it comes time to destroy things, the daemon has to be kicked
- * out of its blocking wait, if it is in one, which it probably is.  We
- * do that by shutting down the socket.  This unblocks the waiters and
- * feeds them errors.  Does this work for all flavors of sockets?  I
- * don't know.  It obviously should, but we've seen some pretty silly
- * limitations in our long life, so nothing would surprise us at this
- * point.
- */
-static int incoming(struct dm_target *target)
-{
-	struct snapinfo *info = target->private;
-	struct messagebuf message; // !!! have a buffer in the target->info
-	struct file *sock;
-	struct task_struct *task = current;
-	int err, length;
-
-	strcpy(task->comm, "csnap-client");
-	down(&info->exit2_sem);
-	trace(warn("Client thread started, pid=%i", current->pid);)
-connect:
-	trace(warn("Request socket connection");)
-	outbead(info->control_socket, NEED_SERVER, struct { });
-	trace(warn("Wait for socket connection");)
-	down(&info->server_in_sem);
-	trace(warn("got socket %p", info->sock);)
-	sock = info->sock;
-
-	while (running(info)) { // stop on module exit
-		int rw, to_snap;
-
-		trace(warn("wait message");)
-		if ((err = readpipe(sock, &message.head, sizeof(message.head))))
-			goto socket_error;
-		length = message.head.length;
-		if (length > maxbody)
-			goto message_too_long;
-		trace(warn("%x/%u", message.head.code, length);)
-		if ((err = readpipe(sock, &message.body, length)))
-			goto socket_error;
-	
-		switch (message.head.code) {
-		case REPLY_ORIGIN_WRITE:
-			rw = WRITE;
-			to_snap = 0;
-			break;
-
-		case REPLY_SNAPSHOT_WRITE:
-			rw = WRITE;
-			to_snap = 1;
-			break;
-
-		case REPLY_SNAPSHOT_READ_ORIGIN:
-			rw = READ;
-			to_snap = 0;
-			break;
-
-		case REPLY_SNAPSHOT_READ:
-			rw = READ;
-			to_snap = 1;
-			break;
-
-		case REPLY_IDENTIFY:
-			trace(warn("identify succeeded");)
-			up(&info->server_out_sem);
-			outbead(info->control_socket, REPLY_CONNECT_SERVER, struct { });
-			continue;
-
-		default: 
-			warn("Unknown message %x", message.head.code);
-			continue;
-		}
-		if (length < sizeof(struct rw_request))
-			goto message_too_short;
-
-		replied_rw(target, (void *)message.body, length, rw, to_snap);
-	}
-out:
-	up(&info->exit2_sem); /* !!! will crash if module unloaded before ret executes */
-	warn("%s exiting", task->comm);
-	return 0;
-message_too_long:
-	warn("message %x too long (%u bytes)", message.head.code, message.head.length);
-	goto out;
-message_too_short:
-	warn("message %x too short (%u bytes)", message.head.code, message.head.length);
-	goto out;
-socket_error:
-	warn("socket error %i", err);
-	if (!running(info))
-		goto out;
-
-	warn("halt worker");
-	report_error(info);
-	goto connect;
-}
-
-/*
- * Here is our nonblocking worker daemon.  It handles all events other
- * than incoming socket traffic.  At the moment, its only job is to
- * send read release messages that can't be sent directly from the read
- * end_io function, which executes in interrupt context.  But soon its
- * duties will be expanded to include submitting IO that was blocked
- * because no server pipe is connected yet, or something broke the
- * pipe.  It may also have to resubmit some server queries, if the
- * server dies for some reason and a new one is incarnated to take its
- * place.  We also want to check for timed-out queries here.  Sure, we
- * have heartbeating in the cluster, but why not have the guy who knows
- * what to expect do the checking?  When we do detect timeouts, we will
- * punt the complaint upstairs using some interface that hasn't been
- * invented yet, because nobody has thought too deeply about what you
- * need to do, to detect faults really quickly and reliably.
- *
- * We throttle this daemon using a counting semaphore: each up on the
- * semaphore causes the daemon to loop through its polling sequence
- * once.  So we make sure we up the daemon's semaphore every time we
- * queue an event.  The daemon may well process more than one event per
- * cycle (we want that, actually, because then it can do some, e.g.,
- * message batching if it wants to) and will therefore end up looping
- * a few times without doing any work.  This is harmless, and much much
- * less nasty than missing an event.  When there are no pending events,
- * the daemon sleeps peacefully.  Killing the daemon is easy, we just
- * pull down the running flag and up the work semaphore, which causes
- * our faithful worker to drop out the bottom.
- */
-void upload_locks(struct snapinfo *info)
-{
-	unsigned long irqflags;
-	struct hook *hook;
-	struct list_head *entry, *tmp;
-
-	spin_lock_irqsave(&info->end_io_lock, irqflags);
-	info->dont_switch_lists = 1;
-	while(!list_empty(&info->releases)){
-		entry = info->releases.prev;
-		hook = list_entry(entry, struct hook, list);
-		list_del(entry);
-		kmem_cache_free(end_io_cache, hook);
-	}
-	spin_unlock_irqrestore(&info->end_io_lock, irqflags);
-	list_for_each_safe(entry, tmp, &info->locked){
-		chunk_t chunk;
-
-		hook = list_entry(entry, struct hook, list);
-		spin_lock_irqsave(&info->end_io_lock, irqflags);
-		if (hook->old_end_io == NULL){
-			list_del(entry);
-			kmem_cache_free(end_io_cache, hook);
-			spin_unlock_irqrestore(&info->end_io_lock, irqflags);
-			continue;
-		}
-		spin_unlock_irqrestore(&info->end_io_lock, irqflags);
-		chunk = hook->sector >> info->chunkshift;
-		outbead(info->sock, UPLOAD_LOCK, struct rw_request1, .count = 1, .ranges[0].chunk = chunk, .ranges[0].chunks = 1);
-	}
-	outbead(info->sock, FINISH_UPLOAD_LOCK, struct {});
-	spin_lock_irqsave(&info->end_io_lock, irqflags);
-	list_for_each_safe(entry, tmp, &info->locked){
-		hook = list_entry(entry, struct hook, list);
-		if (hook->old_end_io == NULL)
-			list_move(&hook->list, &info->releases);
-	}
-	info->dont_switch_lists = 0;
-	spin_unlock_irqrestore(&info->end_io_lock, irqflags);
-}
-
-static void requeue_queries(struct snapinfo *info)
-{
-	unsigned i;
-
-	trace(show_pending(info);)
-	spin_lock(&info->pending_lock);
-	warn("");
-	for (i = 0; i < NUM_BUCKETS; i++) {
-		struct list_head *bucket = info->pending + i;
-
-		while (!list_empty(bucket)) {
-			struct list_head *entry = bucket->next;
-			struct pending *pending = list_entry(entry, struct pending, list);
-			trace_on(warn("requeue %u:%Lx", pending->id, pending->chunk);)
-
-			list_move(entry, &info->queries);
-			up(&info->more_work_sem);
-		}
-	}
-	spin_unlock(&info->pending_lock);
-	trace(show_pending(info);)
-}
-
-static int worker(struct dm_target *target)
-{
-	struct snapinfo *info = target->private;
-	struct task_struct *task = current;
-	int err;
-
-	strcpy(task->comm, "csnap-worker");
-	trace(warn("Worker thread started, pid=%i", current->pid);)
-	down(&info->exit1_sem);
-	goto recover; /* just for now we'll always upload locks, even on fresh start */
-restart:
-	while (worker_running(info)) {
-		unsigned long irqflags;
-		down(&info->more_work_sem);
-
-		/* Send message for each pending request. */
-		spin_lock(&info->pending_lock);
-		while (!list_empty(&info->queries) && worker_running(info)) {
-			struct list_head *entry = info->queries.prev;
-			struct pending *pending = list_entry(entry, struct pending, list);
-
-			list_del(entry);
-			list_add(&pending->list, info->pending + hash_pending(pending->id));
-			spin_unlock(&info->pending_lock);
-			trace(show_pending(info);)
-
-			down(&info->server_out_sem);
-			trace(warn("Server query [%Lx/%x]", pending->chunk, pending->chunks);)
-			if ((err = outbead(info->sock,
-				bio_data_dir(pending->bio) == WRITE? QUERY_WRITE: QUERY_SNAPSHOT_READ,
-				struct rw_request1,
-					.id = pending->id, .count = 1,
-					.ranges[0].chunk = pending->chunk,
-					.ranges[0].chunks = pending->chunks)))
-				goto report;
-			up(&info->server_out_sem);
-			spin_lock(&info->pending_lock);
-		}
-		spin_unlock(&info->pending_lock);
-
-		/* Send message for each pending read release. */
-		spin_lock_irqsave(&info->end_io_lock, irqflags);
-		while (!list_empty(&info->releases) && worker_running(info)) {
-			struct list_head *entry = info->releases.prev;
-			struct hook *hook = list_entry(entry, struct hook, list);
-			chunk_t chunk = hook->sector >> info->chunkshift;
-
-			list_del(entry);
-			spin_unlock_irqrestore(&info->end_io_lock, irqflags);
-			trace(warn("release sector %Lx, chunk %Lx", (long long)hook->sector, chunk);)
-			kmem_cache_free(end_io_cache, hook);
-			down(&info->server_out_sem);
-			if ((err = outbead(info->sock, FINISH_SNAPSHOT_READ, struct rw_request1,
-				.count = 1, .ranges[0].chunk = chunk, .ranges[0].chunks = 1)))
-				goto report;
-			up(&info->server_out_sem);
-			spin_lock_irqsave(&info->end_io_lock, irqflags);
-		}
-		spin_unlock_irqrestore(&info->end_io_lock, irqflags);
-
-		trace(warn("Yowza! More work?");)
-	}
-	if ((info->flags & RECOVER_FLAG)) {
-		down(&info->server_out_sem);
-		up(&info->more_work_sem);
-		goto recover;
-	}
-finish:
-	up(&info->exit1_sem); /* !!! crashes if module unloaded before ret executes */
-	trace_on(warn("%s exiting", task->comm);)
-	return 0;
-
-report:
-	warn("worker socket error %i", err);
-	report_error(info);
-recover:
-	trace_on(warn("worker recovering");)
-	down(&info->recover_sem);
-	if ((info->flags & FINISH_FLAG))
-		goto finish;
-	if (is_snapshot(info))
-		upload_locks(info);
-	requeue_queries(info);
-	trace_on(warn("worker resuming");)
-
-	info->flags &= ~(RECOVER_FLAG|(1 << REPORT_BIT));
-	up(&info->recover_sem);
-	goto restart;
-}
-
-/*
- * Yikes, a third daemon, that makes four including the user space
- * monitor.  This daemon proliferation is due to not using poll, which
- * we should fix at some point.  Or maybe we should wait for aio to
- * work properly for sockets, and use that instead.  Either way, we
- * can combine the two socket-waiting daemons into one, which will look
- * nicer in ps.  Practically speaking, it doesn't matter a whole lot
- * though, if we just stay lazy and have too many daemons.
- *
- * At least, combine this code with incoming, with just the switches
- * different.
- */
-static int control(struct dm_target *target)
-{
-	struct task_struct *task = current;
-	struct snapinfo *info = target->private;
-	struct messagebuf message; // !!! have a buffer in the target->info
-	struct file *sock;
-	int err, length;
-
-	strcpy(task->comm, "csnap-control");
-	trace(warn("Control thread started, pid=%i", current->pid);)
-	sock = info->control_socket;
-	trace(warn("got socket %p", sock);)
-
-	down(&info->exit3_sem);
-	while (running(info)) {
-		trace(warn("wait message");)
-		if ((err = readpipe(sock, &message.head, sizeof(message.head))))
-			goto socket_error;
-		trace(warn("got message header code %x", message.head.code);)
-		length = message.head.length;
-		if (length > maxbody)
-			goto message_too_long;
-		trace(warn("%x/%u", message.head.code, length);)
-		if ((err = readpipe(sock, &message.body, length)))
-			goto socket_error;
-	
-		switch (message.head.code) {
-		case SET_IDENTITY:
-			info->id = ((struct set_id *)message.body)->id;
-			warn("id set: %Lu", info->id);
-			break;
-		case CONNECT_SERVER: {
-			unsigned len = 4;
-			char bogus[len];
-			int sock_fd = get_unused_fd(), fd;
-
-			if (sock_fd < 0) {
-				warn("Can't get fd, error %i", sock_fd);
-				break;
-			}
-			fd_install(sock_fd, sock);
-			if ((fd = recv_fd(sock_fd, bogus, &len)) < 0) {
-				warn("recv_fd failed, error %i", fd);
-				put_unused_fd(sock_fd);
-				break;
-			}
-			trace(warn("Received socket %i", fd);)
-			info->sock = fget(fd);
-			current->files->fdt->fd[fd] = NULL; /* this is sooo hokey */
-			put_unused_fd(sock_fd);
-			sys_close(fd);
-			up(&info->server_in_sem);
-			outbead(info->sock, IDENTIFY, struct identify, .id = info->id, .snap = info->snap);
-			up(&info->recover_sem); /* worker uploads locks now */
-			break;
-		}
-		default: 
-			warn("Unknown message %x", message.head.code);
-			continue;
-		}
-	}
-out:
-	up(&info->exit3_sem); /* !!! will crash if module unloaded before ret executes */
-	warn("%s exiting", task->comm);
-	return 0;
-message_too_long:
-	warn("message %x too long (%u bytes)", message.head.code, message.head.length);
-	goto out;
-socket_error:
-	warn("socket error %i", err);
-	goto out;
-}
-
-/*
- * This is the device mapper mapping method, which does one of three things:
- * (1) tells device mapper to go ahead and submit the request with a default
- * identity mapping (return 1) (2) tells device mapper to forget about the
- * request (return 0), goes off and does its own thing, or (3) on a bad
- * day, tells device mapper to fail the IO (return negative errnum).
- *
- * This is pretty simple: we just hand any origin reads back to device mapper
- * after filling in the origin device.  Then, we check the cache to see if
- * if conditions are right to map the request locally, otherwise we need help
- * from the server, so we remember the request in the pending hash and send
- * off the appropriate server query.
- *
- * To make this a little more interesting, our server connection may be broken
- * at the moment, or may not have been established yet, in which case we have
- * to defer the request until the server becomes available.
- */
-static int csnap_map(struct dm_target *target, struct bio *bio, union map_info *context)
-{
-	struct snapinfo *info = target->private;
-	struct pending *pending;
-	chunk_t chunk;
-	unsigned id;
-
-	bio->bi_bdev = info->orgdev->bdev;
-	if (bio_data_dir(bio) == READ && !is_snapshot(info))
-		return 1;
-
-	chunk = bio->bi_sector >> info->chunkshift;
-	trace(warn("map %Lx/%x, chunk %Lx", (long long)bio->bi_sector, bio->bi_size, chunk);)
-	assert(bio->bi_size <= 1 << info->chunksize_bits);
-#ifdef CACHE
-	if (is_snapshot(info)) { // !!! use page cache for both
-		struct page *page;
-		u64 *exception = snap_map_cachep(info->inode->i_mapping, chunk, &page);
-	
-		if (!exception) {
-			printk("Failed to get a page for sector %ld\n", bio->bi_sector);
-			return -1;
-		}
-
-		u64 exp_chunk = *exception;
-		UnlockPage(page);
-		if (exp_chunk) {
-			bio->bi_sector = bio->bi_sector + ((exp_chunk - chunk) << info->chunkshift);
-			return 1;
-		}
-	} else {
-		if (info->shared_bitmap && get_unshared_bit(info, chunk))
-			return 1;
-	}
-#endif
-	id = info->nextid;
-	info->nextid = (id + 1) & ~(-1 << ID_BITS);
-	pending = kmem_cache_alloc(pending_cache, GFP_NOIO|__GFP_NOFAIL);
-	*pending = (struct pending){ .id = id, .bio = bio, .chunk = chunk, .chunks = 1 };
-	spin_lock(&info->pending_lock);
-	list_add(&pending->list, &info->queries);
-	spin_unlock(&info->pending_lock);
-	up(&info->more_work_sem);
-	return 0;
-}
-
-/*
- * Carefully crafted not to care about how far we got in the process
- * of instantiating our client.  As such, it serves both for error
- * abort and device unload destruction.  We have to scour our little
- * world for resources and give them all back, including any pending
- * requests, context structures and daemons.  The latter have to be
- * convince to exit on demand, and we must be sure they have exited,
- * so we synchronize that with semaphores.  This isn't 100% foolproof'
- * there is still the possibility that the destructor could gain
- * control between the time a daemon ups its exit semaphore and when
- * it has actually returned to its caller.  In that case, the module
- * could be unloaded and the exiting thread will segfault.  This is
- * a basic flaw in Linux that I hope to get around to fixing at some
- * point, one way or another.
- */
-static int shutdown_socket(struct file *socket)
-{
-	struct socket *sock = SOCKET_I(socket->f_dentry->d_inode);
-	return sock->ops->shutdown(sock, RCV_SHUTDOWN);
-}
-
-static void csnap_destroy(struct dm_target *target)
-{
-	struct snapinfo *info = target->private;
-	int err; /* I have no mouth but I must scream */
-
-	trace(warn("%p", target);)
-	if (!info)
-		return;
-
-	/* Unblock helper threads */
-	info->flags |= FINISH_FLAG;
-	up(&info->server_in_sem); // unblock incoming thread
-	up(&info->server_out_sem); // unblock io request threads
-	up(&info->recover_sem); // unblock worker recovery
-
-	if (info->sock && (err = shutdown_socket(info->sock)))
-		warn("server socket shutdown error %i", err);
-	if (info->sock && (err = shutdown_socket(info->control_socket)))
-		warn("control socket shutdown error %i", err);
-
-	up(&info->more_work_sem);
-
-	// !!! wrong! the thread might be just starting, think about this some more
-	// ah, don't let csnap_destroy run while csnap_create is spawning threads
-	down(&info->exit1_sem);
-	warn("thread 1 exited");
-	down(&info->exit2_sem);
-	warn("thread 2 exited");
-	down(&info->exit3_sem);
-	warn("thread 3 exited");
-
-	if (info->sock)
-		fput(info->sock);
-	if (info->inode)
-		iput(info->inode);
-	if (info->shared_bitmap)
-		vfree(info->shared_bitmap);
-	if (info->snapdev)
-		dm_put_device(target, info->snapdev);
-	if (info->orgdev)
-		dm_put_device(target, info->orgdev);
-	kfree(info);
-}
-
-/*
- * Woohoo, we are going to instantiate a new cluster snapshot virtual
- * device, what fun.
- */
-static int get_control_socket(char *sockname)
-{
-	mm_segment_t oldseg = get_fs();
-	struct sockaddr_un addr = { .sun_family = AF_UNIX };
-	int addr_len = sizeof(addr) - sizeof(addr.sun_path) + strlen(sockname); // !!! check too long
-	int sock = sys_socket(AF_UNIX, SOCK_STREAM, 0), err = 0;
-
-	trace(warn("Connect to control socket %s", sockname);)
-	if (sock <= 0)
-		return sock;
-	strncpy(addr.sun_path, sockname, sizeof(addr.sun_path));
-	if (sockname[0] == '@')
-		addr.sun_path[0] = 0;
-
-	set_fs(get_ds());
-	while ((err = sys_connect(sock, (struct sockaddr *)&addr, addr_len)) == -ECONNREFUSED)
-		break;
-//		yield();
-	set_fs(oldseg);
-
-	return err? err: sock;
-}
-
-/*
- * Round up to nearest 2**k boundary
- * !!! lose this
- */
-static inline ulong round_up(ulong n, ulong size)
-{
-	return (n + size - 1) & ~(size - 1);
-}
-
-static int csnap_create(struct dm_target *target, unsigned argc, char **argv)
-{
-	u64 chunksize_bits = 12; // !!! when chunksize isn't always 4K, have to move all this to identify reply handler
-	struct snapinfo *info;
-	int err, i, snap, flags = 0;
-	char *error;
-#ifdef CACHE
-	unsigned bm_size;
-#endif
-
-	error = "csnap usage: orgdev snapdev sockname snapnum";
-	err = -EINVAL;
-	if (argc != 4)
-		goto eek;
-
-	snap = simple_strtol(argv[3], NULL, 0);
-	if (snap >= 0)
-		flags |= IS_SNAP_FLAG;
-
-	err = -ENOMEM;
-	error = "can't get kernel memory";
-	if (!(info = kmalloc(sizeof(struct snapinfo), GFP_KERNEL)))
-		goto eek;
-
-	*info = (struct snapinfo){ 
-		.flags = flags, .snap = snap,
-		.chunksize_bits = chunksize_bits,
-		.chunkshift = chunksize_bits - SECTOR_SHIFT};
-	target->private = info;
-	sema_init(&info->server_in_sem, 0);
-	sema_init(&info->server_out_sem, 0);
-	sema_init(&info->recover_sem, 0);
-	sema_init(&info->exit1_sem, 1);
-	sema_init(&info->exit2_sem, 1);
-	sema_init(&info->exit3_sem, 1);
-	sema_init(&info->more_work_sem, 0);
-	spin_lock_init(&info->pending_lock);
-	spin_lock_init(&info->end_io_lock);
-	INIT_LIST_HEAD(&info->queries);
-	INIT_LIST_HEAD(&info->releases);
-	INIT_LIST_HEAD(&info->locked);
-	for (i = 0; i < NUM_BUCKETS; i++)
-		INIT_LIST_HEAD(&info->pending[i]);
-
-	error = "Can't get snapshot device";
-	if ((err = dm_get_device(target, argv[0], 0, target->len, dm_table_get_mode(target->table), &info->snapdev)))
-		goto eek;
-	error = "Can't get origin device";
-	if ((err = dm_get_device(target, argv[1], 0, target->len, dm_table_get_mode(target->table), &info->orgdev)))
-		goto eek;
-	error = "Can't connect control socket";
-	if ((err = get_control_socket(argv[2])) < 0)
-		goto eek;
-	info->control_socket = fget(err);
-	sys_close(err);
-
-#ifdef CACHE
-	bm_size = round_up((target->len  + 7) >> (chunksize_bits + 3), sizeof(u32)); // !!! wrong
-	error = "Can't allocate bitmap for origin";
-	if (!(info->shared_bitmap = vmalloc(bm_size)))
-		goto eek;
-	memset(info->shared_bitmap, 0, bm_size);
-	if (!(info->inode = new_inode(snapshot_super)))
-		goto eek;
-#endif
-
-	error = "Can't start daemon";
-	if ((err = kernel_thread((void *)incoming, target, CLONE_KERNEL)) < 0)
-		goto eek;
-	if ((err = kernel_thread((void *)worker, target, CLONE_KERNEL)) < 0)
-		goto eek;
-	if ((err = kernel_thread((void *)control, target, CLONE_KERNEL)) < 0)
-		goto eek;
-	warn("Created snapshot device origin=%s snapstore=%s socket=%s snapshot=%i", argv[0], argv[1], argv[2], snap);
-	target->split_io = 1 << info->chunkshift; // !!! lose this as soon as possible
-	return 0;
-
-eek:	warn("Virtual device create error %i: %s!", err, error);
-	csnap_destroy(target);
-	target->error = error;
-	return err;
-
-	{ void *useme = show_pending; useme = useme; }
-}
-
-/* Is this actually useful?  It's really trying to be a message */
-
-static int csnap_status(struct dm_target *target, status_type_t type, char *result, unsigned int maxlen)
-{
-	char orgbuffer[32];
-	char snapbuffer[32];
-	struct snapinfo *info = target->private;
-
-	switch (type) {
-	case STATUSTYPE_INFO:
-		result[0] = '\0';
-		break;
-
-	case STATUSTYPE_TABLE:
-		format_dev_t(orgbuffer, info->orgdev->bdev->bd_dev);
-		format_dev_t(snapbuffer, info->snapdev->bdev->bd_dev);
-		snprintf(result, maxlen, "%s %s %u",
-			 orgbuffer, snapbuffer, 1 << info->chunksize_bits);
-		break;
-	}
-
-	return 0;
-}
-
-static struct target_type csnap = {
-	.name = "csnapshot",
-	.version = {0, 0, 0},
-	.module = THIS_MODULE,
-	.ctr = csnap_create,
-	.dtr = csnap_destroy,
-	.map = csnap_map,
-	.status = csnap_status,
-};
-
-int __init dm_csnap_init(void)
-{
-	int err = -ENOMEM;
-	char *what = "Cache create";
-	if (!(pending_cache = kmem_cache_create("csnap-pending",
-		sizeof(struct pending), __alignof__(struct pending), 0, NULL, NULL)))
-		goto bad1;
-	if (!(end_io_cache = kmem_cache_create("csnap-endio",
-		sizeof(struct hook), __alignof__(struct hook), 0, NULL, NULL)))
-		goto bad2;
-	what = "register";
-	if ((err = dm_register_target(&csnap)))
-		goto bad3;
-#ifdef CACHE
-	err = -ENOMEM;
-	what = "create snapshot superblock";
-	if (!(snapshot_super = alloc_super()))
-		goto bad4;
-#endif
-	return 0;
-
-#ifdef CACHE
-bad4:
-	dm_unregister_target(&csnap);
-#endif
-bad3:
-	kmem_cache_destroy(end_io_cache);
-bad2:
-	kmem_cache_destroy(pending_cache);
-bad1:
-	DMERR("%s failed\n", what);
-	return err;
-}
-
-void dm_csnap_exit(void)
-{
-	int err;
-	trace_on(warn(">>> module exit");)
-	if ((err = dm_unregister_target(&csnap)))
-		DMERR("Snapshot unregister failed %d", err);
-	if (pending_cache)
-		kmem_cache_destroy(pending_cache);
-	if (end_io_cache)
-		kmem_cache_destroy(end_io_cache);
-	kfree(snapshot_super);
-}
-
-module_init(dm_csnap_init);
-module_exit(dm_csnap_exit);
-
-MODULE_LICENSE("GPL");
-
diff --git a/csnap-kernel/src/dm-csnap.h b/csnap-kernel/src/dm-csnap.h
deleted file mode 100644
index ad9f965..0000000
--- a/csnap-kernel/src/dm-csnap.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#define PACKED __attribute__ ((packed))
-#define MAGIC  0xadbe
-
-struct head { uint32_t code; uint32_t length; } PACKED;
-
-enum csnap_codes
-{
-	REPLY_ERROR = 0xbead0000,
-	IDENTIFY,
-	REPLY_IDENTIFY,
-	QUERY_WRITE,
-	REPLY_ORIGIN_WRITE,
-	REPLY_SNAPSHOT_WRITE,
-	QUERY_SNAPSHOT_READ,
-	REPLY_SNAPSHOT_READ,
-	REPLY_SNAPSHOT_READ_ORIGIN,
-	FINISH_SNAPSHOT_READ,
-	CREATE_SNAPSHOT,
-	REPLY_CREATE_SNAPSHOT,
-	DELETE_SNAPSHOT,
-	REPLY_DELETE_SNAPSHOT,
-	DUMP_TREE,
-	INITIALIZE_SNAPSTORE,
-	NEED_SERVER,
-	CONNECT_SERVER,
-	REPLY_CONNECT_SERVER,
-	CONTROL_SOCKET,
-	SERVER_READY,
-	START_SERVER,
-	SHUTDOWN_SERVER,
-	SET_IDENTITY,
-	UPLOAD_LOCK,
-	FINISH_UPLOAD_LOCK,
-	NEED_CLIENTS,
-	UPLOAD_CLIENT_ID,
-	FINISH_UPLOAD_CLIENT_ID,
-	REMOVE_CLIENT_IDS,
-};
-
-struct match_id { uint64_t id; uint64_t mask; } PACKED;
-struct set_id { uint64_t id; } PACKED;
-struct identify { uint64_t id; int32_t snap; } PACKED;
-struct create_snapshot { uint32_t snap; } PACKED;
-
-typedef uint16_t shortcount; /* !!! what is this all about */
-
-struct rw_request
-{
-	uint16_t id;
-	shortcount count;
-	struct chunk_range
-	{
-		uint64_t chunk;
-		shortcount chunks;
-	} PACKED ranges[];
-} PACKED;
-
-/* !!! can there be only one flavor of me please */
-struct rw_request1
-{
-	uint16_t id;
-	shortcount count;
-	struct chunk_range PACKED ranges[1];
-} PACKED;
-
-/* decruft me... !!! */
-#define maxbody 500
-struct rwmessage { struct head head; struct rw_request body; };
-struct messagebuf { struct head head; char body[maxbody]; };
-/* ...decruft me */
diff --git a/csnap/COPYING b/csnap/COPYING
deleted file mode 100644
index 5b6e7c6..0000000
--- a/csnap/COPYING
+++ /dev/null
@@ -1,340 +0,0 @@
-		    GNU GENERAL PUBLIC LICENSE
-		       Version 2, June 1991
-
- Copyright (C) 1989, 1991 Free Software Foundation, Inc.
-                       59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-			    Preamble
-
-  The licenses for most software are designed to take away your
-freedom to share and change it.  By contrast, the GNU General Public
-License is intended to guarantee your freedom to share and change free
-software--to make sure the software is free for all its users.  This
-General Public License applies to most of the Free Software
-Foundation's software and to any other program whose authors commit to
-using it.  (Some other Free Software Foundation software is covered by
-the GNU Library General Public License instead.)  You can apply it to
-your programs, too.
-
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-this service if you wish), that you receive source code or can get it
-if you want it, that you can change the software or use pieces of it
-in new free programs; and that you know you can do these things.
-
-  To protect your rights, we need to make restrictions that forbid
-anyone to deny you these rights or to ask you to surrender the rights.
-These restrictions translate to certain responsibilities for you if you
-distribute copies of the software, or if you modify it.
-
-  For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must give the recipients all the rights that
-you have.  You must make sure that they, too, receive or can get the
-source code.  And you must show them these terms so they know their
-rights.
-
-  We protect your rights with two steps: (1) copyright the software, and
-(2) offer you this license which gives you legal permission to copy,
-distribute and/or modify the software.
-
-  Also, for each author's protection and ours, we want to make certain
-that everyone understands that there is no warranty for this free
-software.  If the software is modified by someone else and passed on, we
-want its recipients to know that what they have is not the original, so
-that any problems introduced by others will not reflect on the original
-authors' reputations.
-
-  Finally, any free program is threatened constantly by software
-patents.  We wish to avoid the danger that redistributors of a free
-program will individually obtain patent licenses, in effect making the
-program proprietary.  To prevent this, we have made it clear that any
-patent must be licensed for everyone's free use or not licensed at all.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.
-\f
-		    GNU GENERAL PUBLIC LICENSE
-   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
-
-  0. This License applies to any program or other work which contains
-a notice placed by the copyright holder saying it may be distributed
-under the terms of this General Public License.  The "Program", below,
-refers to any such program or work, and a "work based on the Program"
-means either the Program or any derivative work under copyright law:
-that is to say, a work containing the Program or a portion of it,
-either verbatim or with modifications and/or translated into another
-language.  (Hereinafter, translation is included without limitation in
-the term "modification".)  Each licensee is addressed as "you".
-
-Activities other than copying, distribution and modification are not
-covered by this License; they are outside its scope.  The act of
-running the Program is not restricted, and the output from the Program
-is covered only if its contents constitute a work based on the
-Program (independent of having been made by running the Program).
-Whether that is true depends on what the Program does.
-
-  1. You may copy and distribute verbatim copies of the Program's
-source code as you receive it, in any medium, provided that you
-conspicuously and appropriately publish on each copy an appropriate
-copyright notice and disclaimer of warranty; keep intact all the
-notices that refer to this License and to the absence of any warranty;
-and give any other recipients of the Program a copy of this License
-along with the Program.
-
-You may charge a fee for the physical act of transferring a copy, and
-you may at your option offer warranty protection in exchange for a fee.
-
-  2. You may modify your copy or copies of the Program or any portion
-of it, thus forming a work based on the Program, and copy and
-distribute such modifications or work under the terms of Section 1
-above, provided that you also meet all of these conditions:
-
-    a) You must cause the modified files to carry prominent notices
-    stating that you changed the files and the date of any change.
-
-    b) You must cause any work that you distribute or publish, that in
-    whole or in part contains or is derived from the Program or any
-    part thereof, to be licensed as a whole at no charge to all third
-    parties under the terms of this License.
-
-    c) If the modified program normally reads commands interactively
-    when run, you must cause it, when started running for such
-    interactive use in the most ordinary way, to print or display an
-    announcement including an appropriate copyright notice and a
-    notice that there is no warranty (or else, saying that you provide
-    a warranty) and that users may redistribute the program under
-    these conditions, and telling the user how to view a copy of this
-    License.  (Exception: if the Program itself is interactive but
-    does not normally print such an announcement, your work based on
-    the Program is not required to print an announcement.)
-\f
-These requirements apply to the modified work as a whole.  If
-identifiable sections of that work are not derived from the Program,
-and can be reasonably considered independent and separate works in
-themselves, then this License, and its terms, do not apply to those
-sections when you distribute them as separate works.  But when you
-distribute the same sections as part of a whole which is a work based
-on the Program, the distribution of the whole must be on the terms of
-this License, whose permissions for other licensees extend to the
-entire whole, and thus to each and every part regardless of who wrote it.
-
-Thus, it is not the intent of this section to claim rights or contest
-your rights to work written entirely by you; rather, the intent is to
-exercise the right to control the distribution of derivative or
-collective works based on the Program.
-
-In addition, mere aggregation of another work not based on the Program
-with the Program (or with a work based on the Program) on a volume of
-a storage or distribution medium does not bring the other work under
-the scope of this License.
-
-  3. You may copy and distribute the Program (or a work based on it,
-under Section 2) in object code or executable form under the terms of
-Sections 1 and 2 above provided that you also do one of the following:
-
-    a) Accompany it with the complete corresponding machine-readable
-    source code, which must be distributed under the terms of Sections
-    1 and 2 above on a medium customarily used for software interchange; or,
-
-    b) Accompany it with a written offer, valid for at least three
-    years, to give any third party, for a charge no more than your
-    cost of physically performing source distribution, a complete
-    machine-readable copy of the corresponding source code, to be
-    distributed under the terms of Sections 1 and 2 above on a medium
-    customarily used for software interchange; or,
-
-    c) Accompany it with the information you received as to the offer
-    to distribute corresponding source code.  (This alternative is
-    allowed only for noncommercial distribution and only if you
-    received the program in object code or executable form with such
-    an offer, in accord with Subsection b above.)
-
-The source code for a work means the preferred form of the work for
-making modifications to it.  For an executable work, complete source
-code means all the source code for all modules it contains, plus any
-associated interface definition files, plus the scripts used to
-control compilation and installation of the executable.  However, as a
-special exception, the source code distributed need not include
-anything that is normally distributed (in either source or binary
-form) with the major components (compiler, kernel, and so on) of the
-operating system on which the executable runs, unless that component
-itself accompanies the executable.
-
-If distribution of executable or object code is made by offering
-access to copy from a designated place, then offering equivalent
-access to copy the source code from the same place counts as
-distribution of the source code, even though third parties are not
-compelled to copy the source along with the object code.
-\f
-  4. You may not copy, modify, sublicense, or distribute the Program
-except as expressly provided under this License.  Any attempt
-otherwise to copy, modify, sublicense or distribute the Program is
-void, and will automatically terminate your rights under this License.
-However, parties who have received copies, or rights, from you under
-this License will not have their licenses terminated so long as such
-parties remain in full compliance.
-
-  5. You are not required to accept this License, since you have not
-signed it.  However, nothing else grants you permission to modify or
-distribute the Program or its derivative works.  These actions are
-prohibited by law if you do not accept this License.  Therefore, by
-modifying or distributing the Program (or any work based on the
-Program), you indicate your acceptance of this License to do so, and
-all its terms and conditions for copying, distributing or modifying
-the Program or works based on it.
-
-  6. Each time you redistribute the Program (or any work based on the
-Program), the recipient automatically receives a license from the
-original licensor to copy, distribute or modify the Program subject to
-these terms and conditions.  You may not impose any further
-restrictions on the recipients' exercise of the rights granted herein.
-You are not responsible for enforcing compliance by third parties to
-this License.
-
-  7. If, as a consequence of a court judgment or allegation of patent
-infringement or for any other reason (not limited to patent issues),
-conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot
-distribute so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you
-may not distribute the Program at all.  For example, if a patent
-license would not permit royalty-free redistribution of the Program by
-all those who receive copies directly or indirectly through you, then
-the only way you could satisfy both it and this License would be to
-refrain entirely from distribution of the Program.
-
-If any portion of this section is held invalid or unenforceable under
-any particular circumstance, the balance of the section is intended to
-apply and the section as a whole is intended to apply in other
-circumstances.
-
-It is not the purpose of this section to induce you to infringe any
-patents or other property right claims or to contest validity of any
-such claims; this section has the sole purpose of protecting the
-integrity of the free software distribution system, which is
-implemented by public license practices.  Many people have made
-generous contributions to the wide range of software distributed
-through that system in reliance on consistent application of that
-system; it is up to the author/donor to decide if he or she is willing
-to distribute software through any other system and a licensee cannot
-impose that choice.
-
-This section is intended to make thoroughly clear what is believed to
-be a consequence of the rest of this License.
-\f
-  8. If the distribution and/or use of the Program is restricted in
-certain countries either by patents or by copyrighted interfaces, the
-original copyright holder who places the Program under this License
-may add an explicit geographical distribution limitation excluding
-those countries, so that distribution is permitted only in or among
-countries not thus excluded.  In such case, this License incorporates
-the limitation as if written in the body of this License.
-
-  9. The Free Software Foundation may publish revised and/or new versions
-of the General Public License from time to time.  Such new versions will
-be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
-Each version is given a distinguishing version number.  If the Program
-specifies a version number of this License which applies to it and "any
-later version", you have the option of following the terms and conditions
-either of that version or of any later version published by the Free
-Software Foundation.  If the Program does not specify a version number of
-this License, you may choose any version ever published by the Free Software
-Foundation.
-
-  10. If you wish to incorporate parts of the Program into other free
-programs whose distribution conditions are different, write to the author
-to ask for permission.  For software which is copyrighted by the Free
-Software Foundation, write to the Free Software Foundation; we sometimes
-make exceptions for this.  Our decision will be guided by the two goals
-of preserving the free status of all derivatives of our free software and
-of promoting the sharing and reuse of software generally.
-
-			    NO WARRANTY
-
-  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
-FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
-OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
-PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
-OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
-TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
-PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
-REPAIR OR CORRECTION.
-
-  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
-REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
-INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
-OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
-TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
-YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
-PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGES.
-
-		     END OF TERMS AND CONDITIONS
-\f
-	    How to Apply These Terms to Your New Programs
-
-  If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
-  To do so, attach the following notices to the program.  It is safest
-to attach them to the start of each source file to most effectively
-convey the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-    <one line to give the program's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-
-Also add information on how to contact you by electronic and paper mail.
-
-If the program is interactive, make it output a short notice like this
-when it starts in an interactive mode:
-
-    Gnomovision version 69, Copyright (C) year name of author
-    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
-    This is free software, and you are welcome to redistribute it
-    under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, the commands you use may
-be called something other than `show w' and `show c'; they could even be
-mouse-clicks or menu items--whatever suits your program.
-
-You should also get your employer (if you work as a programmer) or your
-school, if any, to sign a "copyright disclaimer" for the program, if
-necessary.  Here is a sample; alter the names:
-
-  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
-  `Gnomovision' (which makes passes at compilers) written by James Hacker.
-
-  <signature of Ty Coon>, 1 April 1989
-  Ty Coon, President of Vice
-
-This General Public License does not permit incorporating your program into
-proprietary programs.  If your program is a subroutine library, you may
-consider it more useful to permit linking proprietary applications with the
-library.  If this is what you want to do, use the GNU Library General
-Public License instead of this License.
diff --git a/csnap/Makefile b/csnap/Makefile
deleted file mode 100644
index a9098ad..0000000
--- a/csnap/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-all:
-	${MAKE} -C src all
-
-clean:
-	${MAKE} -C src clean
-	${MAKE} -C tests clean
-
-install: all
-	${MAKE} -C src install
-
-uninstall:
-	${MAKE} -C src uninstall
-
-distclean: clean
-	rm -f make/defines.mk
diff --git a/csnap/README b/csnap/README
deleted file mode 100644
index 6f7c050..0000000
--- a/csnap/README
+++ /dev/null
@@ -1,67 +0,0 @@
-To install:
-
-For 2.4:
-
-  - Start with a 2.4.26 tree from kernel.org
-
-  - Apply the 2.4.26 (pre) device mapper patch
-
-For 2.6:
-
-  - Get a 2.6.7 kernel.org tree, it already has device mapper
-
-For both:
-
-  - Make sure device mapper is enabled in the configuration
-
-  - Now:
-      cd drivers/md
-      tar -xzf csnap-2.4.26.tgz
-      cd csnap
-      make
-
-  - Apply the csnap-2.4.26 patch that you see in this directory.
-    From the root of the 2.4.26 tree:
-
-      patch -p1 <csnap-2.4.26
-
-  - Rebuild the kernel on your test machine, install and reboot
-
-To run tests, you need a couple of block devices named /dev/test-origin 
-and /dev/test-snapstore.  These can just be symlinks to test partitions 
-on some disk, or they can even be files (I think, but I haven't tested 
-this for a while).  Once you have the devices you are ready to go:
-
-      make test
-
-Check out what the test is actually doing by looking in the makefile.  
-Basically, it's:
-
-  - try to clean up from a previous test by killing the server and
-    removing the dm device
-
-  - recreate the snapshot store, so each test starts clean
-
-  - start the snapshot server
-
-  - create a snapshot or origin client
-
-  - run pokedev, which randomly or sequentially reads or writes a bunch
-    of 4K blocks on the snapshot or origin virtual device
-
-Watch the tracing output to see what happened.  The kernel driver 
-tracing defaults to on at the moment, so there should be lots of 
-console output for each IO transfer.
-
-Please let me know what I've forgotten in the above recipes, so that I 
-can incorporate improvements.  One thing: the makefile tests 
-arbitrarily bind the server to port 8080, chances are good you are 
-using this for something else.  I should think of a better default 
-port.  Anyway, just edit the makefile for now.
-
-To remove the test device mapper device:
-
-     make test9
-
-Note note note!  This source tree is in mid-hack!  It's pretty in place 
-and ugly in others.
diff --git a/csnap/doc/cluster.snapshot.design.html b/csnap/doc/cluster.snapshot.design.html
deleted file mode 100644
index b64f954..0000000
--- a/csnap/doc/cluster.snapshot.design.html
+++ /dev/null
@@ -1,1467 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<html>
-<head>
-  <title>Clustered Snapshot Design</title>
-</head>
-<body>
-<div style="text-align: center;"><big><big><big><br>
-Clustered Snapshots<br>
-Detailed Design<small><br>
-</small></big></big></big></div>
-<div style="text-align: center;"><big><big><big><small><br>
-Revised October 6, 2004<br>
-</small></big></big></big><big><big><small><br>
-Daniel
-Phillips, Red Hat Inc.</small></big></big><br>
-<big><big><big><small><br>
-</small></big></big></big>
-<div style="text-align: justify;">
-<ol id="mozToc">
-<!--mozToc h1 1 h2 2 h3 3 h4 4 h5 5 h6 6--><li><a href="#mozTocId615751">Design
-Overview</a>
-    <ol>
-      <li><a href="#mozTocId166506">Overview of Data
-structures</a></li>
-      <li><a href="#mozTocId399340">Snapshot Store Layout</a></li>
-      <li><a href="#mozTocId558118">Superblock and
-Metablock </a></li>
-    </ol>
-  </li>
-  <li><a href="#mozTocId412004">Client-Server Messages</a>
-    <ol>
-      <li><a href="#mozTocId518793">Synchronization via
-server messages </a></li>
-      <li><a href="#mozTocId692447">Message Sequences </a>
-        <ol>
-          <li><a href="#mozTocId226061">Sequences Initiated
-by an Origin Client </a></li>
-          <li><a href="#mozTocId674706">Sequences Initiated
-by a Snapshot Client </a></li>
-          <li><a href="#mozTocId170275">Sequences Initiated
-by the Snapshot Server </a></li>
-        </ol>
-      </li>
-    </ol>
-  </li>
-  <li><a href="#mozTocId691886">Server Operation </a>
-    <ol>
-      <li><a href="#mozTocId330842">Exception Tree Format </a>
-        <ol>
-          <li><a href="#mozTocId268045">Leaf nodes</a></li>
-          <li><a href="#mozTocId478694">Index nodes</a></li>
-        </ol>
-      </li>
-      <li><a href="#mozTocId326876">Journal</a></li>
-      <li><a href="#mozTocId852538">Durability </a></li>
-      <li><a href="#mozTocId244480">Chunk Allocation
-Bitmaps</a></li>
-      <li><a href="#mozTocId189826">Allocation Policy</a></li>
-      <li><a href="#mozTocId194475">Expanding the
-Snapshot Store </a></li>
-      <li><a href="#mozTocId2062">Locking</a></li>
-      <li><a href="#mozTocId750451">Snapshot Deletion </a></li>
-      <li><a href="#mozTocId63589">Server Statistics</a></li>
-    </ol>
-  </li>
-  <li><a href="#mozTocId149578">Client Operation</a></li>
-  <li><a href="#mozTocId986766">Integration with
-Cluster Infrastructure </a>
-    <ol>
-      <li><a href="#mozTocId291552">Server Start</a></li>
-      <li><a href="#mozTocId546783">Server Shutdown </a></li>
-      <li><a href="#mozTocId763101">Failure Recognition</a>
-        <ol>
-          <li><a href="#mozTocId909827">Snapshot Server Failure </a></li>
-          <li><a href="#mozTocId909827"> Cluster Manager
-Failure </a></li>
-        </ol>
-      </li>
-      <li><a href="#mozTocId711483">Server Restart</a></li>
-      <li><a href="#mozTocId407640">Client-Server
-Connections </a>
-        <ol>
-          <li><a href="#mozTocId423442">Initial connection</a></li>
-          <li><a href="#mozTocId755654">Disconnect</a></li>
-          <li><a href="#mozTocId840922">Reconnect</a></li>
-        </ol>
-      </li>
-    </ol>
-  </li>
-  <li><a href="#mozTocId373989">Performance
-Characteristics</a>
-    <ol>
-      <li><a href="#mozTocId206505">Effect of chunk size</a></li>
-      <li><a href="#mozTocId700589">Effect of metadata
-block size</a></li>
-      <li><a href="#mozTocId473026">Effect of Holding
-Multiple Snapshots</a></li>
-      <li><a href="#mozTocId217737">Assumptions</a></li>
-      <li><a href="#mozTocId495690">Origin Read
-Performance</a></li>
-      <li><a href="#mozTocId519978">Sequential Origin
-Write Performance</a></li>
-      <li><a href="#mozTocId245882">Random Origin Write
-Performance</a></li>
-      <li><a href="#mozTocId983353">Snapshot Read
-Performance</a></li>
-      <li><a href="#mozTocId713768">Snapshot Write
-Performance</a></li>
-      <li><a href="#mozTocId899929">Network Performance</a></li>
-      <li><a href="#mozTocId163638">Overall Performance</a></li>
-    </ol>
-  </li>
-  <li><a href="#mozTocId895406">Parallelizing the
-Architecture</a></li>
-  <li><a href="#mozTocId426297">Adaptation to Single Node Client</a></li>
-</ol>
-</div>
-<big><big><big><small></small></big></big></big></div>
-<h1><a class="mozTocH1" name="mozTocId615751"></a>Design Overview</h1>
-The required functionality of the clustered snapshot target, documented
-in detail elsewhere, is briefly reviewed here.&nbsp; This target
-implements a virtual block device that sits on top of some other block
-device, allowing the creation of multiple writable snapshots of the
-state of the underlying device.&nbsp; Each snapshot is also a virtual
-block device implemented by the target.&nbsp; Multiple snapshots and
-the origin device can be active simultaneously on multiple nodes of a
-cluster.&nbsp; These virtual devices must act like real devices, that
-is, they must be durable in the sense that once data is written to the
-origin or to a snapshot it will never be lost even in the event of a
-system crash or power outage.&nbsp; Performance of the virtual devices
-must not be too much less than the underlying device.&nbsp; To save
-space, snapshots must share data chunks with the origin volume and with
-each other as much as possible.<br>
-<br>
-This design implements a client-server architecture
-where almost everything interesting happens in the server.&nbsp; For a
-write request to the origin, the client sends a message to the snapshot
-server instructing it to ensure that the write will not interfere with
-any snapshot by copying data from the origin to snapshot store if
-necessary.&nbsp; The server signals that it has accomplished this by
-acknowledging the message, and the client proceeds to carry out the
-write.&nbsp; A snapshot client handles writes in a similar way with the
-difference that the server informs the client in its response which of
-the chunks the client wishes to write to are located in the snapshot
-store, and where.&nbsp; Snapshot reads require some special
-handling.&nbsp; The snapshot client sends a message to the server
-indicating which chunks it wishes to read.&nbsp; The server locks any
-of those chunks that lie on the origin and thus are at risk of being
-overwritten by a client simultaneously active on the origin.&nbsp; The
-server informs the snapshot client which chunks it has locked, and
-after reading them, the client sends a message to the server that the
-chunks may be unlocked.&nbsp; This interaction between client and
-server via messages provides all the synchronization necessary to
-ensure that multiple simultaneously active origin and snapshot clients
-do not interfere with each other, preserving the illusion that these
-virtual targets are real devices.<br>
-<br>
-There is a fair amount of mechanism behind the scenes in order for the
-server to carry out the
-work required by the above messages faithfully and efficiently.&nbsp;
-This mechanism is implemented in the metadata server, the main focus of
-this design document.<br>
-<br>
-The metadata server maintains the current state of all snapshots in a
-single disk-based btree.&nbsp; The btree
-permits
-a variable number of exceptions to be stored per chunk.&nbsp; Within
-btree leaf nodes, bitmaps are used to to record the sharing of snapshot
-store data.&nbsp; Each bitmap is the same size as a logical address, 64
-bits, giving a maximum of 64 simultaneous snapshots.&nbsp; Btree nodes
-are operated on directly as primary data, as 64 bit alignment of
-objects within the nodes is considered desireable for efficiency and
-to support the stringent alignment requirements of some architectures.<br>
-<br>
-Free space within the snapshot store is tracked with
-bitmaps with a granularity of one bit per chunk.&nbsp; Metadata
-structures&nbsp; on the other hand may have a finer granularity than a
-chunk, typically 4K, which is the current limitation on the size of a
-kernel buffer.&nbsp; When possible, metadata allocations will be made
-metadata_blocksize / chunksize blocks at a time; where this is not
-possible, the system must track partially allocated chunks.&nbsp; The
-current plan is to remember only the most recent partially allocated
-chunk and to do metadata allocations from it until it is completely
-filled.&nbsp; (This simple approach limits the ability to optimize
-metadata layout by setting allocation goal, so this strategy needs to
-be looked at critically.)<br>
-<br>
-A journal is used for durable/atomic updating of snapshot
-metadata.<br>
-<br>
-&nbsp; - Changes to the superblock<br>
-&nbsp; - Changes to the metaroot<br>
-&nbsp; - Current state of the BTree<br>
-&nbsp; - State of all partially allocated chunks<br>
-<br>
-Each origin client caches a bitmap indicating which origin chunks are
-known not to be shared by any snapshot, and thus can be written without
-synchronization.&nbsp; Each snapshot client similarly caches a table of
-exception store addresses of chunks that are known not to be shared.<br>
-<h2><a class="mozTocH2" name="mozTocId166506"></a>Overview of Data
-structures</h2>
-This section describes the main data structures used by the server and
-client, to provide context for the detailed discussions below.<br>
-<br>
-Data structures used by the server are disk-resident and partially
-cached in memory.&nbsp; Client caches are memory-resident.<br>
-<ul>
-  <li>Server On-disk Structures<br>
-    <br>
-  </li>
-  <ul>
-    <li>Superblock and Metaroot<br>
-Static global data, e.g., chunk size; location of the root of the
-exception btree and journal; size of the origin volume and snapshot
-store; allocation bitmap stride and base<br>
-    </li>
-    <li>Metablock<br>
-Miscellaneous variable data, e.g., snapshot list;
-snapshot deletes in progress; list of chunks allocated for metadata but
-only partially
-used; freespace total [do we really need this?]<br>
-    </li>
-    <li>Journal<br>
-For atomic updating and durability, changes to the exception btree,
-allocation bitmaps and
-miscellaneous other structures are written first to the journal then to
-their final destinations.&nbsp; Without such measures, snapshot virtual
-volumes would be vulnerable to disk corruption in the event of
-unexpected system failure, which is undesirable and unlike a physical
-volume.<br>
-    </li>
-    <li>Allocation bitmaps<br>
-Free chunks in the snapshot store are tracked via bitmaps located just
-about the journal in the snapshot store, indexed via a full-populated
-radix tree<br>
-    </li>
-    <li>Exception Btree<br>
-Indexed by chunk address; for each chunk address that has exceptions,
-stores a list of exceptions.&nbsp; Each exception is paired with a
-bitmap indicating which snapshots share the exception<br>
-      <br>
-    </li>
-  </ul>
-  <li>Client Cache<br>
-    <br>
-  </li>
-  <ul>
-    <li>Unique bitmap (for origin client)<br>
-Each one bit in this bitmap indicates that all snapshots have
-exceptions for the given chunk and so the client may write the chunk
-without interacting the snapshot server.&nbsp; A zero means the chunk
-is shared or its status is unknown, in either case the client must<br>
-    </li>
-    <li>Unique exception map (for snapshot client)<br>
-Each exception in the table may be zero if its state is unknown or the
-chunk does not have an exception, or the sector address of an exception
-store chunk, with the low bit one if the chunk is known&nbsp; not to be
-shared by any other snapshot.</li>
-  </ul>
-</ul>
-[need a diagram]
-<h2><a class="mozTocH2" name="mozTocId399340"></a>Snapshot Store Layout</h2>
-From low to high disk
-address:<br>
-<br>
-[ superblock ]<br>
-[ fixed size journal ]<br>
-[ bitmaps, btree leaves and nodes ]<br>
-[ exceptions ]<br>
-<br>
-[need a diagram]<br>
-<br>
-The dividing line between metadata and exceptions is not fixed, but
-rather is determined by allocation policy.<span
- style="font-weight: bold;"><br>
-</span>
-<h2><a name="mozTocId558118" class="mozTocH2"></a>Superblock and
-Metablock<br>
-</h2>
-The first 4K block in the snapshot store is the superblock, containing
-global information for the volume set at creation time:<br>
-<ul>
-  <li>Version<br>
-  </li>
-  <li>Size of snapshot store</li>
-  <li>Metadata block size (4K)<br>
-  </li>
-  <li>Chunk size (binary multiple of metadata block size)<br>
-  </li>
-  <li>Sector address of the metablock</li>
-  <li>Sector address of root of allocation bitmap radix tree</li>
-  <li>Sector address of beginning of journal</li>
-  <li>Size of journal<br>
-  </li>
-  <li>...?<br>
-  </li>
-</ul>
-The metablock contains variable global information:<br>
-<ul>
-  <li>Status flags</li>
-  <ul>
-    <li>Journal clean bit<br>
-    </li>
-  </ul>
-  <li>Highwater mark of chunk allocation</li>
-  <li>Total free space<br>
-  </li>
-  <li>Sector address and allocated size of partially allocated chunk<br>
-  </li>
-  <li>Bitmask of currently active snapshots</li>
-  <li>Bitmask of snapshots in process of being deleted</li>
-  <li>list of sector addresses of roots of snapshot store btrees</li>
-  <li>...?</li>
-</ul>
-Metablock data items that change frequently such as the highwater mark,
-freespace and partial allocation are also recorded in the journal tag
-block each time a journal transaction is committed.&nbsp; Updates to
-the metablock are always journalled.<br>
-<h1><a class="mozTocH1" name="mozTocId412004"></a><span
- style="font-weight: bold;">Client-Server Messages</span></h1>
-<h2><a class="mozTocH2" name="mozTocId518793"></a>Synchronization via
-server messages<br>
-</h2>
-Some form of synchronization is required in order to make clustered
-origin volumes and snapshot devices act as though they are independent
-devices even when they share data.&nbsp; The obvious approach would be
-to use
-a cluster lock manager, with shared locks for reads and exclusive locks
-for writes.&nbsp; But once a client has taken a lock for a write to a
-given chunk it needs to find out from the metadata server whether the
-original data of the chunk needs to be preserved in the snapshot store
-in the case of a write to the origin or a new exception needs to be
-created in the case of a write to a snapshot.&nbsp; This generates more
-message traffic than necessary; it turns out that server messages alone
-are sufficient for synchronization.&nbsp; The following fact is helpful
-in reducing the number of messages required: if a chunk is known to be
-unshared then it can be freely written or read without global
-synchronization.&nbsp; Furthermore, once an origin chunk becomes
-unshared it can only become shared again by creating a new
-snapshot.&nbsp; And once a chunk shared by more than one snapshot
-becomes unshared it will remain unshared until the snapshot is
-deleted.&nbsp; This property means that once a client is told that a
-given chunk is unshared it can rely on that information for a long
-period,
-or more precisely, until a new snapshot is created.&nbsp; A consequence
-of this is that all origin clients must clear their bitmap cache each
-time a new snapshot is created; a server-initiated request/response
-sequence is provided for that purpose.&nbsp; (Snapshot clients on the
-other hand do not have to clear their cache because snapshot chunks
-known to be unique reside in the snapshot store, thus cannot become
-shared with a new snapshot.&nbsp; This is yet another reason why
-snapshot IO performance is expected to ultimately supercede origin IO
-performance.&nbsp; [show why snapshot creation doesn't interfere with
-outstanding snapshot reads])<br>
-<br>
-This design is therefore based on the principle that a client will
-send a request to the snapshot server for every chunk that it does not
-know is unshared.&nbsp; When the client receives the response it knows
-that all requested chunks are unshared.&nbsp; The server has either
-determined this by consulting the btree or made it so by creating new
-exceptions.&nbsp; The
-client then caches the information that the chunks are unshared (in a
-bitmap) to optimize the case of repeated accesses to the same
-chunk.&nbsp;&nbsp; Once a chunk is known to be unshared the client can
-proceed with a write operation to the chunk.<br>
-<br>
-Special synchronization is required for snapshot reads, to prevent an
-origin chunk that is read via a snapshot from being overwritten by a
-write via the origin.&nbsp; Locking is used here, however the locking
-is
-internal to the snapshot server, except that a snapshot client must
-send messages to the server to unlock chunks that the server has locked
-on behalf of the client (only for snapshot reads, not for writes or
-origin reads).&nbsp; Thus the snapshot server acts as a minimal lock
-manager in the case of snapshot reads.<br>
-<br>
-A complete enumeration of cases should clarify the above logic:<br>
-<ul>
-  <li><span style="font-weight: bold;">Origin write to unshared chunk</span><br>
-A write to an origin chunk that is not shared (i.e., all snapshots
-already have exceptions for the chunk) does not require any global
-synchronization; it is the responsibility of the higher level
-application to ensure that no other reads or writes race with this
-write.<br>
-    <br>
-  </li>
-  <li><span style="font-weight: bold;">Origin write to shared chunk</span><br>
-A write to an origin chunk that is shared requires global
-synchronization.&nbsp; This synchronization is accomplished by sending
-a message to the snapshot server which will ensure that the chunk is
-not shared, by examining the exception btree and creating a new
-exception if necessary.&nbsp; When the client receives the reply the
-chunk is gauranteed not to be shared, which reduces to the case above
-and the write can proceed.&nbsp; When the client doesn't know whether a
-chunk is shared or unshared it must ask the server, so "unknown" is
-treated the same as "shared" by the client; once the server responds
-the chunk is known to be unshared and the client can cache this
-information; the chunk can only become shared again if a new snapshot
-is set, in which case the client will discard any sharing information
-it has cached.<br>
-    <br>
-  </li>
-  <li><span style="font-weight: bold;">Origin read from shared or
-unshared chunk</span><br>
-Reads from the origin do not require any global synchronization because
-the higher level application has the responsibility of ensuring that
-these do not race with writes to the same volume.&nbsp; Snapshot writes
-do not collide with origin reads because the destination of a snapshot
-write is always the snapshot store.<br>
-    <br>
-  </li>
-  <li><span style="font-weight: bold;">Snapshot write to unshared chunk</span><br>
-A write to a snapshot logical chunk that is not shared does not require
-global synchronization, as for origin writes.<br>
-    <br>
-  </li>
-  <li><span style="font-weight: bold;">Snapshot write to shared chunk</span><br>
-A write to a snapshot chunk that is shared is similar to a write to the
-origin except that the snapshot server must also return a set of
-exception store addresses to the client, which the client caches.<br>
-    <br>
-  </li>
-  <li><span style="font-weight: bold;">Snapshot read from unshared chunk<br>
-    </span>A snapshot read from an unshared chunk does not require any
-global synchronization.<br>
-    <br>
-  </li>
-  <li><span style="font-weight: bold;">Snapshot read from shared chunk</span><br>
-A snapshot read of a shared chunk requires global synchronization to
-ensure that a write to the same chunk via the origin does not overwrite
-the data while it is being read.&nbsp; The snapshot server performs
-this synchronization by locking locally on the snapshot server between
-writers and readers of shared chunks, details below.&nbsp; Chunks that
-are locked for reading on a snapshot have to be unlocked after the read
-is complete, which requires an additional message from the client to
-the server. Similarly to writes to shared chunks, if the client doesn't
-know that a chunk is shared it must contact the server.&nbsp; The
-server's response indicates which chunks are unshared and the client
-can cache this information.</li>
-</ul>
-Each chunk in the original message, once acknowledged, is guaranteed to
-be unique because the metadata server has either
-determined that each chunk is already unique or it has completed a copy
-to snapshot store to make it unique.&nbsp; [note:&nbsp; chunks are not
-necessarily acknowledged in the requested order]&nbsp; The only way a
-unique chunk
-can become shared is when a new snapshot is set, in fact, at the time a
-new snapshot is set all its chunks are shared with at least the
-origin.&nbsp; For this reason, setting a new snapshot requires that all
-origin clients discard their bitmaps.&nbsp; Thus, the server sends a
-"new snapshot" message to every client and thenew snap shot does not
-become writeable until every client has acknowledged this message.<br>
-<h2><a class="mozTocH2" name="mozTocId692447"></a>Message Sequences<br>
-</h2>
-This section enumerates the messages in each synchronization sequence.<br>
-<h3><a class="mozTocH3" name="mozTocId226061"></a>Sequences Initiated
-by an Origin Client<br>
-</h3>
-<ul>
-  <li>Origin write</li>
-  <ul>
-    <li>Client sends unique request</li>
-    <ul>
-      <li>request gives chunk address range<br>
-      </li>
-    </ul>
-    <li>Server responds with initiate</li>
-    <ul>
-      <li>empty message, i.e., "write can proceed" </li>
-    </ul>
-    <ul>
-      <li>server has verified each chunk is unshared or created new
-exceptions as necessary<br>
-      </li>
-      <li>all chunks are now unique so unique cache can be updated for
-these chunks</li>
-    </ul>
-  </ul>
-</ul>
-<h3><a class="mozTocH3" name="mozTocId674706"></a>Sequences Initiated
-by a Snapshot Client<br>
-</h3>
-<ul>
-  <li>Snapshot write</li>
-  <ul>
-    <li>Client sends unique request</li>
-    <ul>
-      <li>request gives chunk address range<br>
-      </li>
-    </ul>
-    <li>Client responds with initiate</li>
-    <ul>
-      <li>response lists exception addresses, if any</li>
-      <li>after verifying each chunk is unshared, or creating new
-exceptions where not</li>
-    </ul>
-  </ul>
-</ul>
-<ul>
-  <li>Snapshot read</li>
-  <ul>
-    <li>client sends read lock request</li>
-    <ul>
-      <li>request chunk address range<br>
-      </li>
-    </ul>
-    <li>Server responds with initiate</li>
-    <ul>
-      <li>lists exception addresses for non-origin chunks</li>
-      <li>lists which chunks need to be unlocked because they are not
-unique</li>
-    </ul>
-    <li>Client sends unlock when done</li>
-    <ul>
-      <li>for non-unique chunks above<br>
-        <br>
-      </li>
-    </ul>
-  </ul>
-  <li>Snapshot create</li>
-  <ul>
-    <li>client sends snapshot create message</li>
-    <li>server sends snapshot create advice to each origin client</li>
-    <li>each origin client clears its bitmap cache and acknowledges
-create<br>
-    </li>
-    <li>server returns create acknowlege</li>
-    <li>(I'm not completely satisfied with this sequence)<br>
-    </li>
-  </ul>
-</ul>
-<h3><a class="mozTocH3" name="mozTocId170275"></a>Sequences Initiated
-by the Snapshot Server<br>
-</h3>
-[snapshot create]; cluster management messages; error messages;
-shutdown message; [fixme]<br>
-<h1><a class="mozTocH1" name="mozTocId691886"></a>Server Operation<br>
-</h1>
-<h2><a class="mozTocH2" name="mozTocId330842"></a>Exception Tree Format<br>
-</h2>
-Exceptions for all snapshots are stored in a single btree indexed by
-logical chunk address.&nbsp; For each chunk, a list of exceptions is
-stored.&nbsp; Each exception consists of a snapshot address and a
-bitmap indicating which snapshots share that exception.<br>
-<br>
-Rather than serving only as a disk format to be translated into some
-more efficient cache format, the btree is meant to be operated on
-directly by the snapshot server.&nbsp; To this end, data structures in
-the btree nodes and leafs are designed for direct memory access, e.g.,
-all numeric values are aligned according to their size.<br>
-<br>
-An attempt has been made to keep the btree compact by designing the
-node formats carefully, without going to extremes such as using a
-serial compressed encoding which is unpacked into a memory structure in
-order to be accessed.&nbsp; In other words, difficult tradeoffs have
-been made here between compactness, simplicity and efficiency.<br>
-<h3><a class="mozTocH3" name="mozTocId268045"></a>Leaf nodes</h3>
-Leaf block format is optimized for rapid lookup and efficient
-insertion.&nbsp; At the bottom of each leaf is a header and a directory
-map that grows up towards a table of exceptions, which grows
-down.&nbsp; Each entry in the directory map gives the logical chunk
-address
-relative to a base address stored in the header, and has a pointer to
-one of the exceptions in the table at the top of the block.&nbsp; The
-entries are stored in sorted order according to logical chunk address
-and the pointers increase monotonically.<br>
-<br>
-[need a diagram]<br>
-<br>
-Using relative addresses allows the map entries to be more
-compact.&nbsp; In the current prototype map entries consist of two 32
-bit numbers, however two 16 bit numbers might work just as well and
-save more space, although a 16 bit relative block number might be so
-small as to cause a noticeable increase in the number of leaf blocks if
-exceptions.are distributed sparsely.&nbsp; With 32 bit map numbers, a
-single exception requires 24 bytes; with 16 bit map numbers that would
-fall to 20 bytes, a 16% savings.&nbsp; The final determination of which
-is best should probably be determined experimentally.<br>
-<br>
-The difference between each two pointers in the map gives the number of
-exceptions for the chunk.&nbsp; The last entry in the map is a sentinel
-and points at the top of the block (this could be designed out to save
-a few bytes).&nbsp; Each entry in the exception table has the 64 bit
-sector address of an exception in the snapshot store and a bitmap to
-indicate which snapshots share the exception.<br>
-<br>
-The basic operations to locate and determine sharing of exceptions are
-quite efficient.&nbsp; A binary search is used to locate the target
-chunk address in the map, if it is present.&nbsp; This yields a list of
-exceptions on which efficient bitwise operations can be performed to
-determine sharing. From the point of view of the origin, a logical
-chunk is shared unless all active snapshots have exceptions for that
-chunk.&nbsp; From the point of view of a snapshot, a logical chunk is
-shared if it has no exception (i.e., is shared with the origin) or it
-has the same snapshot store address as another snapshot.<br>
-<br>
-A slight drawback of this leaf format is that insertion requires memory
-moves
-in order to maintain the entries in sorted order, and the memory moves
-get longer as the leaf block fills up.&nbsp; For relatively small leaf
-blocks, i.e. 4K, it is probably not a problem.&nbsp; This will be
-determined experimentally.&nbsp; Other, equivalently efficient leaf
-formats are certainly possible, though perhaps they will not be as
-simple.<br>
-<br>
-A more serious drawback of this leaf format is that as the number of
-snapshots increases, update overhead of the btree will increase more or
-less linearly.&nbsp; It is thus desirable to adopt a variant leaf
-format at some point capable of encoding runs of adjacent exceptions
-efficiently.&nbsp; [variant leaf format needs to be provided for in
-per-leaf flags and in superblock flags, for backward
-compatibility.]&nbsp; This issue is treated at greater length in the
-performance section, below.&nbsp; In brief: this will not be a problem
-for reasonable numbers of simultaneous snapshots.<br>
-<h3><a class="mozTocH3" name="mozTocId478694"></a>Index nodes</h3>
-An index node contains a table of entries each of which consists of a
-64 bit logical chunk address key and a 64 bit sector address of a lower
-level index node or, at the lowest index level, a leaf.&nbsp; The
-entries are in sorted order by logical chunk address.&nbsp; Two
-successive keys bound the range of entries contained by the lower level
-node.<br>
-<br>
-To locate the leaf block in which exceptions, if any, are stored for a
-given logical address, we descend recursively from the root, doing a
-binary search on the address key in each block and descending
-recursively into the node referenced by the sector address lying
-between the two keys that bound the target key.<br>
-<br>
-We search all the way to a leaf node even if we are examining a region
-of the address space that is completely empty.&nbsp; For write requests
-this is not inefficient because we will immediately add an exception to
-the&nbsp; leaf node we found if one is not present.&nbsp; For read
-requests it's a little more work than necessary but we probably do not
-care since this only affects snapshot reads, and only by a small amount
-(origin reads do not involve the server).<br>
-<h2><a class="mozTocH2" name="mozTocId326876"></a>Journal</h2>
-Any altered metadata block, i.e, btree leaf and index nodes,
-allocation bitmaps, etc, are written to a journal before being written
-to their final destinations.&nbsp; This gaurantees that the metadata
-can
-be restored reliably to the state of the most recently committed
-exception or other metadata change.<br>
-<br>
-The size and location of the journal are determined at the time the
-snapshot store is created and cannot be changed.<br>
-<br>
-Each journal transaction consists of an arbitrary number of data blocks
-followed by a journal tag block.&nbsp; The tag block carries a magic
-number allowing it to be identified as such for the purpose of journal
-replay, and a sequence number used to locate the starting point for
-journal replay. Any data block written to the journal that happens to
-have the same number at the same location must be escaped by writing a
-zero to that location in a copy of the data.&nbsp; The tag block
-carries a list of snapshot
-store sector addresses which are the final destinations of the data
-blocks.&nbsp; The low bit of the address carries a bit flag indicating
-that the data block was escaped and the magic number needs to be
-restored before the data block is finally written.&nbsp; The tag block
-carries other miscellaneous information such as partial usage status of
-a chunk recently allocated for metadata.<br>
-<h2><a class="mozTocH2" name="mozTocId852538"></a>Durability<br>
-</h2>
-Thanks to the journal, the entire state of the metadata server (with on
-exception, see below) is always completely recorded on disk at the time
-any write is acknowledged.&nbsp; Thus, if the metadata server should
-fail a new one can be started, read the metadata root and continue as
-if nothing had happened.<br>
-<br>
-The one exception to this is that locking state of snapshot read
-requests against origin writes is kept only in memory on the
-server.&nbsp; While it is enough to simply requre that all outstanding
-reads on clients must complete before a newly started metadata server
-can resume processing requests, there could be cases where this would
-cause an unnecessary delay of serveral seconds on server restart where
-there is a heavy backlog of IO.&nbsp; Since it is easy,
-clients will be asked to upload any outstanding locked snapshot reads
-to the new metadata server before the server resumes processing
-requests.&nbsp; This should only take a few tens of milliseconds.&nbsp;
-The total latency of starting a new metadata server then should be
-measured in tens of milliseconds (though detecting that a server has
-failed could easily take much longer).<br>
-<br>
-[more details of journalling]<br>
-[for the kernel implementation, considerations for continued access to
-metadata blocks that are currently in the journal writeout]<br>
-<h2><a class="mozTocH2" name="mozTocId244480"></a>Chunk Allocation
-Bitmaps</h2>
-Freespace in the snapshot store is mangaged via bitmaps with a
-resolution of one bit per chunk.&nbsp; Each bitmap is one 4K block in
-size and maps 2**15 chunks.&nbsp; The bitmap blocks are indexed via a
-radix tree rooted in the header.&nbsp; Each radix tree node contains
-512&nbsp; 8-byte sector addresses.&nbsp; As a slight simplification
-this tree is always 3 levels deep, giving 2^27 * 2^15 =&nbsp; 4
-trillion chunks, or 16 petabytes volume size limit with a minimal 4K
-chunk size.&nbsp; It is always fully populated, i.e., the tree is
-created at the time the snapshot store is created and changed only if
-the snapshot store is expanded.&nbsp; The second lowest level of the
-bitmap index tree is
-loaded into memory when the volume is activated, this will be about 512
-KB per terabyte of snapshot store.<br>
-<br>
-Bitmaps are cached in buffers and accessed via getblk.&nbsp; A pointer
-is kept to the most recently accessed bitmap, i.e., it is not released
-until a different bitmap is accessed, which eliminates the majority of
-getblk lookups assuming reasonably good locality of allocation.&nbsp;
-Likewise, a pointer is kept to the most recently accessed index
-block.&nbsp; Since nearly all accesses to bitmaps are associated with
-changing the bitmap, the bitmaps are kept near the journal rather than
-being distributed throughout the snapshot store.&nbsp; This is purely a
-matter of allocation policy since the actual locations of bitmaps are
-determined by the radix tree.<br>
-<br>
-Since metadata is allocated in blocks but allocation granualrity is
-chunks, some chunks allocated to metadata may be only partially
-full.&nbsp; To avoid leakage of this unallocated space on unexpected
-restart, any partial allocations are recorded in the journal tag
-block.&nbsp; As a side effect, this means that a few
-metadata blocks can be allocated before a bitmap needs to be modified,
-saving some journal bandwidth.<br>
-<h2><a class="mozTocH2" name="mozTocId189826"></a>Allocation Policy</h2>
-[coming soon]<br>
-<br>
-[see the performance section re why this is important]<br>
-<br>
-[Should there be hints re total free space in each region?&nbsp;
-Probably]<br>
-<h2><a class="mozTocH2" name="mozTocId194475"></a>Expanding the
-Snapshot Store<br>
-</h2>
-To expand the snapshot store, additional bitmaps and associated radix
-tree index blocks need to be allocated, hopefully not too far away from
-the journal.&nbsp; Besides updating the snapshot store size in the
-header, this is the only change that needs to be made (I think).<br>
-<h2><a class="mozTocH2" name="mozTocId2062"></a>Locking</h2>
-Synchronization via locking is only required between snapshot reads and
-origin writes.&nbsp; This locking takes place entirely within the
-server so no cluster lock manager
-is involved.&nbsp; (In fact the server is a lock manager for the
-limited case of snapshot reads.)&nbsp; The locks are simple, hashed
-locks.&nbsp; The cost of this locking will be one hash lookup per
-snapshot read or origin write of a shared chunk, plus the unlock
-messages.&nbsp; This locking is only required when snapshot and origin
-virtual devices are active at the same time; e.g., the server does not
-have to take any locks to service origin write requests if no snapshot
-device is active, even if snapshots are being held.<br>
-<br>
-The server takes a (local) lock for each shared chunk in the range of a
-client snapshot read request, if the chunk has no exception for that
-snapshot and therefore might collide with a write to the origin.&nbsp;
-Locked chunks are marked in the response.&nbsp; The client sends a
-message to release the lock after it has completed the read.&nbsp;
-Meanwhile, the server briefly locks each chunk of a client's write
-request after completing the copy to snapshot store and recording the
-new exception, but before allowing the actual write to proceed by
-replying to the client's request.&nbsp; This ensures that a contending
-read always completes before the write to the origin takes place or is
-initiated after the new exception has been recorded, thus directing the
-read to the snapshot store instead of the origin.<br>
-<h2><a class="mozTocH2" name="mozTocId750451"></a>Snapshot Deletion </h2>
-Because it packs together information for multiple
-snapshots in each leaf node, the exception btree is optimized for
-lookup and exception
-insertion as it should be.&nbsp; However, snapshot deletion is not as
-simple an operation&nbsp; as it would be if each snapshot had its own
-tree.&nbsp; (But if each snapshot had its own tree then exception
-creation time would increase with the number of snapshots, much more
-space would be used for multiple snapshots and keeping track of
-exception sharing would be less efficient.)&nbsp; In general, deleting
-a snapshot requires examining the entire btree and modifying each
-leaf&nbsp; block that contains an exception for the snapshot.&nbsp;
-This could amount to quite a lot of IO traffic and take a significant
-amount of time.&nbsp; The snapshot server will therefore simply log the
-status of the snapshot as "in process of deleting" and indicate
-completion immediately to the requesting client.&nbsp; The actual
-deletion will proceed in the background.&nbsp; When the deletion is
-finished, which could require tens of seconds for a large volume, the
-snapshot is logged as available for reuse.<br>
-<br>
-A possible optimization is to defer deletions until several snapshots
-can be deleted in one pass, which will require less time than deleting
-each individually.&nbsp; How much less depends on how common it is for
-exceptions of several snapshots being deleted to lie in the same btree
-node.&nbsp; Another possible optimization is to include in each index
-node a bitmap indicating which snapshots have exceptions in the subtree
-descending from that node so that entire subtrees can be skipped during
-the traversal if they do not need to be modified.<br>
-<br>
-A more aggressive and considerably more difficult optimization would
-involve introducing the concept of snapshot set generations and tagging
-each leaf block with a the snapshot generation as of the most recent
-alteration.&nbsp; Then a snapshot could be deleted by creating a new
-generation that does not include the deleted snapshot.&nbsp; A leaf
-block tagged with an earlier generation would be seen as "stale" and
-would be modified when next encountered, to remap it to the current
-generation, removing exceptions belonging to deleted snapshots in the
-process.&nbsp; The complexity of this approach makes it unattractive,
-however if snapshot deletion performance turns out to be a problem it
-could turn out to be worth the effort.<br>
-<h2><a class="mozTocH2" name="mozTocId63589"></a>Server Statistics</h2>
-The current count of free chunks in the snapshot store is recorded as a
-64 bit value in the journal tag block.&nbsp; In the event of unexpected
-restart this value will be exact since it records the value as of the
-most recent commit, which is the state recovered by replaying the
-journal.<br>
-<h1><a class="mozTocH1" name="mozTocId149578"></a>Client Operation</h1>
-[this section was pasted in from the "barebones client specs" I gave to
-Patrick and needs rewriting]<br>
-<br>
-Client operation is simple: all information required to map an incoming
-request to its destination is obtained from the server, so the clients
-just need to implement some simple message handling and a cache, the
-latter not being essential for correct operations.<br>
-<br>
-Client initialization:<br>
-<br>
-&nbsp; Our target only needs to know three things from the outside
-world:<br>
-<br>
-&nbsp;&nbsp;&nbsp; - device<br>
-&nbsp;&nbsp;&nbsp; - socket<br>
-&nbsp;&nbsp;&nbsp; - chunk size<br>
-<br>
-&nbsp; Later, snapshot clients will need to be tied to the origin
-client in an<br>
-&nbsp; as yet unidentified way.<br>
-<br>
-&nbsp; On initialization the target starts a kernel thread to handle
-server<br>
-&nbsp; responses.<br>
-<br>
-Read/write request handling:<br>
-<br>
-&nbsp; - Each read request is identity-mapped and dm takes care of
-submitting it.<br>
-<br>
-&nbsp; - The target places each write request on a deferred list and
-sends a<br>
-&nbsp;&nbsp;&nbsp; "prepare write" request to the server.&nbsp; The
-prepare write message<br>
-&nbsp;&nbsp;&nbsp; contains a single range of chunk addresses (for now,
-later we will add<br>
-&nbsp;&nbsp;&nbsp; request batching) which the server will make
-unique.&nbsp; This range covers<br>
-&nbsp;&nbsp;&nbsp; the sector range of the corresponding deferred write
-request.<br>
-<br>
-&nbsp; - For each "prepare write" response received from the server the
-target<br>
-&nbsp;&nbsp;&nbsp; searches the deferred write list for a request with
-the indicated<br>
-&nbsp;&nbsp;&nbsp; chunk address, verifies that the chunk count
-matches, removes it from<br>
-&nbsp;&nbsp;&nbsp; the list and submits it.<br>
-<br>
-Other messages:<br>
-<br>
-&nbsp; - We don't need to handle any other messages for now.&nbsp;
-Later we will add<br>
-&nbsp;&nbsp;&nbsp; a variant for handling snapshot prepare write
-messages and the three-step<br>
-&nbsp;&nbsp;&nbsp; snapshot read messages.&nbsp; Later, there will be a
-snapshot creation message<br>
-&nbsp;&nbsp;&nbsp; that allows origin clients to discard their 'unique'
-cache.<br>
-<br>
-Message formats:<br>
-<br>
-&nbsp; - Messages are in network byte order, halfword aligned.<br>
-<br>
-&nbsp; - Message header:<br>
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; be_u16 magic;<br>
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; be_u16 msg_id;<br>
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; be_u32 msg_len;<br>
-<br>
-Prepare write message:<br>
-<br>
-&nbsp; header;<br>
-&nbsp; be_u16 num_ranges; /* always 1 for now */<br>
-&nbsp; be_u64 chunk_address;<br>
-&nbsp; be_u16 chunk_count;<br>
-<br>
-Prepare write response:<br>
-&nbsp; header;<br>
-&nbsp; be_u16 num_ranges; /* always 1 for now */<br>
-&nbsp; be_u64 chunk_address;<br>
-&nbsp; be_u16 chunk_count;<br>
-<br>
-- Matching chunk addresses to blocked writes<br>
-<br>
-- Caching in client is most of it, and it's optional<br>
-<h1><a class="mozTocH1" name="mozTocId986766"></a>Integration with
-Cluster Infrastructure</h1>
-A snapshot client's entire interface with cluster infrastructure takes
-place over a named socket connection to a user space daemon, the csnap
-"agent".&nbsp; This agent is a fairly lightweight program with a simple
-interface that is intended to be customized to a particular operating
-environment.<br>
-<br>
-At present, the only traffic between the device mapper target and the
-user space agent is a three message sequence:<br>
-<ol>
-  <li>Client requests a&nbsp; server connection</li>
-  <li>Agent responds by opening and passing a server connection to the
-client</li>
-  <li>Client confirms that it successfully established communication
-with the server. </li>
-</ol>
-For consistency, these messages are all in the same format as messages
-between the client and server, except for 2. which additionally uses a
-file descriptor passing interface operating over the local socket (see
-below).<br>
-<br>
-The general scheme for snapshot server failover is:<br>
-<ul>
-  <li>The client loses its connection</li>
-  <li>The client tells the userspace service daemon that it lost its
-connection</li>
-  <li>The service daemon contacts rgmanager, or service manager, or
-some definitive source, and asks, "where is the server?"</li>
-  <li>If there is a server, then the client simply tries to reconnect
-to the existing server.</li>
-  <li>If not, then rgmanager will start one, somewhere.</li>
-  <li>After successfully connecting to a new snapshot server, the
-service daemon passes the connection to the client<br>
-  </li>
-</ul>
-(Thanks for expressing this succinctly, Ben)<br>
-<br>
-If a newly started server has to recover any global state from a
-client, then it must interface with the connection manager of the
-cluster infrastructure to be sure that all clients possessing global
-state that were connected to the previous server incarnation have
-either reconnected or left the cluster.&nbsp;&nbsp; [This interface is
-under construction]<br>
-<br>
-A pleasant property of this cluster snapshot design is that, by adding
-an additional message to the snapshot client read protocol, the need to
-recover global state is eliminated, and so is the need to devise a
-connection manager interface protocol.&nbsp; However, the connection
-manager interface protocol in itself is an interesting and useful
-design excercise, so the current plan is to implement both options.<br>
-<h2>Server Infrastructure Interface</h2>
-[under construction]<br>
-<br>
-- needs something to start it<br>
-&nbsp;&nbsp; - which tells it<br>
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; - local name of origin device<br>
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; - local name of snapshot store<br>
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; - port to bind to<br>
-- that something needs to know only one is started<br>
-<h2>Client Infrastructure Interface<br>
-</h2>
-<h3>Client Connection to Server <br>
-</h3>
-A snapshot or origin client takes the form of a device mapper target
-module, which is a standard kernel module that registers itself with
-device mapper at the time it is initialized, either when the module is
-inserted, or at kernel bootstrap time if the module is built into the
-kernel.&nbsp; The csnap module registers a set of methods with device
-mapper, one of which is the "ctr" (constructor) method.<br>
-<br>
-Device mapper can be directed to instantiate a csnap virtual device
-directly via the device mapper ioctl interface (ioctl on the
-/dev/mapper/control device) or indirectly via libdevmapper, or by
-higher level utilities such as dmsetup, LVM2 or EVMS, which use
-libdevmapper, which in turn uses the ioctl interface.&nbsp; To
-instantiate
-the device, device mapper invokes the csnap constructor, passing it a
-set of ascii parameters in much the same form as a C main
-routine.&nbsp; The csnap target expects the
-following parameters:<br>
-<ul>
-  <li>Snapshot number (-1 = origin)</li>
-  <li>Name of the origin physical device<br>
-  </li>
-  <li>Name of the snapshot store physical device<br>
-  </li>
-  <li>Name of a control socket</li>
-</ul>
-The csnap target opens the two "physical" devices (which may themselves
-be virtual devices) and the control socket.&nbsp; If everything is in
-order, initialization completes by sending a message over the control
-socket to request a snapshot server connection.&nbsp; The contructor
-method returns success and device mapper dutifully enters the name of
-the virtual device into the /dev/mapper directory.<br>
-<br>
-At this point the device exists and may receive IO requests, but any
-request received before a server connection is established will merely
-be queued for later processing.&nbsp; A user space agent
-establishes the server connection and passes it to the csnap target
-over the named control socket.&nbsp; To handle failover,
-the agent monitors the control socket connection for
-further server connection requests from the csnap target.<br>
-<br>
-Eventually, the csnap target will receive a server connection over its
-control socket via a fd-passing interface.&nbsp; It then identifies
-itself to the server, and supplies the number of the snapshot it wishs
-to access.&nbsp; The server responds with an error if that snapshot
-does not exist (do we want to have an option to force snapshot creation
-on device initialization, or do we always want to do
-this?).&nbsp;&nbsp; If the server indicates that it is satisfied with
-its new client, the csnap target sends a success message over its
-control socket and proceeds to process any queued IO requests.
-<h3>Client Reconnect<br>
-</h3>
-The client may lose its server connection for one of several reasons:<br>
-<ol>
-  <li>The server fails</li>
-  <li>The connection fails</li>
-  <li>The client sends an erroneous message and the server closes the
-connection</li>
-  <li>The client detects an erroneous reponse from the server or does
-not receive a response</li>
-</ol>
-All these cases are handled the same way: by sending an error message
-over the control socket, then requesting a new connection as for
-initialization.&nbsp; When (if) the target receives a new connection,
-any outstanding requests will be retried starting from the beginning of
-the relevant protocol.&nbsp; In the case of snapshot reads or writes,
-the original query and the actual IO have to be retried, in case a
-newly instantiated snapshot server has allowed some other client to
-overwrite the target data blocks.<br>
-<br>
-The client cannot know the difference between cases 1 and 2, and in any
-case, it handles the two the same way.&nbsp; The external control
-program will try to determine only the client's connection has failed,
-and reestablish it, or if the server has failed, in which case a new
-one has to be started.&nbsp; It will rely on a higher level service in
-the cluster for these determinations, the end result of which should be
-a new server connection, or an unrecoverable error.&nbsp; In cases 3
-and 4, the external control program may decide not to supply a new
-connection but to place the client into an error state instead, so that
-all pending and future IO requests will be failed.<br>
-<br>
-There are two flavors of snapshot read protocol: the "3 message" and
-the "4 message" protocol.&nbsp; The latter provides a confirmation that
-a client's unlock message has been processed by the server, so the
-client knows that a newly instantiated server has not allowed any other
-client to overwrite the data it was reading.&nbsp; In contract, if the
-3 message protocol is used, there is no confirmation of an unlock and
-the client must upload the list of read locks it thinks it holds to the
-new server before the new server will reply to origin write queries on
-behalf of any client.&nbsp; So with the 4 message protocol there is no
-special recovery work for the client to do on server failover, whereas
-with the 3 message protocol, a list of read locks may have to be
-uploaded.<br>
-<br>
-The 4 message protocol is clearly the more robust of the two, though
-only the 3 message protocol is currently implemented.<br>
-<h3><a class="mozTocH2" name="mozTocId763101"></a>Client-side Error
-Detection</h3>
-Currently, the client only detects errors in server reply message
-syntax, which it reports to its user space agent.&nbsp; Traditionally,
-detection of liveness of a server is handled by a cluster heartbeating
-system.&nbsp; However, there are some errors that such a heartbeating
-system will not detect, such as a stuck request, which the server
-receives but never responds to, or a case where the client's connection
-to the server breaks while the heatbeat system's does not.&nbsp; For
-robustness, it is planned to incorporate additional timing-based error
-detection into the client, including:<br>
-<ul>
-  <li>Stuck request detection</li>
-  <li>Liveness of server connection</li>
-</ul>
-Because the client is designed to fail over transparently, it is
-acceptable to make a pessimistic determination of failure, where a
-response that is simply slow is treated as if it will never
-arrive.&nbsp; In either case, the client will report the error to its
-user space agent&nbsp; and wait for instructions, meanwhile attempting
-to continue processing IO requests.&nbsp; The agent in turn will pass
-the error on to a higher level: to a cluster, to a human operator, or
-both.<br>
-<br>
-A stuck request will typically indicate a client or server
-bug.&nbsp;&nbsp; In such a case, it is highly desirable to provide
-accurate fault-isolation information to a central cluster manager as
-opposed to silently leaving a user application stuck in a kernel
-D-state.<br>
-<h3><a name="mozTocId909827" class="mozTocH2"></a> User Space Agent
-Failure</h3>
-The user space agent is a local process and is considered reliable, so
-no failover action is attempted if fails or if the client's connection
-to the agent breaks.&nbsp; The client will respond by failing all
-outstanding and future IO requests.&nbsp; This situation could only
-arise from a bug, administrator error or similar.&nbsp; To recover, the
-device mapper device needs to be removed and recreated.<br>
-<br>
-One possible exception to this might be a feature to permit live
-upgrade of the agent.&nbsp; The adminstrator kills the agent and starts
-a new, improved agent, while the client attempts to reconnect to the
-agent in a polling loop.&nbsp; Though it is not clear at this point why
-anyone would need such a feature, it would be easy to provide.<br>
-<h1><a class="mozTocH1" name="mozTocId373989"></a>Performance
-Characteristics</h1>
-<h2><a class="mozTocH2" name="mozTocId206505"></a>Effect of chunk size</h2>
-Larger chunk size will help performance for sequential and hurt for
-random write loads.&nbsp; The total size of metadata reduces linearly
-with the chunk size, saving space, IO bandwidth and seeking. &nbsp; On
-the other hand, larger chunks increase internal fragmentation of the
-snapshot store, especially for sparse, random access loads, and the
-overhead of metadata updating is supposed to be small in relation to
-data transfers.&nbsp; Therefore it is hoped that the performance and
-metadata size cost of small chunk sizes will be outweighed reduced
-internal fragmentation, saving space in the snapshot store.&nbsp; This
-remains to be tested in practice.<br>
-<h2><a class="mozTocH2" name="mozTocId700589"></a>Effect of metadata
-block size</h2>
-Larger metadata blocks will improve performance somewhat on largely
-serial write loads due do requiring a fewer number of larger IOs,
-especially if the snapshot metadata is fragmented.&nbsp; However, for
-the time being Linux does not support IO buffers larger than physical
-page size, so it is expected that metadata block size will not increase
-until this issue is addressed, at least for a kernel implementation of
-the snapshot metadata server.&nbsp;&nbsp; For compatibility with the
-expected kernel metadata server, the user space implementation will use
-4K blocks.<br>
-<br>
-It is thought that communication overhead and server load will not be
-significant performance factors, due to these being highly
-optimized.&nbsp;
-Contention on large clusters with parallel loads should not be a
-significant factor either, since a single server should be able to
-handle the traffic of many nodes of similar power to itself.&nbsp; The
-exception to this is copy-out overhead which could easily saturate a
-server's bus; a simple solution is available: farm out the copy-out
-traffic to lightly-loaded nodes as necessary.<br>
-<h2><a class="mozTocH2" name="mozTocId473026"></a>Effect of Holding
-Multiple Snapshots</h2>
-The more snapshots that are held, the more btree leaf nodes will be
-required to hold them.&nbsp; Journalling the extra btree leaves to disk
-consumes IO bandwidth, causes more seeking and generates cache
-pressure.&nbsp; Reading in the extra btree nodes increases
-latency.&nbsp; However, because exceptions for all snapshots are stored
-adjacent in the btree, the overhead is not as large as if a separate
-map had to be updated on disk for each snapshot.&nbsp; Importantly, the
-process of determining whether a given chunk is shared never requires
-more than a single leaf node to be examined.<br>
-<br>
-Sharing bitmaps are used within leaf nodes to avoid having to enter any
-given snapshot store address more than once into the node, and also
-performs the function of specifiying which snapshot uses a given
-snapshot store address.&nbsp;&nbsp; The worst case arises when a given
-logical chunk is written at least once after every snapshot.&nbsp; Then
-the leaf node entries for that chunk have a bitmap and a snapshot store
-address for every snapshot.&nbsp;&nbsp;&nbsp; Since leaf nodes are
-expected to be 50% full in the initial implementation, we can end up
-with one exception stored in each leaf node.&nbsp; Then the number of
-btree nodes that have to be journalled is equal to the number of chunks
-written.&nbsp; The journalled node has to be written twice, once to the
-journal and once to its true destination.&nbsp; So the worst case is a
-factor of 3 degradation in write performance due to btree updating
-alone.&nbsp; To ameliorate such degradation it would be wise to use a
-larger chunk size when large numbers of snapshots are expected.<br>
-<br>
-The worst case degradation above can be tempered somewhat by improving
-the btree update algorithm to use a b+tree algorithm, which guarantees
-2/3rds leaf fullness, enough to hold two exceptions instead of
-one.&nbsp; Larger metadata blocks will help reduce seeking overhead,
-when they become practical.. &nbsp; Eventually though, the best
-strategy is to introduce variant leaf node formats that optimize for
-the many-snapshots case by representing ranges of snapshot store chunks
-compactly, especially where the snapshot store chunks are allocated
-sequentially, which is something we want to achieve anyway.<br>
-<br>
-In brief, the metadata update component of origin and snapshot write
-performance will degrade linearly with the number of
-snapshots held, but with a much shallower slope than if snapshot store
-data were not shared and
-metadata were not grouped together by logical address.&nbsp; In the
-latter case, copy-out overhead would increase directly with number of
-snapshots.&nbsp;&nbsp; Exception
-table update overhead would increase rapidly as well, though the exact
-rate is harder to characterize because it depends on the chunk sharing
-patterns.<br>
-<br>
-With the maximum number of snapshots held (64) the new design should
-perform better than the old one
-by a factor of thirty or more.&nbsp; Furthermore, some fairly
-straightforward improvements to the btree leaf format can make the
-slope much shallower, to the point where the overhead of holding 64
-snapshots may be hard to notice.<br>
-<br>
-With a single snapshot held, the new design not perform quite as well
-as the existing device-mapper design, but only because the existing
-design does not provide durable recording of snapshot store
-updates.&nbsp; In any case, the overhead of the durable snapshot
-recording is expected to be only about 2% worst-case overhead vs raw
-writing, far less than the 200% worst-case overhead of copy-outs when a
-single snapshot is held, and shrinks roughly linearly with the chunk
-size (extra seeking in the metadata region makes this relationship
-slightly more complex).&nbsp; So by using a 256K chunk size, metadata
-update can most likely be held to a few percent of first-time write
-overhead even when the maximum number of snapshots are held.<br>
-<h2><a class="mozTocH2" name="mozTocId217737"></a>Assumptions</h2>
-Performance estimates below are based on the assumption that the
-smallest chunk size (4K) is used.&nbsp;&nbsp; Each new exception uses
-20 bytes (exception store address, sharing bitmap and directory entry)
-so each btree leaf node holds a maximum of about 200 exceptions.&nbsp;
-Due to splitting, leaf nodes are normally not full.&nbsp; In fact worst
-case fullness of 50% is expected for the early implementations, so leaf
-nodes will hold about 100 exceptions each.<br>
-<br>
-The performance estimates here assume asynchronous IO, which for user
-space is not yet a practical possibility in Linux, therefore a kernel
-implementation is assumed.&nbsp; The initial implementation however is
-in user space; without asynchronous IO the user space implementation
-will not perform as well as a kernel implementation.&nbsp; It is
-expected that both implementations will be developed and maintained;
-that the user implementation will be available first; that a kernel
-implementation will supercede it in performance; and that the user
-space implementation will eventually pull even with the kernel
-implementation by taking advantage of newly available asynchronous IO
-and user space locking facilities.<br>
-<h2><a class="mozTocH2" name="mozTocId495690"></a>Origin Read
-Performance</h2>
-Origin reads are passed straight through to the underlying
-volume.&nbsp; Since the overhead of the device mapper handling is
-insignificant, origin read performance is essentially unchanged<br>
-<h2><a class="mozTocH2" name="mozTocId519978"></a>Sequential Origin
-Write Performance</h2>
-Origin write throughput is affected mainly by the frequency of chunk
-copy-outs and metadata update overhead.&nbsp;&nbsp; Copy-outs require
-reading and writing, requiring a minimum of 200% additional bandwidth
-vs raw write and additional seeking as well, especially for the
-single-spindle case where the origin volume and snapshot store will be
-far apart.&nbsp; Throughput is improved at the expense of latency by
-batching the copy-out reads and copy-out writes, which happens
-naturally with asynchronous IO.&nbsp; There will thus be fewer long
-seeks between the origin and snapshot store.<br>
-<br>
-Worst case origin write performance is obtained when the snapshot store
-is created with the smallest possible chunk size (4K) and the load
-requires a copy-out for every chunk write.&nbsp;&nbsp; Such a load is
-easy to generate, for example by setting a snapshot and then
-immediately unpacking an archive into the volume.&nbsp; Required IO
-bandwidth will triple, seeking between the origin and snapshot store
-will increase, and metadata updating will increase.&nbsp; Writing in
-this case should be largely linear and batching amortizes the seeking
-overhead, so the dominant effect is expected to be the increased IO
-bandwidth.&nbsp; For this load we should expect to see a 3 times
-slowdown versus raw volume access.&nbsp; Fragmentation of the snapshot
-store could make this considerably worse, perhaps by another factor of
-three.<br>
-<br>
-Since such a load is easy to generate it is worrisome.&nbsp; It is
-possible that in the long run, general performance for a snapshot
-volume could become better than for the origin, see below.<br>
-<br>
-Fragmentation of the snapshot store will introduce additional seeking
-and rotational latency penalties.&nbsp; Reducing such fragmentation by
-clever snapshot store allocation policy will yield significant
-performance gains, however such allocation policy improvements require
-considerable time to develop.&nbsp; A highly fragmented snapshort store
-could aggrate worst case write performance by an additional factor of a
-few hundred percent.<br>
-<br>
-[what about latency]<br>
-<h2><a class="mozTocH2" name="mozTocId245882"></a>Random Origin Write
-Performance</h2>
-A load that consists of 100% single-sector writes distributed randomly
-over the entire volume immediately after setting a snapshot will cause
-copy-out bandwidth to be much more than 200% of raw write bandwidth,
-and will also cause a great deal of additional seeking.&nbsp; Metadata
-overhead will also increase significantly since typically only a single
-chunk on each leaf node will be updated each time the node is
-journalled to disk; rotational latency will increase significantly
-during metadata access.&nbsp; Performance under this random load will
-typically be dominated by seeking rather than bandwidth.&nbsp; Analysis
-is complex, however I will speculate now that the performance of the
-snapshotted volume could degrade by a factor of 3 to 4 versus the raw
-volume due to additional seeking and rotational latency for copy-outs
-and metadata updating.<br>
-<br>
-Fragmentation of the snapshot store can and should be addressed over
-time.&nbsp; For origin writes, nothing that can be done about the
-copy-out
-overhead.&nbsp;&nbsp; Snapshot writes on the other hand do not incurr
-copy-out
-overhead.&nbsp; They do incurr seeking and rotational penalties due to
-fragmentation in the snapshot store, but so do origin writes.&nbsp;
-Furthermore snapshot reads also suffer from fragmentation penalties
-whereas origin reads do not.&nbsp; Very good snapshot store layout
-optimization could reduce both the penalty for snapshot reading and
-writing, in which case&nbsp; general performance on a snapshot volume
-could
-be better than on a snapshotted origin volume.&nbsp; Whether this can
-be
-realized in practice remains to be seen.<br>
-<br>
-[what about latency]<br>
-<h2><a class="mozTocH2" name="mozTocId983353"></a>Snapshot Read
-Performance</h2>
-Unlike origin reads, snapshot read throughput is affected by snapshot
-store fragmentation.&nbsp; Snapshot read latency is increased by the
-requirement of locking against origin writes.&nbsp; Readahead results
-in a kind of lockahead, so under loads where readahead is effective,
-increased snapshot read latency will not hurt read throughput.&nbsp;
-The predominant visible effect is expected to be read
-fragmentation.&nbsp; With large chunk sizes, e.g., 256K and up,
-moderate fragmentation should cause only slight degradation in snapshot
-read performance.&nbsp; However, without special attention to snapshot
-store allocation policy, fragmentation can be expected to be fairly
-severe, so snapshot read performance is not expected to be steller in
-early implementations.&nbsp; Fortunately, since the main purpose of
-reading from a snapshot is to back it up or restore a few files, some
-read performance degradation is acceptable and is unlikely to be
-noticed.<br>
-<br>
-In the long run it is desireable to improve snapshot read performance
-by controlling snapshot store fragmentation as much as possible, in
-order to take advantage of the inherently superior performance of&nbsp;
-snapshot writing versus origin writing.<br>
-<h2><a class="mozTocH2" name="mozTocId713768"></a>Snapshot Write
-Performance</h2>
-Snapshot writes to not require copy-outs; if an origin chunk or shared
-snapshot store chunk needs to be written, the logical chunk is first
-remapped to a new chunk in the snapshot store.&nbsp; With some tweaking
-of the message protocol, writing to the chunk could procede as soon as
-the new allocation is known, in parallel with the logging of the new
-exception.&nbsp; So snapshot writes are inherently quite efficient.<br>
-<br>
-Snapshot write overhead comes from metadata update overhead and
-snapshot store fragmentation.&nbsp; The former is supposed to be small,
-on the order of a few percent.&nbsp; The latter could be very large,
-and probably will be in initial implementation, perhaps on the order of
-a factor of 10.&nbsp; Larger chunk sizes will reduced this seeking
-overhead, roughly linearly with the chunk size.&nbsp; Careful layout
-optimization could conceivably reduce this to a few percent, even with
-small chunks.&nbsp; We shall see.<br>
-<h2><a class="mozTocH2" name="mozTocId899929"></a>Network Performance</h2>
-The amount of message data needed for each chunk is small, especially
-since the message format is designed from the outset to handle ranges
-of chunks and multiple ranges in each message.&nbsp; Except for
-snapshot reads, each message&nbsp; sequence is only two messages long
-(note: approximately.&nbsp; Server responses do not correspond exactly
-requests; e.g., any unshared chunks can be acknowledged
-immediately).&nbsp; Message traffic is expected to be less than 1% of
-disk array traffic.&nbsp; Assuming that the general purpose network
-interconnect and storage array interconnect have similar bandwidth,
-this is where the expectation that this architecture will scale
-linearly to about 100 clients comes from.<br>
-<br>
-[details]<br>
-<h2><a name="mozTocId163638" class="mozTocH2"></a>Overall Performance</h2>
-It is expected that typical usage of a snapshotted origin volume will
-show only slight reduction of performance versus the raw origin volume,
-due to reading being more common than writing.&nbsp; Rewriting chunks
-is
-optimized by the client's bitmap cache, which is compact and probably
-capable of caching all the hot spots of a volume, even for large
-volumes.&nbsp; So rewriting should show now visible degradation.&nbsp;
-The
-performance of fresh writes to snapshotted chunks will degrade
-significantly, due to copy-out bandwidth, and to snapshot store
-fragmentation, that latter being subject to optimization while the
-former is unavoidable.&nbsp; In general, more frequent snapshots cause
-more
-fresh writes, with the frequency of fresh writes peaking just after the
-snapshot and declining over time, till the next snapshot.<br>
-<br>
-So: what will be the balance of fresh writes vs reads and
-rewrites?&nbsp;
-How frequently will we see will we see the balance shift for a short
-time in the direction of the worst case?&nbsp; How bad is the worst
-case?&nbsp;&nbsp;
-How likely is it that the user will notice the shifts in write
-performance?&nbsp;&nbsp;&nbsp; These all await measurement under live
-loads.&nbsp; However
-at this point I will speculate that even a relatively&nbsp; early
-implementation will show average performance degradation versus a raw
-volume of less than ten percent, and that, at worst, performance
-degradation will be limited to a factor of four or so just after a
-snapshot.&nbsp; For many users, and particularly enterprise users, the
-benefits of snapshotting will outweigh the performance loss: it is easy
-to buy bandwidth, not as easy to buy live backup
-capability.&nbsp;&nbsp; For
-others, the unavoidable performance degradation of origin writing will
-make snapshotting unattractive enough to discourage its use.&nbsp;
-Eventually we may be able to satisfy this group as well, by improving
-snapshot store allocation policy to the point where the origin can be
-made optional and all IO take place in the snapshot store.<br>
-<br>
-The pessimism in this section should be tempered by observing that in
-many respects, performance is expected to be good:<br>
-<ul>
-  <li>Large number of snapshots can be held without affecting
-performance much</li>
-  <li>Snapshot store utilization is good</li>
-  <li>Network traffic is minimal</li>
-  <li>Rewrites are highly optimized</li>
-</ul>
-In other words, if you need snapshots then this implementation is
-likely to deliver good performance versus alternatives.&nbsp;&nbsp;
-Plus there is
-a clear path forward to achieving near-optimal performance, by working
-towards a system where the snapshot store can be used effectively
-alone, with no origin volume
-<h1><a class="mozTocH1" name="mozTocId895406"></a>Parallelizing the
-Architecture</h1>
-Normally the first question I am asked about this clustered snapshot
-design is "why isn't it symmetric"?&nbsp; The answer: because a) it
-doesn't have to be in order to perform well on today's typical clusters
-and b) distributing a tree structure across independent caches is a
-complex, error-prone process, and introduces overhead of its own.&nbsp;
-At some point, however, the single node server architecture will become
-a bottleneck, so I discuss parallelizing strategies here.<br>
-<br>
-The easist thing we can do, and with the strongest immediate effect is
-to have the server distribute the copy-out work to underused
-nodes.&nbsp; This will take significant IO bandwidth load off the
-server's bus at the expense of a little messaging latency.&nbsp; By
-doing this, a single server can likely scale to handle a hundred or so
-busy nodes of similar power to itself: the real bottleneck will
-probably be the storage array.&nbsp; A user who can afford to upgrade
-the storage array to handle even larger numbers of clients can likely
-afford to upgrade the snapshot server as well.<br>
-<br>
-At some point, perhaps two or three hundred clients, the snapshot
-server
-becomes a bottleneck again.&nbsp; Further scaling is easily achieved by
-dividing up the work between a number of snapshot servers, by logical
-address range.&nbsp; Each snapshot server maintains a separate btree in
-a distinct range of logical addresses and operates its own
-journal.&nbsp; Care must be taken that allocation bitmaps are divided
-up cleanly; this is not hard (e.g., even if a logical address range
-boundary lies in the middle of a bitmap block, the boundary bitmap can
-be replicated between two nodes, with logic to prevent allocation
-outside the boundary - needed anyway for error checking).&nbsp; Shared
-metadata such as the current snapshot list, superblock, etc., is
-updated using a RW locking strategy (i.e., using a DLM).&nbsp; Assuming
-that workload is distributed relatively evenly across the logical
-address range, this simple parallelization strategy will serve up to a
-thousand clients or so, and the disk will once again be the bottleneck.<br>
-<br>
-[It might be best to add the metadata hooks for this range division now
-since we know it's needed eventually]<br>
-<br>
-If we want to scale to far larger numbers of cients&nbsp; we probably
-have to bite the bullet and distribute the btrees and allocation
-bitmaps.&nbsp; However I do not think this problem is imminent; there
-is plenty of time to think about it.<br>
-<h1><a class="mozTocH1" name="mozTocId426297"></a>Adaptation to Single
-Node Client</h1>
-The current device-mapper snapshot design suffers from a number of
-drawbacks, chiefly<br>
-<ul>
-  <li>copy-out overhead increases linearly with number of snapshots held</li>
-  <li>snapshot state is not recorded durably, but only at shutdown</li>
-  <li>all metadata is held in memory, creating excessive cache pressure
-for large volumes</li>
-</ul>
-It is therefore expected that this design for clustered snapshots will
-be adapted sooner rather than later for use with the normal,
-non-clustered machines that constitute the vast majority of Linux
-installs.&nbsp; How best to do that?<br>
-<br>
-The message-based synchronization described above may not be the
-optimal solution for an entirely local implementation.&nbsp; But then,
-with some tweaking it just might be.&nbsp; Currently I am considering
-the wisdom of adapting the clustered snapshot target for local use by
-replacing the socket messaging with a lightweight ring buffer messaging
-facility that presents the same interface to the rest of the
-target.&nbsp; The obvious alternative is to incorporate the server
-operations directly into the client.&nbsp; It's clear which is easier,
-but which is better?&nbsp; [feedback invited]<br>
-</body>
-</html>
diff --git a/csnap/doc/csnap.ps b/csnap/doc/csnap.ps
deleted file mode 100644
index a31d7b3..0000000
--- a/csnap/doc/csnap.ps
+++ /dev/null
@@ -1,2994 +0,0 @@
-%!PS-Adobe-2.0
-%%Creator: dvips(k) 5.92b Copyright 2002 Radical Eye Software
-%%Title: csnap.dvi
-%%Pages: 20
-%%PageOrder: Ascend
-%%BoundingBox: 0 0 612 792
-%%DocumentFonts: CMSY10
-%%EndComments
-%DVIPSWebPage: (www.radicaleye.com)
-%DVIPSCommandLine: dvips -t letter -o csnap.ps csnap.dvi
-%DVIPSParameters: dpi=600, compressed
-%DVIPSSource:  TeX output 2004.09.03:1751
-%%BeginProcSet: texc.pro
-%!
-/TeXDict 300 dict def TeXDict begin/N{def}def/B{bind def}N/S{exch}N/X{S
-N}B/A{dup}B/TR{translate}N/isls false N/vsize 11 72 mul N/hsize 8.5 72
-mul N/landplus90{false}def/@rigin{isls{[0 landplus90{1 -1}{-1 1}ifelse 0
-0 0]concat}if 72 Resolution div 72 VResolution div neg scale isls{
-landplus90{VResolution 72 div vsize mul 0 exch}{Resolution -72 div hsize
-mul 0}ifelse TR}if Resolution VResolution vsize -72 div 1 add mul TR[
-matrix currentmatrix{A A round sub abs 0.00001 lt{round}if}forall round
-exch round exch]setmatrix}N/@landscape{/isls true N}B/@manualfeed{
-statusdict/manualfeed true put}B/@copies{/#copies X}B/FMat[1 0 0 -1 0 0]
-N/FBB[0 0 0 0]N/nn 0 N/IEn 0 N/ctr 0 N/df-tail{/nn 8 dict N nn begin
-/FontType 3 N/FontMatrix fntrx N/FontBBox FBB N string/base X array
-/BitMaps X/BuildChar{CharBuilder}N/Encoding IEn N end A{/foo setfont}2
-array copy cvx N load 0 nn put/ctr 0 N[}B/sf 0 N/df{/sf 1 N/fntrx FMat N
-df-tail}B/dfs{div/sf X/fntrx[sf 0 0 sf neg 0 0]N df-tail}B/E{pop nn A
-definefont setfont}B/Cw{Cd A length 5 sub get}B/Ch{Cd A length 4 sub get
-}B/Cx{128 Cd A length 3 sub get sub}B/Cy{Cd A length 2 sub get 127 sub}
-B/Cdx{Cd A length 1 sub get}B/Ci{Cd A type/stringtype ne{ctr get/ctr ctr
-1 add N}if}B/id 0 N/rw 0 N/rc 0 N/gp 0 N/cp 0 N/G 0 N/CharBuilder{save 3
-1 roll S A/base get 2 index get S/BitMaps get S get/Cd X pop/ctr 0 N Cdx
-0 Cx Cy Ch sub Cx Cw add Cy setcachedevice Cw Ch true[1 0 0 -1 -.1 Cx
-sub Cy .1 sub]/id Ci N/rw Cw 7 add 8 idiv string N/rc 0 N/gp 0 N/cp 0 N{
-rc 0 ne{rc 1 sub/rc X rw}{G}ifelse}imagemask restore}B/G{{id gp get/gp
-gp 1 add N A 18 mod S 18 idiv pl S get exec}loop}B/adv{cp add/cp X}B
-/chg{rw cp id gp 4 index getinterval putinterval A gp add/gp X adv}B/nd{
-/cp 0 N rw exit}B/lsh{rw cp 2 copy get A 0 eq{pop 1}{A 255 eq{pop 254}{
-A A add 255 and S 1 and or}ifelse}ifelse put 1 adv}B/rsh{rw cp 2 copy
-get A 0 eq{pop 128}{A 255 eq{pop 127}{A 2 idiv S 128 and or}ifelse}
-ifelse put 1 adv}B/clr{rw cp 2 index string putinterval adv}B/set{rw cp
-fillstr 0 4 index getinterval putinterval adv}B/fillstr 18 string 0 1 17
-{2 copy 255 put pop}for N/pl[{adv 1 chg}{adv 1 chg nd}{1 add chg}{1 add
-chg nd}{adv lsh}{adv lsh nd}{adv rsh}{adv rsh nd}{1 add adv}{/rc X nd}{
-1 add set}{1 add clr}{adv 2 chg}{adv 2 chg nd}{pop nd}]A{bind pop}
-forall N/D{/cc X A type/stringtype ne{]}if nn/base get cc ctr put nn
-/BitMaps get S ctr S sf 1 ne{A A length 1 sub A 2 index S get sf div put
-}if put/ctr ctr 1 add N}B/I{cc 1 add D}B/bop{userdict/bop-hook known{
-bop-hook}if/SI save N @rigin 0 0 moveto/V matrix currentmatrix A 1 get A
-mul exch 0 get A mul add .99 lt{/QV}{/RV}ifelse load def pop pop}N/eop{
-SI restore userdict/eop-hook known{eop-hook}if showpage}N/@start{
-userdict/start-hook known{start-hook}if pop/VResolution X/Resolution X
-1000 div/DVImag X/IEn 256 array N 2 string 0 1 255{IEn S A 360 add 36 4
-index cvrs cvn put}for pop 65781.76 div/vsize X 65781.76 div/hsize X}N
-/p{show}N/RMat[1 0 0 -1 0 0]N/BDot 260 string N/Rx 0 N/Ry 0 N/V{}B/RV/v{
-/Ry X/Rx X V}B statusdict begin/product where{pop false[(Display)(NeXT)
-(LaserWriter 16/600)]{A length product length le{A length product exch 0
-exch getinterval eq{pop true exit}if}{pop}ifelse}forall}{false}ifelse
-end{{gsave TR -.1 .1 TR 1 1 scale Rx Ry false RMat{BDot}imagemask
-grestore}}{{gsave TR -.1 .1 TR Rx Ry scale 1 1 false RMat{BDot}
-imagemask grestore}}ifelse B/QV{gsave newpath transform round exch round
-exch itransform moveto Rx 0 rlineto 0 Ry neg rlineto Rx neg 0 rlineto
-fill grestore}B/a{moveto}B/delta 0 N/tail{A/delta X 0 rmoveto}B/M{S p
-delta add tail}B/b{S p tail}B/c{-4 M}B/d{-3 M}B/e{-2 M}B/f{-1 M}B/g{0 M}
-B/h{1 M}B/i{2 M}B/j{3 M}B/k{4 M}B/w{0 rmoveto}B/l{p -4 w}B/m{p -3 w}B/n{
-p -2 w}B/o{p -1 w}B/q{p 1 w}B/r{p 2 w}B/s{p 3 w}B/t{p 4 w}B/x{0 S
-rmoveto}B/y{3 2 roll p a}B/bos{/SS save N}B/eos{SS restore}B end
-
-%%EndProcSet
-%%BeginProcSet: bbad153f.enc
-% Thomas Esser, Dec 2002. public domain
-%
-% Encoding for:
-%     cmsy10 cmsy5 cmsy6 cmsy7 cmsy8 cmsy9
-%
-/TeXbbad153fEncoding [
-/minus /periodcentered /multiply /asteriskmath /divide /diamondmath
-/plusminus /minusplus /circleplus /circleminus /circlemultiply
-/circledivide /circledot /circlecopyrt /openbullet /bullet
-/equivasymptotic /equivalence /reflexsubset /reflexsuperset /lessequal
-/greaterequal /precedesequal /followsequal /similar /approxequal
-/propersubset /propersuperset /lessmuch /greatermuch /precedes /follows
-/arrowleft /arrowright /arrowup /arrowdown /arrowboth /arrownortheast
-/arrowsoutheast /similarequal /arrowdblleft /arrowdblright /arrowdblup
-/arrowdbldown /arrowdblboth /arrownorthwest /arrowsouthwest /proportional
-/prime /infinity /element /owner /triangle /triangleinv /negationslash
-/mapsto /universal /existential /logicalnot /emptyset /Rfractur /Ifractur
-/latticetop /perpendicular /aleph /A /B /C /D /E /F /G /H /I /J /K
-/L /M /N /O /P /Q /R /S /T /U /V /W /X /Y /Z /union /intersection
-/unionmulti /logicaland /logicalor /turnstileleft /turnstileright
-/floorleft /floorright /ceilingleft /ceilingright /braceleft /braceright
-/angbracketleft /angbracketright /bar /bardbl /arrowbothv /arrowdblbothv
-/backslash /wreathproduct /radical /coproduct /nabla /integral
-/unionsq /intersectionsq /subsetsqequal /supersetsqequal /section
-/dagger /daggerdbl /paragraph /club /diamond /heart /spade /arrowleft
-/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
-/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
-/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
-/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
-/minus /periodcentered /multiply /asteriskmath /divide /diamondmath
-/plusminus /minusplus /circleplus /circleminus /.notdef /.notdef
-/circlemultiply /circledivide /circledot /circlecopyrt /openbullet
-/bullet /equivasymptotic /equivalence /reflexsubset /reflexsuperset
-/lessequal /greaterequal /precedesequal /followsequal /similar
-/approxequal /propersubset /propersuperset /lessmuch /greatermuch
-/precedes /follows /arrowleft /spade /.notdef /.notdef /.notdef /.notdef
-/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
-/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
-/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
-/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
-/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
-/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
-/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef
-] def
-
-%%EndProcSet
-%%BeginProcSet: texps.pro
-%!
-TeXDict begin/rf{findfont dup length 1 add dict begin{1 index/FID ne 2
-index/UniqueID ne and{def}{pop pop}ifelse}forall[1 index 0 6 -1 roll
-exec 0 exch 5 -1 roll VResolution Resolution div mul neg 0 0]FontType 0
-ne{/Metrics exch def dict begin Encoding{exch dup type/integertype ne{
-pop pop 1 sub dup 0 le{pop}{[}ifelse}{FontMatrix 0 get div Metrics 0 get
-div def}ifelse}forall Metrics/Metrics currentdict end def}{{1 index type
-/nametype eq{exit}if exch pop}loop}ifelse[2 index currentdict end
-definefont 3 -1 roll makefont/setfont cvx]cvx def}def/ObliqueSlant{dup
-sin S cos div neg}B/SlantFont{4 index mul add}def/ExtendFont{3 -1 roll
-mul exch}def/ReEncodeFont{CharStrings rcheck{/Encoding false def dup[
-exch{dup CharStrings exch known not{pop/.notdef/Encoding true def}if}
-forall Encoding{]exch pop}{cleartomark}ifelse}if/Encoding exch def}def
-end
-
-%%EndProcSet
-%%BeginFont: CMSY10
-%!PS-AdobeFont-1.1: CMSY10 1.0
-%%CreationDate: 1991 Aug 15 07:20:57
-% Copyright (C) 1997 American Mathematical Society. All Rights Reserved.
-11 dict begin
-/FontInfo 7 dict dup begin
-/version (1.0) readonly def
-/Notice (Copyright (C) 1997 American Mathematical Society. All Rights Reserved) readonly def
-/FullName (CMSY10) readonly def
-/FamilyName (Computer Modern) readonly def
-/Weight (Medium) readonly def
-/ItalicAngle -14.035 def
-/isFixedPitch false def
-end readonly def
-/FontName /CMSY10 def
-/PaintType 0 def
-/FontType 1 def
-/FontMatrix [0.001 0 0 0.001 0 0] readonly def
-/Encoding 256 array
-0 1 255 {1 index exch /.notdef put} for
-dup 0 /.notdef put
-readonly def
-/FontBBox{-29 -960 1116 775}readonly def
-/UniqueID 5000820 def
-currentdict end
-currentfile eexec
-D9D66F633B846A97B686A97E45A3D0AA052F09F9C8ADE9D907C058B87E9B6964
-7D53359E51216774A4EAA1E2B58EC3176BD1184A633B951372B4198D4E8C5EF4
-A213ACB58AA0A658908035BF2ED8531779838A960DFE2B27EA49C37156989C85
-E21B3ABF72E39A89232CD9F4237FC80C9E64E8425AA3BEF7DED60B122A52922A
-221A37D9A807DD01161779DDE7D31FF2B87F97C73D63EECDDA4C49501773468A
-27D1663E0B62F461F6E40A5D6676D1D12B51E641C1D4E8E2771864FC104F8CBF
-5B78EC1D88228725F1C453A678F58A7E1B7BD7CA700717D288EB8DA1F57C4F09
-0ABF1D42C5DDD0C384C7E22F8F8047BE1D4C1CC8E33368FB1AC82B4E96146730
-DE3302B2E6B819CB6AE455B1AF3187FFE8071AA57EF8A6616B9CB7941D44EC7A
-71A7BB3DF755178D7D2E4BB69859EFA4BBC30BD6BB1531133FD4D9438FF99F09
-4ECC068A324D75B5F696B8688EEB2F17E5ED34CCD6D047A4E3806D000C199D7C
-515DB70A8D4F6146FE068DC1E5DE8BC5703711DA090312BA3FC00A08C453C609
-C627A8AC5158CF7CDD95058BF2B70796EB09F833A6DD557560244C58DCDF257B
-36F96AF73BCDFAD79FC5AE97F7E4CB643BE6125DC257BE825C1FCE80EA9886AF
-87B49B6FCF3AB57CB1960E83AD525404EDE0E99FB350CB662C8CA46B4D161320
-892A7D2FB2A57C42B874D5DE96C17F8A0FEC855E62DD37AD6088597E91527E1C
-0EA9A3FB7AE720C62543ED75FF6DFA01E434F2841851CCD780F15A1EBE417E52
-0F753BEF7ADFFAA9173C4776936AD55854BC82CDB3327374E540A4A0A27B6AB0
-0E9C1B155C72BB
-0000000000000000000000000000000000000000000000000000000000000000
-0000000000000000000000000000000000000000000000000000000000000000
-0000000000000000000000000000000000000000000000000000000000000000
-0000000000000000000000000000000000000000000000000000000000000000
-0000000000000000000000000000000000000000000000000000000000000000
-0000000000000000000000000000000000000000000000000000000000000000
-0000000000000000000000000000000000000000000000000000000000000000
-0000000000000000000000000000000000000000000000000000000000000000
-cleartomark
-%%EndFont 
-TeXDict begin 40258431 52099146 1000 600 600 (csnap.dvi)
-@start
-%DVIPSBitmapFont: Fa ecrm0800 8 28
-/Fa 28 123 df<123C127EB4FCA21380A2127F123D1201A312031300A25A1206120E5A5A
-5A126009157AAD14>39 D<B512C0A412047F9018>45 D<123C127E12FFA4127E123C0808
-7A8714>I<4A7E4A7EA34A7EA24A7EA3EC1BF81419A2EC30FCA2EC70FEEC607EA24A7EA3
-49486C7EA2010380EC000FA201066D7EA3496D7EA2011FB57EA29038180001496D7EA349
-147EA201E0147F4980A20001ED1F801203000716C0D80FF0EC3FE0D8FFFC0103B5FCA230
-2F7EAE35>65 D<007FB712F8A29039000FC003007C150000701638A200601618A200E016
-1CA248160CA5C71500B3A94A7E011FB512E0A22E2D7EAC33>84 D<13FF000713C0380F01
-F0381C00F8003F137C80A2143F001E7FC7FCA4EB07FF137F3801FE1FEA07F0EA1FC0EA3F
-80EA7F00127E00FE14065AA3143F7E007E137F007FEBEF8C391F83C7FC390FFF03F83901
-FC01E01F207D9E23>97 D<EA07C012FFA2120F1207AC14FE9038C7FF809038CF03E09038
-DC01F09038F8007C49137E49133E497F1680A2150F16C0A9ED1F80A216005D6D133E6D5B
-01B05B9038BC01F090380E07E0390607FF80260001FCC7FC222F7EAD27>I<EB1FE0EB7F
-FC3801F01E3803E0073907C01F80EA0F80EA1F005A003EEB0F00007E90C7FCA2127C12FC
-A9127EA215C07E6C130101801380380FC0033907E007003801F03E38007FF8EB1FC01A20
-7E9E1F>I<15F8141FA214011400ACEB0FE0EB7FF83801F81E3803E0073807C003380F80
-01EA1F00481300123E127EA25AA9127C127EA2003E13017EEB8003000F13073903E00EFC
-3A01F03CFFC038007FF090391FC0F800222F7EAD27>I<EB1F80EBFFF03803E0783807C0
-3E380F801E381F001FEC0F80123E007E130715C0127C12FCA3B6FCA200FCC8FCA5127EA2
-003E14C0123F6C1301390F80038001C013003803E00F3801F03C38007FF8EB1FC01A207E
-9E1F>I<EB03F0EB0FFCEB3E1EEB7C3F13F8EA01F0A23803E00C1400AAB512E0A23803E0
-00B3A6487E387FFF80A2182F7FAE16>I<013F13F89038FFC3FE3903E1FF1E3807807C00
-0F140C391F003E00A2003E7FA76C133EA26C6C5A00071378380FE1F0380CFFC0D81C3FC7
-FC90C8FCA3121E121F380FFFF814FF6C14C04814F0391E0007F848130048147C12F84814
-3CA46C147C007C14F86CEB01F06CEB03E03907E01F803901FFFE0038003FF01F2D7E9D23
->I<EA07C012FFA2120F1207AC14FE9038C3FF809038C703E09038DE01F013F8496C7EA2
-5BA25BB2486C487E3AFFFE1FFFC0A2222E7EAD27>I<EA0780EA0FC0EA1FE0A4EA0FC0EA
-0780C7FCA8EA07C012FFA2120F1207B3A5EA0FE0EAFFFCA20E2E7EAD14>I<EA07C012FF
-A2120F1207B3B3A3EA0FE0EAFFFEA20F2E7EAD14>108 D<2607C07FEB07F03BFFC3FFC0
-3FFC903AC783F0783F3C0FCE01F8E01F803B07DC00F9C00F01F8D9FF8013C04990387F00
-0749137EA249137CB2486C01FEEB0FE03CFFFE0FFFE0FFFEA2371E7E9D3C>I<3807C0FE
-39FFC3FF809038C703E0390FDE01F0EA07F8496C7EA25BA25BB2486C487E3AFFFE1FFFC0
-A2221E7E9D27>I<EB1FE0EB7FF83801F03E3803C00F3907800780390F0003C04814E000
-3EEB01F0A248EB00F8A300FC14FCA9007C14F8A26CEB01F0A26CEB03E0A2390F8007C039
-07C00F803901F03E0038007FF8EB1FE01E207E9E23>I<3807C0FE39FFC7FF809038CF03
-E0390FDC01F03907F800FC49137E49133E49133FED1F80A3ED0FC0A8151F1680A2ED3F00
-A26D137E6D137C5D9038FC01F09038CE07E09038C7FF80D9C1FCC7FC01C0C8FCA9487EEA
-FFFEA2222B7E9D27>I<90380FE01890387FF8383801F81C3903E00E783807C007390F80
-03F8001F1301EA3F00A2007E1300A212FE5AA8127EA36C13017EEB8003380FC0073803E0
-0E3801F03C38007FF0EB1FC090C7FCA94A7E91381FFFC0A2222B7E9D25>I<380781F038
-FF87FCEB9E7EEA0F98EA07B813B0EBF03CEBE000A35BB1487EB5FCA2171E7E9D1B>I<38
-01FE183807FFB8381E01F8EA3C00481378481338A21418A27E7EB41300EA7FF06CB4FC6C
-13C06C13F0000113F838001FFC130138C0007E143EA26C131EA27EA26C133CA26C137838
-FF01F038E3FFC000C0130017207E9E1C>I<1360A413E0A312011203A21207121FB512F0
-A23803E000AF1418A714383801F03014703800F860EB3FE0EB0F80152A7FA81B>I<D807
-C013F800FF131FA2000F130100071300B21401A314033803E007EC0EFC3A01F81CFFC038
-007FF890391FE0F800221F7E9D27>I<3AFFFC01FFC0A23A0FE0007E000007147C153800
-03143015706C6C1360A26C6C5BA390387C0180A26D48C7FCA2EB3F07EB1F06A2EB0F8CA2
-14DCEB07D8A2EB03F0A36D5AA26D5A221E7F9C25>I<3BFFFC3FFE07FFA23B0FE003F001
-F801C09038E000F00007010114E0812603E00314C0A2913807F8012701F006781380A290
-39F80E7C030000D90C3C1300A290397C181E06A2151F6D486C5AA2168C90391F600798A2
-16D890390FC003F0A36D486C5AA36DC75A301E7F9C33>I<3AFFFC01FFC0A23A0FE0007E
-000007147C1538000314306D137000011460A26C6C5BA2EBFC01017C5BEB7E03013E90C7
-FCA2EB1F06A2148EEB0F8CA2EB07D8A2EB03F0A36D5AA26D5AA2495AA2130391C8FC1278
-EAFC06A25B131CEA7838EA7070EA3FE0EA0F80222B7F9C25>121
-D<003FB51280A2EB003F003C14000038137E00305BEA700100605B495A495A130F00005B
-495A49C7FC5B137E9038FC0180EA01F8120313F03807E003EA0FC0001F1400138048485A
-007E5B00FE133FB6FCA2191D7E9C1F>I E
-%EndDVIPSBitmapFont
-%DVIPSBitmapFont: Fb ecrm0600 6 2
-/Fb 2 51 df<13E01201120712FF12F91201B3A7487EB512C0A212217AA01E>49
-D<EA01FC3807FF80381C0FC0383003E0386001F0EB00F812F86C13FCA2147C1278003013
-FCC7FC14F8A2EB01F0EB03E014C0EB0780EB0F00131E13385B5B3801C00CEA0380380600
-185A5A383FFFF85AB512F0A216217CA01E>I E
-%EndDVIPSBitmapFont
-%DVIPSBitmapFont: Fc ecrm0700 7 2
-/Fc 2 51 df<13381378EA01F8121F12FE12E01200B3AB487EB512F8A215267BA521>49
-D<13FF000313E0380E03F0381800F848137C48137E00787F12FC6CEB1F80A4127CC7FC15
-005C143E147E147C5C495A495A5C495A010EC7FC5B5B903870018013E0EA018039030003
-0012065A001FB5FC5A485BB5FCA219267DA521>I E
-%EndDVIPSBitmapFont
-%DVIPSBitmapFont: Fd ecbx1000 10 35
-/Fd 35 122 df<913A03FF8007FE027F9039F07FFF800103B500FDB512E0010F903A00FF
-FE0FF0D93FF8ECF81F90267FE0019038F03FF849485A4816E014804816C00200ED1FF081
-F007C06F91C7FCA8B912E0A4000390C701C0C7FCB3ABB5D8FC3FEBFF80A43D3A7EB938>
-27 D<B61280A819087F9620>45 D<ED03E04B7EA24B7EA34B7EA24B7EA34B7EA292B57E
-A34A8015F302038015E1A202078015C0020F80ED807FA2021F80ED003F4A80023E131FA2
-027E80027C7F02FC814A7FA20101824A7F49B77EA3498202C0C7FC010F824A147FA2011F
-8291C8123F4982013E151FA2017E82017C8101FE83B500F80107B61280A4413A7DB948>
-65 D<B812C017FC17FF18C028007FF000037F04007F717E717E171F84A2717EA74D5AA2
-60173F4D5A4D5A4C13C0040F5B91B600FCC7FCA2EFFF8002F0C713F0EF3FF8717E717E71
-7E19807113C0A319E0A719C0A25F4D138019005FEF7FFE4C485AB912F018C095C7FC17F0
-3B397DB844>I<DB3FFCEB01C00203B5EAC003021FECF00791B6EAFC0F01039039FC00FF
-3F4901C0EB1FFFD91FFEC77E49481403D97FF080494880485B48177F4849153F4890C9FC
-181F485A180F123F5B1807127FA24993C7FC12FFAD127F7FF003C0123FA27F001F1707A2
-6C6C1780180F6C6D16006C6D5D6C173E6C6D157ED97FF85D6D6C4A5A6DB44A5A010701C0
-EB0FE06D01FCEBFF80010090B548C7FC021F14F8020314E09126003FFEC8FC3A3B7BB945
->I<B87E17F817FF18C028007FF8000713F09338007FF8EF1FFE717E050313807113C0A2
-7113E0F07FF0A2F03FF8A219FC181FA219FEA419FFAC19FEA419FC183FA219F8187F19F0
-F0FFE0A24D13C04D13804D1300EF1FFEEF7FFC933807FFF0B912C095C7FC17FC17804039
-7DB849>I<B912F0A426007FF8C7FCEF1FF8170717031701A21700A21878A3043C137C18
-3CA41800167CA216FC150391B5FCA4ECF8031500167CA2163C180FA3181EA293C7FCA218
-3EA2183C187CA218FCA2EF01F81703170F173FEE01FFB9FC18F0A338397DB83F>I<B612
-FCA439007FF800B3B3ADB612FCA41E397DB824>73 D<B7FCA426007FF8C9FCB3ACEF0780
-A5170F1800A35FA25FA25F5F5E5EEE0FFE167FB8FCA431397DB839>76
-D<B500F80403B512F06E5EA26E5ED8007FF1E000A2D97BFF161EA201796D5DA201786D5D
-A26E6C5DA36E6C4A5AA26E6C4A5AA26E6C4A5AA26E6C4A5AA26E6C141EA36E6D5BA26E6D
-5BA26F6C5BA26F6C485AA36F6C485AA26F6C485AA26F6C48C7FCA2923803FF1EA36F13BC
-A26F13F8A2705AA2705AA213FCB500FC6D4848B612F0A2EE0F80EE070054397DB85B>I<
-EDFFF8020FEBFF80027F14F0903A01FFC01FFC010790380007FFD91FFC010113C0D93FF0
-6D6C7E49486E7E49486E7E48496E7E48834890C86C7EA248486F1380A248486F13C0A200
-3F18E0A348486F13F0A400FF18F8AC007F18F06D5DA3003F18E0A26D5D001F18C0A26C6C
-4B13806C18006E5C6C6D4A5A6C5F6C6D4A5A6D6C4A5AD93FFC49485A6DB401075B0107D9
-C01F90C7FC010190B512FC6D6C14F0020F1480020001F8C8FC3D3B7BB948>79
-D<D907FF130E013FEBE01E90B5EAF83E0003ECFE7E3A07FC01FFFE390FF0001F4848130F
-48481303491301007F140090C8FC167E5A163EA27F161E7F7F6D91C7FC13FC387FFFE014
-FEECFFF06C14FE6F7E6C816C15F06C816C81C681133F010F801301D9000F1480EC007F03
-0F13C01503818100F0157FA3163FA27E17807E167F6C16007E6D14FE01E0495A01F81303
-9039FF801FF800FC90B512E0D8F83F5CD8F00749C7FC39E0007FF02A3B7BB935>83
-D<B600FC011FB512C0A426007FF8C8381FC000725AB3B3181F013F94C7FC8060011F163E
-6D6C157E187C6D6C15FC6D6D495A6D6DEB07F06D01F0EB1FE0DA7FFEEBFFC0021FB6C8FC
-02075C020014F0030F1380423A7DB849>85 D<EB3FFE0003B512E0000F14F8391FF00FFE
-003FEB03FF6D6C7F6E7FA26F7EA26C5A6C5AEA0380C8FCA2EC3FFF010FB5FC137F3901FF
-F87F00071380380FFE00EA3FF85B485A12FF5BA415FF6D5A127F263FF00713F83B1FFC1F
-BFFFC0390FFFFE1F0003EBF80F39003FE0032A257DA42E>97 D<13FFB5FCA412077EAF4A
-B47E020F13F0023F13FC9138FE03FFDAF00013804AEB7FC00280EB3FE091C713F0EE1FF8
-A217FC160FA217FEAA17FCA3EE1FF8A217F06E133F6EEB7FE06E14C0903AFDF001FF8090
-3AF8FC07FE009039F03FFFF8D9E00F13E0D9C00390C7FC2F3A7EB935>I<903801FFC001
-0F13FC017F13FFD9FF8013802603FE0013C048485AEA0FF8121F13F0123F6E13804848EB
-7F00151C92C7FC12FFA9127FA27F123FED01E06C7E15036C6CEB07C06C6C14806C6C131F
-C69038C07E006DB45A010F13F00101138023257DA42A>I<EE7F80ED7FFFA4150381AF90
-3801FF81010F13F1013F13FD9038FFC07F0003EB001FD807FC1307000F8048487F5B123F
-A2485AA312FFAA127FA27F123FA26C6C5B000F5C6C6C5B6C6C4913C02701FF80FD13FE39
-007FFFF9011F13E1010113012F3A7DB935>I<903803FF80011F13F0017F13FC3901FF83
-FE3A03FE007F804848133F484814C0001FEC1FE05B003FEC0FF0A2485A16F8150712FFA2
-90B6FCA301E0C8FCA4127FA36C7E1678121F6C6C14F86D14F000071403D801FFEB0FE06C
-9038C07FC06DB51200010F13FC010113E025257DA42C>I<EC1FF0903801FFFC010713FF
-90391FF87F8090383FE0FFD9FFC113C0A2481381A24813016E1380A2ED3E0092C7FCA8B6
-FCA4000390C8FCB3ABB512FEA4223A7DB91D>I<161FD907FEEBFFC090387FFFE348B6EA
-EFE02607FE07138F260FF801131F48486C138F003F15CF4990387FC7C0EEC000007F81A6
-003F5DA26D13FF001F5D6C6C4890C7FC3907FE07FE48B512F86D13E0261E07FEC8FC90CA
-FCA2123E123F7F6C7E90B512F8EDFF8016E06C15F86C816C815A001F81393FC0000F48C8
-138048157F5A163FA36C157F6C16006D5C6C6C495AD81FF0EB07FCD807FEEB3FF00001B6
-12C06C6C91C7FC010713F02B377DA530>I<13FFB5FCA412077EAFED7FC0913803FFF802
-0F13FE91381F03FFDA3C01138014784A7E4A14C05CA25CA291C7FCB3A3B5D8FC3F13FFA4
-303A7DB935>I<EA01F0EA07FC487EA2487EA56C5AA26C5AEA01F0C8FCA913FF127FA412
-077EB3A9B512F8A4153B7DBA1B>I<13FFB5FCA412077EAF92380FFFE0A4923803FC0016
-F0ED0FE0ED1F804BC7FC157E5DEC03F8EC07E04A5A141FEC7FE04A7E8181A2ECCFFEEC0F
-FF496C7F806E7F6E7F82157F6F7E6F7E82150F82B5D8F83F13F8A42D3A7EB932>107
-D<13FFB5FCA412077EB3B3ACB512FCA4163A7DB91B>I<01FED97FE0EB0FFC00FF902601
-FFFC90383FFF80020701FF90B512E0DA1F81903983F03FF0DA3C00903887801F000749DA
-CF007F00034914DE6D48D97FFC6D7E4A5CA24A5CA291C75BB3A3B5D8FC1FB50083B512F0
-A44C257DA451>I<01FEEB7FC000FF903803FFF8020F13FE91381F03FFDA3C0113800007
-13780003497E6D4814C05CA25CA291C7FCB3A3B5D8FC3F13FFA430257DA435>I<903801
-FFC0010F13F8017F13FFD9FF807F3A03FE003FE048486D7E48486D7E48486D7EA2003F81
-491303007F81A300FF1680A9007F1600A3003F5D6D1307001F5DA26C6C495A6C6C495A6C
-6C495A6C6C6CB45A6C6CB5C7FC011F13FC010113C029257DA430>I<9039FF01FF80B500
-0F13F0023F13FC9138FE07FFDAF00113800003496C13C00280EB7FE091C713F0EE3FF8A2
-EE1FFCA3EE0FFEAA17FC161FA217F8163F17F06E137F6E14E06EEBFFC0DAF00313809139
-FC07FE0091383FFFF8020F13E0020390C7FC91C9FCACB512FCA42F357EA435>I<9038FE
-03F000FFEB0FFEEC3FFF91387C7F809138F8FFC000075B6C6C5A5CA29138807F80ED3F00
-150C92C7FC91C8FCB3A2B512FEA422257EA427>114 D<90383FF0383903FFFEF8000F13
-FF381FC00F383F0003007E1301007C130012FC15787E7E6D130013FCEBFFE06C13FCECFF
-806C14C06C14F06C14F81203C614FC131F9038007FFE140700F0130114007E157E7E157C
-6C14FC6C14F8EB80019038F007F090B512C000F8140038E01FF81F257DA426>I<130FA5
-5BA45BA25B5BA25A1207001FEBFFE0B6FCA3000390C7FCB21578A815F86CEB80F014816C
-EBC3E090383FFFC06D1380903803FE001D357EB425>I<01FFEC3FC0B5EB3FFFA4000714
-016C80B3A35DA25DA26C5C6E4813E06CD9C03E13FF90387FFFFC011F13F0010313803025
-7DA435>I<B539F001FFF8A4000390C7EA1F00161E6E133E6C153C6E137C6C15786E13F8
-017F5CECF001013F5C14F8011F495AA2ECFC07010F5CECFE0F010791C7FC6E5A6D131E15
-BE6D13BC15FC6D5BA36E5AA26E5AA26E5AA26E5AA22D257EA432>I<B539F01FFFF0A400
-0390398003F8006C01C013E06C1407D97FE05B6D6C485A6E48C7FC90381FFC3E010F5B90
-3807FEFC6D6C5A5D6D5B6D5B6E7E6E7E814A7EA24A7E903801F3FFD903E37FD907C17FEB
-0FC049486C7E4A6C7E013E80496D7E49130F00016E7EB590383FFFF8A42D257EA432>
-120 D<B539F001FFF8A4000390C7EA1F00161E6E133E6C153C6E137C6C15786E13F8017F
-5CECF001013F5C14F8011F495AA2ECFC07010F5CECFE0F010791C7FC6E5A6D131E15BE6D
-13BC15FC6D5BA36E5AA26E5AA26E5AA26E5AA292C8FCA25C141E003F133E387F803C38FF
-C07C147814F8EBC1F0EBC3E06C485A387D1F80D83FFFC9FCEA1FFCEA07F02D357EA432>
-I E
-%EndDVIPSBitmapFont
-/Fe 254[28 1[{ TeXbbad153fEncoding ReEncodeFont }1 99.6264
-/CMSY10 rf
-%DVIPSBitmapFont: Ff ecrm1000 10 83
-/Ff 83 123 df<486C1360000314E039070001C0000EEB038048EB070000181306003813
-0E0030130C0070131C00601318A200E01338481330A400CEEB338039FF803FE001C013F0
-A3007F131FA2393F800FE0390E0003801C1981B91C>16 D<001C1307007FEB1FC039FF80
-3FE0A201C013F0A3007F131F001CEB073000001300A400011470491360A2000314E090C7
-12C048130100061480000E130348EB070048130E485B006013181C1980B91C>I<DA0FF8
-13FC91397FFF07FF903B01F807DF83C0903A07E001FF0F903B1F8007FE1FE090393F000F
-FC137E16F85B9338F007804848010790C7FC1503ACB812F8A32801F80003F0C7FCB3AB48
-6C497E267FFFE0B512F0A3333B7FBA30>27 D<EC0FF8EC7FFE903901F80780903907E001
-C090391F8000E090383F0007017E497EA25BA2485A6F5AED018092C8FCA9ED03F0B7FCA3
-3901F8000F1503B3AA486C497E267FFFE0B512C0A32A3B7FBA2E>I<EC0FFC91387FFF70
-903901F803F0903807E00790381F800FEB3F00137EA25B150748481303ADB7FCA33901F8
-0003B3AB486C497E267FFFE0B512C0A32A3B7FBA2E>I<DA0FF0EB1FF0DA7FFEEBFFFC90
-3B01F80F83F00F903C07E001CFC00380903C1F8000FF0001C090273F0007FE130F017E49
-48497EA2495CA248485C03076E5A03030203C7FC95C8FCA9F007E0BAFCA33C01F80003F0
-001F1807B3AA486C496C497E267FFFE0B500C1B51280A3413B7FBA45>I<DA0FF8EB1FF8
-DA7FFF9038FFFEE0903B01F80783F007903B07E001CFC00F903B1F8007FF001F4948485A
-017E5CA2495C180F48486D4813071503ACBAFCA33C01F80003F00007B3AB486C496C497E
-267FFFE0B500C1B51280A3413B7FBA45>I<007C137C00FE13FEEAFF01A3EAFE00A7007E
-13FC007C137CA8003C137800381338A700181330171E77BA2A>34
-D<017C166048B416F02607C3801401260F81C01403D900E04A5A001E01784A5A003E6D14
-1F003C013FEC7F80007C90271BE003FFC7FC0218B512BF007891381FFC3E00F8011CC75A
-020C14FC5F4C5A16035F4C5A160F5F4CC8FC021C5B00780118133E007C5D16FC003C0138
-5B003E90383001F0001EEB70036C01E05B903981C007C03907C3800F2601FF005BD8007C
-49C9FC90C748EB07C0033EEB1FF04BEB3C3803FCEBF81C4B497E913A01F001E006020301
-03130703E0497E912607C0071480020F15011580DA1F00018013C04A010F1300143E5C14
-FC5C495A13035C495A130F4A0107130149C701C013805B013E1603490203140001FC6F5A
-49020113064848913800F00E0003705A49ED3C3849ED1FF06C48ED07C03A437BBD45>37
-D<121C127FEAFF80A213C0A3127F121C1200A412011380A2120313005A1206120E5A5A5A
-12600A1979B917>39 D<146014E0EB01C0EB0380EB0700130E131E5B5BA25B485AA2485A
-A212075B120F90C7FCA25A121EA2123EA35AA65AB2127CA67EA3121EA2121F7EA27F1207
-7F1203A26C7EA26C7E1378A27F7F130E7FEB0380EB01C0EB00E01460135278BD20>I<12
-C07E12707E7E7E120F6C7E6C7EA26C7E6C7EA21378A2137C133C133E131EA2131F7FA214
-80A3EB07C0A6EB03E0B2EB07C0A6EB0F80A31400A25B131EA2133E133C137C1378A25BA2
-485A485AA2485A48C7FC120E5A5A5A5A5A13527CBD20>I<EB0380497EA7397803803C00
-FC147E00FE14FE397F8383FC393FC387F8390FE38FE03903FBBF803900FFFE00EB3FF8EB
-0FE0A2EB3FF8EBFFFE3903FBBF80390FE38FE0393FC387F8397F8383FC39FE0380FE00FC
-147E0078143C390007C000A76D5A1F247BBD2A>I<1530B3A8B912FCA2C80030C8FCB3A8
-36367BAF41>I<121C127FEAFF80A213C0A3127F121C1200A412011380A2120313005A12
-06120E5A5A5A12600A19798817>I<B512FCA516057F941C>I<121C127FEAFF80A5EA7F00
-121C0909798817>I<1506A2150E150CA2151C151815381530A215701560A215E015C0A2
-14011580A2140315005C1406A2140E140CA2141C1418A214381430A21470146014E05CA2
-13015CA2130391C7FCA25B1306A2130E130C131C1318A213381330A213701360A213E05B
-A212015B120390C8FCA25A1206A2120E120CA2121C1218A21238123012701260A212E05A
-A21F537BBD2A>I<EB03F8EB1FFF90387E0FC09038F803E03901E000F048481378000714
-7C48487FA248C77EA2481580A3007EEC0FC0A500FE15E0B3007E15C0A4007F141F6C1580
-A36C1500A26C6C133EA26C6C5B6C6C5BEBF0013900F803E090387E0FC0D91FFFC7FCEB03
-F823397DB62A>I<EB01C013031307131F13FFB5FCA2131F1200B3B3A7497E007FB512F0
-A31C3779B62A>I<EB0FF0EB7FFE48B57E3903E03FE0390F000FF0001E6D7E001C6D7E48
-6D7E5A6E7E126012FE6CEC7F807FA56CC7FC121CC8FCEDFF00A25D14015D14035D4A5A4A
-5A5D4A5A4AC7FC147E5C495A14E0495A495A49C8FC011EEB01805B5B4913034848140048
-5A485A90C75A48B6FC5A5A485CB6FCA321377CB62A>I<EB07F8EB3FFF90B512C03901F8
-0FF03903C007F848486C7E390E0001FEEA0F80391FE000FF7FA56C5A6C5AC7485AA25D14
-035D4A5A5DEC0F80027FC7FCEB1FFCECFF809038000FE06E7EEC01FC816E7EED7F80A216
-C0A2153F16E0A2121EEA7F80A2487EA316C0157F491480007EC7FC0070ECFF006C495A12
-1E390F8003F83907F00FF00001B512C06C6C90C7FCEB0FF823397DB62A>I<1538A21578
-15F8A2140114031407A2140F141F141B14331473146314C313011483EB03031307130613
-0C131C131813301370136013C01201EA038013005A120E120C5A123812305A12E0B712F8
-A3C73803F800AA4A7E0103B512F8A325387EB72A>I<0006140CD80780133C9038F003F8
-90B5FC5D5D158092C7FC14FC38067FE090C9FCAAEB07F8EB1FFE9038780F809038E007E0
-3907C003F0496C7E130000066D7E81C8FC8181A21680A4121C127F5A7FA390C713005D12
-FC00605C12704A5A6C5C6C1303001E495A6C6C485A3907E03F800001B5C7FC38007FFCEB
-1FE021397CB62A>I<EC3FC0903801FFF0010713FC90380FE03E90383F800790387E001F
-49EB3F804848137F485A12075B000FEC3F0049131E001F91C7FC5B123FA3127F90C9FCEB
-01FC903807FF8039FF1E07E090383801F0496C7E01607F01E0137E497F16805BED1FC0A3
-90C713E0A57EA47F123F16C0A2001FEC3F807F000F15006D5B000714FE6C6C5B6C6C485A
-3900FE07F090387FFFC0011F90C7FCEB03FC23397DB62A>I<12301238123E003FB612E0
-A316C05A168016000070C712060060140E5D5D00E014304814705D5DC712014A5A4AC7FC
-1406140E5CA25C1478147014F05C1301A213035C1307A2130FA3131F5CA2133FA5137FA9
-6DC8FC131E233A7BB72A>I<EB03F8EB1FFF017F13C09038FC07F03901E001F83903C000
-7C4848133C90C7123E48141E000E141F001E80A3121FA26D5B6D131E7FD80FF85B6D137C
-01FF13786C6D5A6CEBE3E0ECF780C601FFC7FC6D5A6D6C7E010F13E0013F7F01F97F3901
-E07FFE48486C7E380F800F48486C1380001E010113C0487F007C143F0078EC1FE0150F00
-F81407481403A21501A36C15C0A200781403007C15806C14076CEC0F006C6C131ED807E0
-137C3903F803F0C6B55A013F1380D907FCC7FC23397DB62A>I<EB03F8EB1FFF017F13C0
-3901FC07E048486C7E3907E001F8000F6D7E4848137E5B003F80A248C71380A25AED1FC0
-A516E0A56C143FA36C7E157F121F6C6C13FF6C6C13DF000313013901F0039F3900FC0F1F
-D93FFC13C0EB07F090C7FCA2153F1680A216005D120F486C137E486C5BA24A5A4A5A4948
-5A381F000F001CEB1F80260F807FC7FC3807FFFE000113F838003FC023397DB62A>I<12
-1C127FEAFF80A5EA7F00121CC7FCB2121C127FEAFF80A5EA7F00121C092479A317>I<12
-1C127FEAFF80A5EA7F00121CC7FCB2121C127FEAFF80A213C0A3127F121C1200A4120113
-80A2120313005A1206120E5A5A5A12600A3479A317>I<007FB812F8B912FCCCFCB0B912
-FC6C17F836147B9E41>61 D<EB3FE03801FFFE3907C03F80390E000FC0003CEB07F00030
-1303007014F8007C130100FE14FC7EA4127E003CEB03F8C7FCEC07F0A2EC0FE0EC1F80EC
-3F00147E147C5C495A5C495A5CA249C7FCA31306AA90C8FCA8130EEB3F80497EA56D5A01
-0EC7FC1E3B7CBA27>63 D<1538A3157CA315FEA34A7EA34A6C7EA202077FEC063FA2020E
-7FEC0C1FA2021C7FEC180FA202387FEC3007A202707FEC6003A202C07F1501A2D901807F
-81A249C77F167FA20106810107B6FCA24981010CC7121FA2496E7EA3496E7EA3496E7EA2
-13E0707E1201486C81D80FFC02071380B56C90B512FEA3373C7DBB3E>65
-D<B712E016FC16FF0001903980007FC06C90C7EA1FE0707E707E707EA2707EA283A75F16
-035F4C5A4C5A4C5A4C5AEEFF8091B500FCC7FCA291C7EA7F80EE1FE0EE07F0707E707E83
-707EA21880177F18C0A7188017FFA24C13005F16034C5AEE1FF8486DEB7FF0B812C094C7
-FC16F832397DB83B>I<913A01FF800180020FEBE003027F13F8903A01FF807E07903A03
-FC000F0FD90FF0EB039F4948EB01DFD93F80EB00FF49C8127F01FE153F12014848151F48
-48150FA248481507A2485A1703123F5B007F1601A35B00FF93C7FCAD127F6DED0180A312
-3F7F001F160318006C7E5F6C7E17066C6C150E6C6C5D00001618017F15386D6C5CD91FE0
-5C6D6CEB03C0D903FCEB0F80902701FF803FC7FC9039007FFFFC020F13F002011380313D
-7BBA3C>I<B712C016F816FE000190398001FF806C90C7EA3FE0EE0FF0EE03F8707E707E
-177FA2EF3F8018C0171F18E0170F18F0A3EF07F8A418FCAC18F8A4EF0FF0A218E0A2171F
-18C0EF3F80A2EF7F0017FE4C5A4C5AEE0FF0EE3FE0486DEBFF80B8C7FC16F816C036397D
-B83F>I<B812FEA3000190388000076C90C8FC173F838383A383A31880170116C0A394C7
-FCA31501A21503150F91B5FCA3EC000F15031501A21500A21860A318E093C712C0A41701
-A3EF0380A21707A2170F173F177F486D903807FF00B9FCA333397EB838>I<B812F8A300
-01903880001F6C90C71201EE00FC177C173C171CA2170CA4170E1706A2ED0180A21700A4
-1503A21507151F91B5FCA3EC001F15071503A21501A692C8FCAD4813C0B612C0A32F397D
-B836>I<DBFF8013C0020FEBF001023F13FC9139FF803F03903A03FC000787D90FF0EB03
-CF4948EB00EF4948147F4948143F49C8121F485A4848150F48481507A248481503A2485A
-1701123F5B007F1600A448481600AB93B6FCA26C7E9338007FE0EF3FC0A2123F7F121FA2
-6C7EA26C7EA26C7E6C7E6C6C157F6D7E6D6C14FF6D6C14EFD90FF8EB03C7D903FEEB0783
-903A00FFC03F0191393FFFFC00020F01F0130002001380383D7CBA41>I<B648B512FEA3
-0001902680000313006C90C76C5AB3A491B6FCA391C71201B3A6486D497EB648B512FEA3
-37397DB83E>I<B612C0A3C6EBC0006D5AB3B3AD497EB612C0A31A397EB81E>I<013FB512
-E0A39039001FFC00EC07F8B3B3A3123FEA7F80EAFFC0A44A5A1380D87F005B0070131F6C
-5C6C495A6C49C7FC380781FC3801FFF038007F80233B7DB82B>I<B649B5FCA300010180
-9038007FF06C90C8EA3F80053EC7FC173C17385F5F4C5A4C5A4CC8FC160E5E5E5E5E4B5A
-ED0780030EC9FC5D153E157E15FF5C4A7F4A6C7E140E4A6C7E4A6C7E14704A6C7E4A6C7E
-14804A6C7E6F7EA26F7F707EA2707E707EA2707EA2707E707EA2707E707F8484486D497F
-B6011FEBFF80A339397DB841>I<B612E0A3000101C0C8FC6C90C9FCB3AD1718A5173817
-30A31770A317F0A216011603160FEE1FE0486D13FFB8FCA32D397DB834>I<B5933807FF
-F86E5DA20001F0FC002600DFC0ED1BF8A2D9CFE01533A3D9C7F01563A3D9C3F815C3A2D9
-C1FCEC0183A3D9C0FEEC0303A2027F1406A36E6C130CA36E6C1318A26E6C1330A36E6C13
-60A26E6C13C0A3913901FC0180A3913900FE0300A2ED7F06A3ED3F8CA2ED1FD8A3ED0FF0
-A3486C6D5A487ED80FFC6D48497EB500C00203B512F8A2ED018045397DB84C>I<B59138
-07FFFE8080C69238007FE06EEC1F80D9DFF0EC0F001706EBCFF8EBC7FCA2EBC3FEEBC1FF
-A201C07F6E7EA26E7E6E7E81140F6E7E8114036E7E168080ED7FC016E0153FED1FF0ED0F
-F8A2ED07FCED03FEA2ED01FF6F1386A2EE7FC6EE3FE6A2EE1FF6EE0FFEA216071603A216
-011600A2177E486C153E487ED80FFC151EB500C0140EA2170637397DB83E>I<EC03FF02
-1F13E09138FE01FC903901F8007ED907E0EB1F8049486D7ED93F80EB07F049C76C7E01FE
-6E7E48486E7E49157E0003167F4848ED3F80A24848ED1FC0A2001F17E049150F003F17F0
-A3007F17F8491507A300FF17FCAC007F17F86D150FA3003F17F0A26C6CED1FE0A36C6CED
-3FC0000717806D157F000317006C6C15FEA26C6C4A5A017F4A5A6D6C495A6D6C495AD907
-E0EB1F80D903F8017FC7FC903900FE01FC91381FFFE0020390C8FC363D7BBA41>I<B712
-C016FC16FF0001D9800013C06C90C7EA1FE0707EEE03F883707EA2707EA21880A71800A2
-4C5AA24C5A5FEE0FF04C5AEEFF8091B548C7FC16F091CAFCB3A5487FB6FCA331397EB838
->I<EC03FF021F13E09138FE01FC903901F8007ED907E0EB1F8049486D7ED93F80EB07F0
-49C76C7E01FE6E7E48486E7EA24848157F0007178049153F000F17C049151F001F17E0A2
-4848ED0FF0A3007F17F8A2491507A200FF17FCAC007F17F8A26D150FA2003F17F0A26C6C
-ED1FE0A36C6CED3FC00007027C14804AB4FC3C03F80383807F003B01FC0701C0FEEC0E00
-2600FE0CEBE1FC017FEC63F8D93F8CEB77F0D91FCCEB3FE0D907EE14806DB449C7FC0100
-D981FC130CEC1FFF0203131C91C7001E131C161F183CEF807CEFC0F8EE0FFFA318F08218
-E07013C07013809338007E00364B7BBA41>I<B612FEEDFFE016F8000190388007FE6C90
-C76C7EEE3FC0707E707E707EA2707EA283A65FA24C5AA24C5A4C5AEE3F8004FFC8FCED07
-FC91B512E05E9138000FF0ED03F8ED00FE82707E707EA2161F83A583A6F00180A217F816
-0F1803486D01071400B66D6C5A04011306933800FE0ECAEA3FFCEF07F0393B7DB83D>I<
-D90FF813C090383FFE0190B512813903F807E33907E000F74848137F4848133F48C7121F
-003E140F007E1407A2007C140312FC1501A36C1400A37E6D14006C7E7F13F86CB47E6C13
-F8ECFF806C14E06C14F86C14FEC680013F1480010714C0EB007F020713E0EC007FED3FF0
-151F150FED07F8A200C01403A21501A37EA216F07E15036C15E06C14076C15C06C140F6D
-EB1F80D8FBF0EB3F00D8F0FE13FE39E03FFFF8010F13E0D8C00190C7FC253D7CBA2E>I<
-003FB812E0A3D9C003EB001F273E0001FE130348EE01F00078160000701770A300601730
-A400E01738481718A4C71600B3B0913807FF80011FB612E0A335397DB83C>I<B6903807
-FFFEA3000101809038007FE06C90C8EA1F80EF0F001706B3B2170E6D150C80171C133F17
-186D6C14385F6D6C14F06D6C5C6D6C495A6D6CEB07806D6C49C7FC91387F807E91381FFF
-F8020713E09138007F80373B7DB83E>I<B500FC91387FFF80A30003018091380FFC006C
-90C8EA07E0715A6C705A6E1403017F93C7FCA280013F1506A26E140E011F150C80010F5D
-A28001075DA26E147001031560A26D6C5CA2806D4A5AA2ED8003027F91C8FCA291383FC0
-06A215E0021F5BA2EDF01C020F1318A26E6C5AA215FC02035BA2EDFEE002015BA26E6C5A
-A36FC9FCA3153EA2151CA3393B7EB83E>I<B5D8FC07B5D8F001B5FCA30007902780001F
-FEC7EA1FF86C48C7D80FF8EC07E000010307ED03C01B807F6C6F6C1500A26E5F017F6E6C
-1406A280013F4A6C5CA280011F4A6D5BEE067FA26D6C010E6D5BEE0C3FA26D6C011C6D5B
-EE181FA26D6C6F5BEE300FA26D6C6F485AEE6007A26D6C4CC7FC9338C003FCA203805D91
-3B7F818001FE06A203C1150EDA3FC3C7EAFF0CA203E3151CDA1FE6EC7F98A215F6DA0FFC
-EC3FF0A302075E4B141FA202035E4B140FA202015E4B1407A2020093C8FC4B80503B7EB8
-55>I<1303EB0FC0497E497EEB7CF83801F87E3803E01F3907C00F80391F0003E0003CEB
-00F048147800E0141C48140C1E0D76B333>94 D<007FB81280B912C0A26C178032047970
-41>I<EB1FE0EBFFFC3803E03F3907000F80390F8007E0486C6C7E13E06E7EA26E7E6C5A
-6C5AC8FCA4147FEB07FFEB3FE0EBFE00EA03F8EA0FF0EA1FC0123F485A90C7FC160C12FE
-A31401A26C13036CEB077C903980063E18383FC01E3A0FE0781FF03A03FFF00FE03A007F
-8007C026277DA52A>97 D<EA03F012FFA3120F1203B0EC1FE0EC7FF89038F1E03E9039F3
-801F809039F7000FC001FEEB07E049EB03F049EB01F85BED00FCA216FEA2167E167FAA16
-7E16FEA216FC15016D14F8ED03F07F01EEEB07E001C6EB0FC09039C7801F00903881E07E
-903800FFF8C7EA1FC0283B7EB92E>I<EB03FC90381FFF8090387E03E03901F800704848
-13F83907E001FC380FC003A2EA1F80123F90380001F848EB00F01500A2127E12FEAA127E
-127FA26C14067F001F140E6D130C000F141C6C6C13386C6C13706C6C13E039007C07C090
-381FFF00EB07F81F277DA525>I<ED0FC0EC03FFA3EC003F150FB0EB03F8EB1FFF90387E
-078F9038F801EF3903F0007F4848133F4848131FA24848130F123F90C7FC5AA2127E12FE
-AA127E127FA27EA26C6C131FA26C6C133F6C6C137F6C6CEBEFF03A01F801CFFF39007C07
-8F90381FFE0FD907F813C0283B7DB92E>I<EB07F8EB1FFF90387C0FC03901F803E03903
-F001F0D807E013F8380FC0004848137CA248C7127E153E5A153F127E12FEA3B7FCA248C8
-FCA5127EA2127FA26C14037F001F14076C6C13060007140E6D131CD801F013386C6C1370
-90387E03E090381FFF80903803FC0020277EA525>I<147E903803FF8090380FC1E0EB1F
-8790383F0FF0137EA213FCA23901F803C091C7FCADB512FCA3D801F8C7FCB3AB487E387F
-FFF8A31C3B7FBA19>I<ED03F090390FF00FF890393FFC3C3C9039F81F707C3901F00FE0
-3903E007C03A07C003E010000FECF000A248486C7EA86C6C485AA200075C6C6C485A6D48
-5A6D48C7FC38073FFC38060FF0000EC9FCA4120FA213C06CB512C015F86C14FE6CECFF80
-4815C03A0F80007FE048C7EA0FF0003E140348140116F8481400A56C1401007C15F06CEC
-03E0003F1407D80F80EB0F80D807E0EB3F003901FC01FC39007FFFF0010790C7FC26387E
-A52A>I<EA03F012FFA3120F1203B0EC0FF0EC3FFCECF03F9039F1C01F809039F3800FC0
-EBF70013FE496D7EA25BA35BB3A3486C497EB500C1B51280A3293A7EB92E>I<EA0380EA
-0FE0487EA56C5AEA0380C8FCAAEA03F012FFA312071203B3AA487EB512C0A312387EB717
->I<EB01C0EB07F0EB0FF8A5EB07F0EB01C090C7FCAAEB01F813FFA313071301B3B3A212
-3C127E00FF13F01303A214E038FE07C0127C383C0F00EA0FFEEA03F8154984B719>I<EA
-03F012FFA3120F1203B1913801FFFCA39138007FC01600157C15705D4A5A4A5A4AC7FC14
-1E1438147814FC13F1EBF3FEEBF73F01FE7FEBF81F496C7E8114076E7E6E7E811400157E
-157F811680ED1FC0486CEB3FF0B500C0B5FCA3283A7EB92C>I<EA03F012FFA3120F1203
-B3B3AD487EB512C0A3123A7EB917>I<2703F00FF0EB1FE000FFD93FFCEB7FF8913AF03F
-01E07E903BF1C01F83803F3D0FF3800FC7001F802603F70013CE01FE14DC49D907F8EB0F
-C0A2495CA3495CB3A3486C496CEB1FE0B500C1B50083B5FCA340257EA445>I<3903F00F
-F000FFEB3FFCECF03F9039F1C01F803A0FF3800FC03803F70013FE496D7EA25BA35BB3A3
-486C497EB500C1B51280A329257EA42E>I<EB03FE90380FFF8090383E03E09038F800F8
-4848137C48487F48487F4848EB0F80001F15C090C712074815E0A2007EEC03F0A400FE15
-F8A9007E15F0A2007F14076C15E0A26C6CEB0FC0000F15806D131F6C6CEB3F006C6C137E
-C66C13F890387E03F090381FFFC0D903FEC7FC25277EA52A>I<3903F01FE000FFEB7FF8
-9038F1E07E9039F3801F803A07F7000FC0D803FEEB07E049EB03F04914F849130116FC15
-0016FEA3167FAA16FEA3ED01FCA26DEB03F816F06D13076DEB0FE001F614C09039F7803F
-009038F1E07E9038F0FFF8EC1FC091C8FCAB487EB512C0A328357EA42E>I<D903F813C0
-90381FFE0190387E07819038FC01C33903F000E3000714774848133749133F001F141F48
-5A150F48C7FCA312FEAA127FA37E6D131F121F6D133F120F6C6C137F6C6C13EF3901F801
-CF39007E078F90381FFE0FEB07F890C7FCABED1FE00203B5FCA328357DA42C>I<3807E0
-1F00FFEB7FC09038E1E3E09038E387F0380FE707EA03E613EE9038EC03E09038FC008049
-1300A45BB3A2487EB512F0A31C257EA421>I<EBFF03000313E7380F80FF381E003F487F
-487F00707F12F0A2807EA27EB490C7FCEA7FE013FF6C13E06C13F86C7F00037FC67F0107
-1380EB007F141F00C0EB0FC01407A26C1303A37E15806C13077EEC0F00B4131E38F3C07C
-38E1FFF038C03F801A277DA521>I<1318A51338A31378A313F8120112031207001FB5FC
-B6FCA2D801F8C7FCB215C0A93800FC011580EB7C03017E13006D5AEB0FFEEB01F81A347F
-B220>I<D803F0EB07E000FFEB01FFA3000FEB001F00031407B3A4150FA3151F12016D13
-3F0000EC77F86D9038E7FF8090383F03C790381FFF87903A03FC07E00029267EA42E>I<
-B538803FFEA33A0FF8000FF06C48EB07E00003EC03C06D148000011500A26C6C1306A26D
-130E017E130CA26D5BA2EC8038011F1330A26D6C5AA214E001075BA2903803F180A3D901
-FBC7FCA214FF6D5AA2147CA31438A227257EA32C>I<B53A1FFFE03FFEA3260FF8009038
-000FF86C48017EEB03E018C00003023EEB0180A26C6C013FEB0300A36C6CEC8006156FA2
-017E9038EFC00C15C7171CD93F01EBE01815830281EBF038D91F831430150102C3EBF870
-90260FC6001360A2D907E66D5A02EC137CA2D903FCEB7F804A133FA2010192C7FC4A7FA2
-0100141E4A130E0260130C37257EA33C>I<B538807FFFA33A03FE003FF00001EC1F8000
-0092C7FC017E131C6D13186D6C5AECC070010F5B6D6C5AECF180EB03FB6DB4C8FC6D5AA2
-147F804A7E8114CF903801C7E090380383F090380703F8EB0601496C7E011C137E49137F
-01787F496D7E486C80000FEC3FF0D8FFFE90B51280A329247FA32C>I<B538803FFEA33A
-0FF8000FF06C48EB07C00003EC03806C7E16007F00001406A2017E5BA2137F6D5BA26D6C
-5AA2ECC070010F1360A26D6C5AA214F101035BA2D901FBC7FCA214FF6D5AA2147CA31438
-A21430A214701460A25CA2EA7C0100FE5B130391C8FC1306EAFC0EEA701C6C5AEA1FF0EA
-0FC027357EA32C>I<003FB512FCA2EB8003D83E0013F8003CEB07F00038EB0FE0123000
-70EB1FC0EC3F800060137F150014FE495AA2C6485A495AA2495A495A495AA290387F0006
-13FEA2485A485A0007140E5B4848130C4848131CA24848133C48C7127C48EB03FC90B5FC
-A21F247EA325>I E
-%EndDVIPSBitmapFont
-%DVIPSBitmapFont: Fg ecbx1200 12 59
-/Fg 59 123 df<DB0FFFEB03FF4AB5D8C03F13C0020F02F1B512E0027F91B612F0902701
-FFF8039038FE1FF849018002F813FC010F4948EBF03F49484913E0495A4A15C0495AF11F
-F801FF16804A6DEC07E070EC018096C7FCABBA12F0A5C69026E000030180C7FCB3B0007F
-D9FFC1B67EA546467EC541>27 D<B612F8A91D097F9A25>45 D<EA07C0EA1FF0EA3FF8EA
-7FFCEAFFFEA7EA7FFCEA3FF8EA1FF0EA07C00F0F788E1F>I<EC3FF849B5FC010F14E001
-3F14F890397FF01FFC9039FFC007FE4890380001FF48486D1380000716C049147F000F16
-E049143F001F16F0A2003F16F8A249141F007F16FCA600FF16FEB3A3007F16FCA56C6CEC
-3FF8A3001F16F0A2000F16E06D147F000716C06D14FF6C6C4913806C6D4813006C6D485A
-90397FF01FFC6DB55A010F14E0010314809026003FF8C7FC2F427CC038>48
-D<EC03C01407141F147FEB03FF133FB6FCA413C3EA0003B3B3ADB712FCA5264177C038>
-I<ECFFE0010F13FE013F6D7E90B612E0000315F82607FC0313FE3A0FE0007FFFD81F806D
-138048C7000F13C0488001C015E001F07F00FF6E13F07F17F881A46C5A6C5A6C5AC9FC17
-F05DA217E05D17C04B13804B1300A2ED1FFC4B5A5E4B5A4B5A4A90C7FC4A5A4A5AEC0FF0
-4A5AEC3F804AC7127814FE495A494814F8D907E014F0495A495A49C8FC017C1401491403
-48B7FC4816E05A5A5A5A5AB8FC17C0A42D417BC038>I<ECFFF0010713FF011F14C0017F
-14F049C66C7ED803F8EB3FFED807E06D7E81D80FF86D138013FE001F16C07FA66C5A6C48
-15806C485BC814005D5E4B5A4B5A4B5A4A5B020F1380902607FFFEC7FC15F815FF16C090
-C713F0ED3FFCED0FFEEEFF80816F13C017E0A26F13F0A217F8A3EA0FC0EA3FF0487EA248
-7EA217F0A25D17E06C5A494913C05BD83F80491380D81FF0491300D80FFEEBFFFE6CB612
-F800015D6C6C14C0011F49C7FC010113E02D427BC038>I<163FA25E5E5D5DA25D5D5D5D
-A25D92B5FCEC01F7EC03E7140715C7EC0F87EC1F07143E147E147C14F8EB01F0EB03E013
-0714C0EB0F80EB1F00133E5BA25B485A485A485A120F5B48C7FC123E5A12FCB91280A5C8
-000F90C7FCAC027FB61280A531417DC038>I<0007150301E0143F01FFEB07FF91B6FC5E
-5E5E5E5E16804BC7FC5D15E092C8FC01C0C9FCAAEC3FF001C1B5FC01C714C001DF14F090
-39FFE03FFC9138000FFE01FC6D7E01F06D13804915C0497F6C4815E0C8FC6F13F0A317F8
-A4EA0F80EA3FE0487E12FF7FA317F05B5D6C4815E05B007EC74813C0123E003F4A1380D8
-1FC0491300D80FF0495AD807FEEBFFFC6CB612F0C65D013F1480010F01FCC7FC010113C0
-2D427BC038>I<4AB47E021F13F0027F13FC49B6FC01079038807F8090390FFC001FD93F
-F014C04948137F4948EBFFE048495A5A1400485A120FA248486D13C0EE7F80EE1E00003F
-92C7FCA25B127FA2EC07FC91381FFF8000FF017F13E091B512F89039F9F01FFC9039FBC0
-07FE9039FF8003FF17804A6C13C05B6F13E0A24915F0A317F85BA4127FA5123FA217F07F
-121FA2000F4A13E0A26C6C15C06D4913806C018014006C6D485A6C9038E01FFC6DB55A01
-1F5C010714C0010191C7FC9038003FF02D427BC038>I<121E121F13FC90B712FEA45A17
-FC17F817F017E017C0A2481680007EC8EA3F00007C157E5E00785D15014B5A00F84A5A48
-4A5A5E151FC848C7FC157E5DA24A5A14035D14074A5AA2141F5D143FA2147F5D14FFA25B
-A35B92C8FCA35BA55BAA6D5A6D5A6D5A2F447AC238>I<EC7FF00103B5FC010F14C0013F
-14F090397F801FFC3A01FC0003FE48486D7E497F4848EC7F80163F484815C0A2001F151F
-A27FA27F7F01FE143F6D158002C0137F02F014006C01FC5B6E485A6C9038FF83FCEDE7F8
-6CECFFE06C5D6C92C7FC6D14C06D80010F14F882013F8090B7FC48013F14802607FC0F14
-C0260FF80314E04848C6FC496D13F0003F141F48481307496D13F8150000FF157F90C812
-3F161F160FA21607A36D15F0127F160F6D15E06C6C141F6DEC3FC06C6CEC7F80D80FFE90
-3801FF003A07FFC00FFE6C90B55AC615F0013F14C0010F91C7FC010013F02D427BC038>
-I<EC7FF0903807FFFE011F6D7E017F14E09039FFE03FF0489038800FF848496C7E484880
-48486D7E001F80003F1680A2484815C08117E0A212FF17F0A617F8A45D127FA3003F5CA2
-6C7E5D6C6C5B12076C6C131E6CEBC07C6CEBFFF8013F5B010F01C013F00101130090C8FC
-A217E05DA2EA03C0D80FF015C0487E486C491380A217004B5A150F5E49495A6C48495A01
-C0EBFFE0260FF0035B6CB65A6C4AC7FC6C14F86C6C13E0D907FEC8FC2D427BC038>I<90
-3807FFC0013F13FC48B612804815E0260FF80013F0D81FC0EB3FF848C7EA1FFC4815FE01
-C0130F486C14FF7FA66C485B6C4814FE000FC7FCC8EA3FFCED7FF8EDFFF04A13E04A1380
-1600EC07FC4A5A5D4A5A5D4A5A92C7FCA2147E147CA31478AA91C8FCA814F8EB03FE497E
-497FA2497FA56D5BA26D90C7FC6D5AEB00F828467AC535>63 D<EE1F80A24C7EA24C7EA3
-4C7EA24B7FA34B7FA24B7FA34B7F169F031F80161F82033F80ED3E07037E80157C8203FC
-804B7E02018115F0820203814B137F0207815D173F020F814B7F021F8292C77EA24A8202
-3E80027E82027FB7FCA291B87EA2498302F0C8FCA20103834A157F0107834A153FA24948
-8284011F8491C97E4984133E017E82B6020FB612F0A54C457CC455>65
-D<B9FC18F018FE727E19E026003FFCC700077F05017F716C7E727E727EA2721380A37213
-C0A74E1380A24E1300A24E5A4E5A4E5A4D5B05075B94B5128091B700FCC7FC18F018FF19
-E002FCC7000113F8716C7EF01FFE727E7213801AC07213E0A27213F0A31AF8A71AF0A260
-1AE0604E13C0604E138095B5120005075BBA12F86119C04EC7FC18E045447CC350>I<DC
-FFF01470031F01FF14F04AB6EAE0010207EDF803023FEDFE0791B539E001FF0F4949C7EA
-3F9F010701F0EC0FFF4901C0804990C87E4948814948814948167F4849163F4849161F5A
-4A160F485B19074890CAFC19035A5BA2007F1801A34994C7FC12FFAE127F7F1AF0A2123F
-A27F6C18011AE06C7F19036C6D17C06E16077E6C6DEE0F806C6DEE1F006D6C5E6D6C167E
-6D6C6C5D6D6D4A5A6D01F0EC07F0010101FEEC1FE06D903AFFF001FF80023F90B6C7FC02
-0715FC020115F0DA001F1480030001F8C8FC44467AC451>I<B9FC18F018FE727E19E026
-003FFEC7001F13F805017F9438003FFF060F7F727F727F727F84737E737EA2737EA2737E
-A21B80A2851BC0A51BE0AD1BC0A51B8061A21B006162193F624F5A19FF624E5B06075B4E
-5B063F90C7FC4DB45A050F13F8BA5A19C04EC8FC18F095C9FC4B447CC356>I<BA12F8A4
-85D8001F90C71201EF003F180F180318011800A2197E193EA3191EA21778A285A405F890
-C7FCA316011603161F92B5FCA5ED001F160316011600A2F101E01778A2F103C0A494C7FC
-1907A21A80A2190FA2191FA2193FF17F0061601807181F4DB5FCBBFC61A443447DC34A>
-I<BA1280A419C026003FFEC7121F1701EF007F183F181F180F180719E01803A31801A3EE
-01E0F000F0A419001603A31607160F167F91B6FCA59138FE007F160F16071603A31601A6
-93C9FCAFB712F0A53C447CC346>I<DCFFF01470031F01FF14F04AB6EAE0010207EDF803
-023FEDFE0791B539E001FF0F4949C7EA3F9F010701F0EC0FFF4901C0804990C87E494881
-4948814948167F4849163F4849161F5A4A160F485B19074890CAFC19035A5BA2007F1801
-A34994C8FC12FFAD057FB612F0127F7FA3003FDC0001EBF000A27F7EA26C7FA26C7F807E
-6C7F6C7F6D7E6D6C5D6D6C7E6D6D5C6D01F05C010101FE143F6D903AFFF001FF9F023F90
-B6120F0207EDFC030201EDF000DA001F02C01330030001FCC9FC4C467AC458>I<B7D880
-03B612FEA526003FFEC9EBF800B3A791B9FCA54AC9FCB3AAB7D88003B612FEA54F447CC3
-58>I<0107B7FCA590C7001F1300B3B3A9EA1FE0487E487EA2487EA44B5AA26C48495A49
-5C6C4813FF6C48485B260FFC0713C06CB65A6C4AC7FCC66C13F8010F138030457DC33A>
-74 D<B712F0A526003FFECAFCB3B1F00780A4180F1900A460A360A2187EA218FE170117
-031707171F177FEE03FFB95AA539447CC343>76 D<B500FE067FB512806E95B6FCA26F5E
-A2D8003F50C7FC013D6DEE03DFA2013C6DEE079FA26E6CEE0F1FA26E6C161EA26E6C163C
-A36E6C1678A26E6C16F0A26E6DEC01E0A26E6DEC03C0A36E6DEC0780A26F6CEC0F00A26F
-6C141EA26F6C5CA36F6C5CA26F6C5CA26F6D485AA26F6D485AA26F6D485AA3706C48C7FC
-A293383FF81EA2706C5AA2706C5AA3706C5AA2705BA2705BA2705BA2B6057FB6128071C7
-FCA2173E171C61447CC36A>I<B64BB512FE8181A281D8003F6D91C7EA780081013D7F81
-133C6E7E6E7F6E7F6E7F6E7F82806E7F6E7F6F7E6F7F83816F7F6F7F6F7F6F7F6F7F8382
-707F707F707F707F8482707F707F717E7113807113C019E0837113F07113F87113FC7113
-FE19FF847213F884848484A28484197F193F191FA2190F1907B61603190119001A78A24F
-447CC358>I<923807FFC092B512FE0207ECFFC0021F15F091267FFE0013FC902601FFF0
-EB1FFF01070180010313C04990C76C7FD91FFC6E6C7E49486F7E49486F7E01FF8348496F
-7E48496F1380A248496F13C0A24890C96C13E0A24819F04982003F19F8A3007F19FC4917
-7FA400FF19FEAD007F19FC6D17FFA3003F19F8A26D5E6C19F0A26E5D6C19E0A26C6D4B13
-C06C19806E5D6C6D4B13006C6D4B5A6D6C4B5A6D6C4B5A6D6C4A5B6D01C001075B6D01F0
-011F5B010101FE90B5C7FC6D90B65A023F15F8020715C002004AC8FC030713C047467AC4
-54>I<B9FC18F018FE727E19E0D8001F90C7000F7F05017F716C7E727E727E721380A21A
-C084A21AE0A91AC0A24E1380A21A00604E5A4E5A4D485A050F5B92B712C096C7FC18FC18
-C092CBFCB3A7B712E0A543447DC34D>I<923807FFC092B512FE0207ECFFC0021F15F091
-267FFE0013FC902601FFF0EB1FFF010701C0010713C04990C700017F49486E7F49486F7E
-49486F7E49486F7E48496F7E48496F1380A248496F13C0A24819E091C97E4819F0A24848
-7013F8A3007F19FCA249177FA300FF19FEAD007F19FCA36D17FF003F19F8A3001F19F06D
-5EA26C19E06E01FE5B6C912603FF8014C06C6D486D4813804B13E06C9028E01F83F00F13
-006C903BF01E00F81FFE90267FF83E90387C3FFC90263FFC3C6D485AD91FFE91381EFFF0
-D90FFF021F5B6D01FE5D010194C7FC6D6D6CB45A023F90B512F8020703E0130202006F13
-07030713C792C7EA07F8716C130F72131F9538FF80FF96B5FC7114FEA3831AFCA27213F8
-1AF0847213E07213C0721300F001FC48587AC454>I<B812F8EFFFC018F818FE727ED800
-1F90C7003F13E005037F05007F727E727E727EA28684A286A762A24E90C7FCA24E5A6118
-7F943801FFF005075B053F138092B7C8FC18F818E018F892C77FEF3FFF050F7F717F717F
-A2717FA2717FA785A61B0F85A2187F73131F72141EB700E06DEB803E72EBE0FC72EBFFF8
-060114F0726C13E0CC0007138050457DC354>I<DAFFE0131C010701FE133C013F9038FF
-807C90B6EAE0FC4815F9489038801FFF3907FC00014848EB007F4848143F4848140F4914
-07007F15035B1601160012FF177CA27FA26D153C7F7F6D92C7FC6C7EEBFFE014FE6CEBFF
-F015FF6C15E016FC6C816C6F7E6C826C826C6C81011F810107811300020F80140003077F
-ED007F82040F1380828212F082A282A27EA218007EA26C5D6C5E6D14036D5D6D140701F8
-4A5A01FFEC3FF002F8EBFFE0486CB65AD8FC1F92C7FCD8F80714FC48C614F04801071380
-31467AC43E>I<003FBA12E0A59026FE000FEB8003D87FE09338003FF049171F90C71607
-A2007E1803007C1801A300781800A400F819F8481978A5C81700B3B3A20107B8FCA54543
-7CC24E>I<B76C010FB512F8A526003FFEC93803E000B3B3A9011F17076280190F6D606F
-151F6D95C7FC6D6D5D197E6D6D5D6D6D1403DA7FFC4A5A6EB4EC3FF0020F9039F003FFE0
-6E90B61280020193C8FC6E6C14FC030F14E09226007FFEC9FC4D457CC356>I<B600FE01
-7FB691B512FEA526007FFCC8D83FFEC9EA7C006E82013F701778807415F86D705F6F7014
-016D705FA26F7014036D64814E6D14076D646F70140F6D041E94C7FCA26F023E6D5C6DDC
-3C7F151E81027F037C6D5CF0783F6F70147C023F4B6C1578A26F01016F13F86E4B6C5D16
-806E02036F485A4E7E04C0EEE0036E4A486C5DA2DCE00FEDF0076E4B6C5D16F06E4A6F48
-C8FC051E7F04F8705A6E4A027F131EA2DCFC7CEDFE3E037F0178023F133C04FE16FF033F
-01F85E4D8004FF17F86F496E5BA36F496E5BA26F604D80A26F90C86C5BA36F486F90C9FC
-A26F48167EA30478163C6F457EC374>87 D<903801FFE0011F13FE017F6D7E48B612E03A
-03FE007FF84848EB1FFC6D6D7E486C6D7EA26F7FA36F7F6C5A6C5AEA00F090C7FCA40203
-B5FC91B6FC1307013F13F19038FFFC01000313E0481380381FFE00485A5B127F5B12FF5B
-A35DA26D5B6C6C5B4B13F0D83FFE013EEBFFC03A1FFF80FC7F0007EBFFF86CECE01FC66C
-EB8007D90FFCC9FC322F7DAD36>97 D<EB7FC0B5FCA512037EB1ED0FF892B57E02C314E0
-02CF14F89139DFC03FFC9139FF000FFE02FCEB03FF4A6D13804A15C04A6D13E05CEF7FF0
-A218F8173FA318FCAC18F8A2177F18F0A3EFFFE06E15C06E5B6E491380027C491300496C
-495A903AFC1FC07FFC496CB512F0D9F00314C049C691C7FCC8EA1FF036467DC43E>I<EC
-3FFC49B512C0010F14F0013F14FC90397FF003FE9039FFC001FF0003495A48494813805B
-120F485AA2485A6F1300007F6E5AED00784991C7FCA212FFAC6C7EA3123F6DEC03C0A26C
-6C1407000F16806D140F6C6DEB1F006C6D133E6C01F05B3A007FFC03F86DB55A010F14C0
-010391C7FC9038003FF82A2F7CAD32>I<EE03FEED07FFA5ED001F160FB1EC3FE0903803
-FFFC010FEBFF8F013F14CF9039FFF807FF48EBC00148903880007F4890C7123F4848141F
-49140F121F485AA3127F5BA212FFAC127FA37F123FA26C6C141FA26C6C143F0007157F6C
-6C91B5FC6CD9C00314FC6C9038F01FEF6DB5128F011FEBFE0F010713F89026007FC0EBF8
-0036467CC43E>I<EC3FF80103B57E010F14E0013F8090397FF83FF89039FFC007FC4849
-6C7E48496C7E48486D1380485A001FED7FC05B003FED3FE0A2127F5B17F0161F12FFA290
-B7FCA401F0C9FCA5127FA27FA2123F17F06C7E16016C6C15E06C6C14036C6DEB07C06C6D
-EB0F806C01F0EB3F0090397FFE01FE011FB55A010714F0010114C09026001FFEC7FC2C2F
-7DAD33>I<EDFF80020F13E0027F13F049B512F849EB8FFC90390FFE0FFE90381FFC1F14
-F8133FEB7FF0A2ED0FFCEBFFE0ED03F0ED00C01600ABB612F8A5C601E0C7FCB3B0007FEB
-FFE0A527467DC522>I<DAFFE0137E010F9039FE03FF80013FEBFF8F90B812C048D9C07F
-133F489038001FF84848EB0FFC4848903907FE1F80001F9238FF0F00496D90C7FCA2003F
-82A8001F93C7FCA26D5B000F5D6C6C495A6C6C495A6C9038C07FF04890B55A1680D8078F
-49C8FC018013E0000F90CAFCA47F7F7F90B612C016FC6CEDFF8017E06C826C16FC7E0003
-82000F82D81FF0C77ED83FC014074848020113808248C9FC177FA46D15FF007F17006D5C
-6C6C4A5A6C6C4A5AD80FFEEC3FF83B07FFC001FFF0000190B612C06C6C92C7FC010F14F8
-D9007F90C8FC32427DAC38>I<EB7FC0B5FCA512037EB1ED07FE92383FFF8092B512E002
-C114F89139C7F03FFC9138CF801F9139DF000FFE14DE14FC4A6D7E5CA25CA35CB3A7B600
-83B512FEA537457CC43E>I<137C48B4FC4813804813C0A24813E0A56C13C0A26C13806C
-1300EA007C90C7FCAAEB7FC0EA7FFFA512037EB3AFB6FCA518467CC520>I<EB7FC0B5FC
-A512037EB293387FFFE0A593380FE0004C5A4CC7FC167E5EED03F8ED07E04B5A4B5A037F
-C8FC15FEECC1FCECC3FE14C7ECDFFF91B57E82A202F97F02E17F02C07FEC807F6F7E826F
-7E816F7F836F7F816F7F83707E163FB60003B512F8A535457DC43B>107
-D<EB7FC0B5FCA512037EB3B3B3A3B61280A519457CC420>I<90277F8007FEEC0FFCB590
-263FFFC090387FFF8092B5D8F001B512E002816E4880913D87F01FFC0FE03FF8913D8FC0
-0FFE1F801FFC0003D99F009026FF3E007F6C019E6D013C130F02BC5D02F86D496D7EA24A
-5D4A5DA34A5DB3A7B60081B60003B512FEA5572D7CAC5E>I<90397F8007FEB590383FFF
-8092B512E0028114F8913987F03FFC91388F801F000390399F000FFE6C139E14BC02F86D
-7E5CA25CA35CB3A7B60083B512FEA5372D7CAC3E>I<EC1FFC49B512C0010714F0011F14
-FC90397FF80FFF9026FFC0017F48496C7F4848C7EA3FE000078248486E7E49140F001F82
-A2003F82491407007F82A400FF1780AA007F1700A46C6C4A5AA2001F5E6D141F000F5E6C
-6C4A5AA26C6C6CEBFFE06C6D485B27007FF80F90C7FC6DB55A010F14F8010114C0902600
-1FFCC8FC312F7DAD38>I<90397FC00FF8B590B57E02C314E002CF14F89139DFC03FFC91
-39FF001FFE000301FCEB07FF6C496D13804A15C04A6D13E05C7013F0A2EF7FF8A4EF3FFC
-ACEF7FF8A318F017FFA24C13E06E15C06E5B6E4913806E4913006E495A9139DFC07FFC02
-CFB512F002C314C002C091C7FCED1FF092C9FCADB67EA536407DAC3E>I<DA3FE0131E90
-2603FFFC133E010F01FF137E013F1480903AFFF80FE0FE489038E003F148EBC001489038
-8000FB4890C7127F49143F001F151F485A160F5B127FA3485AAC6C7EA46C7EA26C6C141F
-163F6C6C147F6C15FF6C6D5A6C9038E003EF6C9038F01FCF6DB5128F011FEBFE0F010313
-F89038007FC091C7FCAD0307B512FCA536407CAC3B>I<90387F807FB53881FFE0028313
-F0028F13F8ED8FFC91389F1FFE000313BE6C13BC14F8A214F0ED0FFC9138E007F8ED01E0
-92C7FCA35CB3A5B612E0A5272D7DAC2E>I<90391FFC038090B51287000314FF120F381F
-F003383FC00049133F48C7121F127E00FE140FA215077EA27F01E090C7FC13FE387FFFF0
-14FF6C14C015F06C14FC6C800003806C15806C7E010F14C0EB003F020313E0140000F014
-3FA26C141F150FA27EA26C15C06C141FA26DEB3F8001E0EB7F009038F803FE90B55A00FC
-5CD8F03F13E026E007FEC7FC232F7CAD2C>I<EB01E0A51303A41307A2130FA2131FA213
-3F137F13FF1203000F90B51280B7FCA4C601E0C7FCB3A3ED01E0A9150302F013C0137F15
-0790393FF80F8090391FFC1F006DB5FC6D13FC01015B9038003FE023407EBE2C>I<D97F
-C049B4FCB50103B5FCA50003EC000F6C81B3A85EA25EA25E7E6E491380017FD901F713FE
-9138F807E76DB512C7010F1407010313FE9026007FF0EBFC00372E7CAC3E>I<B6903803
-FFFCA5000101E09038003E006C163C80017F5D8017F8013F5D6E1301011F5D6E1303010F
-5D6E13076D5DED800F6D92C7FC15C05E6DEBE01E163E6D143CEDF07C027F1378EDF8F802
-3F5B15FD021F5B15FF6E5BA36E5BA26E90C8FCA26E5AA26E5AA21578362C7EAB3B>I<B5
-D8FE1FB539801FFFF0A500019027C0003FE0C7EA7C007114786E17F86C6F6C5C6E160101
-7F6E6C5CA26E011F1403013F6F5C6E013F1407011F6F5CA26E0179140F010F048090C7FC
-6E01F95C6D02F0EBC01E15806D902681E07F5B18E003C3157C6D9139C03FF07815E76DDA
-801F5B18F803FF14F96E9039000FFDE018FF6E486D5BA36E486D5BA26E486D90C8FCA24B
-7F02075DA26E48147C4B143C4C2C7EAB51>I<B500FE90383FFFF0A5C601F0903803E000
-6D6C495A6D6C495A011F4AC7FC6E5B6D6C137E6DEB807C6D6D5A6DEBC1F0EDE3E06DEBF7
-C06EB45A806E90C8FC5D6E7E6E7F6E7FA24A7F4A7F8291381F3FFCEC3E1F027C7F4A6C7E
-49486C7F01036D7F49487E02C08049486C7F49C76C7E013E6E7E017E141FB500E090B512
-FCA5362C7EAB3B>I<B6903803FFFCA5000101E09038003E006C163C80017F5D8017F801
-3F5D6E1301011F5D6E1303010F5D6E13076D5DED800F6D92C7FC15C05E6DEBE01E163E6D
-143CEDF07C027F1378EDF8F8023F5B15FD021F5B15FF6E5BA36E5BA26E90C8FCA26E5AA2
-6E5AA21578A215F85D14015D001F1303D83F805B387FC007D8FFE05B140F92C9FC5C143E
-495A387FC1F8EB07F06CB45A6C5B000790CAFCEA01FC36407EAB3B>I<001FB71280A490
-26FC001F130001E0495A5B49495A90C7485A48495B123E4A5B4A5B003C495BA24A90C7FC
-4A5A4A5AC7FC4A5A495B495BA2495B499038800780491300A2495A4948130F49481400A2
-485B48495B485BA248495B4890C75A48485C15034848EB1FFEB7FCA4292C7DAB32>I
-E
-%EndDVIPSBitmapFont
-%DVIPSBitmapFont: Fh ecbx1440 14.4 44
-/Fh 44 123 df<913803FF80023F13F849B6FC010715C04901017F903A3FFC007FF8D97F
-F0EB1FFC49486D7E48496D7E4A7F4817804890C76C13C0A248486E13E0A2001F17F0A300
-3F17F8A249157FA2007F17FCA600FF17FEB3A5007F17FCA6003F17F86D15FFA3001F17F0
-A3000F17E06D5C6C17C0A26C6D4913806C17006E5B6C6D495A6D6C495AD93FFCEB7FF890
-3A0FFF01FFE06D90B55A010192C7FCD9003F13F802031380374F7BCD42>48
-D<151E153E15FE1403140F147FEB07FF0003B5FCB6FCA3EBF87FEAFC00C7FCB3B3B3A600
-7FB712FCA52E4E76CD42>I<EC1FFE49B512F0010F14FC013FECFF804915E02701FF803F
-7F2703FC000713FCD807F001017F48486D7FD81F806E138048C87E7013C0D87FE016E001
-F8806D16F000FF817F7013F8A56C5AA26C5A6C5AEA0380C914F05EA218E05E18C05E1880
-4C13005F4C5A4C5A5F4B5B4B5B4B5B94C7FCED0FFC4B5A4B5AED7FC04B5A4A90C8FCEC03
-FC4A5A4A4814F84A5A4A5A4AC8FC02FEEC01F0495A495A495A5CD90F80140349C8FC013E
-1507017FB7FC90B812E05A5A5A5A5A5A5AB9FC18C0A4354E7ACD42>I<913807FFC0027F
-13FC0103B67E010F15E090261FF80313F890267FC0007F01FEC7EA3FFE48488148486E13
-8013FE486C6C6D13C0804817E080A66C5B18C06C5B6C90C75AD80038168090C8FC4C1300
-A24C5A5F4C5A4B5B4B13C0030F5BDB7FFEC7FC91387FFFF816C016FCEEFF80DA000313E0
-9238007FF8EE3FFE707E70138018C07013E018F07013F8A218FC82A218FEA3EA03C0EA0F
-F0EA3FFC487EA2B5FCA218FCA25E18F8A26C4816F0495C4916E0D83FE04A13C06C485CD8
-0FF04A1380D807FE91387FFE003B03FFE003FFFC6C90B65A6C6C15E0010F92C7FC010114
-FCD9001F1380374F7BCD42>I<17FC1601A216031607160FA2161F163F167FA216FF5D5D
-A25D5D5D167F153E157E15FC15F8EC01F01403EC07E015C0EC0F80141FEC3F00143E5C14
-FC495A5C495A1307495A5C49C7FC5B137E137C5B1201485A5B485A120F485A90C8FC123E
-127E5ABA1280A5C901FCC7FCAF021FB71280A5394F7CCE42>I<486C150601F0153E01FE
-EC01FED9FFF0133F91B65A5F5F5F5F5F94C7FC16FC5E16E093C8FC15FC01F0138091CAFC
-AC913807FF80023F13F891B512FE01F36E7E9026FFFC0113E09139E0007FF891C76C7E49
-6E7E01F86E7E5B7013804916C0C9FC18E08218F0A418F8A31203EA0FE0EA3FF8487EA212
-FF7FA218F0A25B5E6C4816E05B01C016C06CC85A18806C6C4A13007FD80FF04A5A6C6CEC
-FFFCD803FE4913F02701FFE00F5B6C6CB612806D92C7FC010F14F8010114C09026003FFC
-C8FC354F7ACD42>I<ED07FE92B512C0020314F0021F14FC91397FFC01FE9139FFE0007F
-01030180EB3F804990C7121F4948EC7FC0494814FF4948010313E0495A49485B5A485BA2
-485BA2486F13C091C7FC4803001300177E94C7FC5AA25B127FA2ED3FF04AB5FC020714C0
-00FF4914F091391F807FF891393E001FFE02786D7E4A6D13807013C06D5A4A6D13E018F0
-5C7013F8A291C813FCA44916FEA3127FA6123FA37F6C17FCA36C17F85E7E6E15F06C17E0
-6C6D5B6E15C06C4B13806D6C491300D93FFC495A6DB4EBFFFC010790B512F06D5D010015
-80021F01FCC7FC020313C0374F7BCD42>I<121F7F7FEBFF8091B8FCA45A18FE18FC18F8
-18F0A218E018C018804817000180C8123E007EC9127E5F007C4B5A4C5A5F16074C5A484B
-5A4CC7FC167E167CC912FC4B5A4B5AA24B5A150F4B5AA24B5AA24BC8FC5DA25C5D1403A2
-14075D140FA3141FA2143FA34A5AA414FFA65BAB6D5B6E5A6E5A6E5A385279D042>I<91
-3803FFC0023F13FC49B67E010715E090260FFC0013F8D93FE0EB1FFCD97F80EB07FE49C7
-6C7E496E1380484880000317C049157F120718E0173F120FA27FA27F7F6E147F02E015C0
-8002FC14FF6C01FF15806F481300EDE0036C9138F807FE6F485A6C9138FF1FF06CEDFFE0
-17806D4AC7FC7F010F6E7E6D81010115F06D81010315FE010F81D93FF71580D97FC115C0
-2701FF807F14E048EB001F48486D14F04848010314F848481300496E13FC003F151F4914
-07007F6F13FE491400177F00FF163F49151F170F1707A21703A218FCA27F127F6DED07F8
-A26C6CED0FF07F6C6CED1FE06C6CED3FC06C6CEDFF806C01C0010313006C01FCEB3FFE6C
-6CB612F8011F15E001071580010002FCC7FC020F13C0374F7BCD42>I<913807FF80027F
-13F849B512FE01076E7E90261FFE0113E0903A7FF8003FF049486D7E48496D7E48496D7E
-484980486F138091C7FC486F13C05A18E0485A18F0A27013F812FFA318FCA618FEA35E12
-7FA4003F5DA26C7E5E7E6C6D5B161E6C7F6C6D5B6C6C6C13F890393FFC03F06DB55A0107
-4A13FC01001400EC1FF891C8FCA218F85EA301FC16F0487E2607FF8015E05E486D15C0A2
-4C1380A24C13005F4A131F6C4B5A49C7485A494A5A6C48495B6D01075B2701FF803F90C7
-FC6C90B512FC013F5C6D14C0010791C8FC9038007FF0374F7BCD42>I<173FA24D7EA34D
-7EA24C7FA34C7FA24C7FA34C7FA24C7FA34C7F163E83047E80EE7C3F04FC8016F8830301
-814C7E03038116E0830307814C7E030F81168083031F811600834B81033E80037E82157C
-8403FC824B800201835D840203834B800207835D92B8FC4A83A34A8392C9FC4A83143E85
-027E84027C8202FC845C850101854A820103855C850107854A82A2494884D93FF082B600
-F0020FB712C0A55A547CD363>65 D<B912FEF0FFF019FE737E1AE0D8000F01C0C7001F7F
-06037F727F726C7E867313807313C0A27313E0A37313F0A94F13E0A34F13C01B80614F13
-00624F5A06035B4E13E0063F5B92B8C7FC19F8A2F1FF8003C0C7001F13E0060113F89538
-007FFE737E070F13C01BE07313F0851BF87313FCA27313FEA31BFFA91BFEA2611BFCA261
-4F13F81BF0614F13E0077F13C04EB51280060FEBFE00BB5A1AF01AC04FC7FC19C050527B
-D15D>I<932603FFF01407047F01FF140F0307B600E0131F033F03F8133F92B700FE137F
-02039126C003FF13FF020F01F8C7EA3FC1023F01C0EC0FE391B5C80003B5FC4901FC8149
-49814901E082011F498249498292CA7E4948834948835A4A83485B4885A24849187FA248
-5B1B3FA2485B1B1FA25AA21B0091CDFCA2B5FCAE7EA280A36C1A1FA36C7FA21B3F6C7F1B
-3E6C7F1B7E6C6D187C6C1AFC6E18F86C19016D6CEF03F06D7E6FEE07E06D6DEE0FC00107
-6DEE1F806D01F8EE3F006D6D16FE6D01FF4B5A023F01C0EC07F8020F01FCEC3FF0020390
-3AFFC001FFC0020091B6C7FC033F15FC030715F0DB007F1480040301F0C8FC505479D25F
->I<BAFC19F819FF1AE01AF8D8000701F0C7001F13FE06017FDE003F13C0070F7F07037F
-737F737F747E747E747F86747F8886888688A2747FA3881B7FA288A51D80AF1D00A564A2
-1BFF64A3505BA2505BA2505BA2505B505B99C7FC505A1A7F4F485A4F13F0070F5B073F5B
-4EB55A061F49C8FCBB12F81AE097C9FC19F896CAFC59527CD165>I<BB12FCA5D8000701
-F0C7000F7F1800191F190F19071903190119001A7E1A7F86A386A51B80DD03E0130FA497
-C7FCA31707A3170F171F173FEE01FF92B6FCA5EDF001EE003F171F170F1707A31703A794
-CAFCB3A2B812F0A549527CD153>70 D<B8D88007B712FCA5D8000701F0C9003FEB8000B3
-AE92BAFCA503F0C9123FB3B1B8D88007B712FCA55E527CD167>72
-D<B81280A5D8000701F0C7FCB3B3B3B2B81280A529527DD130>I<B912FCF0FFE019FE73
-7E1AE0D8000F01E0C7003F7F060313FC06007F737E7313807313C07313E0851BF0A21BF8
-85A21BFCA91BF8A3611BF0A21BE04F13C0614F13804F13004F5A060713F8063F5B92B812
-C097C7FC19F8198003E0CBFCB3AEB712FEA54E527CD15A>80 D<DA0FFE141C91B500F013
-3C010702FC137C011F02FF13FC017F15C19026FFF00113E148903980001FFB4890C7EA07
-FFD807FC14014848804848153F171F4848150FA2007F1607491503A2170112FFA217007F
-A26D167CA27F7F6D93C7FC6C7E14C014F8ECFF806C14F8EDFFC06C15FC6CEDFF8017F06C
-16FC6C826C707E6C836D82011F8201078213016D6C81020781EC007F030380ED003F0403
-14801600173F837113C0838312F883A3837EA319807EA26C5E19007F6D4B5A7F6D4B5A01
-FC4B5A6D151FD9FFC04A5AD97FF8ECFFE028FE1FFF80075B010790B6C7FCD8FC0115FC48
-6C6C14F048010F14C0489026007FFCC8FC3A5479D249>83 D<003FBB12FCA59126C0007F
-EB000301FCC7ED003FD87FF0F00FFE49180749180349180190C81600A2007E1A7EA3007C
-1A3EA500FC1A3F481A1FA6C91700B3B3AC49B912C0A550517BD05B>I<B700FE4AB612F0
-A5D8000F01E0CA387FC000091FC7FCB3B3B26D611B3E811B7E6D197CA26D6D17FC636D6D
-1601027F4D5A6F4C5A023F170F6E6C4C5A6E6D4B5A6E01E003FFC8FC6E01F8EC03FE0200
-01FEEC1FFC923B7FFFE001FFF8031F90B612E00307168003004BC9FC041F14F0040091CA
-FC5C537CD165>I<B700F8017FB600FC49B612E0A5D8001F01C0C8001F01E0C9EBC0000E
-1FC7FC6F6F606D73163E6F81207E6D73167C6F8120FC6D735E6F6F17016D735E616F1B03
-6D735E616F1B076E4C6E5D7015BF1F0F6E041F6E5D70031F161F6E9AC8FC073F8070DA3E
-0F5E6E73143E197E70DA7C07167E6E04FC6E147C704A7E1FFC6E03016F5C704A7E6E515A
-060381704A6C15036E735C1807704A6D14076F07805B7148487F1E0F6F021F04C05B05C0
-90C77E1E1F6F4A04E090C9FCDDE03E6E5C6FF1F03E187EDDF07C6E147E6FF1F87C18FC71
-486E14FC6F01F9715ADDFDF0801DFD6F01FFEFFFF04E806F62A24E817061A24E81706195
-C97EA27096CAFC4D82040F60A24D1607040760A24D16030403604D160104016083537ED1
-88>87 D<EC3FFE0107B512E0011F14FC017F14FF2701FFC00F13C02703FE00037F486C01
-007F6E6D7E486D80707EA2707EA3707F6C5B6C90C7FC6C5AC9FCA60307B5FC0203B6FC14
-7F0103B7FC011FEBF00F017F1300EBFFFC000313F04813C0485B4890C7FC5A5B485AF081
-F012FF5BA35EA26D5C127F6D5C003F03F713C36DD901E314E06CD9C00714FF00079026F0
-1F8114C06C90B5C61480C602FC6D1300011F01F0EB3FFC01010180EB07F03C387CB642>
-97 D<913803FFE0023F13FE91B67E010315E0010F9038003FF8D93FFCEB07FC4948497E
-4948131F4849497E485B485BA24890C7FC5A5B003F6F5A705A705A007F92C8FC5BA312FF
-AD127F7FA3123F7F6CEE0F80A26C6D141F18006C6D5C6C6D143E6C6D147E6C6D5C6D6C49
-5A6DB4EB07F0010F9038C01FE06D90B5128001014AC7FCD9003F13F80203138031387CB6
-3A>99 D<943803FF80040FB5FCA5EE003F170FB3A4913803FF80023F13F849B512FE0107
-ECFF8F011F9038C03FEF90273FFE0007B5FCD97FF8130149487F48498048498048498048
-8291C8FC5A5B123FA2127F5BA312FFAD127FA37F123FA3121F7F6C5E6C6D5C5F6C6D91B5
-FC6C6D5B6C6D4914E0D97FFCD90FEFEBFF80D91FFFEB7F8F010790B5120F010114FC6D6C
-13E00207010049C7FC41547CD249>I<913807FF80027F13F849B512FE01076E7E011F01
-0313E0903A3FFC007FF0D97FF06D7E49486D7E4849130F48496D7E48824890C77E188048
-5A82003F17C0A3485A18E082A212FFA290B8FCA401FCCAFCA6127FA37F123FA2EF03E06C
-7E17076C17C06C6D140F18806C6D141F6C6DEC3F006C6D147ED97FFC495AD91FFFEB07F8
-6D9038E03FF0010390B512C001005D023F01FCC7FC020113E033387CB63C>I<ED1FF891
-3803FFFE020FEBFF80023F14C09139FFF83FE001039038E0FFF049138049010113F85BEB
-3FFEA2EB7FFCA26F13F0495AEE7FE0EE1F8093C7FCAEB712C0A5C601F8C8FCB3B3A7B612
-FEA52D547CD328>I<DA1FFE14FE49B539E007FF80010FDAFC1F13C0013FDAFF7F13E090
-267FF807EBFF072701FFE001EBF07F48497E484990387FF83F91C7003F14C048EEFC1F48
-9338FE070049021F90C7FCA2003F82A9001F5EA26D143F6C5E6C5E6E137F6C6D495A6C6D
-485B6CD9F80713804890B6C8FCD803EF14FC01C114E02707C01FFEC9FC49CBFCA2487EA3
-7FA27F13FC90B612FE6CEDFFF017FCEFFF806C8318F06C836C837F48B87E1207D80FFCC7
-00037F4848EC003F4848150F48486F138083485A83A56D5D007F18006D5D003F5F6C6C4B
-5A01FE153FD807FFED7FF06C01C049485AC601FC011F1380013FB648C7FC010F15F80101
-15C0D9000F01F8C8FC3B4F7CB542>I<EB3FF8B5FCA51203C6FCB3A4EE1FFC93B57E0303
-14E0030F14F892391FC07FFC92397E003FFE03F86D7EECF9F04B6D7FECFBC0ECFF8092C7
-6C7FA25CA25CA45CB3ACB6D8F807B612C0A542537CD249>I<133FEBFFC0487F487FA248
-7FA66C5BA26C5B6C5B013FC7FC90C8FCAEEB1FF8B5FCA512017EB3B3A6B612F0A51C547C
-D324>I<EB3FF8B5FCA51203C6FCB3A50407B512F0A59339007FF000EF3FC04D5A4DC7FC
-EE01FC4C5AEE0FF04C5A4C5A4CC8FC16FEED03FC4B5A4B5A4B5A4B7E4B7EECF9FF02FB7F
-91B57EA28203BF7F031F7F14FE4A6C7FDAF0077F6F7FA26F7F6F7F167F83707F707FA270
-7F707F707FA2707F707F84B6D8F00F14FEA53F537DD245>107 D<EB3FF8B5FCA51203C6
-FCB3B3B3B1B612F8A51D537CD224>I<D93FF0D91FF84AB47EB591B56C010F13F8030302
-E0013F13FE030F6E90B6FCDB3F809027F803F80F7F922A7E007FFC07E0077F000302F890
-283FFE0F80037FC6D9F1F0011F49487EDAF3E0DAFF3E814B153CDAF7805D92C76C496D7F
-14FF4A5EA24A5EA34A5EB3ADB6D8F80FB66CB612F8A565367BB56E>I<D93FF0EB1FFCB5
-91B57E030314E0030F14F892391FC07FFC92397E003FFE000302F86D7EC6EBF1F04B6D7F
-ECF3C0ECF78092C76C7F14FF5CA25CA45CB3ACB6D8F807B612C0A542367CB549>I<9138
-01FFC0023F13FE91B67E010315E0010F018013F8903A3FFC001FFED97FF0EB07FF49486D
-7F48496D7F48496D7F91C8127F4883488349153F001F83A2003F8349151FA2007F83A400
-FF1880AC007F1800A3003F5F6D153FA2001F5FA26C6C4B5AA26C6D4A5A6C5F6C6D495B6C
-6D495B6D6C4990C7FCD93FFCEB1FFE6DB46CB45A010790B512F0010115C0D9003F49C8FC
-020313E039387CB642>I<D93FF8EB7FF0B50107B5FC031F14C0037F14F09126F9FF0013
-FCDAFFF8EB3FFF000302E0010F7FC602806D7F92C76C7F4A824A804A6E7F85187F85A218
-3F85A4721380AD4E1300A44E5AA26118FF616E5C616E4A5B6E4A5B6F495B03E04990C7FC
-6FEB7FFE913AF9FE01FFF802F8B65A033F14C0030749C8FC030013E093CAFCB1B612F8A5
-414D7DB549>I<90393FF001FCB590380FFF804B13E0037F13F09238FE1FF89138F1F83F
-00019138F07FFC6CEBF3E015C0ECF780A2ECFF00EE3FF84AEB1FF0EE0FE093C7FC5CA45C
-B3ABB612FEA52E367DB535>114 D<903903FFC00E011FEBFC1E90B6127E000315FE3907
-FE003FD80FF0130F4848130348481301491300127F90C8127EA248153EA27FA27F01F091
-C7FC13FCEBFF806C13FEECFFF06C14FE6F7E6C15E06C816C15FC6C81C681133F010F1580
-1301D9000F14C0EC003F030713E0150100F880167F6C153FA2161F7EA217C07E6D143F17
-807F6DEC7F0001F85C6DEB03FE9039FF801FFC486CB512F0D8F81F14C0D8F00791C7FC39
-E0007FF02B387CB634>I<147CA614FCA41301A31303A21307A2130F131F133F137F13FF
-1203000F90B512FEB7FCA426007FFCC8FCB3A9EE0F80ABEE1F006D7EA2011F143E806D6D
-5A6DEBC1F86DEBFFF001005C023F1380DA03FEC7FC294D7ECB33>I<D93FF8913801FFC0
-B50207B5FCA50003ED001FC61607B3AE5FA35FA25F137F5F6D6C14F7DC01E713F06D6CD9
-07C7EBFFC0903A0FFF801F876D90B51207010114FC6D6C13F0020701C091C7FC42377CB5
-49>I<B600E090381FFFFCA5000101F8C7000113006CEE007C6E15FC017F5E8017016D6C
-5D17036D5E6F13076D5E6F130FA26D6D5C171F6D93C7FC6F5B6D153E6F137E6D157C8117
-FC027F5CEDFE01023F5CEDFF036E5C168316876E5C16CF6E5C16FF6E91C8FCA36E5BA26E
-5BA26F5AA36F5AA26F5AA26F5AA23E367DB445>I<B600E1B6D8800FB5FCA500019026F0
-000301C0C7EA3FE06E6D6DEC0F806CF21F00A26E6D6D5C017F193E6E147F72147E013F19
-7C6E6F14FC6D6117FF6F6E13016D4A5E03C06E13036D615E03E001E7EB80076D02075E03
-F001C313C06D4E5A160F03F80181EBE01F6D96C7FC6F48C6FC735A027F49153EDBFE3E90
-387FF87E023F177C167EDBFF7C90383FFCFC6E01FC5D4CEB1FFF6E5FA24C7F6E5F4C7F6E
-5FA24C7F6E5F4C7FA26E94C8FC93C8FC6F5DA2033E157C58367DB45F>I<B600E090381F
-FFFCA5000101F8C7000113006CEE007C6E15FC017F5E6E1401013F5E8017036D6D5C1707
-6D5E6F130F6D5E6F131F6D93C7FC815F6D6D133E177E6D157C6F13FC027F5C811601DA3F
-FF5B16036E5C16876E5C16CF6E5C16EF16FF6E91C8FCA26E5BA26E5BA26F5AA36F5AA26F
-5AA26F5AA35E151F93C9FC5D153E157ED81FC0137C487E486C13FC486C5B14015D4A5A14
-074A5A6C48485A4948CAFC495A383F81FC6CB45A6C5B000313C0C648CBFC3E4D7DB445>
-121 D<003FB712FEA4DA000113FC01F815F801E05B494913F04915E090C75A4B13C0007E
-4A1380A24B13004B5A007C5D5C4A5B5E5C4A5BC75C5C4A5B93C7FC5C4A5A495B5D5B4949
-131F5D5B495B5D49153F4990C7123E5C13FF485B4849147EA2484914FE485B4A13014815
-034849130791C7EA1FFC48EC01FFB8FCA430357CB43A>I E
-%EndDVIPSBitmapFont
-%DVIPSBitmapFont: Fi ecrm0900 9 36
-/Fi 36 123 df<EC1FE0ECFFFC903803F01E90390FC00780EB1F8090393F000FC0017E13
-1F5BA2485AED0F8092C7FCA9ED0FC0B7FCA33901F8001F150FB3A6486CEB1FE0267FFFC1
-B5FCA328357FB42B>28 D<123C127EB4FCA21380A2127F123D1201A412031300A25A1206
-120E120C121C5A5A126009177A8715>44 D<123C127E12FFA4127E123C08087A8715>46
-D<123C127E12FFA4127E123C1200B0123C127E12FFA4127E123C08207A9F15>58
-D<123C127EB4FCA4127E123CC7FCB0123C127EB4FCA21380A2127F123D1201A412031300
-A25A1206120E120C121C5A5A1260092F7A9F15>I<B77E16F016FE3A01FE0001FF000091
-38003FC0EE0FE0707E707E707E707E177E177FEF3F80A2EF1FC0A3EF0FE0A418F0AA18E0
-A3171F18C0A21880173F18005F17FE5F4C5AEE07F04C5AEE3FC000014AB45AB748C7FC16
-F8168034337EB23B>68 D<B81280A3D803FCC7FC0001151FEE07C01603A21601A21600A4
-1760150CA31700A2151CA2153C15FC90B5FCA3EBFC00153C151CA2150CA592C8FCAB487E
-B512FEA32B337DB232>70 D<B512FEA3000113006C5AB3B3A7487EB512FEA317337EB21C
->73 D<B512FEA3D803FEC9FC6C5AB3A9EE0180A416031700A45EA25E5E5E5E16FE000314
-07B7FCA329337DB230>76 D<D8FFFC923801FFF86D5DA20003EFFE00D801BFED06FCA3D9
-9F80140CA2D98FC01418A3D987E01430A2D983F01460A3D981F814C0A3D980FCEB0180A2
-027EEB0300A36E1306A26E6C5AA36E6C5AA36E6C5AA26E6C5AA36E6C5AA3913800FD80A2
-037FC7FCA3486C133ED80FF04B7EB5011C90387FFFF8A33D337CB246>I<007FB712FEA3
-90398007F001D87C00EC003E0078161E0070160EA20060160600E01607A3481603A6C715
-00B3AB4A7E011FB512FCA330337DB237>84 D<EB7F803803FFF0380F80FC381C003E003F
-133F6D6C7E6E7EA26E7EEA1F00C7FCA4EB01FF131FEBFF873803FC07EA0FF0EA1FC0EA3F
-80127F13004815C05AA3140FA26C131F6C133B3A3F8071F180391FC1E1FF2607FFC01300
-3900FE003C22237DA126>97 D<EA03F012FFA312071203AEEC3F80ECFFE09038F3C0F890
-38F7007E01FE7F49EB1F8049EB0FC05BED07E016F0A2150316F8AA16F0150716E0A2ED0F
-C07F6DEB1F8001ECEB3F0001CF137C90388381F8903801FFE0C76CC7FC25357EB32B>I<
-EB07F8EB3FFF9038FC07C03901F000E03903E003F03807C007120FEA1F80123F90380003
-E04890C7FCA2127E12FEAA127FA26C14187F001F14386D1330000F14706C6C13E03903F0
-01C03900FC0F8090383FFE00EB07F01D237EA122>I<153FEC0FFFA3EC007F81AEEB07F0
-EB3FFCEBFC0F3901F003BF3907E001FF48487E48487F8148C7FCA25A127E12FEAA127E12
-7FA27E6C6C5BA26C6C5B6C6C4813803A03F007BFFC3900F81E3FEB3FFCD90FE013002635
-7DB32B>I<EB0FE0EB7FFCEBF83F3903F00F80D807E013C0390FC007E0381F800315F0EA
-3F0014014814F8127EA212FEA2B6FCA248C8FCA5127E127FA26C1418A26C6C1338000F14
-306D13706C6C13E03901F003C03900FC0F00EB3FFEEB07F01D237EA122>I<EB01FCEB07
-FF90381F078090383E0FC0EB7C1F13FCEA01F8A20003EB070049C7FCACB512F0A3D803F0
-C7FCB3A7487E387FFFE0A31A357FB417>I<151F90391FC07F809039FFF8E3C03901F07F
-C73907E03F033A0FC01F83809039800F8000001F80EB00074880A66C5CEB800F000F5CEB
-C01F6C6C48C7FCEBF07C380EFFF8380C1FC0001CC9FCA3121EA2121F380FFFFEECFFC06C
-14F06C14FC4880381F0001003EEB007F4880ED1F8048140FA56C141F007C15006C143E6C
-5C390FC001F83903F007E0C6B51280D91FFCC7FC22337EA126>I<EA03F012FFA3120712
-03AEEC1FC0EC7FF09038F1E0FC9038F3807C9038F7007E13FE497FA25BA25BB3486CEB7F
-80B538C7FFFCA326347EB32B>I<EA0780EA0FC0EA1FE0A4EA0FC0EA0780C7FCAAEA07E0
-12FFA3120F1207B3A6EA0FF0B5FCA310337EB215>I<EA03F012FFA312071203AF913803
-FFE0A36E1300EC00F8EC01E05D4A5A020FC7FC141C5C5C14F0EBF3F8EBF7FC13FEEBFC7E
-EBF87F496C7E141F6E7E8114076E7E8114016E7E81486CEBFF80B500C313F0A324347EB3
-29>107 D<EA07E012FFA3120F1207B3B3A7EA0FF0B5FCA310347EB315>I<2703F01FE013
-FF00FF90267FF80313C0903BF1E07C0F03E0903BF3803E1C01F02807F7003F387FD803FE
-1470496D486C7EA2495CA2495CB3486C496C487EB53BC7FFFE3FFFF0A33C217EA041>I<
-3903F01FC000FFEB7FF09038F1E0FC9038F3807C3907F7007EEA03FE497FA25BA25BB348
-6CEB7F80B538C7FFFCA326217EA02B>I<EB07F0EB3FFE9038FC1F803901F007C03903C0
-01E000078048486C7E48C7127CA248147E003E143E007E143FA300FE1580A8007E1500A3
-6C147EA26C147C6D13FC6C6C485A00075C3903F007E03900FC1F80D93FFEC7FCEB07F021
-237EA126>I<3903F03F8000FFEBFFE09038F3C0F89038F7007ED807FE7F6C48EB1F8049
-14C049130F16E0ED07F0A3ED03F8A9150716F0A216E0150F16C06D131F6DEB3F80160001
-FF13FC9038F381F89038F1FFE0D9F07FC7FC91C8FCAA487EB512C0A325307EA02B>I<90
-3807F00390383FFC07EBFC0F3901F8038F3807E001000F14DF48486CB4FC497F123F90C7
-7E5AA25A5AA9127FA36C6C5B121F6D5B000F5B3907E003BF3903F0073F3800F81EEB3FF8
-EB0FE090C7FCAAED7F8091380FFFFCA326307DA029>I<3803E07C38FFE1FF9038E38F80
-9038E71FC0EA07EEEA03ECA29038FC0F8049C7FCA35BB2487EB512E0A31A217FA01E>I<
-EBFF06000713CE381F00FE003C133E48131E140E5A1406A27EA200FE90C7FC6C7EEA7FFC
-383FFFC014F0000F7F6C7FC67FEB0FFF1300EC3F8000C0131F140F6C1307A37E15006C5B
-6C130E6C5B38F7807838E1FFE038C07F8019237EA11E>I<1330A51370A313F0A21201A2
-12031207381FFFFEB5FCA23803F000AF1403A814073801F806A23800FC0EEB7E1CEB1FF8
-EB07E0182F7FAD1E>I<D803F0133F00FFEB0FFFA30007EB007F000380B35DA35D12016D
-4813800000903803BFFC90387E073FEB1FFED907F8130026227EA02B>I<B5EBFFF0A3D8
-0FF0EB3F800007EC1F000003140E150C6D131C00011418A26C6C5BA26D1370017E136013
-7F6D5BA290381F8180A214C3010F90C7FCA2EB07E6A214FE6D5AA26D5AA36D5AA2146024
-217E9F29>I<B53A1FFF81FFF0A33C07F801FC003F8001F049EB1E0000030100141C816C
-6C017C1318A26D017E1338000002FE1330A290267E01FF5B159F168090263F030F5BA216
-C0903A1F8607C180A202C613E390260FCC0390C7FCA2D907FC13F6ECF80116FE6D486C5A
-A36D481378A36D48133034217F9F37>I<B53801FFF8A32603FE0013806C48EB7C000000
-1478017E1370017F5B90383F81C090381F8380D90FC3C7FCEB07E614FE6D5A6D5A6D7E80
-805B9038039F809038071FC09038060FE0EB0C0790381C03F0496C7E01707FEBF0000001
-80000FECFF8026FFFC0313FCA326207F9F29>I<3A7FFF807FF8A33A07F8001FC00003EC
-0F800001EC070015066C6C5BA26D131C017E1318A26D5BA2EC8070011F1360ECC0E0010F
-5BA2903807E180A214F3010390C7FC14FBEB01FEA26D5AA31478A21430A25CA214E05CA2
-495A1278D8FC03C8FCA21306130EEA701CEA7838EA1FF0EA0FC025307F9F29>I<003FB5
-12F0A2EB000F003C14E00038EB1FC00030EB3F800070137F1500006013FE495A13035CC6
-485A495AA2495A495A49C7FC153013FE485A12035B48481370485A001F14604913E0485A
-387F000348130F90B5FCA21C207E9F22>I E
-%EndDVIPSBitmapFont
-%DVIPSBitmapFont: Fj ecbx0900 9 7
-/Fj 7 117 df<ED1F80A24B7EA24B7EA34B7EA24A7FA34A7FA24A7F15CFA2020F7F1587
-021F801503023F80EC3E01A2027E80EC7C0002FC804A137FA20101814A133F0103814A13
-1FA249B67EA24981A290271F8000077F91C77EA24982013E80017E82017C80A201FC8249
-157FB500F0013FB512F0A43C347DB343>65 D<EB7FFE0003B512E04814F8390FF00FFC39
-1FF803FF806E138016C0157F6C5A6C5AEA0180C8FCEC7FFF010FB5FC90B6FC0003EBF07F
-000F1300EA1FF8485A485A485A5BA315FF7F007F5B6D4813E03A3FF80FBFFF000FB5121F
-0003EBFC0F39007FE00728217EA02B>97 D<EA01FC12FFA4120F1207ADEC0FF8EC7FFF01
-FDB512C09039FFF01FF09138800FF84A6C7E496D7E496D7EA2178081A217C0A91780A25D
-1700A26D495A6D495A6E485A9039F7E03FF001E1B512C0D9C07F90C7FC9038801FF02A34
-7DB331>I<903807FF80013F13F090B512FC3903FE01FE4848487EEA0FF8EA1FF0EA3FE0
-A2007F6D5A496C5A153000FF91C7FCA9127F7FA2003FEC07807F6C6C130F000FEC1F00D8
-07FE133E3903FF80FCC6EBFFF8013F13E0010790C7FC21217DA027>I<3901F81F8000FF
-EB7FF0ECFFF89038F9E3FC9038FBC7FE380FFF876C1307A213FEEC03FCEC01F8EC006049
-1300B1B512F0A41F217EA024>114 D<9038FFE1C0000713FF5A383F803F387E000F1407
-5A14037EA26C6CC7FC13FCEBFFE06C13FC806CEBFF80000F14C06C14E0C6FC010F13F0EB
-007F140F00F0130714037EA26C14E06C13076CEB0FC09038C01F8090B5120000F913FC38
-E03FE01C217DA023>I<133CA5137CA313FCA21201A212031207001FB51280B6FCA3D807
-FCC7FCB0EC03C0A79038FE078012033901FF0F006C13FEEB3FFCEB0FF01A2F7EAE22>I
-E
-%EndDVIPSBitmapFont
-%DVIPSBitmapFont: Fk ecrm1200 12 20
-/Fk 20 117 df<14FF010713E090381F81F890383E007C01FC133F4848EB1F8049130F48
-48EB07C04848EB03E0A2000F15F0491301001F15F8A2003F15FCA390C8FC4815FEA54815
-FFB3A46C15FEA56D1301003F15FCA3001F15F8A26C6CEB03F0A36C6CEB07E0000315C06D
-130F6C6CEB1F806C6CEB3F00013E137C90381F81F8903807FFE0010090C7FC28447CC131
->48 D<EB03FE90381FFFC0017F13F03901F80FFC3903C001FE48486C7E000EC7EA7F8048
-EC3FC0ED1FE04815F00030140F007015F800601407126CB415FC7F7F1503A46C4813076C
-C7FCC8FC16F8A2150F16F0151F16E0A2ED3FC0ED7F8016005D5D4A5A4A5A4A5A5D4A5A4A
-5A4AC7FC147C5C5C495A495A495A49C7120C131E5B013814185B5B485A4848143848C812
-30000E1570001FB612F0A25A5AB712E0A326427BC131>50 D<49B4FC010F13E0013F13FC
-9038FE01FE3A01F0007F80D803C0EB3FC048C7EA1FE0120EED0FF0EA0FE0486C14F8A215
-077F5BA26C48130FEA03C0C813F0A3ED1FE0A2ED3FC01680ED7F0015FE4A5AEC03F0EC1F
-C0D90FFFC7FC15F090380001FCEC007FED3F80ED1FC0ED0FE016F0ED07F816FC150316FE
-A2150116FFA3121EEA7F80487EA416FE491303A2007EC713FC00701407003015F8003814
-0F6C15F06CEC1FE06C6CEB3FC0D803E0EB7F803A01FE01FE0039007FFFF8010F13E00101
-90C7FC28447CC131>I<ED0380A21507150FA2151F153FA2157F15FFA25CEC03BF153F14
-071406140C141C141814301470146014C013011480EB03005B13065B131C13185B137013
-6013E0485A5B120390C7FC1206120E120C5A123812305A12E0B812C0A3C8383F8000ADED
-FFE0027FEBFFC0A32A437DC231>I<B712FEEEFFE017F800019039C00007FE6C6C489038
-00FF80EF3FC0EF0FF0717E717EEF00FE8484F03F80F01FC0A2F00FE019F0180719F8A218
-0319FCA3F001FEA419FFAD19FEA3180319FCA319F8180719F0180F19E0A2F01FC0F03F80
-A2F07F0018FE4D5A4D5AEF0FF0EF3FE0EFFF8048486C010790C7FCB812FC17E04CC8FC40
-447CC34A>68 D<B712FCEEFFC017F800019039C0000FFC6C6C48EB01FF9338007F80EF1F
-E0170FEF07F018F8EF03FCA218FE1701A218FFA718FEA2170318FCA2EF07F818F0EF0FE0
-EF1FC0EF7F80933801FE00EE0FFC91B612F017800280C9FCB3AA3801FFE0B612C0A33844
-7CC342>80 D<49B41303010FEBE007013F13F89039FE00FE0FD801F8131FD807E0EB079F
-49EB03DF48486DB4FC48C8FC4881003E81127E82127C00FC81A282A37E82A27EA26C6C91
-C7FC7F7FEA3FF813FE381FFFE06C13FE6CEBFFE06C14FC6C14FF6C15C0013F14F0010F80
-010180D9001F7F14019138001FFF03031380816F13C0167F163F161F17E000C0150FA316
-07A37EA36C16C0160F7E17806C151F6C16006C5D6D147ED8FBC05CD8F9F0495AD8F07C49
-5A90393FC00FE0D8E00FB51280010149C7FC39C0003FF02B487BC536>83
-D<EB07FC90383FFF809038F80FE03903C003F048C66C7E000E6D7ED80FC0137E486C137F
-6D6D7EA36F7EA26C5AEA0380C8FCA4EC0FFF49B5FC90380FFE1FEB3FC0EBFF00EA03FC48
-5A485A485A485A127F5B176048C7FCA3153FA36D137F007F14EF6D9038C7E0C0003F1301
-3A1FE00783F13B07F81E03FF802701FFFC0113003A001FE0007C2B2E7CAC31>97
-D<EA01FC12FFA3120712031201B3EC03FC91380FFF8091383C07E091387001F89039FDE0
-007E02807F01FFEC1F8091C713C049EC0FE049140717F0A2EE03F8A217FCA2160117FEAB
-17FC1603A217F8A2EE07F0A26DEC0FE017C06D141F01FBEC3F80D9F380EB7E00D9E1C05B
-9039E0F001F89039C03C07E09039801FFF80C7D803FCC7FC2F467DC436>I<167FED3FFF
-A315018182B3EC7F80903803FFF090380FC07C90383F000E017E1307496D5AD803F87F48
-487F5B000F81485AA2485AA2127FA290C8FC5AAB7E7FA2123FA26C7EA2000F5D7F6C6C5B
-00035C6C6C9038077F806C6C010E13C0013F011C13FE90380FC0F8903803FFE09026007F
-0013002F467DC436>100 D<EB01FE903807FFC090381F03F090387E00FC49137E48487F
-485A4848EB1F80000F15C049130F121F484814E01507A2007F15F090C7FCA25AA390B6FC
-A290C9FCA67EA27FA2123F16306C7E1670000F15606D14E06C6C14C0000314016C6CEB03
-806C6CEB0700013E131E90381F80F8903803FFE0010090C7FC242E7DAC2B>I<EA01FC12
-FFA3120712031201B3EC01FE913807FFC091381E07F091383801F802707FECE000D9FDC0
-7F5C01FF147F91C7FCA25BA35BB3A8486CECFF80B5D8F83F13FEA32F457DC436>104
-D<EA01E0EA07F8A2487EA46C5AA2EA01E0C8FCADEA01FC12FFA3120712031201B3B0487E
-B512F8A315437DC21C>I<EA01FC12FFA3120712031201B3B3B3A5487EB512F8A315457D
-C41C>108 D<D801FC01FFEC1FE000FF010701E0EBFFFC913B0F03F801E07F913C3C01FC
-07803F800007903C7000FE0E001FC0000349D97E1C130F2601FDC0D97F38804A143001FF
-DA3FF06D7E91C75BA2495DA3495DB3A8486C4A6C497EB5D8F81FB50003B512E0A34B2C7D
-AB52>I<3901FC01FE00FF903807FFC091381E07F091383801F8000701707F0003EBE000
-2601FDC07F5C01FF147F91C7FCA25BA35BB3A8486CECFF80B5D8F83F13FEA32F2C7DAB36
->I<3901FC03FC00FF90380FFF8091383C07E091387001F83A07FDE000FE00010180137F
-01FFEC3F8091C7EA1FC04915E049140F17F0160717F8160317FCA3EE01FEABEE03FCA3EE
-07F8A217F0160F6D15E0EE1FC06D143F17806EEB7E00D9FDC05B9039FCF003F891383C0F
-E091381FFF80DA03FCC7FC91C9FCAE487EB512F8A32F3F7DAB36>112
-D<3903F803F000FFEB1FFCEC3C3EEC707F0007EBE0FF3803F9C000015B13FBEC007E153C
-01FF13005BA45BB3A748B4FCB512FEA3202C7DAB26>114 D<90383FE0183901FFFC3839
-07E01F78390F0003F8001E1301481300007C1478127800F81438A21518A27EA27E6C6C13
-006C7E13FC383FFFE06C13FC6C13FF6C14C06C14E0C614F0011F13F81300EC0FFC140300
-C0EB01FE1400157E7E153EA27EA36C143C6C147C15786C14F86CEB01F039F38003E039F1
-F00F8039E07FFE0038C00FF01F2E7DAC26>I<1306A5130EA4131EA3133E137EA213FE12
-011207001FB512F0B6FCA2C648C7FCB3A4150CAA017E131C017F1318A26D133890381F80
-30ECC070903807E0E0903801FFC09038007F001E3E7EBC26>I E
-%EndDVIPSBitmapFont
-%DVIPSBitmapFont: Fl ecrm1728 17.28 19
-/Fl 19 119 df<B912F018FF19E019F8C601FCC8EA3FFED93FF8ED07FF011F040113C072
-7F737E737E737E737E1907737EA2731380A21BC085A21BE0A91BC061A21B80611B004F5A
-190F624F5A4F5AF1FFE04E5B4E90C7FCF00FFEF07FF894381FFFC091B748C8FCF0FFC019
-FC02F8C8EA07FF060013C0F13FE0F11FF8737E737E737E7313807313C0F27FE0A2F23FF0
-A2F21FF8A21BFCA21A0F1BFEA91BFC1A1FA3F23FF8A2F27FF0A2F2FFE0614F13C01B8007
-0F13004F5A4F5AF1FFF8013F040313E0D9FFFC033F5BBBC7FC19FC19E04EC8FC4F6278E1
-5F>66 D<DDFFF015C0040F01FF1401047F14E00303B600F81303030F9038E003FEDB3FFC
-C7007F1307DBFFE0EC0FC002030180913803E00F4A48C83801F01F4A48ED0078DA3FF0EE
-3C3F4A48161E4A48EE0F7F4949EE07FF4990CA7E495A4948834948835C013F197F494818
-3F495A1B1F485B1B0F4890CCFCA248481907A2485A1B03121F5BA2123F1B015BA2127F98
-C7FCA35B12FFB0127F7FA4123FF301C07FA2121FA27F000F1A031C806C7EA26C7E1B076C
-6D1900636C7F1B0E6D6C181E6D6C181C011F193C6E606D7E6D6C606D6C4D5A6D6D4C5A6D
-6D16076E6C4C5A6E6C4CC7FCDA0FFC163E6E6C16FC6E6C6CEC01F0020001F0EC07E0DB3F
-FCEC3F8092280FFFE003FFC8FC030390B512FCDB007F14F0040F1480040001F8C9FC5266
-79E361>I<B912F018FF19E019F8C601FCC8EA7FFED93FF892380FFF80011F04017F9538
-007FF0F11FF8737EF103FE737E737F747E747E747E1A0F87747E1A0387747EA2741380A2
-F37FC0A21CE01B3FA21CF0A21B1F1CF8A31CFCA21B0FA41CFEAF1CFCA51B1F1CF8A4F33F
-F0A21CE0A21B7F1CC01BFF1C80A2501300A2505A505AA2505A505A505A505A1AFF4F5B4F
-90C7FCF107FCF11FF8F17FF0953801FFC0013F04075BD9FFFCDB7FFEC8FCBA12F819E096
-C9FC18F0576278E167>I<DA07FF1403023F01F05B49B512FC010702FF5B90260FFC0013
-C0D93FE090380FF01FD97F80EB03F801FEC86C5A4848157E4848ED1F7F48486F5A484881
-5B001F824981003F8290CAFC4883A2007E83A212FE84A384A27EA36D82A26C7EA26D93C7
-FC6C7E7F7F6C7E6D7E6C13E06C13FCECFFC06C14F86CECFF806C15F86DECFF80011F15E0
-6D15F8010315FE01006F7E021F81020181DA003F80030380DB003F7F04037FEE007FEF1F
-FF71138017037113C083A2F07FE0183FA2181F00E018F0180FA41807A27EA47E19E0180F
-7E19C07E6C171F19806D163F6D17006D5E6D16FE486C5E6D4B5AD8FC7F1503D91F80EC0F
-F026F80FE04A5AD907FCEC7F8029F001FFE003FFC7FC6D6CB512FC48011F14F0020314C0
-489026001FFEC8FC3C6679E34B>83 D<EC3FE0903803FFFE010F6D7E90393FC03FE09039
-7C000FF801F0EB03FC48486D7E48486D7E48486E7E48C86C7E7F01F06E7E487E6D6E7EA3
-707EA36C5AEA03E0C9FCA6167FED7FFF020FB5FC91387FF807903801FF80903807FC00EB
-1FF0EB7FC0495AD803FEC7FC485A120F5B485A485AA24848EE01C0A312FF5BA2160FA316
-1F6D141B007F153B16736D17806C6C9138E1FC03001FEC03C16C6C903A0780FE0700D807
-FE49486C5A2701FF807CEB7FFE6C6CB4486D5A011F01E06D5A010390C7EA07E03A4179BF
-43>97 D<4AB47E020F13F8023F13FE9139FF007F80D903FCEB07E0D907F0EB01F0D91FE0
-EB007849488049488049C87E48485D4915FF00034B138048485CA2485AA2485AA2003F6F
-130049EC007C94C7FC127FA35B12FFAD127F7FA4123F7FA2001FEE01C07F000F16036D16
-8012076C6C15076D160000015E6C6C151E6D6C5C6D6C5C6D6C5CD90FF8495AD903FCEB07
-C0903A00FF803F8091263FFFFEC7FC020F13F80201138032417BBF3C>99
-D<EC03FE91381FFFE091B512F8903901FE03FE903A07F0007F8049486D7ED93FC06D7E49
-C76C7E496E7E49140348488148481401000782491400000F8283485A1880123F49153FA2
-007F17C0A35BA212FF90B8FCA30180CAFCA9127F7FA3123FA27F121FEF01C06C7E17036C
-6C1680A26C6C15070001EE0F006D150E6C6C151E6D6C5C6D6C5C6D6C5CD907F0EB03E0D9
-03FC495A902700FF803FC7FC91383FFFFC020F13F00201138032417BBF3C>101
-D<EB03C0EA07FFB5FCA41201EA007FA2133FB3AAEE7FE0923803FFFC030F13FFDB3F0013
-C00378EB1FE04B6D7EDAC1C06D7EDAC3808002C7C7120302CE81170114DC14D802F86E7E
-5CA35CA35CB3B3496C4A7F496C4A7FB6D8F003B612C0A442647CE34B>104
-D<1378EA01FE487E487FA66C90C7FC6C5AEA007890C8FCB3A2EB0780EA0FFFB5FCA41203
-C6FCA2137FB3B3AC497E487FB61280A4195F7BDE25>I<EB03C0EA07FFB5FCA41201EA00
-7FA2133FB3AB0403B512F8A40400148094387FFC0018E06095C7FC177E5F17F04C5A4C5A
-4C5A4CC8FC163E5E5E4B5A4B5A4B5A4B5A151F4B7E4B7E15FF02C17F9138C3CFF8ECC787
-9138CF07FC9138FE03FEECFC0102F87F4A6C7F4A137F4A80707E161F83707E160783707E
-160183707F177F84717E171F84717E84A284496CEDFF80496C4A13E0B600F090B6FCA440
-647CE347>107 D<EB0780EA0FFFB5FCA41203C6FCA2137FB3B3B3B3AD497E487FB612C0
-A41A647BE325>I<D903C0EB7FE0D807FF903803FFFCB5010F13FFDB3F0013C00378EB1F
-E04B6D7E0001D9C1C06D7E27007FC3808002C7C71203D93FCE81170114DC14D802F86E7E
-5CA35CA35CB3B3496C4A7F496C4A7FB6D8F003B612C0A4423F7CBE4B>110
-D<4AB47E020F13F0027F13FE4AC67ED903F8EB1FC0D907E0EB07E0D91FC0EB03F849486D
-7E49C87E01FE157F49814848ED1F80000317C04848ED0FE0A24848ED07F0A2001F17F849
-1503003F17FCA3007F17FE491501A400FF17FFAC007F17FEA26D1503A3003F17FCA2001F
-17F86D1507A2000F17F06D150F000717E06C6CED1FC0A26C6CED3F806C6CED7F00017F15
-FE6D6C495A6D6C495A6D6C495AD903F8EB1FC06DB4EBFF806D6CB448C7FC020F13F00201
-138038417BBF43>I<D903C0EB7FC0D807FF903807FFFCB5011F13FFDB7F0013C003F8EB
-1FF0DAC3E0EB07F80001D9C7806D7E26007FCFC76C7E02DE6E7ED93FFC6F7E4A6F7E4A82
-181F4A82727E5C727EA2727EA3727EA41A8084AC4E1300A54E5AA2611807A24E5A6E5E18
-1F6E4B5A6E5E187F6E4B5A02DE4A90C7FC02CF4A5ADAC780495ADAC3C0EB0FF0DAC1F0EB
-3FE0913AC07E01FF806FB448C8FC030F13F80300138093CAFCB3A3497E497EB612F0A441
-5B7CBE4B>I<010FEB07F8D80FFFEB1FFEB590387FFF809238F81FC0913801E03F913903
-C07FE00003EB0780C6EB0F00140E6D5A0218EB3FC00238EB1F800230EB0600027090C7FC
-A2146014E0A25CA55CB3B0497E4813F0B612F8A42B3F7BBE34>114
-D<9138FFC003010FEBF807017FEBFE0F3A01FF003F9FD803F0EB07DF48486DB4FCD80F80
-1300001F8148C8FC003E81007E81127C00FC81A4827EA27E7F6C7E6D91C7FC13F8EA3FFE
-381FFFE06C13FF15F0000314FE6C6E7E6C6C14E0011F14F801078001008002077FDA003F
-13801507030113C0ED007F00E0ED3FE0161F17F06C150F1607A36C1503A37EA26C16E016
-077E17C06D140F6D15806D141FD8FDF0EC3F00D8F8F8147E017C495A3AF01F801FF06DB5
-12C0D8E00391C7FC39C0007FF02C417CBF35>I<1470A714F0A51301A31303A21307A213
-0FA2131F133F137F13FF1203000F90B6FCB8FCA326000FF0C8FCB3AEEE01C0AE6D6CEB03
-80A316076D6C14005E6D6C130E6D6C131E6E6C5A91383FE0F86EB45A020713C0020090C7
-FC2A597ED734>I<D903C0150FD807FFED1FFFB50203B5FCA40001ED0007D8007F1501A2
-013F81B3B25FA35FA35F011F15066E140E5F130F6E4A7F01075D6D6C494813E0D901FE49
-48EBFFC0903A00FFC01F8091393FFFFE00020F13F8020001C0EC800042407CBE4B>I<B6
-6C0103B512C0A4000101F8C8EBFC006C01E0ED3FF0017FEE1FC0013F5F96C7FC131F181E
-80010F161C8001075EA26E157801031670A26D6C5DA26E14016D5EA26F1303027F5D8102
-3F4AC8FCA26F5B021F140E81020F5CA26F133C02071438A26E6C5BA26F13F002015CA2ED
-FF016E5C168192387F8380A216C7033F90C9FCA2ED1FEEA216FE6F5AA36F5AA26F5AA36F
-5AA2423F7EBD47>I E
-%EndDVIPSBitmapFont
-end
-%%EndProlog
-%%BeginSetup
-%%Feature: *Resolution 600dpi
-TeXDict begin
-%%BeginPaperSize: Letter
-letter
-%%EndPaperSize
- end
-%%EndSetup
-%%Page: 1 1
-TeXDict begin 1 0 bop 1023 424 a Fl(Cluster)46 b(Snapshot)g(Blo)t(c)l
-(k)g(Device)1620 764 y Fk(Daniel)32 b(Phillips)1510 1059
-y(3rd)g(Septem)m(b)s(er)h(2004)1766 1367 y Fj(Abstract)323
-1548 y Fi(F)-6 b(or)26 b(sev)n(eral)i(y)n(ears)f(no)n(w,)g(Lin)n(ux)f
-(has)g(pro)n(vided)g(a)h(blo)r(c)n(k)g(lev)n(el)g(snapshot)f(facilit)n
-(y)-6 b(,)28 b(\034rst)f(as)g(part)g(of)g(the)f(logical)208
-1640 y(v)n(olume)c(manager,)j(and)f(more)f(recen)n(tly)h(as)g(part)g
-(of)h(the)e(Device)h(Mapp)r(er)g(virtual)g(blo)r(c)n(k)g(device)g
-(subsystem.)33 b(This)208 1731 y(w)n(ork)i(builds)g(on)g(the)g
-(original)i(concept)e(and)g(extends)f(it)h(to)h(op)r(erate)g(with)f
-(shared)g(storage)i(devices)e(accessed)208 1822 y(sim)n(ultaneously)27
-b(b)n(y)g(man)n(y)f(cluster)i(no)r(des)g(o)n(v)n(er)g(a)g(storage)h
-(net)n(w)n(ork.)41 b(In)27 b(the)h(pro)r(cess,)h(some)f(de\034ciencies)
-g(of)h(the)208 1914 y(existing)i(design)g(w)n(ere)g(corrected:)46
-b(when)30 b(m)n(ultiple)g(snapshots)i(are)f(held)f(sim)n(ultaneously)-6
-b(,)32 b(m)n(ultiple)e(cop)n(ying)h(of)208 2005 y(snapshotted)24
-b(data)g(is)h(eliminated;)g(memory)d(fo)r(otprin)n(t)j(is)g(reduced)f
-(and)f(made)h(indep)r(enden)n(t)f(of)i(v)n(olume)e(size;)j(the)208
-2096 y(requiremen)n(t)k(for)k(m)n(ultiple)d(separate)j(snapshot)e
-(store)h(v)n(olumes)f(is)h(eliminated;)j(and)c(a)h(single)h(Device)e
-(Mapp)r(er)208 2188 y(target)26 b(serv)n(es)g(for)g(b)r(oth)f(snapshot)
-h(and)f(origin)i(devices.)0 2558 y Fh(1)131 b(Bac)l(kground)0
-2856 y Fg(1.1)112 b(Wh)m(y)38 b(Snapshots?)0 3109 y Ff(In)32
-b(a)g(t)n(ypical)g(en)n(terprise)f(computing)h(en)n(vironmen)n(t,)g(a)g
-(\034le)h(system)e(is)i(relied)e(on)h(to)g(store,)h(organize,)e(k)n
-(eep)h(trac)n(k)f(of)0 3208 y(and)e(mak)n(e)f(a)n(v)-5
-b(ailable)28 b(the)h(\034les)g(generated)f(b)n(y)g(the)i(users.)40
-b(The)29 b(en)n(terprise)f(computing)h(system)g(ma)n(y)f(ha)n(v)n(e)g
-(h)n(undreds)0 3308 y(or)h(ev)n(en)h(thousands)f(of)i(users)e(and)h
-(the)g(\034le)h(system)f(is)g(required)f(to)h(supp)r(ort)g(large)f(n)n
-(um)n(b)r(ers)g(of)h(users)g(reading)e(from)0 3407 y(and)e(writing)g
-(to)g(the)h(disk)f(storage)f(space)g(managed)g(b)n(y)h(the)h(\034le)g
-(system.)36 b(In)26 b(to)r(da)n(y's)g(business)g(en)n(vironmen)n(t,)f
-(the)i(\034le)0 3507 y(system)i(ma)n(y)g(b)r(e)g(needed)h(around)e(the)
-i(clo)r(c)n(k.)41 b(In)29 b(man)n(y)g(systems,)g(the)h(\034le)f(system)
-g(m)n(ust)h(b)r(e)f(con)n(tin)n(uously)f(a)n(v)-5 b(ailable)0
-3607 y(and)27 b(cannot)g(b)r(e)h(sh)n(utdo)n(wn)f(temp)r(orarily)g
-(without)h(incurring)e(unacceptable)h(business)g(costs.)0
-3806 y(F)-7 b(ull-system)33 b(bac)n(kup)g(copies)g(can)g(b)r(e)h(used)f
-(to)g(restore)f(\034les)h(in)h(the)g(ev)n(en)n(t)f(of)g(system)g
-(failure,)i(acciden)n(tal)d(deletion,)0 3906 y(\034le)j(corruption,)g
-(unin)n(tended)g(edits,)h(etc.)58 b(Regularly)33 b(making)g(bac)n(kups)
-h(is)g(an)g(essen)n(tial)g(safet)n(y)g(measure)f(in)i(man)n(y)0
-4005 y(systems.)g(In)22 b(order)g(to)g(mak)n(e)g(a)g(bac)n(kup)g(of)h
-(a)f(\034le)h(system)f(using)g(traditional)g(cop)n(ying)f(tec)n
-(hniques,)j(it)f(is)f(necessary)f(that)0 4105 y(the)i(con)n(ten)n(ts)e
-(of)i(the)g(\034le)f(system)g(b)r(e)h(stable.)35 b(If)23
-b(the)f(\034le)h(system)f(is)g(liv)n(e,)h(then)g(c)n(hanges)e(ma)n(y)h
-(b)r(e)h(made)f(to)g(the)h(con)n(ten)n(ts,)0 4204 y(ev)n(en)30
-b(as)g(they)h(are)f(b)r(eing)g(copied.)46 b(Th)n(us,)31
-b(the)g(\034le)g(system)f(m)n(ust)h(b)r(e)g(\020o\033-line\021)37
-b(in)31 b(order)e(to)h(mak)n(e)g(an)g(e\033ectiv)n(e)h(bac)n(k)0
-4304 y(up.)44 b(In)30 b(a)g(t)n(ypical)f(en)n(terprise)g(en)n(vironmen)
-n(t,)g(this)i(pro)r(cess)d(ma)n(y)h(b)r(e)i(p)r(erformed)e(at)h(nigh)n
-(t)g(or)f(during)g(o\033)h(hours.)43 b(The)0 4404 y(\034le)25
-b(system)g(ma)n(y)g(b)r(e)g(una)n(v)-5 b(ailable)24 b(for)h(sev)n(eral)
-f(hours)g(eac)n(h)g(nigh)n(t)h(or)g(once)f(a)h(w)n(eek)g(as)f(the)i
-(con)n(ten)n(ts)e(of)h(the)h(\034le)f(system)0 4503 y(are)g(read)g(out)
-g(and)h(copied)g(to)f(tap)r(e.)37 b(The)25 b(end)h(result)g(is)g(a)f
-(ph)n(ysical)g(cop)n(y)g(of)h(the)g(original)e(data.)36
-b(As)26 b(en)n(terprises)e(gro)n(w)0 4603 y(ev)n(er)30
-b(larger,)g(and)h(the)g(stored)g(data)f(accum)n(ulates,)h(the)g(v)n
-(olume)g(of)g(data)f(that)i(has)e(to)h(b)r(e)h(copied)e(increases)g
-(and)h(the)0 4703 y(time)26 b(that)f(it)h(tak)n(es)e(to)h(mak)n(e)f(a)h
-(bac)n(kup)f(of)h(the)h(\034le)f(system)g(using)g(traditional)f(tec)n
-(hniques)h(is)g(no)g(longer)f(manageable.)0 4802 y(In)33
-b(some)f(en)n(vironmen)n(ts,)h(it)g(ma)n(y)f(tak)n(e)f(more)h(than)h(a)
-f(da)n(y)g(to)h(mak)n(e)f(a)g(bac)n(kup)g(cop)n(y)-7
-b(.)51 b(It)33 b(ma)n(y)f(b)r(e)h(unacceptable)f(to)0
-4902 y(tak)n(e)27 b(a)g(\034lesystem)g(o\037ine)h(for)f(suc)n(h)g(a)g
-(long)g(time.)0 5101 y(It)34 b(is)g(p)r(ossible)f(to)h(ac)n(hiev)n(e)e
-(the)i(e\033ect)h(of)e(susp)r(ending)h(and)f(making)g(a)h(cop)n(y)f(of)
-g(an)h(en)n(tire)f(\034lesystem,)i(but)g(without)0 5201
-y(in)n(terrupting)27 b(the)h(op)r(eration)f(of)h(the)g(\034lesystem.)37
-b(A)29 b('snapshot')e(is)g(a)h(virtually)f(instan)n(t)h(cop)n(y)e(of)i
-(a)g(de\034ned)g(collection)0 5300 y(of)d(data)f(created)g(at)g(a)h
-(particular)e(instan)n(t)i(in)g(time.)36 b(Bac)n(kups)23
-b(and)i(other)f(functions)h(can)f(b)r(e)h(p)r(erformed)g(in)g(a)f
-(leisurely)0 5400 y(w)n(a)n(y)i(on)i(a)f(snapshot)g(image)f(without)i
-(impacting)g(system)f(a)n(v)-5 b(ailabilit)n(y)e(.)p
-eop end
-%%Page: 2 2
-TeXDict begin 2 1 bop 0 83 a Ff(Bey)n(ond)38 b(online)g(bac)n(kup,)j
-(snapshots)d(ha)n(v)n(e)g(v)-5 b(arious)37 b(uses.)71
-b(F)-7 b(or)38 b(example,)j(one)d(migh)n(t)h(create)f(a)g(snapshot)g(b)
-r(efore)0 183 y(making)29 b(a)g(trial)g(install)g(of)g(some)g(new)g
-(system)h(soft)n(w)n(are.)40 b(If)30 b(the)g(install)f(is)g
-(unsuccessful,)h(the)g(trial)f(can)g(b)r(e)g(rev)n(erted)0
-282 y(b)n(y)e(cop)n(ying)g(the)h(snapshot)e(bac)n(k)h(to)g(the)h
-(origin)f(device.)0 614 y Fg(1.2)112 b(Wh)m(y)38 b(Blo)s(c)m(k-Lev)m
-(el)e(Snapshots?)0 867 y Ff(Snapshots)j(ma)n(y)g(b)r(e)h(implemen)n
-(ted)g(at)f(the)h(blo)r(c)n(k)f(device)g(lev)n(el,)j(where)d(the)h
-(disk)f(con)n(ten)n(ts)g(are)g(view)n(ed)g(as)g(mere)0
-967 y(blo)r(c)n(ks)31 b(of)h(data,)g(or)f(at)h(the)g(\034lesystem)g
-(lev)n(el,)g(where)g(the)g(\034lesystem)g(con)n(ten)n(ts)f(are)g(view)n
-(ed)g(as)g(ha)n(ving)g(a)g(particular)0 1066 y(organization.)61
-b(It)36 b(has)g(b)r(een)h(argued)e(that)i(the)f(latter)g(is)h(the)f(b)r
-(est)h(c)n(hoice,)h(since)e(without)h(explicit)f(kno)n(wledge)f(of)0
-1166 y(\034lesystem)24 b(structure)g(it)h(is)f(imp)r(ossible)g(to)g
-(kno)n(w)g(whic)n(h)g(blo)r(c)n(ks)g(are)f(free)h(and)g(whic)n(h)h(are)
-e(not.)36 b(In)24 b(the)h(absence)f(of)g(that)0 1266
-y(kno)n(wledge,)k(a)h(blo)r(c)n(k)g(lev)n(el)g(snapshot)f(sc)n(heme)h
-(will)g(sometimes)g(w)n(aste)f(time)i(preserving)e(snapshot)g(data)h
-(that)g(is)h(just)0 1365 y(\034lesystem)d(free)h(space.)0
-1565 y(On)f(the)h(other)f(hand,)h(it)g(is)f(m)n(uc)n(h)h(simpler)f(to)g
-(implemen)n(t)h(snapshots)f(at)g(the)h(blo)r(c)n(k)f(lev)n(el)h(than)f
-(to)h(incorp)r(orate)e(them)0 1664 y(in)n(to)i(the)h(design)e(of)h(a)g
-(p)r(ossibly)g(already-complex)e(\034lesystem.)38 b(F)-7
-b(urthermore,)28 b(a)f(blo)r(c)n(k)h(lev)n(el)g(snapshot)f(w)n(orks)g
-(for)g(all)0 1764 y(\034lesystems,)f(not)h(just)g(one)f(particular)f
-(\034lesystem.)36 b(And)27 b(\034nally)-7 b(,)26 b(w)n(e)g(already)f
-(ha)n(v)n(e)h(a)g(natural)f(w)n(a)n(y)g(of)i(represen)n(ting)d(a)0
-1863 y(snapshot)j(to)g(the)h(system:)37 b(as)27 b(a)g(virtual)g(blo)r
-(c)n(k)g(device.)0 2195 y Fg(1.3)112 b(Multiple)35 b(Snapshots)0
-2448 y Ff(Multiple)f(snapshots)e(of)h(the)g(same)g(\034lesystem)f(ma)n
-(y)h(b)r(e)g(main)n(tained)g(sim)n(ultaneously)-7 b(,)33
-b(eac)n(h)f(created)h(at)f(a)h(di\033eren)n(t)0 2548
-y(p)r(oin)n(t)27 b(in)g(time.)37 b(In)27 b(the)g(past,)g(this)g(has)f
-(b)r(een)h(implemen)n(ted)g(in)g(a)g(straigh)n(tforw)n(ard)c(w)n(a)n(y)
-j(b)n(y)g(giving)g(eac)n(h)g(snapshot)g(its)0 2648 y(o)n(wn)21
-b(\034xed-size)f(storage)f(area.)34 b(This)21 b(approac)n(h,)f(though)h
-(simple,)i(has)e(serious)f(dra)n(wbac)n(ks.)32 b(Eac)n(h)21
-b(time)h(a)f(blo)r(c)n(k)f(of)i(data)0 2747 y(is)29 b(c)n(hanged)e(on)i
-(the)g(\034lesystem,)g(it)g(m)n(ust)g(b)r(e)g(written)g(to)f(the)h
-(storage)e(for)h(eac)n(h)g(snapshot)g(to)h(whic)n(h)f(it)h(b)r(elongs.)
-40 b(The)0 2847 y(cost)25 b(of)f(these)h(m)n(ultiple)h(writes)e
-(increases)g(directly)h(with)g(the)g(n)n(um)n(b)r(er)g(of)g(snapshots)f
-(held.)36 b(In)25 b(practice,)g(p)r(erformance)0 2946
-y(degrades)33 b(noticeably)g(with)i(more)e(than)h(one)g(or)f(t)n(w)n(o)
-h(sim)n(ultaneous)f(snapshots.)55 b(Dep)r(ending)35 b(on)f(the)g
-(lifetime)h(of)f(a)0 3046 y(snapshot)28 b(and)g(the)h(n)n(um)n(b)r(er)g
-(of)f(c)n(hanges)g(made)g(to)g(the)i(original)d(\034le)h(system,)h(eac)
-n(h)f(snapshot)g(store)f(ma)n(y)h(need)h(to)g(b)r(e)0
-3146 y(as)j(large)g(as)g(the)i(original)d(disk.)53 b(In)33
-b(the)g(absence)g(of)f(some)h(clev)n(er)f(sc)n(heme)g(to)h(allo)r(cate)
-f(storage)f(on)i(demand,)h(these)0 3245 y(m)n(ultiple)27
-b(snapshot)f(stores)g(ma)n(y)g(b)r(e)h(largely)e(empt)n(y)-7
-b(.)37 b(In)27 b(an)n(y)f(ev)n(en)n(t,)h(it)g(is)f(probable)g(that)h(m)
-n(ultiple)g(redundan)n(t)g(copies)0 3345 y(of)h(data)f(will)g(b)r(e)h
-(stored.)0 3544 y(It)35 b(is)f(clear)f(that)h(to)g(a)n(v)n(oid)f(the)i
-(ab)r(o)n(v)n(e)d(problems,)j(data)f(m)n(ust)g(b)r(e)h(shared)e(b)r(et)
-n(w)n(een)h(m)n(ultiple)h(snapshots)e(whenev)n(er)0 3644
-y(p)r(ossible,)44 b(b)r(oth)e(to)f(reduce)f(cop)n(ying)g(time)i(and)f
-(to)f(sa)n(v)n(e)g(space.)77 b(But)41 b(this)g(sharing)f(requiremen)n
-(t)g(has)h(profound)0 3743 y(implications,)27 b(v)n(ersus)f(the)i
-(simple)g(existing)f(design:)138 4026 y Fe(\001)42 b
-Ff(W)-7 b(e)28 b(need)f(structures)g(to)g(k)n(eep)g(trac)n(k)g(of)g
-(whic)n(h)h(data)f(is)g(shared)g(b)n(y)g(whic)n(h)h(snapshots)138
-4192 y Fe(\001)42 b Ff(W)-7 b(e)28 b(need)f(a)g(space)g(allo)r(cation)g
-(sc)n(heme)g(for)g(the)h(snapshot)e(store)138 4358 y
-Fe(\001)42 b Ff(Both)27 b(of)g(the)h(ab)r(o)n(v)n(e)f(need)g(to)h(b)r
-(e)g(p)r(ersisten)n(t)f(and)g(durable)0 4640 y(I)e(elected)g(to)g(use)g
-(a)g(\034lesystem-st)n(yle)f(bitmap)h(allo)r(cation)f(sc)n(heme)h(to)g
-(manage)f(the)h(allo)r(cation)f(of)h(snapshot)f(store)h(data)0
-4740 y(and)i(metadata,)h(a)f(btree)g(to)h(k)n(eep)f(trac)n(k)f(of)i
-(snapshot)f(data)g(mem)n(b)r(ership,)h(and)f(a)g(journal)g(to)h(mak)n
-(e)f(those)g(structures)0 4839 y(durable.)36 b(The)26
-b(co)r(de)g(to)h(manage)e(a)h(snapshot)f(store)h(th)n(us)g(ends)g(up)h
-(lo)r(oking)e(lik)n(e)h(a)g(simple)h(\034lesystem,)f(a)g(far)g(cry)f
-(from)0 4939 y(the)j(existing)e(implemen)n(tation)i(of)f(snapshots,)f
-(and)h(ev)n(en)g(further)g(from)g(the)h(simplicit)n(y)f(of)g(a)g(ph)n
-(ysical)g(disk.)36 b(But)28 b(this)0 5039 y(is)f(the)h(price)g(that)f
-(m)n(ust)h(b)r(e)g(paid)f(to)h(solv)n(e)e(the)i(pressing)f(problems)f
-(of)i(the)g(existing)f(implemen)n(tation.)p eop end
-%%Page: 3 3
-TeXDict begin 3 2 bop 0 83 a Fg(1.4)112 b(Cluster)37
-b(Snapshots)0 336 y Ff(In)25 b(man)n(y)f(en)n(vironmen)n(ts,)g(storage)
-e(systems)i(are)g(clustered,)h(with)g(m)n(ultiple)g(serv)n(er-class)d
-(systems)i(accessing)f(the)i(same)0 436 y(storage)32
-b(device.)55 b(The)34 b(clien)n(ts)f(of)h(the)g(storage)e(systems)h
-(need)h(to)f(comm)n(unicate)g(amongst)g(themselv)n(es)g(in)h(order)e
-(to)0 535 y(use)e(the)h(storage)e(system)h(in)g(a)h(co)r(ordinated)e(w)
-n(a)n(y)g(to)h(a)n(v)n(oid)f(in)n(terlea)n(ving)g(their)h(c)n(hanges)f
-(in)i(a)f(nonsensical)f(w)n(a)n(y)-7 b(.)45 b(All)0 635
-y(mem)n(b)r(ers)19 b(of)h(a)f(cluster)g(m)n(ust)h(receiv)n(e)e(a)i
-(consisten)n(t)f(view)g(of)h(p)r(ossibly)f(m)n(ultiple)h(snapshots,)g
-(ev)n(en)g(while)f(other)h(mem)n(b)r(ers)0 734 y(of)28
-b(the)f(cluster)h(ma)n(y)e(b)r(e)i(sim)n(ultaneously)f(writing)g(to)g
-(the)h(same)f(snapshots.)0 934 y(Net)n(w)n(ork)f(comm)n(unication)h(is)
-g(required)g(b)r(et)n(w)n(een)g(cluster)h(mem)n(b)r(ers)f(for)g(three)g
-(distinct)h(purp)r(oses:)101 1216 y(1.)42 b(Disco)n(v)n(ering)25
-b(whic)n(h)j(ph)n(ysical)e(data)h(blo)r(c)n(ks)g(b)r(elong)g(to)h(a)f
-(giv)n(en)g(snapshot)101 1382 y(2.)42 b(Sync)n(hronizing)26
-b(access)g(to)h(blo)r(c)n(k)g(lev)n(el)h(data)f(shared)f(b)r(et)n(w)n
-(een)i(snapshot)e(and)i(origin)101 1548 y(3.)42 b(Sync)n(hronizing)26
-b(access)g(to)h(snapshot)g(metadata)0 1830 y(A)34 b(traditional)f
-(approac)n(h)f(w)n(ould)h(rely)g(on)h(a)f(distributed)h(lo)r(c)n(k)f
-(manager)f(for)i(sync)n(hronization,)f(augmen)n(ted)g(with)h(a)0
-1930 y(message)j(passing)h(proto)r(col)f(to)i(implemen)n(t)g(the)g
-(data)f(exc)n(hange)g(required)f(for)h(\(1\))h(ab)r(o)n(v)n(e.)69
-b(I)39 b(elected)g(instead)f(to)0 2030 y(adopt)27 b(a)g(pure)g
-(message-passing)e(approac)n(h)g(that)j(com)n(bines)f(sync)n
-(hronization)e(and)j(data)f(exc)n(hange)f(in)n(to)h(a)g(single)g(set)0
-2129 y(of)32 b(messages;)h(after)f(all,)h(a)f(distributed)g(lo)r(c)n(k)
-g(managers)e(implemen)n(ts)j(its)f(lo)r(c)n(ks,)h(in)f(the)h(end,)g
-(with)g(messages.)49 b(This)0 2229 y(eliminates)27 b(an)g(en)n(tire)g
-(la)n(y)n(er)f(of)h(net)n(w)n(ork)f(sync)n(hronization)f(messages,)h
-(and)h(results)g(in)g(a)g(compact,)g(e\036cien)n(t)g(proto)r(col)0
-2328 y(that)h(is)f(easy)g(to)g(v)n(erify)-7 b(.)0 2660
-y Fg(1.5)112 b(Durabilit)m(y)35 b(and)k(P)m(erformance)d(Goals)0
-2913 y Ff(As)20 b(general)f(purp)r(ose)g(blo)r(c)n(k)h(devices,)h
-(cluster)f(snapshots)f(need)h(to)g(op)r(erate)f(just)i(lik)n(e)f(real)f
-(blo)r(c)n(k)g(devices.)34 b(In)20 b(particular,)0 3013
-y(p)r(erformance)30 b(should)h(not)g(b)r(e)h(noticeably)e(degraded)g
-(and)h(data)f(written)i(to)f(a)f(snapshot)h(virtual)f(device)h(should)g
-(not)0 3113 y(b)r(e)f(at)g(an)n(y)f(greater)f(risk)h(than)h(data)f
-(written)h(to)g(a)f(real)g(disk.)43 b(The)30 b(latter)f(is)h(a)f(c)n
-(hallenging)g(requiremen)n(t)g(in)h(view)f(of)0 3212
-y(the)j(fact)g(that)h(a)e(real)g(disk)h(is)g(not)f(required)g(to)h
-(main)n(tain)g(complex)f(data)h(structures)f(to)g(do)h(its)g(w)n(ork.)
-49 b(The)32 b(cluster)0 3312 y(snapshot)24 b(on)g(the)h(other)f(hand,)h
-(m)n(ust)g(main)n(tain)f(complex)h(data)f(structures)g(durably)-7
-b(,)24 b(in)h(other)f(w)n(ords,)g(without)h(b)r(eing)0
-3411 y(sub)5 b(ject)30 b(to)g(corruption)f(in)h(the)g(ev)n(en)n(t)g(of)
-g(sudden)g(in)n(terruption)f(suc)n(h)h(as)f(a)h(p)r(o)n(w)n(er)f
-(failure.)43 b(W)-7 b(e)31 b(w)n(ould)e(not)h(exp)r(ect)g(a)0
-3511 y(real)g(disk)g(to)h(b)r(ecome)f(corrupted)g(in)g(that)h(case,)g
-(and)f(so)g(ha)n(v)n(e)g(the)h(same)f(high)g(exp)r(ectations)g(of)h(a)f
-(virtual)g(disk.)46 b(An)0 3611 y(additional)27 b(c)n(hallenge)f(is)i
-(to)f(ac)n(hiev)n(e)f(this)i(durabilit)n(y)f(without)h(degrading)e(p)r
-(erformance.)0 3810 y(The)d(main)g(tactic)g(brough)n(t)f(to)h(b)r(ear)g
-(on)f(this)i(problem)e(is)h(batc)n(hing)g(together)f(of)h(data)f
-(structure)h(up)r(dates.)35 b(A)24 b(lo)r(calized)0 3910
-y(group)e(of)h(writes)g(to)g(virtual)f(v)n(olume)h(data)f(should)h
-(result)g(in)g(a)g(batc)n(h)g(of)g(lo)r(calized)f(up)r(dates)i(to)f
-(the)g(v)n(olume)g(metadata,)0 4009 y(whic)n(h)28 b(can)g(b)r(e)h
-(transferred)e(to)h(disk)g(as)g(a)f(single)h(transaction.)38
-b(A)28 b(journal)g(is)g(used)g(to)h(mak)n(e)e(the)i(transaction)e
-(atomic)0 4109 y(and)g(durable.)37 b(The)27 b(batc)n(hing)g(mak)n(es)g
-(it)h(e\036cien)n(t.)0 4441 y Fg(1.6)112 b(Wh)m(y)38
-b(a)f(Clien)m(t-Serv)m(er)f(Arc)m(hitecture?)0 4694 y
-Ff(Question:)42 b(Ho)n(w)30 b(do)r(es)g(one)h(distribute)f(a)h
-(database)e(across)f(m)n(ultiple)j(cluster)g(no)r(des?)45
-b(Answ)n(er:)d(With)32 b(m)n(uc)n(h)e(blo)r(o)r(d,)0
-4793 y(sw)n(eat)c(and)g(tears.)35 b(This)27 b(I)f(did)h(not)f(wish)h
-(to)f(attempt.)37 b(Instead)26 b(I)g(decided)h(that)g(the)f(snapshot)g
-(store)f(database)h(w)n(ould)0 4893 y(b)r(e)34 b(cen)n(trally)f
-(managed)g(b)n(y)g(a)h(serv)n(er)e(pro)r(cess)h(running)g(on)h(one)f
-(of)h(the)g(cluster)g(no)r(des,)h(and)f(that)g(sync)n(hronization)0
-4993 y(and)27 b(information)g(distribution)g(w)n(ould)g(b)r(e)h(b)n(y)f
-(w)n(a)n(y)f(of)h(net)n(w)n(ork)f(messages,)g(taking)h(the)h(form)f(of)
-g(queries)f(and)i(replies.)0 5092 y(Since)c(the)g(query)f(proto)r(col)g
-(is)h(v)n(ery)e(e\036cien)n(t,)j(the)f(clien)n(t)g(serv)n(er)e(arc)n
-(hitecture)h(do)r(es)g(not)h(imp)r(ose)g(a)f(large)f(net)n(w)n(ork)h
-(load)0 5192 y(and)k(is)h(exp)r(ected)g(to)f(scale)g(readily)f(to)i
-(sev)n(eral)e(h)n(undred)h(clien)n(ts.)p eop end
-%%Page: 4 4
-TeXDict begin 4 3 bop 0 83 a Ff(F)-7 b(or)27 b(v)n(ery)g(large)g
-(clusters,)g(the)i(clien)n(t-serv)n(er)c(arc)n(hitecture)i(will)h(no)g
-(doubt)g(ev)n(en)n(tually)f(b)r(ecome)h(a)g(b)r(ottlenec)n(k.)38
-b(F)-7 b(ortu-)0 183 y(nately)29 b(there)g(remains)g(plen)n(t)n(y)g(of)
-g(time)h(to)f(solv)n(e)f(that)i(problem,)f(and)g(there)g(are)f(no)h
-(examples)g(y)n(et)g(of)g(\034lesystems)g(of)0 282 y(the)f(t)n(yp)r(e)g
-(that)g(could)f(b)r(ene\034t)h(from)f(a)h(snapshot)e(blo)r(c)n(k)h
-(device,)h(running)f(on)g(suc)n(h)g(large)g(clusters.)0
-614 y Fg(1.7)112 b(T)-9 b(erminology)0 867 y Ff(Here)27
-b(w)n(e)g(review)g(some)g(terminology)f(in)n(tro)r(duced)h(b)n(y)h(the)
-g(original)e(L)-9 b(VM)27 b(snapshot)g(implemen)n(tation:)0
-1066 y Fd(Origin)k(v)m(olume)0 1166 y Ff(One)k(of)g(t)n(w)n(o)f(blo)r
-(c)n(k)g(devices)h(underlying)f(a)h(virtual)f(snapshot)h(device.)59
-b(This)35 b(v)n(olume)f(is)h(mapp)r(ed)g(one-to-one)f(to)g(a)0
-1266 y(snapshot)j(origin)g(virtual)h(device.)67 b(The)39
-b(virtual)e(device)h(could)f(b)r(e)i(remo)n(v)n(ed)d(and)i(the)h
-(underlying)e(origin)g(v)n(olume)0 1365 y(accessed)26
-b(directly)-7 b(,)28 b(at)f(the)h(risk)f(of)g(losing)g(the)h(in)n
-(tegrit)n(y)f(of)g(an)n(y)g(snapshots)f(sharing)h(data)g(with)h(the)g
-(origin.)0 1565 y Fd(Snapshot)k(store)0 1664 y Ff(One)24
-b(of)g(t)n(w)n(o)g(blo)r(c)n(k)f(devices)h(underlying)g(a)g(virtual)f
-(snapshot)h(device.)35 b(This)24 b(v)n(olume)g(con)n(tains)f(data)h(c)n
-(h)n(unks)g(that)g(w)n(ere)0 1764 y(copied)31 b(from)f(the)i(origin)e
-(in)h(order)f(to)h(preserv)n(e)e(the)j(in)n(tegrit)n(y)e(of)h(snapshot)
-f(data,)h(or)g(w)n(ere)f(written)h(directly)g(to)g(the)0
-1863 y(snapshot)d(store)f(via)h(a)g(snapshot)f(virtual)h(device.)39
-b(It)29 b(also)e(con)n(tains)g(all)i(metadata)e(required)h(to)g(k)n
-(eep)g(trac)n(k)f(of)h(whic)n(h)0 1963 y(snapshot)f(store)f(c)n(h)n
-(unks)h(b)r(elong)g(to)h(whic)n(h)f(snapshots,)g(among)f(other)h
-(things.)0 2162 y Fd(Cop)m(y-out)0 2262 y Ff(The)h(act)f(of)g
-(preserving)f(a)h(c)n(h)n(unk)h(of)f(origin)f(data)i(b)n(y)f(cop)n
-(ying)f(it)i(to)g(snapshot)e(store.)0 2461 y Fd(Ch)m(unk)0
-2561 y Ff(The)i(gran)n(ularit)n(y)d(of)i(snapshot)g(cop)n(y-outs,)f(a)h
-(user-de\034nable)f(binary)h(m)n(ultiple)h(of)g(4K)f(blo)r(c)n(k)g
-(size.)0 2760 y Fd(Exceptio)p Ff(n)0 2860 y(A)h(c)n(h)n(unk)f(of)h
-(data)f(in)g(the)h(snapshot)f(store,)g(b)r(elonging)g(to)g(one)g(or)g
-(more)g(snapshots)0 3192 y Fg(1.8)112 b(History)0 3445
-y Ff(T)-7 b(o)38 b(the)g(b)r(est)g(of)g(m)n(y)g(kno)n(wledge,)h(the)f
-(concept)g(of)g(a)f(cop)n(y-on-write)f(blo)r(c)n(k)h(device)h(lev)n(el)
-f(snapshot)g(\034rst)h(arose)e(in)0 3544 y(discussions)c(b)r(et)n(w)n
-(een)g(Heinz)i(Mauelshagen)d(and)i(Mic)n(hael)f(Marxmeier,)h(in)g(1999)
-e(at)h(the)i(Cologne)d(Lin)n(ux)i(K)n(ongress)0 3644
-y(conference.)65 b(Heinz)37 b(created)f(a)h(functional)g(implemen)n
-(tation)g(as)g(part)f(of)i(his)f(Lin)n(ux)g(L)-9 b(VM)37
-b(pro)5 b(ject)36 b(shortly)g(after.)0 3743 y(In)g(2002,)h(Jo)r(e)e
-(Thorn)n(b)r(er)g(re\034ned)h(these)h(ideas)e(and)h(re-implemen)n(ted)g
-(the)g(snapshot)g(target)f(as)h(a)f(Device)h(Mapp)r(er)0
-3843 y(virtual)24 b(device.)36 b(I)25 b(b)r(egan)g(design)f(w)n(ork)g
-(for)g(a)h(cluster)g(snapshot)f(in)h(Septem)n(b)r(er,)h(2003.)34
-b(Once)24 b(a)h(design)f(w)n(as)g(a)n(v)-5 b(ailable,)0
-3943 y(a)37 b(protot)n(yp)r(e)f(k)n(ernel)h(clien)n(t)g(for)g(Lin)n(ux)
-g(2.4)g(w)n(as)f(co)r(ded)h(in)h(\034v)n(e)f(da)n(ys)f(b)n(y)h(P)n
-(atric)n(k)f(Caul\034eld)i(in)f(Decem)n(b)r(er,)j(2003.)0
-4042 y(Observing)31 b(that)h(a)g(simple)g(net)n(w)n(ork)f(ec)n(ho)g(is)
-h(a)g(go)r(o)r(d)g(sim)n(ulation)f(of)i(a)e(cluster)h(snapshot)g(serv)n
-(er)e(for)i(an)f(imp)r(ortan)n(t)0 4142 y(class)26 b(of)h(access)e
-(patterns,)i(P)n(atric)n(k)f(w)n(as)g(able)g(to)h(obtain)g(early)e(b)r
-(enc)n(hmark)h(n)n(um)n(b)r(ers)h(at)g(that)g(time.)37
-b(Encouraged)25 b(b)n(y)0 4242 y(the)30 b(excellen)n(t)f(p)r
-(erformance,)g(I)h(b)r(egan)f(implemen)n(tation)h(w)n(ork)e(on)i(a)f
-(user)g(space)g(snapshot)g(serv)n(er)f(early)g(in)i(the)g(new)0
-4341 y(y)n(ear,)c(bringing)h(it)h(to)f(a)g(testable)h(state)f(b)n(y)g
-(June.)0 4540 y(In)38 b(July)-7 b(,)40 b(the)f(protot)n(yp)r(e)e(k)n
-(ernel)g(clien)n(t)h(w)n(as)f(dev)n(elop)r(ed)g(in)n(to)h(a)f
-(functional)h(clien)n(t)g(for)f(the)i(snapshot)e(origin,)i(and)0
-4640 y(b)r(enc)n(hmarks)32 b(of)h(a)f(functional)h(virtual)g(origin)e
-(device)i(holding)g(a)f(snapshot)g(w)n(ere)g(obtained.)53
-b(These)32 b(con\034rmed)h(the)0 4740 y(predictions)26
-b(set)h(out)g(in)h(the)f(design)g(do)r(cumen)n(t,)g(nearly)f(a)g(y)n
-(ear)g(earlier.)35 b(By)27 b(Monda)n(y)f(August)h(16th,)f(snapshot)h
-(blo)r(c)n(k)0 4839 y(device)32 b(supp)r(ort)f(had)h(b)r(een)g(added)g
-(to)g(the)g(k)n(ernel)f(clien)n(t,)i(completing)f(the)g(basic)f(blo)r
-(c)n(k)h(device)f(functionalit)n(y)-7 b(.)50 b(Ben)0
-4939 y(Marzinski)27 b(p)r(orted)g(the)i(2.4)e(k)n(ernel)f(clien)n(t)i
-(to)g(Lin)n(ux)g(2.6)f(later)g(that)h(w)n(eek,)f(in)h(three)g(da)n(ys.)
-36 b(This)28 b(brings)f(us)h(to)g(to)r(da)n(y)-7 b(,)0
-5039 y(with)29 b(a)g(pre-alpha)e(cluster)i(snapshot)f(protot)n(yp)r(e)g
-(ready)g(to)h(b)r(e)g(released)f(on)g(the)i(\020release)d(early)-7
-b(,)28 b(release)f(often\021)36 b(plan.)0 5138 y(Sev)n(eral)d(more)h
-(mon)n(ths)h(of)g(hard)f(w)n(ork)g(is)g(required)g(to)h(complete)g(the)
-g(fault)g(tolerance)f(and)h(cluster)f(infrastructure)0
-5238 y(in)n(tegration,)26 b(whic)n(h)i(will)f(bring)g(this)h(cluster)f
-(snapshot)g(blo)r(c)n(k)g(device)h(to)f(a)g(usable)g(state.)p
-eop end
-%%Page: 5 5
-TeXDict begin 5 4 bop 0 83 a Fh(2)131 b(Ho)l(w)44 b(a)f(Blo)t(c)l(k)j
-(Device)e(Snapshot)g(W)-11 b(orks)0 364 y Ff(Blo)r(c)n(k)31
-b(lev)n(el)g(snapshots)f(in)i(Lin)n(ux)f(are)g(implemen)n(ted)h(using)f
-(a)g(cop)n(y-on-write)f(strategy)-7 b(.)47 b(When)32
-b(a)f(snapshot)g(is)g(\034rst)0 464 y(created,)c(it)g(is)g(exactly)g
-(the)h(same)e(as)h(the)h(origin)e(v)n(olume.)36 b(Afterw)n(ards,)26
-b(an)n(y)h(attempted)h(write)f(to)g(the)h(origin)e(v)n(olume)0
-564 y(w)n(ould)f(also)f(b)r(e)i(a)f(write)g(to)g(the)h(snapshot,)f
-(whic)n(h)g(w)n(ould)g(destro)n(y)f(the)h(in)n(tegrit)n(y)g(of)g(the)h
-(snapshot,)f(and)g(isn't)g(allo)n(w)n(ed.)0 663 y(Before)31
-b(the)h(write)g(is)g(allo)n(w)n(ed)e(to)i(pro)r(ceed,)g(the)h(data)e
-(that)h(w)n(ould)g(b)r(e)g(o)n(v)n(erwritten)e(is)i(copied)g(to)f(a)h
-(snapshot)f(store.)0 763 y(Snapshot)25 b(metadata)f(is)h(also)f
-(recorded)f(on)i(the)h(snapshot)e(store,)h(to)g(k)n(eep)f(trac)n(k)g
-(of)h(the)g(parts)g(of)g(the)g(origin)f(that)h(ha)n(v)n(e)0
-863 y(b)r(een)e(preserv)n(ed)f(in)h(the)h(snapshot)e(store.)34
-b(This)23 b(metadata)g(is)g(consulted)g(on)f(eac)n(h)h(write)f(access)g
-(to)h(determine)g(whether)0 962 y(or)28 b(not)g(origin)f(data)h(has)g
-(already)f(b)r(een)i(preserv)n(ed)d(in)j(the)g(snapshot)e(store,)h(and)
-g(th)n(us)h(is)f(no)g(longer)f(shared)h(with)h(the)0
-1062 y(origin.)0 1261 y(A)19 b(snapshot)g(is)g(presen)n(ted)f(to)h(the)
-h(system)f(as)f(a)h(virtual)f(blo)r(c)n(k)h(device.)34
-b(An)n(y)19 b(read)f(or)g(write)h(to)g(this)g(device)g(is)g(in)n
-(tercepted)0 1361 y(and)29 b(the)h(snapshot)e(store)g(metadata)h(is)g
-(consulted)g(to)g(determine)g(whether)g(the)h(target)e(of)h(the)h(IO)f
-(lies)g(on)g(the)g(origin)0 1460 y(\(i.e.,)d(hasn't)g(b)r(een)g
-(written)f(to)h(y)n(et)f(b)n(y)g(an)n(y)g(\034lesystem)g(moun)n(ted)g
-(on)g(the)h(origin)e(device\))i(or)e(is)i(in)g(the)f(snapshot)g(store.)
-0 1560 y(By)33 b(preserving)f(o)n(v)n(erwritten)g(data,)i(then)g
-(remapping)f(eac)n(h)g(access)f(to)h(the)h(virtual)f(snapshot)g(device)
-g(to)g(either)h(the)0 1660 y(origin)26 b(or)f(snapshot)h(store)g(as)g
-(appropriate,)f(a)h(view)h(of)g(a)f(snapshot)g(can)g(b)r(e)h
-(reconstructed)f(on)g(demand.)37 b(A)27 b(snapshot)0
-1759 y(view)g(can)h(b)r(e)g(accessed)e(through)h(its)h(virtual)f
-(snapshot)g(device)g(sim)n(ultaneously)g(with)h(the)g(origin)f(v)n
-(olume,)g(and)g(these)0 1859 y(t)n(w)n(o)h(virtual)h(devices)f(will)i
-(b)r(eha)n(v)n(e)e(as)g(if)i(they)f(w)n(ere)f(completely)h(indep)r
-(enden)n(t.)42 b(The)29 b(virtual)g(snapshot)f(device)h(tak)n(es)0
-1958 y(care)d(of)i(the)g(tric)n(ky)f(sync)n(hronization)e(required)i
-(to)g(mak)n(e)g(this)h(w)n(ork.)0 2158 y(W)-7 b(riting)31
-b(to)g(a)f(snapshot)g(through)g(its)h(virtual)f(device)h(is)g(allo)n(w)
-n(ed.)45 b(Data)31 b(to)f(b)r(e)i(o)n(v)n(erwritten)d(in)i(the)g
-(snapshot)f(store)0 2257 y(do)r(es)f(not)h(need)g(to)f(b)r(e)h(preserv)
-n(ed,)f(b)r(ecause)g(it)h(is)g(not)f(shared.)43 b(Ho)n(w)n(ev)n(er,)28
-b(if)i(a)f(write)h(to)f(the)h(snapshot)f(device)h(w)n(ould)0
-2357 y(o)n(v)n(erwrite)c(origin)i(data,)g(space)g(is)g(allo)r(cated)g
-(for)g(the)h(write)f(in)h(the)g(snapshot)e(store)h(instead)g(and)g(the)
-h(snapshot)f(store)0 2457 y(metadata)f(is)g(up)r(dated)h(to)g
-(re\035ect)f(that.)0 2656 y(Multiple)d(snapshots)e(can)h(b)r(e)h(held)g
-(against)e(an)h(origin)f(v)n(olume)h(sim)n(ultaneously)-7
-b(.)34 b(An)n(y)23 b(n)n(um)n(b)r(er)g(of)h(these)f(sim)n(ultaneous)0
-2756 y(snapshots)k(and)g(the)h(origin)e(can)h(b)r(e)h(accessed)f(as)g
-(writable)g(blo)r(c)n(k)g(devices,)g(also)f(sim)n(ultaneously)-7
-b(.)0 3126 y Fh(3)131 b(Cluster)45 b(Sync)l(hronization)0
-3407 y Ff(What)23 b(m)n(ust)g(b)r(e)g(done)f(to)h(turn)g(a)f(single-no)
-r(de)g(snapshot)f(device)i(in)n(to)f(a)h(cluster)f(snapshot)g(device?)
-35 b(Surprisingly)21 b(little.)0 3507 y(A)31 b(relativ)n(ely)e(thin)j
-(la)n(y)n(er)d(of)h(net)n(w)n(ork)g(sync)n(hronization)e(has)i(to)h(b)r
-(e)g(added,)g(and)g(some)f(fail-o)n(v)n(er)e(ho)r(oks.)46
-b(Blo)r(c)n(k)30 b(data)0 3607 y(itself)f(is)f(written)h(to)f(or)f
-(read)h(from)g(the)g(shared)g(blo)r(c)n(k)g(device)g(directly)g(b)n(y)g
-(the)g(cluster)g(snapshot)g(clien)n(t)g(in)h(the)g(same)0
-3706 y(w)n(a)n(y)i(as)h(if)g(the)h(blo)r(c)n(k)f(device)g(w)n(ere)f(lo)
-r(cal.)51 b(The)32 b(main)g(thing)h(that)f(is)g(new)h(for)e(a)h
-(cluster)g(snapshot)g(clien)n(t)g(is)g(that)h(it)0 3806
-y(m)n(ust)f(decide)f(for)g(eac)n(h)g(IO)g(request)g(if)h(global)f(sync)
-n(hronization)e(is)j(required,)f(and)h(query)e(the)i(snapshot)f(serv)n
-(er)f(o)n(v)n(er)0 3906 y(the)e(net)n(w)n(ork)e(if)i(it)g(is.)0
-4105 y(The)33 b(essen)n(tial)f(questions)g(that)h(a)f(snapshot)g(clien)
-n(t)g(m)n(ust)h(ask)f(are:)46 b(is)33 b(a)f(giv)n(en)g(logical)f(data)i
-(c)n(h)n(unk)f(shared)f(b)n(y)i(an)n(y)0 4204 y(snapshot,)h(or)e(is)h
-(it)g(unique?)53 b(Is)33 b(it)g(stored)g(on)f(the)i(origin)d(v)n
-(olume,)j(or)e(in)h(the)h(snapshot)e(store,)h(and)g(if)h(so,)f(at)g
-(what)0 4304 y(ph)n(ysical)28 b(address?)40 b(F)-7 b(or)29
-b(a)f(write,)i(either)f(to)g(the)g(origin)f(or)g(a)h(snapshot,)f(b)n(y)
-h(the)h(time)f(the)h(serv)n(er)d(replies)h(to)h(a)g(query)0
-4404 y(ab)r(out)f(a)f(giv)n(en)g(logical)g(c)n(h)n(unk)g(the)h(c)n(h)n
-(unk)g(is)g(alw)n(a)n(ys)e(unique,)i(b)r(ecause)f(the)h(serv)n(er)e
-(will)j(mak)n(e)e(it)h(so)f(if)h(necessary)-7 b(.)36
-b(T)-7 b(o)0 4503 y(mak)n(e)34 b(a)g(c)n(h)n(unk)h(unique,)h(the)g
-(serv)n(er)c(allo)r(cates)i(a)g(new)h(c)n(h)n(unk)f(in)h(snapshot)f
-(store,)i(copies)e(the)h(shared)f(data)g(there,)0 4603
-y(and)28 b(up)r(dates)f(the)i(snapshot)e(metadata.)36
-b(Ha)n(ving)27 b(ensured)g(uniqueness,)h(the)g(serv)n(er)e(replies)h
-(to)h(the)g(query)-7 b(,)27 b(the)h(clien)n(t)0 4703
-y(asso)r(ciates)i(the)j(reply)f(with)g(a)g(w)n(aiting)g(write)g
-(request,)h(and)f(\034nally)g(the)g(device-lev)n(el)f(write)h(pro)r
-(ceeds.)50 b(In)32 b(the)h(case)0 4802 y(of)27 b(a)g(snapshot)g(store)f
-(write,)i(the)f(reply)g(includes)h(the)g(ph)n(ysical)e(address)g(of)i
-(the)f(snapshot)g(store)f(c)n(h)n(unk,)h(whic)n(h)h(is)f(not)0
-4902 y(needed)h(in)g(the)f(origin)g(case)g(b)r(ecause)g(the)h(logical)e
-(and)h(ph)n(ysical)g(addresses)f(are)g(the)i(same.)0
-5101 y(With)j(this)f(simple)g(sc)n(heme,)g(w)n(e)f(see)h(that)g(it)g
-(is)g(nev)n(er)f(p)r(ossible)h(for)f(a)h(clien)n(t)f(no)r(de)h(to)g(o)n
-(v)n(erwrite)e(origin)h(data)g(that)h(is)0 5201 y(shared)d(b)n(y)g(a)g
-(snapshot,)g(or)f(a)i(shared)e(c)n(h)n(unk)h(in)h(the)g(snapshot)f
-(store.)0 5400 y(Reading)i(from)h(the)g(origin)f(is)h(ev)n(en)f
-(simpler:)41 b(no)30 b(sync)n(hronization)e(at)h(all)h(is)g(required)f
-(on)g(the)h(part)g(of)g(the)g(snapshot)p eop end
-%%Page: 6 6
-TeXDict begin 6 5 bop 0 83 a Ff(device.)53 b(This)33
-b(is)g(b)r(ecause)g(there)g(are)f(no)g(metadata)h(up)r(dates,)h(and)f
-(an)n(y)g(p)r(ossible)f(races)g(w)n(ould)h(also)f(b)r(e)h(races)f(on)g
-(a)0 183 y(ph)n(ysical)27 b(disk.)39 b(It)28 b(is)g(the)h(job)f(of)g(a)
-g(higher)g(lev)n(el)f(application)h(\(i.e.,)h(\034lesystem)f(or)f
-(database\))g(to)h(prev)n(en)n(t)g(suc)n(h)f(races,)0
-282 y(not)h(the)g(snapshot)e(device.)0 482 y(Reading)c(from)h(a)f
-(snapshot,)h(in)g(con)n(trast,)g(requires)e(the)j(most)e(in)n(tricate)g
-(sync)n(hronization)f(of)i(all.)35 b(The)23 b(problem)f(is,)i(if)f(a)0
-581 y(snapshot)g(read)g(references)f(an)i(origin)e(c)n(h)n(unk,)i
-(there)g(could)f(b)r(e)h(a)g(sim)n(ultaneous)f(origin)f(write)i(in)g
-(progress)d(to)j(the)g(same)0 681 y(c)n(h)n(unk.)36 b(If)28
-b(nothing)e(is)h(done)g(to)g(prev)n(en)n(t)f(it,)h(the)h(snapshot)e
-(data)g(could)h(b)r(e)g(o)n(v)n(erwritten)e(while)j(it)f(is)g(in)g(the)
-g(pro)r(cess)f(of)0 780 y(b)r(eing)g(read.)35 b(The)26
-b(snapshot)f(device)g(is)h(supp)r(osed)f(to)h(act)f(lik)n(e)g(a)h(ph)n
-(ysical)e(disk,)i(but)h(it)f(is)f(v)n(ery)g(unlik)n(e)g(a)h(ph)n
-(ysical)e(disk)0 880 y(for)k(data)g(to)g(sp)r(on)n(taneously)f(c)n
-(hange)g(while)i(b)r(eing)f(read.)39 b(There)28 b(is)g(no)g(help)h
-(from)f(cluster)g(\034lesystem)g(or)g(application)0 980
-y(la)n(y)n(ers)h(either,)i(b)r(ecause)f(they)h(do)f(not)h(kno)n(w)f(ab)
-r(out)h(the)g(p)r(ossible)f(in)n(teraction)g(of)g(origin)g(and)g
-(snapshot)g(and)h(cannot)0 1079 y(b)r(e)d(exp)r(ected)g(to.)0
-1279 y(The)39 b(solution)f(to)h(this)g(race)e(is)i(to)g(pro)n(vide)e
-(some)h(lo)r(c)n(king)g(b)r(et)n(w)n(een)h(origin)e(writes)i(and)f
-(snapshot)g(reads.)70 b(When)0 1378 y(a)38 b(snapshot)f(read)g
-(references)f(an)i(origin)f(c)n(h)n(unk,)j(the)e(serv)n(er)e(lo)r(c)n
-(ks)h(the)i(c)n(h)n(unk)e(in)n(ternally)g(b)r(efore)h(replying.)67
-b(The)0 1478 y(snapshot)29 b(clien)n(t)g(then)h(initiates)f(the)h
-(actual)f(read)f(and,)i(on)f(completion,)g(sends)g(a)g(message)f(to)h
-(the)h(serv)n(er)d(to)j(release)0 1577 y(the)i(read)e(lo)r(c)n(k.)48
-b(Snapshot)31 b(reads)f(from)h(the)g(origin)g(th)n(us)g(use)g(a)g
-(three)g(message)f(proto)r(col,)h(as)g(opp)r(osed)g(to)g(origin)f(or)0
-1677 y(snapshot)f(writes,)i(whic)n(h)f(use)g(t)n(w)n(o.)44
-b(This)30 b(also)f(means)g(that)i(there)f(are)f(t)n(w)n(o)g(p)r
-(ossible)h(forms)f(of)i(reply)e(to)h(a)g(snapshot)0 1777
-y(read)22 b(query)-7 b(,)23 b(dep)r(ending)h(on)e(whether)h(the)g
-(target)f(c)n(h)n(unk)h(resides)f(on)g(the)i(origin)d(or)i(in)g(the)g
-(snapshot)f(store.)34 b(The)23 b(latter)0 1876 y(\035a)n(v)n(or)i(m)n
-(ust)j(include)g(the)g(address)e(of)i(a)f(snapshot)g(store)f(c)n(h)n
-(unk,)h(but)i(do)r(es)e(not)g(require)g(an)n(y)g(\034nal)g(release)f
-(message.)0 2208 y Fg(3.1)112 b(Clien)m(t-side)36 b(Query)h(Cac)m(he)0
-2461 y Ff(An)c(in)n(teresting)e(fact)h(ab)r(out)g(the)h(shared)e(vs)h
-(unique)g(state)g(of)g(an)n(y)g(giv)n(en)f(origin)g(c)n(h)n(unk)h(is)g
-(that)g(it)h(is)f(a)g(latc)n(h:)46 b(once)0 2561 y(a)29
-b(c)n(h)n(unk)g(b)r(ecomes)g(unique,)h(it)g(will)g(sta)n(y)f(unique)g
-(un)n(til)h(a)f(new)h(snapshot)e(is)i(created,)f(at)g(whic)n(h)h(p)r
-(oin)n(t)f(all)h(c)n(h)n(unks)e(on)0 2660 y(the)g(origin)f(are)g
-(shared)g(with)h(the)g(new)g(snapshot.)37 b(This)28 b(means)g(that)g
-(an)f(origin)g(clien)n(t)h(can)g(cac)n(he)f(the)h(one-bit)f(result)0
-2760 y(of)34 b(an)n(y)f(write)g(query)-7 b(,)35 b(since)e(it)h(can)g(b)
-r(e)g(relied)f(on)h(to)f(sta)n(y)g(unc)n(hanged,)i(except)e(when)h(a)g
-(new)f(snapshot)g(is)h(created.)0 2860 y(The)27 b(serv)n(er)e
-(broadcasts)g(a)i(message)f(to)h(all)f(origin)g(clien)n(ts)h(in)h(that)
-f(case,)f(so)h(that)g(they)g(ma)n(y)g(clear)f(their)h(cac)n(hes.)35
-b(An)n(y)0 2959 y(uncac)n(hed)27 b(or)g(zero)f(bit)i(is)g(either)f
-(shared)g(or)f('don't)i(kno)n(w',)f(the)h(distinction)g(b)r(eing)f
-(immaterial.)0 3159 y(Snapshot)f(clien)n(ts)g(ma)n(y)g(also)g(cac)n(he)
-f(query)h(results,)g(the)h(cac)n(hed)f(v)-5 b(alue)26
-b(b)r(eing)h(a)f(snapshot)g(store)f(address)h(rather)f(than)0
-3258 y(a)35 b(bit.)59 b(This)35 b(cac)n(he)g(nev)n(er)f(needs)g(to)h(b)
-r(e)h(cleared)e(explicitly)-7 b(,)37 b(although)d(it)i(migh)n(t)f(b)r
-(e)g(partially)f(or)g(fully)i(cleared)e(in)0 3358 y(resp)r(onse)26
-b(to)i(memory)f(pressure.)0 3732 y Fh(4)131 b(The)44
-b(A)l(CID)h(T)-11 b(est)0 4013 y Ff(The)32 b(snapshot)e(store)h
-(metadata)f(is)i(a)f(simple)g(database,)h(and)f(as)g(suc)n(h,)h(should)
-f(satisfy)g(the)h(A)n(CID)g(test:)45 b(atomicit)n(y)-7
-b(,)0 4113 y(consistency)g(,)32 b(isolation)f(and)h(durabilit)n(y)-7
-b(.)50 b(W)-7 b(e)32 b(consider)f(eac)n(h)g(of)h(these)g(requiremen)n
-(ts)f(brie\035y)g(here,)i(to)f(see)f(ho)n(w)h(the)0 4213
-y(new)c(design)f(stac)n(ks)f(up.)0 4528 y Fd(A)m(tomicit)m(y)0
-4781 y Ff(A)34 b(write)g(query)f(that)i(c)n(hanges)d(the)i(state)g(of)g
-(the)g(on)g(disk)g(metadata)f(is)h(alw)n(a)n(ys)e(handled)i(en)n
-(tirely)f(within)i(a)f(single)0 4881 y(journal)27 b(transaction,)f
-(whic)n(h)h(commits)h(the)g(c)n(hange)e(to)i(disk)f(atomically)-7
-b(.)p eop end
-%%Page: 7 7
-TeXDict begin 7 6 bop 0 83 a Fd(Consistency)0 336 y Ff(Just)36
-b(as)f(with)i(a)f(ph)n(ysical)f(disk,)j(it)e(is)g(not)g(the)h(job)f(of)
-g(the)g(snapshot)f(device)h(to)g(mak)n(e)f(an)n(y)h(guaran)n(tee)e(ab)r
-(out)i(the)0 436 y(in)n(ternal)31 b(consistency)f(of)h(data)g(on)g(the)
-h(device,)g(only)e(that)i(what)f(w)n(as)f(written)i(is)f(what)g(will)h
-(b)r(e)f(read)g(bac)n(k,)g(for)g(an)n(y)0 535 y(and)24
-b(all)g(sim)n(ultaneously)g(accessed)f(snapshots)g(and)i(the)f(origin)g
-(device.)35 b(In)25 b(other)f(w)n(ords,)f(its)i(only)f(resp)r
-(onsibilit)n(y)g(here)0 635 y(is)j(to)h(k)n(eep)f(the)h(metadata)f
-(straigh)n(t.)0 950 y Fd(Isolation)0 1203 y Ff(The)j(protot)n(yp)r(e)g
-(implemen)n(tation)g(ac)n(hiev)n(es)f(transaction)g(isolation)g(b)n(y)h
-(the)g(simple)h(exp)r(edien)n(t)f(of)h(a)f(single-threaded)0
-1303 y(serv)n(er)25 b(that)i(handles)f(incoming)h(queries)e(one)i(at)f
-(a)h(time.)37 b(This)27 b(will)g(ev)n(en)n(tually)e(sho)n(w)h(up)h(as)f
-(a)h(b)r(ottlenec)n(k)f(and)h(more)0 1402 y(elab)r(orate)f(sync)n
-(hronization)g(will)i(b)r(e)g(needed.)0 1718 y Fd(Durabilit)m(y)0
-1971 y Ff(Thanks)35 b(to)g(the)h(journal,)h(the)f(en)n(tire)f(state)h
-(of)f(the)h(metadata)f(serv)n(er)f(\(with)i(on)g(exception,)h(see)e(b)r
-(elo)n(w\))h(is)f(alw)n(a)n(ys)0 2070 y(completely)24
-b(recorded)f(on)h(disk)g(at)g(the)h(time)g(an)n(y)e(write)h(is)g(ac)n
-(kno)n(wledged.)34 b(Th)n(us,)25 b(if)f(the)h(metadata)f(serv)n(er)e
-(should)i(fail)0 2170 y(a)j(new)h(one)f(can)g(b)r(e)h(started,)f(read)g
-(the)h(metadata)f(ro)r(ot)f(and)i(con)n(tin)n(ue)f(as)g(if)h(nothing)f
-(had)h(happ)r(ened.)0 2369 y(The)h(one)g(exception)f(to)h(this)g(is)g
-(that)h(lo)r(c)n(king)e(state)g(of)h(snapshot)f(read)h(requests)f
-(against)f(origin)h(writes)h(is)g(k)n(ept)g(only)0 2469
-y(in)f(memory)e(on)h(the)h(serv)n(er.)34 b(While)28 b(it)g(is)f(enough)
-g(to)g(simply)g(require)f(all)h(outstanding)g(reads)f(on)h(clien)n(ts)g
-(to)g(complete)0 2568 y(b)r(efore)35 b(a)f(newly)h(started)g(metadata)f
-(serv)n(er)f(can)i(resume)g(pro)r(cessing)e(requests,)j(there)f(could)g
-(b)r(e)g(cases)f(where)h(this)0 2668 y(w)n(ould)25 b(cause)g(an)h
-(unnecessary)e(dela)n(y)h(of)h(sev)n(eral)e(seconds)g(on)i(serv)n(er)e
-(restart)g(where)i(there)f(is)h(a)f(hea)n(vy)g(bac)n(klog)f(of)h(IO.)0
-2768 y(Since)h(it)h(is)f(easy)-7 b(,)25 b(clien)n(ts)h(will)h(b)r(e)f
-(ask)n(ed)f(to)h(upload)g(an)n(y)f(outstanding)h(lo)r(c)n(k)n(ed)f
-(snapshot)g(reads)g(to)h(the)h(new)f(metadata)0 2867
-y(serv)n(er)f(b)r(efore)h(the)h(serv)n(er)e(resumes)h(pro)r(cessing)f
-(requests.)35 b(This)27 b(should)f(only)h(tak)n(e)e(a)i(few)g(tens)f
-(of)h(milliseconds.)36 b(The)0 2967 y(total)c(latency)g(of)h(starting)f
-(a)g(new)g(metadata)g(serv)n(er)f(then)i(should)f(b)r(e)h(measured)f
-(in)h(tens)g(of)f(milliseconds)g(\(though)0 3066 y(detecting)c(that)g
-(a)f(serv)n(er)e(has)i(failed)h(could)f(easily)g(tak)n(e)g(m)n(uc)n(h)g
-(longer\).)0 3441 y Fh(5)131 b(Serv)l(er)45 b(Implemen)l(tation)f
-(Details)0 3739 y Fg(5.1)112 b(Exception)36 b(BT)-9 b(ree)37
-b(F)-9 b(ormat)0 3991 y Ff(Exceptions)28 b(for)f(all)g(snapshots)g(are)
-g(stored)g(in)h(a)f(single)g(btree)h(indexed)f(b)n(y)h(logical)e(c)n(h)
-n(unk)h(address.)37 b(F)-7 b(or)27 b(eac)n(h)g(c)n(h)n(unk,)0
-4091 y(a)g(list)h(of)f(exceptions)g(is)g(stored.)36 b(Eac)n(h)27
-b(exception)g(consists)f(of)i(a)f(snapshot)f(address)g(and)h(a)g
-(bitmap)h(sp)r(ecifying)f(whic)n(h)0 4191 y(snapshots)g(share)f(that)i
-(exception.)0 4390 y(The)h(btree)g(is)g(mean)n(t)g(to)f(b)r(e)i(op)r
-(erated)e(on)h(directly)g(b)n(y)f(the)i(snapshot)e(serv)n(er,)f(as)i
-(opp)r(osed)f(to)h(b)r(eing)g(translated)f(in)n(to)0
-4490 y(some)j(more)g(e\036cien)n(t)g(cac)n(he)g(format.)48
-b(T)-7 b(o)32 b(supp)r(ort)f(alignmen)n(t-restricted)f(arc)n
-(hitectures,)h(all)g(\034elds)h(in)g(btree)f(blo)r(c)n(ks)0
-4589 y(are)c(aligned)f(according)g(to)i(their)f(size.)1288
-4559 y Fc(1)0 4788 y Ff(An)22 b(attempt)f(has)g(b)r(een)g(made)g(to)g
-(k)n(eep)g(the)g(btree)g(compact)f(b)n(y)h(designing)f(the)i(no)r(de)f
-(formats)f(carefully)-7 b(,)22 b(without)f(going)0 4888
-y(to)k(extremes)f(suc)n(h)g(as)g(using)g(a)h(serial)e(compressed)h
-(enco)r(ding)g(whic)n(h)g(is)h(unpac)n(k)n(ed)f(in)n(to)g(a)h(memory)e
-(structure)i(in)f(order)0 4988 y(to)f(b)r(e)h(accessed.)35
-b(In)23 b(other)g(w)n(ords,)g(di\036cult)i(tradeo\033s)d(ha)n(v)n(e)h
-(b)r(een)h(made)f(here)g(b)r(et)n(w)n(een)g(compactness,)h(simplicit)n
-(y)f(and)0 5087 y(e\036ciency)-7 b(.)p 0 5157 1548 4
-v 92 5210 a Fb(1)127 5234 y Fa(This)30 b(p)r(osturing)i(ma)n(y)e(pro)n
-(v)n(e)i(unnecessary)h(if)d(the)i(compiler's)e(abilit)n(y)g(to)h
-(generate)j(alignmen)n(t-indep)r(enden)n(t)e(co)r(de)g(for)f(alignmen)n
-(t-)0 5312 y(restricted)26 b(arc)n(hitectures)h(pro)n(v)n(es)e
-(reliable.)p eop end
-%%Page: 8 8
-TeXDict begin 8 7 bop 0 83 a Fd(Leaf)33 b(no)s(des)0
-336 y Ff(Leaf)i(blo)r(c)n(k)g(format)g(is)g(optimized)h(for)f(rapid)f
-(lo)r(okup)h(and)h(e\036cien)n(t)f(insertion.)60 b(A)n(t)35
-b(the)h(b)r(ottom)g(of)f(eac)n(h)g(leaf)g(is)g(a)0 436
-y(header)25 b(and)i(a)e(directory)g(map)i(that)f(gro)n(ws)e(up)j(to)n
-(w)n(ards)d(a)i(table)g(of)h(exceptions,)f(whic)n(h)g(gro)n(ws)e(do)n
-(wn.)36 b(Eac)n(h)26 b(en)n(try)g(in)0 535 y(the)g(directory)f(map)h
-(giv)n(es)e(the)i(logical)f(c)n(h)n(unk)g(address)g(relativ)n(e)g(to)g
-(a)h(base)f(address)g(stored)g(in)h(the)g(header,)f(and)h(has)f(a)0
-635 y(p)r(oin)n(ter)i(to)h(one)f(of)g(the)h(exceptions)f(in)h(the)g
-(table)f(at)h(the)g(top)f(of)h(the)g(blo)r(c)n(k.)36
-b(The)28 b(en)n(tries)f(are)f(stored)h(in)h(sorted)e(order)0
-734 y(according)g(to)h(logical)f(c)n(h)n(unk)i(address)e(and)h(the)h(p)
-r(oin)n(ters)f(increase)f(monotonically)-7 b(.)0 934
-y(Using)28 b(relativ)n(e)f(addresses)f(allo)n(ws)h(the)h(map)g(en)n
-(tries)g(to)g(b)r(e)g(more)g(compact.)38 b(In)28 b(the)g(curren)n(t)g
-(protot)n(yp)r(e)f(map)h(en)n(tries)0 1033 y(consist)i(of)h(t)n(w)n(o)e
-(32)h(bit)h(n)n(um)n(b)r(ers,)g(ho)n(w)n(ev)n(er)e(t)n(w)n(o)g(16)h
-(bit)h(n)n(um)n(b)r(ers)f(migh)n(t)h(w)n(ork)e(just)i(as)f(w)n(ell)h
-(and)f(sa)n(v)n(e)f(more)h(space,)0 1133 y(although)e(a)f(16)h(bit)h
-(relativ)n(e)e(blo)r(c)n(k)h(n)n(um)n(b)r(er)g(migh)n(t)g(b)r(e)h(so)e
-(small)h(as)g(to)g(cause)g(a)f(noticeable)h(increase)f(in)i(the)f(n)n
-(um)n(b)r(er)0 1233 y(of)i(leaf)f(blo)r(c)n(ks)g(if)h(exceptions.are)e
-(distributed)i(sparsely)-7 b(.)41 b(With)31 b(32)e(bit)h(map)f(n)n(um)n
-(b)r(ers,)h(a)f(single)g(exception)g(requires)0 1332
-y(24)f(b)n(ytes;)h(with)h(16)e(bit)i(map)e(n)n(um)n(b)r(ers)h(that)g(w)
-n(ould)g(fall)g(to)f(20)h(b)n(ytes,)g(a)f(16\045)g(sa)n(vings.)40
-b(The)29 b(\034nal)g(determination)f(of)0 1432 y(whic)n(h)f(is)h(b)r
-(est)g(should)f(probably)g(b)r(e)h(determined)f(exp)r(erimen)n(tally)-7
-b(.)0 1631 y(The)32 b(di\033erence)g(b)r(et)n(w)n(een)g(eac)n(h)f(t)n
-(w)n(o)h(p)r(oin)n(ters)f(in)i(the)f(map)g(giv)n(es)f(the)i(n)n(um)n(b)
-r(er)e(of)h(exceptions)g(for)g(the)g(c)n(h)n(unk.)50
-b(The)0 1731 y(last)28 b(en)n(try)g(in)h(the)g(map)f(is)h(a)f(sen)n
-(tinel)h(and)f(p)r(oin)n(ts)g(at)h(the)g(top)f(of)h(the)g(blo)r(c)n(k)f
-(\(this)h(could)f(b)r(e)h(designed)f(out)h(to)g(sa)n(v)n(e)d(a)0
-1830 y(few)h(b)n(ytes\).)37 b(Eac)n(h)27 b(en)n(try)f(in)h(the)h
-(exception)e(table)h(has)g(the)g(64)f(bit)h(sector)f(address)g(of)h(an)
-g(exception)f(in)i(the)f(snapshot)0 1930 y(store)g(and)g(a)g(bitmap)h
-(to)f(indicate)h(whic)n(h)g(snapshots)e(share)g(the)i(exception.)0
-2129 y(The)k(basic)f(op)r(erations)g(to)h(lo)r(cate)f(and)h(determine)g
-(sharing)f(of)h(exceptions)f(are)g(e\036cien)n(t.)50
-b(A)33 b(binary)e(searc)n(h)f(is)i(used)0 2229 y(to)i(lo)r(cate)f(the)h
-(target)e(c)n(h)n(unk)i(address)e(in)i(the)g(map,)h(if)f(it)g(is)g
-(presen)n(t.)54 b(This)34 b(yields)f(a)g(list)h(of)g(exceptions)f(on)g
-(whic)n(h)0 2328 y(e\036cien)n(t)28 b(bit)n(wise)f(op)r(erations)g(can)
-g(b)r(e)h(p)r(erformed)g(to)f(determine)h(sharing.)36
-b(F)-7 b(rom)28 b(the)g(p)r(oin)n(t)g(of)f(view)h(of)g(the)g(origin,)e
-(a)0 2428 y(logical)i(c)n(h)n(unk)g(is)i(shared)e(unless)h(all)f(activ)
-n(e)h(snapshots)f(ha)n(v)n(e)g(exceptions)g(for)h(that)g(c)n(h)n(unk.)
-42 b(F)-7 b(rom)28 b(the)i(p)r(oin)n(t)f(of)g(view)0
-2528 y(of)g(a)g(snapshot,)h(a)f(logical)f(c)n(h)n(unk)h(is)g(shared)g
-(if)h(it)g(has)e(no)i(exception)f(\(i.e.,)h(is)g(shared)e(with)i(the)g
-(origin\))e(or)h(it)h(has)f(the)0 2627 y(same)e(snapshot)g(store)f
-(address)h(as)f(another)h(snapshot.)0 2827 y(A)36 b(sligh)n(t)f(dra)n
-(wbac)n(k)f(of)i(this)g(leaf)g(format)f(is)g(that)h(insertion)g
-(requires)e(memory)h(mo)n(v)n(es)f(in)i(order)f(to)g(main)n(tain)h(the)
-0 2926 y(en)n(tries)26 b(in)g(sorted)g(order,)f(and)i(the)f(memory)g
-(mo)n(v)n(es)f(get)h(longer)f(as)h(the)h(leaf)f(blo)r(c)n(k)g(\034lls)g
-(up.)37 b(F)-7 b(or)26 b(relativ)n(ely)f(small)h(leaf)0
-3026 y(blo)r(c)n(ks,)j(i.e.)41 b(4K,)28 b(it)i(is)e(probably)g(not)h(a)
-g(problem.)40 b(This)29 b(will)h(b)r(e)f(determined)g(exp)r(erimen)n
-(tally)-7 b(.)41 b(Other,)29 b(equiv)-5 b(alen)n(tly)0
-3125 y(e\036cien)n(t)28 b(leaf)f(formats)g(are)f(certainly)h(p)r
-(ossible,)g(though)h(p)r(erhaps)f(they)g(will)h(not)g(b)r(e)g(as)f
-(simple.)0 3325 y(A)g(more)g(serious)f(dra)n(wbac)n(k)f(of)i(this)g
-(leaf)g(format)g(is)g(that)g(as)g(the)g(n)n(um)n(b)r(er)g(of)g
-(snapshots)f(increases,)g(up)r(date)h(o)n(v)n(erhead)0
-3424 y(of)f(the)h(btree)f(increases)f(more)g(or)h(less)f(linearly)-7
-b(,)26 b(alb)r(eit)h(with)g(a)e(gen)n(tle)h(slop)r(e.)36
-b(Nonetheless,)27 b(it)f(migh)n(t)h(pro)n(v)n(e)d(desirable)0
-3524 y(to)j(adopt)h(a)f(v)-5 b(arian)n(t)26 b(leaf)i(format)f(at)g
-(some)g(p)r(oin)n(t)h(capable)f(of)g(enco)r(ding)g(runs)g(of)h(adjacen)
-n(t)f(exceptions)g(e\036cien)n(tly)-7 b(.)0 3839 y Fd(Index)32
-b(no)s(des)0 4092 y Ff(An)e(index)g(no)r(de)f(con)n(tains)g(a)g(table)g
-(of)h(en)n(tries)e(eac)n(h)h(of)h(whic)n(h)f(consists)g(of)g(a)g(64)g
-(bit)h(logical)e(c)n(h)n(unk)h(address)f(k)n(ey)h(and)0
-4192 y(a)h(64)g(bit)h(sector)f(address)f(of)i(a)f(lo)n(w)n(er)f(lev)n
-(el)h(index)h(no)r(de)g(or,)f(at)h(the)g(lo)n(w)n(est)e(index)i(lev)n
-(el,)g(a)f(leaf.)47 b(The)30 b(en)n(tries)g(are)g(in)0
-4291 y(sorted)j(order)g(b)n(y)h(logical)f(c)n(h)n(unk)h(address.)56
-b(T)-7 b(w)n(o)33 b(successiv)n(e)g(k)n(eys)g(b)r(ound)i(the)g(range)d
-(of)j(en)n(tries)e(con)n(tained)h(b)n(y)g(the)0 4391
-y(lo)n(w)n(er)26 b(lev)n(el)h(no)r(de.)0 4590 y(T)-7
-b(o)38 b(lo)r(cate)f(the)i(leaf)f(blo)r(c)n(k)f(in)h(whic)n(h)g
-(exceptions,)i(if)f(an)n(y)-7 b(,)40 b(are)d(stored)g(for)h(a)f(giv)n
-(en)g(logical)g(address,)j(w)n(e)d(descend)0 4690 y(recursiv)n(ely)22
-b(from)h(the)h(ro)r(ot,)g(doing)f(a)g(binary)g(searc)n(h)f(on)h(the)h
-(address)f(k)n(ey)g(in)g(eac)n(h)g(blo)r(c)n(k)g(and)h(descending)f
-(recursiv)n(ely)0 4790 y(in)n(to)k(the)h(no)r(de)g(referenced)f(b)n(y)g
-(the)h(sector)e(address)h(lying)g(b)r(et)n(w)n(een)g(the)h(t)n(w)n(o)f
-(k)n(eys)g(that)g(b)r(ound)h(the)g(target)f(k)n(ey)-7
-b(.)0 4989 y(W)g(e)22 b(searc)n(h)e(all)i(the)g(w)n(a)n(y)e(to)i(a)f
-(leaf)h(no)r(de)f(ev)n(en)h(if)g(w)n(e)f(are)g(examining)g(a)g(region)g
-(of)g(the)h(address)f(space)g(that)h(is)f(completely)0
-5088 y(empt)n(y)-7 b(.)47 b(F)-7 b(or)30 b(write)g(requests)g(this)h
-(is)g(not)g(ine\036cien)n(t)g(b)r(ecause)f(w)n(e)h(will)g(immediately)g
-(add)f(an)h(exception)f(to)h(the)g(leaf)0 5188 y(no)r(de)23
-b(w)n(e)f(found)h(if)g(one)f(is)h(not)g(presen)n(t.)34
-b(F)-7 b(or)22 b(read)g(requests)g(it's)g(a)h(little)g(more)f(w)n(ork)f
-(than)i(necessary)e(but)i(w)n(e)f(probably)0 5288 y(do)27
-b(not)g(care)f(since)h(this)h(only)f(a\033ects)g(snapshot)f(reads,)h
-(and)g(only)g(b)n(y)f(a)h(small)g(amoun)n(t)g(\(origin)f(reads)g(do)h
-(not)h(in)n(v)n(olv)n(e)0 5387 y(the)g(serv)n(er\).)p
-eop end
-%%Page: 9 9
-TeXDict begin 9 8 bop 0 83 a Fg(5.2)112 b(Journal)0 336
-y Ff(An)n(y)23 b(altered)f(metadata)h(blo)r(c)n(k,)g(i.e,)h(btree)f
-(leaf)g(and)g(index)g(no)r(des,)h(allo)r(cation)e(bitmaps,)i(etc,)g
-(are)e(written)h(to)g(a)g(journal)0 436 y(b)r(efore)j(b)r(eing)g
-(written)g(to)g(their)g(\034nal)g(destinations.)36 b(This)26
-b(guaran)n(tees)e(that)i(the)h(metadata)e(can)h(b)r(e)g(restored)f
-(reliably)0 535 y(to)i(the)h(state)g(of)f(the)h(most)g(recen)n(tly)e
-(committed)i(exception)g(or)e(other)h(metadata)g(c)n(hange.)0
-734 y(The)e(size)g(and)g(lo)r(cation)f(of)h(the)g(journal)g(are)f
-(determined)h(at)g(the)g(time)h(the)f(snapshot)f(store)g(is)h(created)f
-(and)h(cannot)g(b)r(e)0 834 y(c)n(hanged.)0 1033 y(Eac)n(h)33
-b(journal)g(transaction)f(consists)g(of)i(an)f(arbitrary)e(n)n(um)n(b)r
-(er)i(of)g(data)g(blo)r(c)n(ks)g(follo)n(w)n(ed)f(b)n(y)h(a)g(journal)g
-(tag)g(blo)r(c)n(k.)0 1133 y(The)27 b(tag)g(blo)r(c)n(k)f(carries)f(a)i
-(magic)f(n)n(um)n(b)r(er)h(allo)n(wing)f(it)h(to)g(b)r(e)h(iden)n
-(ti\034ed)f(as)g(suc)n(h)f(for)h(the)g(purp)r(ose)g(of)g(journal)f
-(repla)n(y)-7 b(,)0 1233 y(and)28 b(a)g(sequence)f(n)n(um)n(b)r(er)h
-(used)g(to)g(lo)r(cate)g(the)h(starting)e(p)r(oin)n(t)h(for)g(journal)f
-(repla)n(y)-7 b(.)38 b(An)n(y)28 b(data)g(blo)r(c)n(k)f(written)i(to)f
-(the)0 1332 y(journal)g(that)h(happ)r(ens)g(to)f(ha)n(v)n(e)g(the)h
-(same)f(n)n(um)n(b)r(er)g(at)h(the)g(same)f(lo)r(cation)g(m)n(ust)g(b)r
-(e)i(escap)r(ed)e(b)n(y)g(writing)g(a)h(zero)e(to)0 1432
-y(that)e(lo)r(cation)g(in)g(a)g(cop)n(y)f(of)h(the)g(data.)36
-b(The)25 b(tag)g(blo)r(c)n(k)f(carries)f(a)i(list)g(of)g(snapshot)g
-(store)f(sector)g(addresses)f(whic)n(h)i(are)0 1531 y(the)e(\034nal)f
-(destinations)f(of)i(the)f(data)g(blo)r(c)n(ks.)34 b(The)22
-b(lo)n(w)g(bit)h(of)f(the)g(address)f(carries)g(a)g(bit)i(\035ag)f
-(indicating)g(that)g(the)h(data)0 1631 y(blo)r(c)n(k)j(w)n(as)f(escap)r
-(ed)h(and)g(the)g(magic)g(n)n(um)n(b)r(er)g(needs)g(to)g(b)r(e)h
-(restored)d(b)r(efore)i(the)h(data)f(blo)r(c)n(k)f(is)h(\034nally)g
-(written.)37 b(The)0 1731 y(tag)26 b(blo)r(c)n(k)g(carries)f(other)h
-(miscellaneous)g(information)g(suc)n(h)g(as)g(partial)g(usage)f(status)
-i(of)f(a)h(c)n(h)n(unk)f(recen)n(tly)g(allo)r(cated)0
-1830 y(for)h(metadata.)0 2162 y Fg(5.3)112 b(Allo)s(cation)35
-b(Bitmaps)0 2415 y Ff(F)-7 b(ree)36 b(space)f(in)i(the)f(snapshot)g
-(store)f(is)h(managed)f(via)h(bitmaps)g(with)h(a)e(resolution)g(of)h
-(one)g(bit)h(p)r(er)f(c)n(h)n(unk.)62 b(Eac)n(h)0 2515
-y(bitmap)33 b(is)g(one)f(4K)g(blo)r(c)n(k)g(in)h(size)g(and)f(maps)h
-(2**15)d(c)n(h)n(unks.)52 b(The)32 b(bitmap)h(blo)r(c)n(ks)f(are)g
-(indexed)h(via)f(a)h(radix)e(tree)0 2614 y(ro)r(oted)c(in)i(the)f
-(header.)38 b(Eac)n(h)28 b(radix)f(tree)h(no)r(de)g(con)n(tains)f(512)g
-(8-b)n(yte)g(sector)g(addresses.)37 b(As)28 b(a)g(sligh)n(t)f
-(simpli\034cation)0 2714 y(this)i(tree)g(is)g(alw)n(a)n(ys)e(3)i(lev)n
-(els)f(deep,)h(giving)f(2^27)g(*)g(2^15)g(=)g(4)h(trillion)g(c)n(h)n
-(unks,)f(or)h(16)f(p)r(etab)n(ytes)g(v)n(olume)h(size)f(limit)0
-2814 y(with)d(a)g(minimal)g(4K)f(c)n(h)n(unk)g(size.)35
-b(It)26 b(is)e(alw)n(a)n(ys)f(fully)i(p)r(opulated,)h(i.e.,)f(the)h
-(tree)e(is)h(created)f(at)g(the)h(time)h(the)f(snapshot)0
-2913 y(store)i(is)g(created)g(and)g(c)n(hanged)f(only)h(if)i(the)e
-(snapshot)g(store)g(is)g(expanded.)37 b(The)27 b(second)g(lo)n(w)n(est)
-f(lev)n(el)h(of)h(the)g(bitmap)0 3013 y(index)h(tree)f(is)h(loaded)f
-(in)n(to)g(memory)g(when)h(the)g(v)n(olume)g(is)f(activ)-5
-b(ated,)29 b(this)g(will)g(b)r(e)g(ab)r(out)g(512)e(KB)i(p)r(er)f
-(terab)n(yte)g(of)0 3113 y(snapshot)f(store.)0 3312 y(Bitmaps)35
-b(are)f(cac)n(hed)h(in)h(bu\033ers)f(and)g(accessed)f(via)h(getblk.)60
-b(A)35 b(p)r(oin)n(ter)g(is)h(k)n(ept)f(to)g(the)h(most)f(recen)n(tly)f
-(accessed)0 3411 y(bitmap,)c(i.e.,)g(it)g(is)f(not)h(released)e(un)n
-(til)i(a)f(di\033eren)n(t)g(bitmap)h(is)f(accessed,)g(whic)n(h)g
-(eliminates)g(the)h(ma)5 b(jorit)n(y)28 b(of)h(getblk)0
-3511 y(lo)r(okups)i(assuming)g(reasonably)f(go)r(o)r(d)h(lo)r(calit)n
-(y)g(of)h(allo)r(cation.)49 b(Lik)n(ewise,)32 b(a)f(p)r(oin)n(ter)h(is)
-f(k)n(ept)h(to)g(the)g(most)g(recen)n(tly)0 3611 y(accessed)h(index)h
-(blo)r(c)n(k.)55 b(Since)34 b(nearly)f(all)h(accesses)e(to)i(bitmaps)g
-(are)f(asso)r(ciated)g(with)h(c)n(hanging)e(the)j(bitmap,)h(the)0
-3710 y(bitmaps)f(are)e(k)n(ept)i(near)f(the)h(journal)f(rather)f(than)i
-(b)r(eing)g(distributed)g(throughout)f(the)h(snapshot)e(store.)58
-b(This)34 b(is)0 3810 y(purely)27 b(a)f(matter)h(of)g(allo)r(cation)e
-(p)r(olicy)i(since)g(the)g(actual)g(lo)r(cations)f(of)g(bitmaps)h(are)f
-(determined)h(b)n(y)g(the)g(radix)f(tree.)0 4009 y(Since)20
-b(metadata)e(is)i(allo)r(cated)f(in)g(blo)r(c)n(ks)g(but)h(allo)r
-(cation)e(gran)n(ularit)n(y)f(is)j(c)n(h)n(unks,)g(some)f(c)n(h)n(unks)
-g(allo)r(cated)f(to)i(metadata)0 4109 y(ma)n(y)32 b(b)r(e)h(only)f
-(partially)f(full.)52 b(T)-7 b(o)32 b(a)n(v)n(oid)f(leak)-5
-b(age)31 b(of)i(this)f(unallo)r(cated)g(space)g(on)g(unexp)r(ected)h
-(restart,)f(an)n(y)g(partial)0 4208 y(allo)r(cations)d(are)g(recorded)g
-(in)i(the)f(journal)g(tag)f(blo)r(c)n(k.)45 b(As)30 b(a)g(side)g
-(e\033ect,)i(this)e(means)g(that)h(a)f(few)g(metadata)g(blo)r(c)n(ks)0
-4308 y(can)d(b)r(e)h(allo)r(cated)f(b)r(efore)g(a)g(bitmap)h(needs)g
-(to)f(b)r(e)h(mo)r(di\034ed,)g(sa)n(ving)e(some)h(journal)g(bandwidth.)
-0 4640 y Fg(5.4)112 b(Allo)s(cation)35 b(P)m(olicy)0
-4893 y Ff(The)c(protot)n(yp)r(e)e(implemen)n(tation)h(uses)h(a)f
-(simple,)h(wraparound)d(allo)r(cation)i(sc)n(heme)g(for)g(b)r(oth)g
-(snapshot)g(store)g(data)0 4993 y(and)k(metadata.)55
-b(If)35 b(snapshots)e(are)g(held)h(o)n(v)n(er)e(extended)j(p)r(erio)r
-(ds,)g(the)f(snapshot)g(store)f(will)h(b)r(ecome)g(fragmen)n(ted,)0
-5092 y(reducing)26 b(the)g(e\036ciency)h(of)f(IO)g(transfers.)35
-b(While)27 b(this)g(ma)n(y)e(not)i(b)r(e)g(a)f(serious)f(problem)g(for)
-h(bac)n(kup)g(applications,)g(it)0 5192 y(certainly)i(will)h(b)r(e)g(a)
-f(problem)g(for)g(a)g(user)g(that)h(wishes)g(to)f(use)h(a)f(snapshot)g
-(in)h(place)f(of)g(the)h(origin)f(device.)40 b(T)-7 b(o)28
-b(cater)0 5291 y(to)34 b(this)g(t)n(yp)r(e)g(of)f(usage,)i(a)e(more)g
-(sophisticated)g(allo)r(cator)f(will)i(b)r(e)g(needed.)56
-b(T)-7 b(o)33 b(facilitate)h(e\036cien)n(t)g(data)f(transfer,)0
-5391 y(exception)26 b(store)g(c)n(h)n(unks)g(that)i(are)d(logically)h
-(close)g(together)g(should)g(b)r(e)i(stored)e(ph)n(ysically)f(close)i
-(together.)35 b(Because)p eop end
-%%Page: 10 10
-TeXDict begin 10 9 bop 0 83 a Ff(of)34 b(c)n(h)n(unk)g(sharing,)h(this)
-g(can)f(not)h(b)r(e)g(done)f(p)r(erfectly)g(in)h(general;)i(rather,)e
-(an)f(impro)n(v)n(ed)f(a)n(v)n(erage)f(case)h(should)i(b)r(e)0
-183 y(sough)n(t.)53 b(A)n(t)33 b(a)g(higher)g(lev)n(el,)h(man)n(y)f
-(\034lesystems)f(implicitly)i(rely)f(on)g(details)g(of)g(la)n(y)n(out)f
-(linearit)n(y)-7 b(,)34 b(for)e(p)r(erformance)0 282
-y(reasons.)40 b(Muc)n(h)29 b(e\033ort)g(is)h(required)e(b)n(y)h(the)g
-(allo)r(cator)f(to)h(a)n(v)n(oid)f(violating)g(suc)n(h)h(assumptions)g
-(to)r(o)f(sev)n(erely)-7 b(.)41 b(This)29 b(is)0 382
-y(w)n(ork)d(for)h(the)h(future.)0 714 y Fg(5.5)112 b(Lo)s(c)m(king)0
-967 y Ff(Sync)n(hronization)28 b(via)h(lo)r(c)n(king)g(is)g(only)h
-(required)e(b)r(et)n(w)n(een)i(snapshot)f(reads)f(and)i(origin)e
-(writes.)43 b(This)29 b(lo)r(c)n(king)g(tak)n(es)0 1066
-y(place)c(en)n(tirely)g(within)h(the)g(serv)n(er)e(so)g(no)i(cluster)f
-(lo)r(c)n(k)g(manager)e(is)j(in)n(v)n(olv)n(ed.)34 b(\(In)26
-b(fact)g(the)g(serv)n(er)d(is)j(a)f(lo)r(c)n(k)g(manager)0
-1166 y(for)h(the)h(limited)h(case)d(of)i(snapshot)f(reads.\))36
-b(The)26 b(lo)r(c)n(ks)g(are)g(simple,)h(hashed)f(lo)r(c)n(ks.)36
-b(The)26 b(cost)h(of)f(this)h(lo)r(c)n(king)f(will)h(b)r(e)0
-1266 y(one)22 b(hash)h(lo)r(okup)f(p)r(er)h(snapshot)f(read)g(or)g
-(origin)g(write)g(of)h(a)g(shared)f(c)n(h)n(unk,)h(plus)g(the)g(unlo)r
-(c)n(k)g(messages.)33 b(This)23 b(lo)r(c)n(king)0 1365
-y(is)28 b(only)f(required)g(when)h(snapshot)f(and)g(origin)g(virtual)g
-(devices)g(are)g(activ)n(e)g(at)g(the)h(same)g(time;)g(e.g.,)f(the)i
-(serv)n(er)c(do)r(es)0 1465 y(not)i(ha)n(v)n(e)f(to)h(tak)n(e)g(an)n(y)
-f(lo)r(c)n(ks)h(to)g(service)f(origin)g(write)h(requests)f(if)i(no)f
-(snapshot)f(device)h(is)g(activ)n(e,)g(ev)n(en)f(if)i(snapshots)0
-1565 y(are)f(b)r(eing)g(held.)0 1897 y Fg(5.6)112 b(Snapshot)39
-b(Deletion)0 2149 y Ff(Because)g(it)h(pac)n(ks)f(together)g
-(information)h(for)f(m)n(ultiple)i(snapshots)e(in)h(eac)n(h)f(leaf)h
-(no)r(de,)j(the)e(exception)e(btree)h(is)0 2249 y(optimized)30
-b(for)f(lo)r(okup)g(and)h(exception)f(insertion)g(as)g(it)h(should)f(b)
-r(e.)44 b(Ho)n(w)n(ev)n(er,)28 b(snapshot)h(deletion)h(is)f(not)h(as)f
-(simple)0 2349 y(an)e(op)r(eration)g(as)g(it)h(w)n(ould)f(b)r(e)h(if)g
-(eac)n(h)f(snapshot)g(had)g(its)h(o)n(wn)f(tree.)36 b(\(But)28
-b(if)h(eac)n(h)d(snapshot)h(had)h(its)f(o)n(wn)g(tree)h(then)0
-2448 y(exception)g(creation)f(time)h(w)n(ould)g(increase)f(with)h(the)h
-(n)n(um)n(b)r(er)f(of)g(snapshots,)f(m)n(uc)n(h)h(more)f(space)g(w)n
-(ould)h(b)r(e)h(used)f(for)0 2548 y(m)n(ultiple)33 b(snapshots)f(and)g
-(k)n(eeping)g(trac)n(k)f(of)i(exception)f(sharing)g(w)n(ould)g(b)r(e)h
-(less)f(e\036cien)n(t.\))52 b(In)33 b(general,)g(deleting)f(a)0
-2648 y(snapshot)c(requires)g(examining)g(the)h(en)n(tire)g(btree)g(and)
-g(mo)r(difying)g(eac)n(h)f(leaf)h(blo)r(c)n(k)f(that)h(con)n(tains)f
-(an)h(exception)g(for)0 2747 y(the)k(snapshot.)50 b(This)32
-b(could)g(amoun)n(t)g(to)g(quite)g(a)g(lot)g(of)g(IO)g(tra\036c)g(and)g
-(tak)n(e)g(a)f(signi\034can)n(t)h(amoun)n(t)f(of)i(time.)51
-b(The)0 2847 y(snapshot)26 b(serv)n(er)e(will)j(therefore)e(simply)h
-(log)g(the)g(status)g(of)h(the)f(snapshot)g(as)g("in)g(pro)r(cess)f(of)
-h(deleting")g(and)g(indicate)0 2946 y(completion)d(immediately)g(to)g
-(the)g(requesting)f(clien)n(t.)36 b(The)23 b(actual)f(deletion)h(will)g
-(pro)r(ceed)g(in)g(the)g(bac)n(kground.)34 b(When)0 3046
-y(the)e(deletion)g(is)g(\034nished,)h(whic)n(h)f(could)f(require)g
-(tens)h(of)g(seconds)f(for)g(a)h(large)e(v)n(olume,)i(the)g(snapshot)g
-(is)f(logged)g(as)0 3146 y(a)n(v)-5 b(ailable)26 b(for)h(reuse.)0
-3345 y(A)j(p)r(ossible)e(optimization)h(is)g(to)g(defer)g(deletions)g
-(un)n(til)h(sev)n(eral)d(snapshots)i(can)f(b)r(e)i(deleted)g(in)f(one)g
-(pass,)g(whic)n(h)g(will)0 3445 y(require)20 b(less)h(time)h(than)f
-(deleting)h(eac)n(h)e(individually)-7 b(.)35 b(Ho)n(w)21
-b(m)n(uc)n(h)g(less)g(dep)r(ends)g(on)g(ho)n(w)g(common)g(it)h(is)f
-(for)g(exceptions)0 3544 y(of)j(sev)n(eral)e(snapshots)h(b)r(eing)i
-(deleted)f(to)g(lie)g(in)h(the)f(same)g(btree)g(no)r(de.)35
-b(Another)24 b(p)r(ossible)g(optimization)g(is)g(to)g(include)0
-3644 y(in)34 b(eac)n(h)e(index)i(no)r(de)f(a)g(bitmap)h(indicating)f
-(whic)n(h)h(snapshots)e(ha)n(v)n(e)g(exceptions)h(in)h(the)f(subtree)h
-(descending)e(from)0 3743 y(that)c(no)r(de)f(so)g(that)h(en)n(tire)f
-(subtrees)g(can)g(b)r(e)h(skipp)r(ed)g(during)f(the)h(tra)n(v)n(ersal)d
-(if)j(they)g(do)f(not)h(need)f(to)h(b)r(e)g(mo)r(di\034ed.)0
-3943 y(A)33 b(more)f(aggressiv)n(e)e(and)j(considerably)e(more)i
-(di\036cult)g(optimization)g(w)n(ould)f(in)n(v)n(olv)n(e)g(in)n(tro)r
-(ducing)g(the)h(concept)g(of)0 4042 y(snapshot)28 b(set)h(generations)e
-(and)i(tagging)e(eac)n(h)h(leaf)h(blo)r(c)n(k)f(with)i(a)e(the)h
-(snapshot)f(generation)g(as)g(of)h(the)g(most)g(recen)n(t)0
-4142 y(alteration.)34 b(Then)23 b(a)g(snapshot)f(could)h(b)r(e)g
-(deleted)h(b)n(y)e(creating)g(a)h(new)g(generation)e(that)i(do)r(es)g
-(not)g(include)g(the)h(deleted)0 4242 y(snapshot.)44
-b(A)31 b(leaf)g(blo)r(c)n(k)e(tagged)h(with)h(an)f(earlier)f
-(generation)f(w)n(ould)i(b)r(e)h(seen)f(as)g("stale")f(and)h(w)n(ould)g
-(b)r(e)h(mo)r(di\034ed)0 4341 y(when)i(next)h(encoun)n(tered,)f(to)g
-(remap)f(it)i(to)f(the)g(curren)n(t)g(generation,)g(remo)n(ving)e
-(exceptions)h(b)r(elonging)h(to)g(deleted)0 4441 y(snapshots)38
-b(in)h(the)g(pro)r(cess.)69 b(The)39 b(complexit)n(y)f(of)h(this)g
-(approac)n(h)e(mak)n(es)h(it)h(unattractiv)n(e,)i(ho)n(w)n(ev)n(er)c
-(if)i(snapshot)0 4540 y(deletion)28 b(p)r(erformance)e(turns)h(out)h
-(to)f(b)r(e)h(a)g(problem)f(it)h(migh)n(t)f(turn)h(out)f(to)h(b)r(e)g
-(w)n(orth)e(the)i(e\033ort.)0 4872 y Fg(5.7)112 b(Expanding)38
-b(the)f(Snapshot)i(Store)0 5125 y Ff(The)30 b(only)f(tric)n(ky)g(part)g
-(of)g(expanding)g(the)h(snapshot)f(store)f(is)i(increasing)e(the)i
-(size)f(of)h(the)g(allo)r(cation)e(bitmap)i(table.)0
-5225 y(These)d(are)f(held)i(in)g(a)f(radix)f(tree)h(to)g(facilitate)h
-(this.)37 b(New)27 b(index)h(and)f(bitmap)h(blo)r(c)n(ks)e(are)g(added)
-i(on)f(the)g(righ)n(t.)36 b(Is)28 b(it)0 5325 y(b)r(est)e(to)f(k)n(eep)
-f(the)i(bitmap)f(blo)r(c)n(ks)f(near)h(the)g(journal,)g(or)f(to)h(disp)
-r(erse)g(them)h(throughout)e(the)h(snapshot)g(store,)f(nearb)n(y)p
-eop end
-%%Page: 11 11
-TeXDict begin 11 10 bop 0 83 a Ff(the)37 b(snapshot)e(store)g(areas)g
-(that)h(they)h(map?)63 b(Curren)n(tly)-7 b(,)38 b(all)e(bitmap)g(blo)r
-(c)n(ks)g(are)f(lo)r(cated)h(near)f(the)i(base)e(of)i(the)0
-183 y(snapshot)29 b(store,)h(whic)n(h)h(w)n(as)e(the)i(simplest)f
-(thing)g(to)h(implemen)n(t.)45 b(But)31 b(if)f(the)h(snapshot)f(store)f
-(is)h(expanded,)h(where)0 282 y(should)i(the)h(new)f(bitmap)h(blo)r(c)n
-(ks)e(come)h(from?)54 b(Should)33 b(w)n(e)g(try)g(to)h(lea)n(v)n(e)e
-(some)g(free)h(space)g(at)g(the)h(b)r(ottom)f(of)h(the)0
-382 y(snapshot)27 b(store?)36 b(Should)27 b(w)n(e)h(relo)r(cate)e(some)
-h(data)g(blo)r(c)n(ks)g(to)g(mak)n(e)g(space?)36 b(These)28
-b(are)e(op)r(en)i(questions.)0 581 y(Once)35 b(the)g(bitmap)h(allo)r
-(cation)e(table)h(is)g(expanded,)h(the)g(new)f(size)g(of)g(the)g
-(snapshot)g(store)f(is)h(recorded)f(in)h(the)g(su-)0
-681 y(p)r(erblo)r(c)n(k,)27 b(and)g(that)h(is)g(that.)0
-1013 y Fg(5.8)112 b(User)38 b(Space)g(Serv)m(er)0 1266
-y Ff(Since)33 b(the)h(cluster)f(snapshot)f(clien)n(t)h(connects)g(to)g
-(the)h(snapshot)e(serv)n(er)f(solely)i(b)n(y)g(means)f(of)i(a)e(net)n
-(w)n(ork)g(so)r(c)n(k)n(et,)i(it)0 1365 y(w)n(as)c(natural)f(to)i
-(implemen)n(t)g(the)g(serv)n(er)e(in)i(user)f(space)g(rather)f(than)i
-(k)n(ernel.)45 b(Since)31 b(the)g(serv)n(er)d(is)j(rather)f(complex,)0
-1465 y(this)d(lik)n(ely)f(sa)n(v)n(ed)f(some)h(time,)h(and)f(certainly)
-g(mak)n(es)f(the)i(co)r(de)f(accessible)g(to)g(a)g(wider)g(comm)n(unit)
-n(y)g(of)h(programmers)0 1565 y(than)34 b(w)n(ould)e(b)r(e)i(the)g
-(case)f(for)g(k)n(ernel)f(co)r(de.)54 b(On)33 b(the)h(other)f(hand,)i
-(fully)f(async)n(hronous)d(IO)i(is)g(considerably)f(more)0
-1664 y(di\036cult)e(to)f(ac)n(hiev)n(e)f(in)h(user)g(space)f(than)h(in)
-h(k)n(ernel)e(co)r(de,)i(and)f(certain)f(op)r(erations)g(could)h(w)n
-(ell)g(b)r(e)g(less)g(e\036cien)n(t.)42 b(It)0 1764 y(is)32
-b(exp)r(ected)g(that)g(the)g(serv)n(er)e(will)j(b)r(e)f(p)r(orted)g(to)
-f(k)n(ernel)g(space)h(at)f(some)h(p)r(oin)n(t,)h(if)f(only)g(to)g
-(learn)f(the)h(truth)g(ab)r(out)0 1863 y(relativ)n(e)26
-b(p)r(erformance.)35 b(This)27 b(migh)n(t)g(also)f(b)r(e)i(desirable)e
-(when)h(it)h(comes)e(time)i(to)f(implemen)n(t)g(a)g(single-no)r(de)f(v)
--5 b(arian)n(t.)0 2179 y Fd(User)32 b(Space)g(Bu\033er)g(La)m(y)m(er)0
-2432 y Ff(Lik)n(e)h(man)n(y)f(problems)g(in)i(computer)f(science,)h
-(the)f(snapshot)g(serv)n(er's)e(handling)i(of)g(its)g(disk-based)f
-(btree)h(amoun)n(ts)0 2531 y(to)h(a)h(cac)n(hing)e(problem.)58
-b(As)34 b(suc)n(h,)i(the)f(classic)f(Unix)h(\020getblk\021)40
-b(pro)n(vides)34 b(an)g(elegan)n(t)g(mo)r(del.)58 b(It)35
-b(lends)f(itself)h(w)n(ell)0 2631 y(to)29 b(incremen)n(tal)g(up)r
-(dating)g(of)h(the)g(btree,)f(bitmaps)h(and)f(other)g(\034le-bac)n(k)n
-(ed)f(data)h(items.)42 b(It)30 b(supp)r(orts)f(async)n(hronous)0
-2731 y(writing)h(and)h(m)n(ulti-threaded)f(writing)h(w)n(ell.)46
-b(And)32 b(imp)r(ortan)n(tly)-7 b(,)31 b(it)g(will)g(greatly)f
-(simplify)h(the)g(task)g(of)f(p)r(orting)h(the)0 2830
-y(serv)n(er)23 b(to)j(k)n(ernel)e(space.)35 b(An)26 b(analog)e(of)h
-(Lin)n(ux's)g(bu\033er)g(la)n(y)n(er)f(w)n(as)g(therefore)g(implemen)n
-(ted)i(in)g(user)e(space,)h(complete)0 2930 y(with)j(bu\033er)g(heads,)
-g(ob)5 b(ject)28 b(referencing,)e(and)i(a)g(bu\033er)g(hash.)37
-b(This)28 b(is)g(recommended)f(reading)g(for)g(those)h(in)n(terested)0
-3029 y(in)g(ho)n(w)f(Lin)n(ux's)g(bu\033er)h(IO)f(mo)r(del)g(w)n(orks.)
-0 3345 y Fd(Memory)j(In)m(v)m(ersion)i(Deadlo)s(c)m(k)0
-3598 y Ff(Implemen)n(ting)g(p)r(ortions)g(of)g(blo)r(c)n(k)g(devices)g
-(\(or)g(\034lesystems)g(for)f(that)i(matter\))f(in)h(user)f(space)f(in)
-n(tro)r(duces)h(the)h(p)r(os-)0 3697 y(sibilit)n(y)i(of)h(memory)e(in)n
-(v)n(ersion)g(deadlo)r(c)n(k.)59 b(This)36 b(o)r(ccurs)e(when)i(a)f
-(user)g(space)f(program)g(exists)h(somewhere)f(in)i(the)0
-3797 y(virtual)d(memory)f(writeout)h(path,)i(whic)n(h)f(will)f(b)r(e)h
-(the)g(case)e(for)h(an)n(y)g(blo)r(c)n(k)f(device)i(or)e(\034lesystem.)
-54 b(When)34 b(the)g(VM)0 3897 y(system)24 b(tries)f(to)h(\035ush)h
-(some)e(dirt)n(y)h(data)f(to)h(disk,)h(the)f(user)g(space)f(program)f
-(ma)n(y)h(b)r(e)i(in)n(v)n(ok)n(ed)d(and)i(ma)n(y)f(kno)n(wingly)g(or)0
-3996 y(unkno)n(wingly)28 b(attempt)i(to)g(allo)r(cate)e(some)h(memory)
--7 b(.)42 b(If)29 b(the)h(memory)f(a)n(v)-5 b(ailable)28
-b(for)h(non-critical)f(pro)r(cesses)g(suc)n(h)h(as)0
-4096 y(user)g(space)g(programs)e(is)j(nearly)f(exhausted,)g(the)h
-(memory)f(allo)r(cation)g(request)g(ma)n(y)g(blo)r(c)n(k,)h(whic)n(h)f
-(also)g(blo)r(c)n(ks)g(the)0 4195 y(memory)f(writeout)g(attempt,)h
-(whic)n(h)g(can)f(ev)n(en)n(tually)g(deadlo)r(c)n(k)f(all)h(user)g
-(space)g(programs)e(due)j(to)f(memory)g(exhaus-)0 4295
-y(tion.)40 b(This)29 b(problem)f(has)g(b)r(een)h(observ)n(ed)e(in)i
-(practice.)40 b(The)28 b(standard)g(solution)g(is)h(to)f(run)h(an)n(y)f
-(net)n(w)n(ork)f(serv)n(ers)f(on)0 4395 y(dedicated)c(serv)n(ers)d(so)i
-(that)h(the)g(serv)n(er)e(is)i(not)f(in)h(an)n(y)f(VM)h(writeout)g
-(path.)35 b(This)21 b(solution)g(is)h(not)g(en)n(tirely)f(satisfactory)
-0 4494 y(b)r(ecause)27 b(it)h(requires)e(an)i(extra)e(serv)n(er,)g
-(whic)n(h)h(ma)n(y)g(end)h(up)g(underutilized.)0 4694
-y(Suc)n(h)g(in)n(v)n(ersion)e(can)i(b)r(e)g(prev)n(en)n(ted)f(b)n(y)h
-(running)f(the)i(user)e(space)g(pro)r(cess)g(in)h(\020PF_MEMALLOC\021)
-35 b(mo)r(de,)28 b(in)g(whic)n(h)0 4793 y(a)36 b(sp)r(ecial)g(pro)r
-(cess)e(\035ag)i(is)g(set)g(giving)f(the)i(pro)r(cess)e(privileged)g
-(access)g(to)h(a)g(p)r(o)r(ol)g(of)g(system)g(memory)f(whic)n(h)h(will)
-0 4893 y(automatically)31 b(b)r(e)i(tapp)r(ed)g(when)g(memory)f(runs)g
-(lo)n(w.)51 b(In)33 b(this)f(mo)r(de,)i(a)f(pro)r(cess)e(m)n(ust)i(b)r
-(e)g(sure)e(that)i(its)g(memory)0 4992 y(usage)27 b(is)h(strictly)g(b)r
-(ounded)h(so)f(that)g(the)h(system)f(memory)g(p)r(o)r(ol)g(is)g(guaran)
-n(teed)f(nev)n(er)g(to)h(b)r(e)h(exhausted.)38 b(Enforcing)0
-5092 y(suc)n(h)29 b(a)f(b)r(ound)h(is)g(p)r(ossible)g(with)g(the)h
-(help)f(of)g(the)g(mlo)r(c)n(k)-5 b(all)29 b(system)f(call,)h(preallo)r
-(cation)e(of)i(su\036cien)n(t)g(pro)r(cess)f(stac)n(k)0
-5192 y(space,)f(and)g(careful)f(analysis)g(of)h(the)h(user)e(space)h
-(program)e(in)i(question.)36 b(While)28 b(some)f(details)g(remain)f(to)
-h(b)r(e)h(w)n(ork)n(ed)0 5291 y(out,)g(it)h(is)f(an)n(ticipated)g(that)
-h(a)f(w)n(ork)-5 b(able)26 b(solution)i(to)g(the)h(memory)e(in)n(v)n
-(ersion)g(deadlo)r(c)n(k)g(problem)h(will)g(b)r(e)h(found,)g(so)0
-5391 y(that)e(this)g(problem)f(in)h(itself)g(will)g(not)f(b)r(ecome)h
-(a)f(comp)r(elling)g(reason)f(to)i(p)r(ort)f(the)h(snapshot)f(serv)n
-(er)f(to)h(k)n(ernel)g(space.)p eop end
-%%Page: 12 12
-TeXDict begin 12 11 bop 0 83 a Fg(5.9)112 b(Messaging)0
-336 y Ff(The)39 b(snapshot)f(serv)n(er)e(uses)j(an)f(async)n(hronous)e
-(stream)i(messaging)f(mo)r(del)i(o)n(v)n(er)e(a)h(so)r(c)n(k)n(et)g
-(transp)r(ort.)69 b(A)39 b(simple)0 436 y(message)27
-b(formatting)h(mo)r(del)g(w)n(as)g(devised)g(in)h(whic)n(h)f(eac)n(h)g
-(message)f(header)g(has)h(just)h(t)n(w)n(o)f(\034elds:)38
-b(co)r(de)29 b(and)f(length.)0 535 y(Since)19 b(the)h(messages)e
-(themselv)n(es)g(tend)i(to)f(b)r(e)h(small)f(and)g(n)n(umerous,)h(the)f
-(size)g(of)h(the)f(message)f(header)g(has)h(a)g(signi\034can)n(t)0
-635 y(e\033ect)26 b(on)e(net)n(w)n(ork)g(load.)35 b(The)25
-b(philosoph)n(y)f(is,)i(ev)n(ery)e(\034eld)h(that)g(can)g(b)r(e)g(mo)n
-(v)n(ed)g(out)g(of)g(the)g(header)f(in)n(to)h(the)h(message)0
-734 y(b)r(o)r(dy)i(should)f(b)r(e.)37 b(This)28 b(w)n(a)n(y)-7
-b(,)26 b(messages)g(do)h(not)h(incur)f(size)h(p)r(enalties)f(for)g
-(\034elds)h(they)g(do)f(not)g(require.)0 934 y(Generally)-7
-b(,)30 b(t)n(w)n(o)f(so)r(c)n(k)n(et)g(read)g(op)r(erations)f(are)h(p)r
-(erformed)h(for)f(eac)n(h)g(incoming)h(message,)f(the)h(\034rst)g(to)g
-(learn)f(the)h(size)0 1033 y(and)c(a)f(second)h(to)g(read)f(the)h
-(message)f(b)r(o)r(dy)-7 b(.)36 b(The)26 b(size)g(of)g(the)g(message)f
-(header)g(could)h(b)r(e)g(further)g(reduced,)g(since)g(the)0
-1133 y(size)j(of)g(man)n(y)f(messages)g(can)g(b)r(e)i(inferred)e(from)h
-(the)g(message)f(co)r(de,)h(ho)n(w)n(ev)n(er)e(doing)i(so)f(w)n(ould)g
-(require)g(three)h(reads)0 1233 y(instead)d(of)g(t)n(w)n(o)f(for)h(v)-5
-b(ariable)25 b(length)h(messages.)35 b(F)-7 b(or)25 b(the)i(time)f(b)r
-(eing,)h(the)f(curren)n(t)g(eigh)n(t)f(b)n(yte)h(message)f(headers)g
-(are)0 1332 y(deemed)j(satisfactory)-7 b(.)0 1531 y(Outgoing)40
-b(messages)g(are)g(t)n(ypically)h(sen)n(t)g(directly)g(from)g(a)g(pro)r
-(cess)g(stac)n(k)f(in)i(a)f(single)g(so)r(c)n(k)n(et)f(write)h(op)r
-(eration.)0 1631 y(Syn)n(tactic)19 b(macro)f(sugar)f(is)i(pro)n(vided)f
-(that)i(allo)n(ws)e(\034xed-length)g(messages)g(to)h(b)r(e)g(expressed)
-f(in)i(a)f(single-line)f(functional)0 1731 y(st)n(yle,)27
-b(using)g(sym)n(b)r(olic)g(\034elds)h(as)f(opp)r(osed)g(to)g(p)r
-(ositional.)0 1930 y(Message)g(con)n(ten)n(ts)g(are)g(binary)-7
-b(,)28 b(whic)n(h)g(requires)e(careful)i(atten)n(tion)g(to)g
-(considerations)e(of)i(\034eld)h(alignmen)n(t,)e(size)h(and)0
-2030 y(b)n(yte)23 b(order.)35 b(All)24 b(n)n(umeric)f(\034elds)h(are)e
-(con)n(v)n(erted)g(to)i(net)n(w)n(ork)e(b)n(yte)h(order)g(on)g(sending)
-g(and)h(nativ)n(e)f(order)f(on)h(receiving.)0 2129 y(Message)18
-b(format)h(is)g(de\034ned)g(using)g(gcc-sp)r(eci\034c)g(pac)n(k)n(ed)f
-(structure)h(attributes,)h(whic)n(h)g(in)f(theory)g(pro)n(vides)f
-(predictable)0 2229 y(\034eld)27 b(size)f(and)g(alignmen)n(t,)g(and)g
-(alignmen)n(t)g(indep)r(endence)h(across)e(pro)r(cessor)f(arc)n
-(hitectures.)35 b(W)-7 b(e)27 b(shall)f(see)g(ho)n(w)f(w)n(ell)0
-2328 y(this)j(w)n(orks)e(in)i(practice,)e(in)i(the)g(face)f(of)h(p)r
-(ossible)f(compiler)g(bugs)g(in)h(this)g(relativ)n(ely)e(un)n(tested)i
-(area.)0 2528 y(A)d(fully)f(async)n(hronous)e(message)h(mo)r(del)i(is)f
-(used)g(in)h(whic)n(h)f(the)h(sender)e(nev)n(er)h(w)n(aits)f(for)h(a)g
-(reply)-7 b(.)35 b(Instead,)25 b(the)g(sender)0 2627
-y(implemen)n(ts)d(an)g(incoming)g(message)e(handler)i(capable)f(of)h
-(handling)g(an)n(y)f(message)g(that)h(migh)n(t)g(b)r(e)h(receiv)n(ed,)f
-(including)0 2727 y(message)g(replies.)35 b(An)n(y)23
-b(required)g(state)g(transformations)f(are)g(implemen)n(ted)i(within)h
-(the)f(incoming)f(message)f(handler,)0 2827 y(whic)n(h)d(runs)f(as)g
-(its)h(o)n(wn)f(task.)33 b(Th)n(us,)20 b(sync)n(hronous)d(messages)g
-(requiring)g(replies,)j(suc)n(h)e(as)g(the)h(clien)n(t's)g(initial)f
-(\020iden)n(tify\021)0 2926 y(message,)i(m)n(ust)h(b)r(e)g(handled)f
-(with)h(in)n(terpro)r(cess)e(comm)n(unication)h(while)h(the)f(common)g
-(and)h(p)r(erformance-critical)d(case)0 3026 y(of)28
-b(async)n(hronous)d(messaging)g(is)j(the)g(default.)0
-3400 y Fh(6)131 b(Clien)l(t)45 b(Implemen)l(tation)g(Details)0
-3681 y Ff(Compared)40 b(to)g(the)h(serv)n(er,)i(the)e(snapshot)f(clien)
-n(t)g(is)h(simple.)76 b(Implemen)n(ted)42 b(as)e(a)g(device)g(mapp)r
-(er)h(target,)i(it)e(is)0 3781 y(resp)r(onsible)25 b(for)g(deferring)g
-(IO)g(requests)g(as)g(appropriate,)f(querying)g(the)i(serv)n(er)e(for)h
-(information)g(it)h(do)r(es)g(not)f(ha)n(v)n(e)g(in)0
-3881 y(cac)n(he,)k(receiving)g(replies,)g(and)h(submitting)g(requests)e
-(to)i(the)g(correct)e(device.)43 b(It)29 b(m)n(ust)h(also)f(w)n(orry)e
-(ab)r(out)j(releasing)0 3980 y(snapshot)39 b(read)f(lo)r(c)n(ks,)j(and)
-f(a)e(handle)i(a)f(few)g(system)g(in)n(terface)g(details,)j(suc)n(h)d
-(as)g(setting)g(up)g(a)g(net)n(w)n(ork)f(so)r(c)n(k)n(et)0
-4080 y(connection)21 b(to)g(the)g(snapshot)g(serv)n(er,)f(reconnecting)
-h(if)g(the)h(connection)f(is)g(brok)n(en,)g(and)g(resp)r(onding)f(to)h
-(the)h(o)r(ccasional)0 4180 y(message)i(to)i(clear)e(cac)n(he)h(b)r
-(ecause)g(a)g(new)g(snapshot)g(w)n(as)g(created.)35 b(Essen)n(tially)-7
-b(,)26 b(the)g(clien)n(t)f(implemen)n(ts)h(some)f(simple)0
-4279 y(message)h(handling)h(and)h(a)f(cac)n(he,)g(the)h(latter)f(b)r
-(eing)g(non-essen)n(tial.)0 4478 y(On)k(initialization,)i(the)f(clien)n
-(t)f(is)h(passed)f(a)g(snapshot)g(n)n(um)n(b)r(er)g(\(-1)g(=)h
-(origin\))e(and)i(the)g(lo)r(cal)f(device)g(names)g(of)h(t)n(w)n(o)0
-4578 y(shared)39 b(storage)e(devices:)61 b(the)40 b(origin)f(and)g(the)
-h(snapshot)f(store.)72 b(The)40 b(clien)n(t)g(starts)f(t)n(w)n(o)g(k)n
-(ernel)g(daemons,)i(one)0 4678 y(b)r(eing)32 b(a)g(blo)r(c)n(king)f
-(thread)h(to)g(handle)g(serv)n(er)e(replies)i(and)g(the)g(other,)h(a)f
-(non)n(blo)r(c)n(king)e(w)n(ork)n(er)g(thread)i(to)g(tak)n(e)f(care)0
-4777 y(of)f(v)-5 b(arious)28 b(min)n(utiae.)43 b(A)n(t)30
-b(this)g(p)r(oin)n(t)g(the)g(clien)n(t)g(initialization)f(completes)g
-(and)h(the)g(virtual)f(device)g(app)r(ears)g(in)h(the)0
-4877 y(/dev/mapp)r(er)d(directory)-7 b(.)38 b(It)29 b(can)f(receiv)n(e)
-f(IO)h(requests,)g(but)h(not)f(complete)h(them)g(y)n(et,)f(b)r(ecause)g
-(it)h(do)r(es)f(not)g(ha)n(v)n(e)f(a)0 4977 y(so)r(c)n(k)n(et)f
-(connection)h(to)h(the)g(serv)n(er.)35 b(A)28 b(user)f(space)f(utilit)n
-(y)i(supplies)g(this)g(later)f(via)g(an)g(io)r(ctl)h(on)f(the)h
-(virtual)f(device.)0 5176 y(One)f(migh)n(t)g(argue)e(that)i(the)h(so)r
-(c)n(k)n(et)e(connection)g(should)h(b)r(e)g(created)g(b)r(efore)f(the)i
-(virtual)e(device)h(is)g(created,)f(ho)n(w)n(ev)n(er)0
-5275 y(the)j(virtual)f(device)g(needs)h(to)f(b)r(e)h(able)f(to)h(op)r
-(erate)e(for)h(short)g(p)r(erio)r(ds)g(with)h(the)g(so)r(c)n(k)n(et)f
-(disconnected)g(an)n(yw)n(a)n(y)-7 b(,)25 b(since)0 5375
-y(the)j(so)r(c)n(k)n(et)e(connection)h(migh)n(t)h(break)e(and)i(need)f
-(to)h(b)r(e)g(reconnected.)p eop end
-%%Page: 13 13
-TeXDict begin 13 12 bop 0 83 a Ff(Finally)-7 b(,)34 b(b)r(efore)f(it)h
-(can)e(complete)h(an)n(y)g(IO)f(requests,)i(the)f(clien)n(t)h(iden)n
-(ti\034es)f(itself)g(to)g(the)h(snapshot)e(serv)n(er,)h(stating)0
-183 y(whic)n(h)c(snapshot)f(it)i(is)f(a)g(clien)n(t)g(for.)41
-b(The)29 b(serv)n(er)f(informs)g(the)i(clien)n(t)f(of)g(the)h(c)n(h)n
-(unk)f(size,)g(then)h(the)f(clien)n(t)g(initializes)0
-282 y(its)f(cac)n(he)e(and)i(pro)r(ceeds)e(to)i(send)f(queries,)g
-(receiv)n(e)f(replies)h(and)h(retire)e(IO)i(requests.)0
-482 y(The)i(clien)n(t-side)g(query)f(cac)n(he)g(is)h(implemen)n(ted)h
-(lazily)e(as)h(an)g(ino)r(de)g(address)f(space.)43 b(This)30
-b(has)g(the)g(adv)-5 b(an)n(tage)29 b(that)0 581 y(it)34
-b(is)f(managed)f(automatically)g(b)n(y)i(the)f(VM)h(system:)48
-b(when)34 b(there)f(is)g(no)g(memory)g(pressure,)g(the)h(cac)n(he)e
-(will)i(gro)n(w)0 681 y(without)28 b(b)r(ound,)g(but)g(when)g(there)f
-(is,)h(the)g(VM)g(system)f(can)g(easily)g(shrink)g(it.)0
-1055 y Fh(7)131 b(F)-11 b(ailure)45 b(T)-11 b(olerance)0
-1353 y Fg(7.1)112 b(Serv)m(er)38 b(Restart)0 1606 y Ff(Though)c(c)n
-(hallenging)f(to)i(implemen)n(t)g(e\036cien)n(tly)-7
-b(,)36 b(the)f(A)n(CID)g(prop)r(erties)e(of)i(consistency)f(and)g
-(durabilit)n(y)g(immedi-)0 1705 y(ately)e(yield)g(an)g(imp)r(ortan)n(t)
-g(b)r(ene\034t:)47 b(fail-o)n(v)n(er)30 b(of)i(the)h(snapshot)e(serv)n
-(er)f(is)i(trivial.)50 b(This)33 b(is)f(b)r(ecause)f(all)h(the)h(serv)n
-(er)0 1805 y(state)c(information)f(connected)h(with)g(completed)g(IO)g
-(requests)f(is)h(alw)n(a)n(ys)e(a)n(v)-5 b(ailable)28
-b(on)g(disk.)41 b(State)29 b(information)g(for)0 1905
-y(uncompleted)34 b(IO)f(requests)f(is)h(alw)n(a)n(ys)f(main)n(tained)h
-(in)h(the)f(memory)g(of)g(clien)n(ts,)i(and)e(none)g(needs)h(to)f(b)r
-(e)h(uploaded)0 2004 y(to)29 b(the)h(new)f(serv)n(er.)40
-b(Restarting)29 b(the)g(snapshot)g(serv)n(er)e(is)i(the)h(same)f(as)f
-(starting)h(it)g(initially)-7 b(,)30 b(except)f(that)h(snapshot)0
-2104 y(clien)n(ts)f(\(but)h(not)g(origin)e(clien)n(ts\))h(m)n(ust)h
-(upload)f(their)g(p)r(ending)h(snapshot)e(read)h(lo)r(c)n(ks)f(b)r
-(efore)h(the)h(serv)n(er)e(ma)n(y)g(allo)n(w)0 2204 y(an)n(y)g(origin)g
-(writes)h(to)g(pro)r(ceed.)1048 2173 y Fc(2)1084 2204
-y Ff(Otherwise,)g(the)g(new)h(serv)n(er)d(need)i(only)g(load)f(the)i
-(snapshot)e(store)g(sup)r(erblo)r(c)n(k)g(and)0 2303
-y(repla)n(y)g(the)j(journal.)43 b(Finally)-7 b(,)30 b(there)f(is)h(no)g
-(requiremen)n(t)f(to)g(clear)g(clien)n(t)h(cac)n(he.)43
-b(So)29 b(serv)n(er)f(restart)h(is)g(v)n(ery)g(fast,)h(on)0
-2403 y(the)e(order)e(of)i(tens)f(of)h(milliseconds.)36
-b(Detecting)28 b(serv)n(er)e(failure)h(is)h(lik)n(ely)f(to)g(tak)n(e)g
-(m)n(uc)n(h)g(longer.)36 b(There)27 b(will)h(b)r(e)g(some)0
-2502 y(sligh)n(t)h(p)r(erformance)f(degradation)f(due)j(to)f(the)g
-(loss)g(of)g(serv)n(er)e(metadata)i(cac)n(he)f(and)h(the)h(need)f(for)g
-(snapshot)f(clien)n(ts)0 2602 y(to)f(retry)g(p)r(ending)h(reads,)e(but)
-j(it)f(is)f(unlik)n(ely)g(to)h(b)r(e)g(noticeable.)0
-2934 y Fg(7.2)112 b(Clien)m(t)36 b(failure)0 3187 y Ff(The)23
-b(serv)n(er)e(notices)i(that)g(a)g(clien)n(t)g(has)g(failed)g(when)g
-(its)g(so)r(c)n(k)n(et)f(connection)h(breaks.)34 b(In)23
-b(resp)r(onse,)g(the)h(serv)n(er)d(releases)0 3287 y(an)n(y)27
-b(snapshot)g(read)f(lo)r(c)n(ks)h(held)h(b)n(y)f(that)h(clien)n(t,)g
-(and)f(that)h(is)f(that.)0 3619 y Fg(7.3)112 b(F)-9 b(ailure)37
-b(Detection)0 3871 y Ff(Cluster)26 b(heartb)r(eating)f(in)h(itself)h
-(is)e(insu\036cien)n(t)i(to)f(detect)g(some)f(common)h(failure)f
-(conditions.)36 b(F)-7 b(or)26 b(example,)f(a)h(no)r(de)0
-3971 y(migh)n(t)38 b(con)n(tin)n(ue)g(resp)r(onding)f(to)h(heartb)r
-(eat)g(messages,)h(ev)n(en)f(though)g(its)g(cluster)g(snapshot)f
-(device)h(has)g(stopp)r(ed)0 4071 y(servicing)33 b(IO)h(requests)f(for)
-h(one)g(reason)f(or)g(another,)j(e.g.,)f(a)f(memory)g(failure.)57
-b(If)34 b(that)h(clien)n(t)g(holds)f(crucial)f(read)0
-4170 y(lo)r(c)n(ks,)f(the)g(en)n(tire)g(cluster)f(could)h(lo)r(c)n(k)f
-(up.)50 b(So)32 b(at)g(least)f(in)h(the)h(case)e(of)g(snapshot)h(clien)
-n(ts,)g(the)h(serv)n(er)d(m)n(ust)i(detect)0 4270 y(timeouts)21
-b(and)g(ping)g(the)h(clien)n(t)f(for)g(liv)n(eness.)33
-b(If)22 b(the)g(clien)n(t)f(fails)g(the)g(test,)i(the)f(serv)n(er)d
-(will)i(break)f(its)i(so)r(c)n(k)n(et)e(connection)0
-4370 y(and)28 b(release)e(its)i(lo)r(c)n(ks.)37 b(It)29
-b(is)e(p)r(ossible)h(that)g(the)g(clien)n(t)g(ma)n(y)g(ha)n(v)n(e)e
-(simply)i(b)r(e)h(slo)n(w,)e(not)h(dead,)f(in)i(whic)n(h)e(case)h(it)g
-(will)0 4469 y(reconnect)f(to)g(the)h(cluster)f(and)h(carry)e(on.)0
-4668 y(Similarly)-7 b(,)28 b(clien)n(ts)f(need)i(to)e(detect)i(serv)n
-(er)d(problems)h(that)h(heartb)r(eating)g(ma)n(y)f(miss.)38
-b(If)28 b(a)g(snapshot)f(or)g(origin)g(clien)n(t)0 4768
-y(detects)k(a)f(timeout,)j(it)e(should)f(rep)r(ort)h(it)g(to)g(the)g
-(cluster)f(infrastructure)g(so)g(that)h(further)g(remedial)f(action)h
-(ma)n(y)f(b)r(e)0 4868 y(tak)n(en.)41 b(It)29 b(ma)n(y)g(b)r(e)g(that)h
-(the)f(cluster)g(resource)e(manager)h(or)g(h)n(uman)h(administrator)e
-(simply)j(assigned)d(the)j(serv)n(er)d(to)0 4967 y(an)k(under-p)r(o)n
-(w)n(ered)f(or)h(o)n(v)n(erloaded)e(no)r(de,)k(in)f(whic)n(h)f(case)g
-(the)h(correct)f(action)g(is)g(to)h(restart)e(the)i(serv)n(er)e(on)i(a)
-f(more)0 5067 y(appropriate)26 b(no)r(de.)37 b(In)27
-b(an)n(y)g(ev)n(en)n(t,)g(this)h(is)g(not)f(a)g(problem)g(that)h
-(heartb)r(eating)f(can)g(detect.)p 0 5136 1548 4 v 92
-5190 a Fb(2)127 5213 y Fa(A)f(future)i(optimization)f(to)g(eliminate)e
-(cop)n(y-on-write)j(for)f(snapshot)h(writes)f(will)e(require)j(similar)
-d(treatmen)n(t)j(in-progress)g(snapshot)0 5292 y(writes.)p
-eop end
-%%Page: 14 14
-TeXDict begin 14 13 bop 0 83 a Fh(8)131 b(Utilities)0
-364 y Ff(Sev)n(eral)26 b(utilities)i(programs)e(are)g(pro)n(vided)h(to)
-g(supp)r(ort)g(the)h(use)g(of)f(the)h(cluster)f(blo)r(c)n(k)g(device:)0
-564 y Fd(mksnapstore)f Ff(-)i(Initialize)f(a)g(ph)n(ysical)g(v)n(olume)
-g(as)g(a)g(snapshot)g(store)0 763 y Fd(csnap-connect)i
-Ff(-)e(Connect)g(a)g(virtual)g(snapshot)g(device)g(to)h(a)f(snapshot)g
-(serv)n(er)0 962 y Fd(csnap-create)i Ff(-)e(Create)g(a)g(new)h
-(snapshot)0 1161 y Fd(csnap-delete)f Ff(-)g(Delete)h(a)g(snapshot)0
-1361 y(The)21 b(latter)g(t)n(w)n(o)f(functions)i(ha)n(v)n(e)e
-(traditionally)g(b)r(een)h(p)r(erformed)g(via)f(an)h(io)r(ctl)g(on)g
-(the)h(Device)f(Mapp)r(er)g(con)n(trol)e(device.)0 1460
-y(Ho)n(w)n(ev)n(er)i(it)j(mak)n(es)e(more)g(sense)h(for)g(a)g(cluster)g
-(snapshot)f(implemen)n(tation)h(to)g(request)g(a)g(new)g(snapshot)g
-(directly)f(from)0 1560 y(the)28 b(snapshot)f(serv)n(er)e(so)i(that)h
-(the)g(device)f(itself)h(is)g(not)f(required)g(to)g(act)h(as)f(a)g
-(mere)g(rela)n(y)-7 b(.)0 1934 y Fh(9)131 b(In)l(terface)45
-b(to)f(Cluster)g(Infrastructure)0 2216 y Ff(The)32 b(cluster)f
-(snapshot)g(blo)r(c)n(k)g(device)g(w)n(as)g(designed)g(to)g(b)r(e)h(op)
-r(erated)f(either)h(man)n(ually)e(or)h(under)h(the)g(con)n(trol)e(of)h
-(a)0 2315 y(cluster)d(manager)f(system.)40 b(Its)29 b(needs)f(are)g(mo)
-r(dest:)39 b(it)29 b(requires)e(a)i(so)r(c)n(k)n(et)e(connection)h(to)h
-(a)f(snapshot)g(serv)n(er,)f(and)i(it)0 2415 y(needs)24
-b(a)g(w)n(a)n(y)f(of)h(rep)r(orting)f(errors)f(when)j(things)f(go)f
-(wrong.)35 b(F)-7 b(or)23 b(example,)i(if)f(its)h(serv)n(er)d
-(connection)i(breaks,)g(it)g(needs)0 2514 y(to)29 b(b)r(e)g(able)g(to)g
-(request)f(a)g(new)h(one.)41 b(These)29 b(are)f(the)h(only)f(things)h
-(it)h(requires)d(b)r(ey)n(ond)i(the)g(existing)g(Device)f(Mapp)r(er)0
-2614 y(infrastructure.)0 2813 y(A)d(user)e(space)h(program)e(connects)i
-(a)g(so)r(c)n(k)n(et)f(to)h(the)g(virtual)g(device)g(after)g(Device)g
-(Mapp)r(er)g(has)g(instan)n(tiated)g(it)h(via)e(an)0
-2913 y(io)r(ctl)30 b(on)f(the)h(virtual)f(device,)h(passing)f(the)h(FD)
-g(n)n(um)n(b)r(er)f(of)h(a)f(so)r(c)n(k)n(et)g(it)h(has)f(op)r(ened.)43
-b(A)30 b(simple)g(utilit)n(y)g(is)f(pro)n(vided)0 3013
-y(so)f(that)g(this)h(ma)n(y)e(b)r(e)i(done)f(from)g(the)h(shell,)f(for)
-g(the)g(purp)r(ose)g(of)g(running)g(the)h(snapshot)e(blo)r(c)n(k)h
-(device)g(without)h(the)0 3112 y(aid)e(of)h(an)n(y)f(cluster)g
-(infrastructure.)0 3311 y(A)d(simple)g(user)f(space)h(monitoring)e
-(utilit)n(y)j(is)e(planned,)i(to)f(receiv)n(e)e(reconnection)h
-(requests)g(from)h(the)g(snapshot)f(clien)n(t.)0 3411
-y(These)d(are)e(deliv)n(ered)h(o)n(v)n(er)g(a)g(second,)i(lo)r(cal)e
-(so)r(c)n(k)n(et)g(connection.)34 b(So)19 b(the)i(snapshot)e(clien)n(t)
-h(has)f(t)n(w)n(o)g(so)r(c)n(k)n(et)g(connections:)0
-3511 y(one)25 b(to)f(the)i(serv)n(er)d(o)n(v)n(er)g(the)i(net)n(w)n
-(ork,)f(and)h(a)g(lo)r(cal)f(connection)g(to)h(a)g(monitor)f(daemon.)35
-b(If)26 b(b)r(oth)f(connections)f(break,)0 3610 y(the)k(snapshot)f
-(target)f(will)i(giv)n(e)f(up)h(in)f(disgust)h(and)f(return)g(failure)g
-(for)g(all)h(further)f(IO)g(requests.)0 3810 y(The)35
-b(snapshot)f(serv)n(er)g(m)n(ust)h(rely)f(on)h(the)g(cluster)g
-(infrastructure)f(to)h(satisfy)f(the)i(requiremen)n(t)e(that)h(all)g
-(snapshot)0 3909 y(clien)n(ts)25 b(reconnect)g(and)h(upload)f(their)g
-(read)g(lo)r(c)n(ks)g(on)g(serv)n(er)f(restart.)35 b(A)25
-b(suitable)h(in)n(terface)f(to)g(supp)r(ort)h(this)f(requires)0
-4009 y(further)i(researc)n(h.)0 4383 y Fh(10)131 b(P)l(erformance)44
-b(Characteristics)0 4681 y Fg(10.1)112 b(Assumptions)0
-4934 y Ff(P)n(erformance)36 b(estimates)h(b)r(elo)n(w)g(are)f(based)h
-(on)g(using)g(the)g(smallest)g(c)n(h)n(unk)g(size,)j(4K.)c(Eac)n(h)h
-(new)h(exception)f(uses)0 5034 y(20)g(b)n(ytes)g(\(exception)g(store)f
-(address,)j(sharing)d(bitmap)i(and)f(directory)f(en)n(try\))h(so)g(eac)
-n(h)g(btree)g(leaf)g(no)r(de)g(holds)g(a)0 5133 y(maxim)n(um)30
-b(of)h(ab)r(out)g(200)e(exceptions.)45 b(Due)31 b(to)g(splitting,)h
-(leaf)e(no)r(des)h(are)e(normally)h(not)g(full.)78 b(In)31
-b(fact)g(w)n(orst)e(case)0 5233 y(fullness)37 b(of)g(50\045)f(is)h(exp)
-r(ected)g(for)g(the)g(early)f(implemen)n(tations,)j(so)d(leaf)h(no)r
-(des)f(will)i(hold)e(ab)r(out)h(100)f(exceptions)0 5332
-y(eac)n(h.)g(The)28 b(p)r(erformance)e(estimates)h(here)g(assume)g
-(async)n(hronous)e(IO,)i(whic)n(h)h(for)f(user)g(space)g(is)g(not)h(y)n
-(et)f(a)g(practical)p eop end
-%%Page: 15 15
-TeXDict begin 15 14 bop 0 83 a Ff(p)r(ossibilit)n(y)30
-b(in)g(Lin)n(ux,)g(therefore)g(a)f(k)n(ernel)g(implemen)n(tation)h(is)g
-(assumed.)44 b(The)30 b(initial)g(implemen)n(tation)g(ho)n(w)n(ev)n(er)
-e(is)0 183 y(in)g(user)f(space;)f(without)i(async)n(hronous)d(IO)i(the)
-h(user)f(space)g(implemen)n(tation)g(will)h(not)f(p)r(erform)g(as)g(w)n
-(ell)g(as)g(a)g(k)n(ernel)0 282 y(implemen)n(tation.)36
-b(It)27 b(is)f(exp)r(ected)h(that)f(b)r(oth)h(implemen)n(tations)f
-(will)h(b)r(e)f(dev)n(elop)r(ed)g(and)g(main)n(tained;)h(that)f(the)h
-(user)0 382 y(implemen)n(tation)j(will)g(b)r(e)g(a)n(v)-5
-b(ailable)29 b(\034rst;)i(that)f(a)g(k)n(ernel)f(implemen)n(tation)h
-(will)g(sup)r(ercede)g(it)g(in)g(p)r(erformance;)g(and)0
-482 y(that)38 b(the)f(user)g(space)f(implemen)n(tation)i(will)f(ev)n
-(en)n(tually)f(pull)i(ev)n(en)f(with)h(the)f(k)n(ernel)g(implemen)n
-(tation)g(b)n(y)g(taking)0 581 y(adv)-5 b(an)n(tage)26
-b(of)h(newly)h(a)n(v)-5 b(ailable)26 b(async)n(hronous)f(IO)i(and)h
-(user)f(space)f(lo)r(c)n(king)h(facilities.)0 913 y Fg(10.2)112
-b(E\033ect)37 b(of)h(c)m(h)m(unk)f(size)0 1166 y Ff(Larger)d(c)n(h)n
-(unk)h(size)g(will)i(help)f(p)r(erformance)e(for)i(sequen)n(tial)f(and)
-g(h)n(urt)h(for)f(random)g(write)g(loads.)61 b(The)36
-b(total)f(size)0 1266 y(of)c(metadata)g(reduces)g(linearly)f(with)i
-(the)f(c)n(h)n(unk)g(size,)h(sa)n(ving)e(space,)i(IO)f(bandwidth)g(and)
-h(seeking.)47 b(On)31 b(the)h(other)0 1365 y(hand,)38
-b(larger)33 b(c)n(h)n(unks)i(increase)f(in)n(ternal)h(fragmen)n(tation)
-g(of)g(the)h(snapshot)f(store,)i(esp)r(ecially)e(for)g(sparse,)h
-(random)0 1465 y(access)27 b(loads,)h(and)h(the)g(o)n(v)n(erhead)e(of)h
-(metadata)g(up)r(dating)h(is)g(supp)r(osed)f(to)h(b)r(e)g(small)f(in)h
-(relation)f(to)g(data)g(transfers.)0 1565 y(Therefore)f(it)h(is)g(hop)r
-(ed)f(that)i(the)f(p)r(erformance)e(and)i(metadata)f(size)h(cost)f(of)h
-(small)f(c)n(h)n(unk)h(sizes)f(will)h(b)r(e)g(out)n(w)n(eighed)0
-1664 y(reduced)f(in)n(ternal)g(fragmen)n(tation,)f(sa)n(ving)g(space)h
-(in)h(the)g(snapshot)f(store.)36 b(This)27 b(remains)g(to)g(b)r(e)h
-(tested)g(in)g(practice.)0 1996 y Fg(10.3)112 b(E\033ect)37
-b(of)h(metadata)f(blo)s(c)m(k)g(size)0 2249 y Ff(Larger)22
-b(metadata)i(blo)r(c)n(ks)g(will)h(impro)n(v)n(e)e(p)r(erformance)g
-(somewhat)h(on)g(largely)f(serial)g(write)i(loads)e(due)i(do)f
-(requiring)f(a)0 2349 y(few)n(er)i(n)n(um)n(b)r(er)g(of)h(larger)e
-(IOs,)h(esp)r(ecially)g(if)i(the)f(snapshot)f(metadata)g(is)g(fragmen)n
-(ted.)36 b(Ho)n(w)n(ev)n(er,)24 b(for)h(the)h(time)g(b)r(eing)0
-2448 y(Lin)n(ux)f(do)r(es)h(not)f(supp)r(ort)h(IO)f(bu\033ers)g(larger)
-f(than)i(ph)n(ysical)f(page)f(size,)i(so)f(it)h(is)g(exp)r(ected)f
-(that)h(metadata)f(blo)r(c)n(k)g(size)0 2548 y(will)g(not)g(increase)e
-(un)n(til)j(this)f(issue)f(is)h(addressed,)f(at)h(least)f(for)h(a)f(k)n
-(ernel)g(implemen)n(tation)h(of)g(the)g(snapshot)f(metadata)0
-2648 y(serv)n(er.)34 b(F)-7 b(or)26 b(compatibilit)n(y)g(with)g(the)h
-(exp)r(ected)f(k)n(ernel)f(metadata)g(serv)n(er,)g(the)h(user)g(space)f
-(implemen)n(tation)h(will)g(use)0 2747 y(4K)h(blo)r(c)n(ks.)0
-2946 y(It)g(is)g(though)n(t)f(that)h(comm)n(unication)f(o)n(v)n(erhead)
-e(and)j(serv)n(er)d(load)i(will)h(not)g(b)r(e)g(signi\034can)n(t)f(p)r
-(erformance)f(factors,)h(due)0 3046 y(to)f(these)g(b)r(eing)g(highly)g
-(optimized.)36 b(Con)n(ten)n(tion)24 b(on)h(large)f(clusters)g(with)h
-(parallel)f(loads)g(should)h(not)g(b)r(e)g(a)g(signi\034can)n(t)0
-3146 y(factor)31 b(either,)j(since)e(a)f(single)h(serv)n(er)e(should)i
-(b)r(e)h(able)f(to)g(handle)g(the)g(tra\036c)g(of)g(man)n(y)g(no)r(des)
-g(of)g(similar)f(p)r(o)n(w)n(er)g(to)0 3245 y(itself.)k(The)21
-b(exception)f(to)h(this)g(is)g(cop)n(y-out)e(o)n(v)n(erhead)g(whic)n(h)
-i(could)f(easily)g(saturate)g(a)g(serv)n(er's)f(bus;)k(a)e(simple)g
-(solution)0 3345 y(is)27 b(a)n(v)-5 b(ailable:)36 b(farm)27
-b(out)h(the)g(cop)n(y-out)e(tra\036c)h(to)g(ligh)n(tly-loaded)f(no)r
-(des)i(as)f(necessary)-7 b(.)0 3677 y Fg(10.4)112 b(E\033ect)37
-b(of)h(Holding)d(Multiple)h(Snapshots)0 3930 y Ff(The)d(more)f
-(snapshots)h(that)g(are)f(held,)j(the)e(more)g(btree)g(leaf)g(no)r(des)
-g(will)g(b)r(e)h(required)e(to)h(hold)g(them.)54 b(Journalling)0
-4029 y(the)30 b(extra)f(btree)h(lea)n(v)n(es)f(to)g(disk)h(consumes)f
-(IO)h(bandwidth,)h(causes)e(more)g(seeking)g(and)h(generates)e(cac)n
-(he)h(pressure.)0 4129 y(Reading)22 b(in)h(the)h(extra)e(btree)g(no)r
-(des)h(increases)f(latency)-7 b(.)35 b(Ho)n(w)n(ev)n(er,)21
-b(b)r(ecause)i(exceptions)f(for)h(all)f(snapshots)g(are)g(stored)0
-4229 y(adjacen)n(t)27 b(in)h(the)g(btree,)f(the)h(o)n(v)n(erhead)e(is)h
-(not)h(as)e(large)h(as)f(if)j(a)e(separate)f(map)h(had)h(to)f(b)r(e)h
-(up)r(dated)g(on)f(disk)h(for)f(eac)n(h)0 4328 y(snapshot.)57
-b(Imp)r(ortan)n(tly)-7 b(,)36 b(the)f(pro)r(cess)e(of)i(determining)f
-(whether)g(a)g(giv)n(en)g(c)n(h)n(unk)g(is)h(shared)e(nev)n(er)h
-(requires)f(more)0 4428 y(than)28 b(a)f(single)g(leaf)g(no)r(de)h(to)f
-(b)r(e)h(examined.)0 4627 y(Sharing)d(bitmaps)h(are)f(used)h(within)h
-(leaf)e(no)r(des)h(to)g(a)n(v)n(oid)f(ha)n(ving)f(to)i(en)n(ter)g(an)n
-(y)f(giv)n(en)g(snapshot)g(store)g(address)g(more)0 4727
-y(than)h(once)e(in)n(to)i(the)f(no)r(de,)h(and)g(also)e(p)r(erforms)h
-(the)h(function)f(of)h(sp)r(ecifying)f(whic)n(h)h(snapshot)e(uses)h(a)g
-(giv)n(en)g(snapshot)0 4826 y(store)d(address.)34 b(The)23
-b(w)n(orst)f(case)g(arises)f(when)j(a)e(giv)n(en)g(logical)g(c)n(h)n
-(unk)h(is)f(written)i(at)e(least)h(once)g(after)f(ev)n(ery)g(snapshot.)
-0 4926 y(Then)29 b(the)h(leaf)f(no)r(de)g(en)n(tries)f(for)h(that)g(c)n
-(h)n(unk)g(ha)n(v)n(e)f(a)h(bitmap)g(and)g(a)g(snapshot)f(store)h
-(address)e(for)i(ev)n(ery)f(snapshot.)0 5026 y(Since)23
-b(leaf)g(no)r(des)f(are)g(exp)r(ected)h(to)g(b)r(e)g(50\045)f(full)i
-(in)f(the)g(initial)g(implemen)n(tation,)h(w)n(e)e(can)h(end)g(up)g
-(with)g(one)f(exception)0 5125 y(stored)i(in)h(eac)n(h)g(leaf)f(no)r
-(de.)36 b(Then)26 b(the)f(n)n(um)n(b)r(er)g(of)g(btree)f(no)r(des)h
-(that)g(ha)n(v)n(e)f(to)h(b)r(e)h(journalled)e(is)h(equal)f(to)h(the)g
-(n)n(um)n(b)r(er)0 5225 y(of)32 b(c)n(h)n(unks)f(written.)50
-b(The)32 b(journalled)g(no)r(de)g(has)f(to)h(b)r(e)g(written)g(t)n
-(wice,)h(once)f(to)f(the)i(journal)e(and)h(once)f(to)h(its)g(true)0
-5325 y(destination.)j(So)23 b(the)g(w)n(orst)f(case)g(is)g(a)h(factor)f
-(of)h(3)f(degradation)g(in)h(write)f(p)r(erformance)g(due)h(to)g(btree)
-g(up)r(dating)g(alone.)p eop end
-%%Page: 16 16
-TeXDict begin 16 15 bop 0 83 a Ff(T)-7 b(o)23 b(ameliorate)f(suc)n(h)i
-(degradation)d(it)k(w)n(ould)e(b)r(e)h(wise)f(to)h(use)f(a)g(larger)f
-(c)n(h)n(unk)h(size)g(when)h(large)e(n)n(um)n(b)r(ers)i(of)f(snapshots)
-0 183 y(are)k(exp)r(ected.)0 382 y(The)34 b(w)n(orst)g(case)f
-(degradation)g(ab)r(o)n(v)n(e)g(can)h(b)r(e)g(temp)r(ered)h(somewhat)f
-(b)n(y)g(impro)n(ving)f(the)h(btree)h(up)r(date)f(algorithm)0
-482 y(to)28 b(use)g(a)f(b+tree)g(algorithm,)g(whic)n(h)h(guaran)n(tees)
-e(2/3rds)g(leaf)i(fullness,)g(enough)f(to)h(hold)g(t)n(w)n(o)f
-(exceptions)g(instead)h(of)0 581 y(one.)50 b(Larger)30
-b(metadata)h(blo)r(c)n(ks)h(will)g(help)g(reduce)g(seeking)f(o)n(v)n
-(erhead,)g(when)h(they)g(b)r(ecome)g(practical.)50 b(Ev)n(en)n(tually)0
-681 y(though,)33 b(the)f(b)r(est)g(strategy)f(is)h(to)f(in)n(tro)r
-(duce)h(v)-5 b(arian)n(t)31 b(leaf)g(no)r(de)h(formats)f(that)i
-(optimize)f(for)f(the)h(man)n(y-snapshots)0 780 y(case)24
-b(b)n(y)g(represen)n(ting)g(ranges)f(of)i(snapshot)f(store)f(c)n(h)n
-(unks)i(compactly)-7 b(,)24 b(esp)r(ecially)h(where)f(the)h(snapshot)f
-(store)g(c)n(h)n(unks)0 880 y(are)j(allo)r(cated)f(sequen)n(tially)-7
-b(,)27 b(whic)n(h)h(is)f(something)g(w)n(e)g(w)n(an)n(t)g(to)h(ac)n
-(hiev)n(e)e(an)n(yw)n(a)n(y)-7 b(.)0 1079 y(In)33 b(brief,)h(the)f
-(metadata)g(up)r(date)g(comp)r(onen)n(t)f(of)h(origin)f(and)g(snapshot)
-g(write)h(p)r(erformance)f(will)h(degrade)e(linearly)0
-1179 y(with)f(the)g(n)n(um)n(b)r(er)f(of)h(snapshots)e(held,)i(but)h
-(with)f(a)f(m)n(uc)n(h)g(shallo)n(w)n(er)e(slop)r(e)j(than)f(if)h
-(snapshot)f(store)g(data)g(w)n(ere)f(not)0 1279 y(shared)i(and)g
-(metadata)g(w)n(ere)g(not)g(group)r(ed)g(together)g(b)n(y)g(logical)g
-(address.)44 b(In)31 b(the)g(latter)f(case,)h(cop)n(y-out)e(o)n(v)n
-(erhead)0 1378 y(w)n(ould)c(increase)f(directly)i(with)g(n)n(um)n(b)r
-(er)f(of)h(snapshots.)35 b(Exception)26 b(table)f(up)r(date)h(o)n(v)n
-(erhead)d(w)n(ould)j(increase)e(rapidly)0 1478 y(as)j(w)n(ell,)g
-(though)h(the)g(exact)f(rate)f(is)i(harder)e(to)i(c)n(haracterize)d(b)r
-(ecause)i(it)h(dep)r(ends)g(on)f(the)h(c)n(h)n(unk)f(sharing)f
-(patterns.)0 1677 y(With)j(the)f(maxim)n(um)f(n)n(um)n(b)r(er)h(of)f
-(snapshots)g(held)h(\(64\))f(the)h(new)g(design)f(should)h(p)r(erform)f
-(b)r(etter)h(than)g(the)g(old)f(one)0 1777 y(b)n(y)36
-b(a)g(factor)f(of)h(thirt)n(y)g(or)f(more.)62 b(F)-7
-b(urthermore,)37 b(some)f(fairly)f(straigh)n(tforw)n(ard)e(impro)n(v)n
-(emen)n(ts)i(to)h(the)g(btree)g(leaf)0 1876 y(format)30
-b(can)g(mak)n(e)f(the)i(slop)r(e)f(m)n(uc)n(h)g(shallo)n(w)n(er,)f(to)i
-(the)f(p)r(oin)n(t)h(where)f(the)h(o)n(v)n(erhead)d(of)i(holding)g(64)g
-(snapshots)f(ma)n(y)0 1976 y(b)r(e)f(hard)f(to)g(notice.)0
-2175 y(With)e(a)f(single)f(snapshot)g(held,)j(the)e(new)g(design)g(not)
-g(p)r(erform)f(quite)i(as)e(w)n(ell)h(as)f(the)i(existing)e
-(device-mapp)r(er)h(design,)0 2275 y(but)29 b(only)e(b)r(ecause)h(the)g
-(existing)g(design)f(do)r(es)h(not)g(pro)n(vide)f(durable)g(recording)f
-(of)i(snapshot)f(store)g(up)r(dates.)39 b(In)28 b(an)n(y)0
-2374 y(case,)f(the)i(o)n(v)n(erhead)d(of)i(the)g(durable)g(snapshot)f
-(recording)f(is)i(exp)r(ected)h(to)f(b)r(e)g(only)g(ab)r(out)g(2\045)g
-(w)n(orst-case)d(o)n(v)n(erhead)0 2474 y(vs)g(ra)n(w)f(writing,)h(far)g
-(less)g(than)g(the)g(200\045)f(w)n(orst-case)f(o)n(v)n(erhead)g(of)i
-(cop)n(y-outs)f(when)h(a)g(single)g(snapshot)f(is)h(held,)h(and)0
-2574 y(shrinks)i(roughly)g(linearly)h(with)g(the)h(c)n(h)n(unk)e(size)h
-(\(extra)g(seeking)f(in)i(the)f(metadata)g(region)f(mak)n(es)g(this)h
-(relationship)0 2673 y(sligh)n(tly)f(more)f(complex\).)39
-b(So)29 b(b)n(y)f(using)g(a)g(256K)e(c)n(h)n(unk)i(size,)h(metadata)e
-(up)r(date)i(can)f(most)g(lik)n(ely)g(b)r(e)h(held)g(to)f(a)g(few)0
-2773 y(p)r(ercen)n(t)f(of)h(\034rst-time)f(write)h(o)n(v)n(erhead)d(ev)
-n(en)i(when)h(the)g(maxim)n(um)f(n)n(um)n(b)r(er)g(of)h(snapshots)e
-(are)h(held.)0 3105 y Fg(10.5)112 b(Origin)36 b(Read)i(P)m(erformance)0
-3358 y Ff(Origin)21 b(reads)h(are)g(passed)f(straigh)n(t)h(through)f
-(to)i(the)g(underlying)f(v)n(olume.)34 b(Since)23 b(the)g(o)n(v)n
-(erhead)e(of)h(the)h(device)f(mapp)r(er)0 3457 y(handling)27
-b(is)h(insigni\034can)n(t,)f(origin)f(read)h(p)r(erformance)f(is)i
-(essen)n(tially)e(unc)n(hanged)0 3789 y Fg(10.6)112 b(Sequen)m(tial)37
-b(Origin)f(W)-9 b(rite)35 b(P)m(erformance)0 4042 y Ff(Origin)k(write)g
-(throughput)h(is)f(a\033ected)h(mainly)f(b)n(y)h(the)g(frequency)f(of)h
-(c)n(h)n(unk)f(cop)n(y-outs)f(and)i(metadata)f(up)r(date)0
-4142 y(o)n(v)n(erhead.)45 b(Cop)n(y-outs)30 b(require)g(reading)g(and)h
-(writing,)g(requiring)f(a)h(minim)n(um)g(of)h(200\045)d(additional)i
-(bandwidth)g(vs)0 4242 y(ra)n(w)26 b(write)h(and)g(additional)f
-(seeking)g(as)h(w)n(ell,)g(esp)r(ecially)f(for)h(the)g(single-spindle)g
-(case)f(where)g(the)i(origin)e(v)n(olume)g(and)0 4341
-y(snapshot)21 b(store)g(will)i(b)r(e)f(far)g(apart.)34
-b(Throughput)21 b(is)h(impro)n(v)n(ed)f(at)h(the)g(exp)r(ense)g(of)g
-(latency)g(b)n(y)g(batc)n(hing)f(the)h(cop)n(y-out)0
-4441 y(reads)30 b(and)h(cop)n(y-out)e(writes,)j(whic)n(h)f(happ)r(ens)g
-(naturally)f(with)h(async)n(hronous)e(IO.)h(There)h(will)g(th)n(us)g(b)
-r(e)g(few)n(er)g(long)0 4540 y(seeks)c(b)r(et)n(w)n(een)g(the)h(origin)
-f(and)g(snapshot)g(store.)0 4740 y(W)-7 b(orst)20 b(case)f(origin)g
-(write)h(p)r(erformance)f(is)h(obtained)g(when)g(the)h(snapshot)e
-(store)g(is)h(created)g(with)g(the)h(smallest)e(p)r(ossible)0
-4839 y(c)n(h)n(unk)25 b(size)g(\(4K\))g(and)h(the)g(load)f(requires)f
-(a)h(cop)n(y-out)f(for)h(ev)n(ery)f(c)n(h)n(unk)h(write.)36
-b(Suc)n(h)26 b(a)f(load)g(is)g(easy)g(to)g(generate,)g(for)0
-4939 y(example)30 b(b)n(y)g(setting)g(a)f(snapshot)h(and)g(then)h
-(immediately)f(unpac)n(king)f(an)h(arc)n(hiv)n(e)e(in)n(to)i(the)h(v)n
-(olume.)44 b(Required)29 b(IO)0 5039 y(bandwidth)f(will)g(triple,)h
-(seeking)e(b)r(et)n(w)n(een)g(the)i(origin)d(and)i(snapshot)f(store)g
-(will)h(increase,)f(and)h(metadata)f(up)r(dating)0 5138
-y(will)34 b(increase.)53 b(W)-7 b(riting)34 b(in)f(this)h(case)f
-(should)g(b)r(e)h(largely)e(linear)g(and)i(batc)n(hing)f(amortizes)f
-(the)i(seeking)e(o)n(v)n(erhead,)0 5238 y(so)g(the)h(dominan)n(t)g
-(e\033ect)g(is)g(exp)r(ected)g(to)g(b)r(e)g(the)g(increased)f(IO)g
-(bandwidth.)54 b(F)-7 b(or)32 b(this)h(load)f(w)n(e)h(should)f(exp)r
-(ect)h(to)0 5337 y(see)g(a)h(3)f(times)h(slo)n(wdo)n(wn)e(v)n(ersus)h
-(ra)n(w)f(v)n(olume)h(access.)54 b(F)-7 b(ragmen)n(tation)33
-b(of)g(the)i(snapshot)e(store)f(could)i(mak)n(e)f(this)p
-eop end
-%%Page: 17 17
-TeXDict begin 17 16 bop 0 83 a Ff(considerably)26 b(w)n(orse,)g(p)r
-(erhaps)h(b)n(y)g(another)g(factor)f(of)i(three.)0 282
-y(Since)c(suc)n(h)g(a)f(load)g(is)h(easy)f(to)g(generate)g(it)h(is)g(w)
-n(orrisome.)33 b(It)25 b(is)e(p)r(ossible)h(that)g(in)g(the)g(long)f
-(run,)i(general)d(p)r(erformance)0 382 y(for)27 b(a)g(snapshot)g(v)n
-(olume)g(could)g(b)r(ecome)h(b)r(etter)g(than)f(for)g(the)h(origin,)f
-(see)g(b)r(elo)n(w.)0 581 y(F)-7 b(ragmen)n(tation)24
-b(of)i(the)h(snapshot)e(store)g(will)h(in)n(tro)r(duce)f(additional)g
-(seeking)g(and)h(rotational)e(latency)i(p)r(enalties.)36
-b(Re-)0 681 y(ducing)21 b(suc)n(h)g(fragmen)n(tation)f(b)n(y)h(clev)n
-(er)f(snapshot)h(store)f(allo)r(cation)g(p)r(olicy)h(will)h(yield)f
-(signi\034can)n(t)g(p)r(erformance)f(gains,)0 780 y(ho)n(w)n(ev)n(er)32
-b(suc)n(h)h(allo)r(cation)g(p)r(olicy)h(impro)n(v)n(emen)n(ts)e
-(require)h(considerable)f(time)i(to)g(dev)n(elop.)55
-b(A)34 b(highly)g(fragmen)n(ted)0 880 y(snapshot)k(store)f(could)i
-(aggra)n(v)-5 b(ate)36 b(w)n(orst)h(case)h(write)g(p)r(erformance)f(b)n
-(y)i(an)f(additional)g(factor)g(of)g(a)g(few)h(h)n(undred)0
-980 y(p)r(ercen)n(t.)0 1312 y Fg(10.7)112 b(Random)38
-b(Origin)d(W)-9 b(rite)36 b(P)m(erformance)0 1565 y Ff(A)28
-b(load)f(that)h(consists)f(of)g(100\045)g(single-sector)e(writes)i
-(distributed)h(randomly)f(o)n(v)n(er)f(the)i(en)n(tire)f(v)n(olume)g
-(immediately)0 1664 y(after)22 b(setting)h(a)f(snapshot)g(will)h(cause)
-e(cop)n(y-out)h(bandwidth)h(to)f(b)r(e)h(m)n(uc)n(h)g(more)e(than)i
-(200\045)e(of)i(ra)n(w)e(write)i(bandwidth,)0 1764 y(and)33
-b(will)h(also)f(cause)g(a)g(great)g(deal)g(of)h(additional)f(seeking.)
-54 b(Metadata)33 b(o)n(v)n(erhead)e(will)j(also)f(increase)f
-(signi\034can)n(tly)0 1863 y(since)g(t)n(ypically)f(only)h(a)f(single)h
-(c)n(h)n(unk)g(on)f(eac)n(h)g(leaf)h(no)r(de)g(will)h(b)r(e)f(up)r
-(dated)h(eac)n(h)e(time)h(the)h(no)r(de)f(is)g(journalled)f(to)0
-1963 y(disk;)d(rotational)e(latency)i(will)g(increase)e(signi\034can)n
-(tly)h(during)h(metadata)f(access.)37 b(P)n(erformance)26
-b(under)i(this)g(random)0 2063 y(load)33 b(will)h(t)n(ypically)f(b)r(e)
-h(dominated)g(b)n(y)f(seeking)g(rather)g(than)h(bandwidth.)56
-b(Analysis)33 b(is)h(complex,)g(ho)n(w)n(ev)n(er)e(I)i(will)0
-2162 y(sp)r(eculate)c(no)n(w)f(that)h(the)g(p)r(erformance)f(of)h(the)g
-(snapshotted)g(v)n(olume)f(could)g(degrade)g(b)n(y)g(a)h(factor)f(of)h
-(3)f(to)h(4)g(v)n(ersus)0 2262 y(the)e(ra)n(w)e(v)n(olume)h(due)h(to)f
-(additional)g(seeking)g(and)g(rotational)f(latency)i(for)f(cop)n
-(y-outs)f(and)h(metadata)g(up)r(dating.)0 2461 y(F)-7
-b(ragmen)n(tation)33 b(of)i(the)h(snapshot)e(store)g(can)g(and)h
-(should)g(b)r(e)g(addressed)f(o)n(v)n(er)f(time.)60 b(F)-7
-b(or)34 b(origin)g(writes,)i(nothing)0 2561 y(that)30
-b(can)g(b)r(e)h(done)e(ab)r(out)h(the)h(cop)n(y-out)e(o)n(v)n(erhead.)
-42 b(Snapshot)29 b(writes)h(on)g(the)g(other)g(hand)g(do)g(not)g(incur)
-f(cop)n(y-out)0 2660 y(o)n(v)n(erhead.)41 b(They)29 b(do)g(incur)h
-(seeking)e(and)i(rotational)e(p)r(enalties)h(due)h(to)g(fragmen)n
-(tation)e(in)i(the)g(snapshot)e(store,)h(but)0 2760 y(so)34
-b(do)g(origin)f(writes.)57 b(F)-7 b(urthermore)33 b(snapshot)g(reads)h
-(also)f(su\033er)h(from)g(fragmen)n(tation)f(p)r(enalties)h(whereas)f
-(origin)0 2860 y(reads)g(do)g(not.)56 b(V)-7 b(ery)33
-b(go)r(o)r(d)g(snapshot)g(store)g(la)n(y)n(out)f(optimization)i(could)f
-(reduce)h(b)r(oth)g(the)g(p)r(enalt)n(y)g(for)f(snapshot)0
-2959 y(reading)e(and)h(writing,)i(in)e(whic)n(h)h(case)e(general)g(p)r
-(erformance)h(on)g(a)g(snapshot)f(v)n(olume)h(could)g(b)r(e)h(b)r
-(etter)g(than)f(on)g(a)0 3059 y(snapshotted)27 b(origin)f(v)n(olume.)37
-b(Whether)27 b(this)h(can)f(b)r(e)h(realized)f(in)h(practice)f(remains)
-f(to)i(b)r(e)g(seen.)0 3391 y Fg(10.8)112 b(Snapshot)39
-b(Read)f(P)m(erformance)0 3644 y Ff(Unlik)n(e)29 b(origin)f(reads,)g
-(snapshot)h(read)f(throughput)h(is)g(a\033ected)g(b)n(y)g(snapshot)f
-(store)g(fragmen)n(tation.)40 b(Snapshot)29 b(read)0
-3743 y(latency)k(is)g(increased)f(b)n(y)h(the)h(requiremen)n(t)f(of)g
-(lo)r(c)n(king)f(against)g(origin)h(writes.)53 b(Read-ahead)32
-b(results)h(in)h(a)f(kind)g(of)0 3843 y(lo)r(c)n(k-ahead,)39
-b(so)f(under)g(loads)f(where)g(read-ahead)f(is)i(e\033ectiv)n(e,)j
-(increased)c(snapshot)h(read)f(latency)h(will)g(not)g(h)n(urt)0
-3943 y(read)30 b(throughput.)45 b(The)31 b(predominan)n(t)f(visible)g
-(e\033ect)h(is)g(exp)r(ected)f(to)h(b)r(e)g(read)f(fragmen)n(tation.)44
-b(With)31 b(large)f(c)n(h)n(unk)0 4042 y(sizes,)36 b(e.g.,)h(256K)c
-(and)i(up,)i(mo)r(derate)d(fragmen)n(tation)g(should)h(cause)f(only)h
-(sligh)n(t)f(degradation)f(in)j(snapshot)e(read)0 4142
-y(p)r(erformance.)i(Ho)n(w)n(ev)n(er,)26 b(without)i(sp)r(ecial)g
-(atten)n(tion)f(to)h(snapshot)f(store)g(allo)r(cation)f(p)r(olicy)-7
-b(,)28 b(fragmen)n(tation)e(can)i(b)r(e)0 4242 y(exp)r(ected)g(to)f(b)r
-(e)g(fairly)g(sev)n(ere,)f(so)g(snapshot)h(read)f(p)r(erformance)g(is)h
-(not)h(exp)r(ected)f(to)g(b)r(e)h(stellar)e(in)i(early)e(implemen-)0
-4341 y(tations.)47 b(F)-7 b(ortunately)g(,)32 b(since)f(the)h(main)f
-(purp)r(ose)g(of)g(reading)f(from)h(a)g(snapshot)f(is)h(to)g(bac)n(k)g
-(it)g(up)h(or)e(restore)g(a)h(few)0 4441 y(\034les,)d(some)e(read)h(p)r
-(erformance)g(degradation)e(is)j(acceptable)f(and)g(is)g(unlik)n(ely)h
-(to)f(b)r(e)h(noticed.)0 4640 y(In)d(the)g(long)f(run)h(it)g(is)f
-(desirable)g(to)h(impro)n(v)n(e)e(snapshot)h(read)g(p)r(erformance)f(b)
-n(y)i(con)n(trolling)e(snapshot)h(store)f(fragmen-)0
-4740 y(tation)30 b(as)f(m)n(uc)n(h)h(as)f(p)r(ossible,)h(in)g(order)f
-(to)g(tak)n(e)h(adv)-5 b(an)n(tage)28 b(of)i(the)g(inheren)n(tly)g(sup)
-r(erior)e(p)r(erformance)h(of)h(snapshot)0 4839 y(writing)d(v)n(ersus)f
-(origin)h(writing.)p eop end
-%%Page: 18 18
-TeXDict begin 18 17 bop 0 83 a Fg(10.9)112 b(Snapshot)39
-b(W)-9 b(rite)35 b(P)m(erformance)0 336 y Ff(Snapshot)e(writes)g(to)g
-(not)g(require)f(cop)n(y-outs;)j(if)e(an)g(origin)g(c)n(h)n(unk)f(or)h
-(shared)f(snapshot)g(store)h(c)n(h)n(unk)f(needs)i(to)f(b)r(e)0
-436 y(written,)d(the)f(logical)f(c)n(h)n(unk)g(is)h(\034rst)g(remapp)r
-(ed)g(to)g(a)f(new)h(c)n(h)n(unk)g(in)g(the)h(snapshot)e(store.)40
-b(With)30 b(some)f(t)n(w)n(eaking)e(of)0 535 y(the)d(message)f(proto)r
-(col,)g(writing)h(to)f(the)i(c)n(h)n(unk)e(could)h(pro)r(ceed)f(as)g
-(so)r(on)g(as)g(the)i(new)f(allo)r(cation)e(is)i(kno)n(wn,)g(in)g
-(parallel)0 635 y(with)k(the)g(logging)e(of)h(the)h(new)g(exception.)36
-b(So)28 b(snapshot)f(writes)g(are)f(inheren)n(tly)h(quite)h(e\036cien)n
-(t.)0 834 y(Snapshot)34 b(write)f(o)n(v)n(erhead)f(comes)h(from)h
-(metadata)f(up)r(date)i(o)n(v)n(erhead)c(and)j(snapshot)f(store)g
-(fragmen)n(tation.)55 b(The)0 934 y(former)25 b(is)h(supp)r(osed)g(to)f
-(b)r(e)i(small,)f(on)f(the)i(order)d(of)i(a)g(few)g(p)r(ercen)n(t.)36
-b(The)26 b(latter)g(could)f(b)r(e)i(v)n(ery)e(large,)g(and)g(probably)0
-1033 y(will)32 b(b)r(e)g(in)g(initial)h(implemen)n(tation,)g(p)r
-(erhaps)e(on)g(the)i(order)d(of)i(a)f(factor)g(of)h(10.)49
-b(Larger)30 b(c)n(h)n(unk)h(sizes)g(will)h(reduced)0
-1133 y(this)d(seeking)f(o)n(v)n(erhead,)f(roughly)h(linearly)g(with)h
-(the)g(c)n(h)n(unk)g(size.)40 b(Careful)28 b(la)n(y)n(out)g
-(optimization)g(could)h(conceiv)-5 b(ably)0 1233 y(reduce)27
-b(this)h(to)f(a)h(few)f(p)r(ercen)n(t,)h(ev)n(en)f(with)h(small)f(c)n
-(h)n(unks.)36 b(W)-7 b(e)28 b(shall)f(see.)0 1565 y Fg(10.10)112
-b(Net)m(w)m(ork)37 b(P)m(erformance)0 1817 y Ff(The)24
-b(amoun)n(t)f(of)h(message)e(data)h(needed)h(for)f(eac)n(h)g(c)n(h)n
-(unk)h(is)f(small,)i(esp)r(ecially)e(since)g(the)h(message)f(format)g
-(is)h(designed)0 1917 y(from)f(the)h(outset)g(to)g(handle)f(ranges)f
-(of)i(c)n(h)n(unks)f(and)h(m)n(ultiple)g(ranges)e(in)i(eac)n(h)f
-(message.)34 b(Except)24 b(for)f(snapshot)g(reads,)0
-2017 y(eac)n(h)h(message)f(sequence)i(is)f(only)h(t)n(w)n(o)f(messages)
-f(long)h(\(note:)36 b(appro)n(ximately)-7 b(.)34 b(Serv)n(er)23
-b(resp)r(onses)g(do)i(not)g(corresp)r(ond)0 2116 y(exactly)g(requests;)
-h(e.g.,)g(an)n(y)g(unshared)f(c)n(h)n(unks)h(can)f(b)r(e)i(ac)n(kno)n
-(wledged)d(immediately\).)37 b(Message)24 b(tra\036c)i(is)g(exp)r
-(ected)0 2216 y(to)21 b(b)r(e)h(less)f(than)h(1\045)f(of)h(disk)f(arra)
-n(y)e(tra\036c.)35 b(Assuming)21 b(that)h(the)g(general)e(purp)r(ose)h
-(net)n(w)n(ork)f(in)n(terconnect)h(and)g(storage)0 2316
-y(arra)n(y)30 b(in)n(terconnect)i(ha)n(v)n(e)g(similar)g(bandwidth,)i
-(this)f(is)g(where)f(the)h(exp)r(ectation)g(that)g(this)g(arc)n
-(hitecture)e(will)i(scale)0 2415 y(linearly)27 b(to)g(ab)r(out)h(100)e
-(clien)n(ts)h(comes)g(from.)0 2747 y Fg(10.11)112 b(Ov)m(erall)36
-b(P)m(erformance)0 3000 y Ff(It)19 b(is)f(exp)r(ected)h(that)g(t)n
-(ypical)f(usage)f(of)h(a)g(snapshotted)g(origin)g(v)n(olume)f(will)i
-(sho)n(w)f(only)g(sligh)n(t)g(reduction)g(of)g(p)r(erformance)0
-3100 y(v)n(ersus)39 b(the)j(ra)n(w)d(origin)h(v)n(olume,)k(due)d(to)f
-(reading)g(b)r(eing)h(more)f(common)g(than)h(writing.)77
-b(Rewriting)40 b(c)n(h)n(unks)g(is)0 3199 y(optimized)26
-b(b)n(y)f(the)h(clien)n(t's)g(bitmap)g(cac)n(he,)f(whic)n(h)h(is)g
-(compact)f(and)g(probably)g(capable)g(of)g(cac)n(hing)g(all)g(the)i
-(hot)e(sp)r(ots)0 3299 y(of)g(a)f(v)n(olume,)g(ev)n(en)g(for)g(large)f
-(v)n(olumes.)35 b(So)24 b(rewriting)g(should)g(sho)n(w)g(no)n(w)g
-(visible)g(degradation.)34 b(The)25 b(p)r(erformance)e(of)0
-3399 y(fresh)29 b(writes)f(to)h(snapshotted)g(c)n(h)n(unks)f(will)i
-(degrade)d(signi\034can)n(tly)-7 b(,)29 b(due)g(to)g(cop)n(y-out)f
-(bandwidth,)i(and)f(to)g(snapshot)0 3498 y(store)f(fragmen)n(tation,)f
-(that)i(latter)g(b)r(eing)g(sub)5 b(ject)28 b(to)h(optimization)f
-(while)h(the)g(former)f(is)h(una)n(v)n(oidable.)39 b(In)28
-b(general,)0 3598 y(more)g(frequen)n(t)h(snapshots)f(cause)g(more)h
-(fresh)g(writes,)g(with)g(the)h(frequency)e(of)h(fresh)g(writes)g(p)r
-(eaking)f(just)i(after)f(the)0 3697 y(snapshot)e(and)g(declining)h(o)n
-(v)n(er)d(time,)j(till)h(the)f(next)f(snapshot.)0 3897
-y(So:)52 b(what)35 b(will)h(b)r(e)f(the)h(balance)f(of)g(fresh)g
-(writes)g(vs)g(reads)f(and)h(rewrites?)59 b(Ho)n(w)35
-b(frequen)n(tly)g(will)g(w)n(e)g(see)g(will)g(w)n(e)0
-3996 y(see)29 b(the)h(balance)f(shift)h(for)f(a)g(short)g(time)h(in)g
-(the)g(direction)f(of)g(the)h(w)n(orst)e(case?)43 b(Ho)n(w)29
-b(bad)g(is)g(the)h(w)n(orst)f(case?)42 b(Ho)n(w)0 4096
-y(lik)n(ely)24 b(is)g(it)h(that)g(the)g(user)f(will)g(notice)h(the)f
-(shifts)h(in)g(write)f(p)r(erformance?)35 b(These)24
-b(all)g(a)n(w)n(ait)g(measuremen)n(t)f(under)h(liv)n(e)0
-4196 y(loads.)36 b(Ho)n(w)n(ev)n(er)26 b(at)i(this)g(p)r(oin)n(t)g(I)f
-(will)h(sp)r(eculate)g(that)g(ev)n(en)f(a)h(relativ)n(ely)e(early)h
-(implemen)n(tation)g(will)h(sho)n(w)f(a)n(v)n(erage)0
-4295 y(p)r(erformance)33 b(degradation)g(v)n(ersus)g(a)h(ra)n(w)g(v)n
-(olume)g(of)g(less)g(than)h(ten)g(p)r(ercen)n(t,)h(and)e(that,)j(at)d
-(w)n(orst,)h(p)r(erformance)0 4395 y(degradation)25 b(will)i(b)r(e)g
-(limited)g(to)g(a)f(factor)g(of)g(four)h(or)e(so)h(just)i(after)e(a)g
-(snapshot.)36 b(F)-7 b(or)26 b(man)n(y)g(users,)g(and)h(particularly)0
-4494 y(en)n(terprise)19 b(users,)h(the)g(b)r(ene\034ts)h(of)f
-(snapshotting)f(will)h(out)n(w)n(eigh)f(the)h(p)r(erformance)f(loss:)32
-b(it)20 b(is)g(easy)f(to)h(buy)g(bandwidth,)0 4594 y(not)32
-b(as)e(easy)h(to)g(buy)h(liv)n(e)f(bac)n(kup)g(capabilit)n(y)-7
-b(.)48 b(F)-7 b(or)31 b(others,)h(the)g(una)n(v)n(oidable)e(p)r
-(erformance)g(degradation)g(of)h(origin)0 4694 y(writing)h(will)h(mak)n
-(e)f(snapshotting)f(unattractiv)n(e)h(enough)g(to)g(discourage)f(its)h
-(use.)52 b(Ev)n(en)n(tually)32 b(w)n(e)g(ma)n(y)g(b)r(e)h(able)f(to)0
-4793 y(satisfy)d(this)h(group)e(as)h(w)n(ell,)h(b)n(y)f(impro)n(ving)f
-(snapshot)h(store)g(allo)r(cation)f(p)r(olicy)h(to)h(the)g(p)r(oin)n(t)
-f(where)g(the)h(origin)e(can)0 4893 y(b)r(e)g(made)f(optional)g(and)g
-(all)h(IO)f(tak)n(e)g(place)g(in)h(the)g(snapshot)e(store.)0
-5092 y(The)38 b(p)r(essimism)h(in)f(this)h(section)f(should)g(b)r(e)h
-(temp)r(ered)f(b)n(y)g(observing)f(that)h(in)h(man)n(y)f(resp)r(ects,)i
-(p)r(erformance)d(is)0 5192 y(exp)r(ected)28 b(to)f(b)r(e)h(go)r(o)r
-(d:)p eop end
-%%Page: 19 19
-TeXDict begin 19 18 bop 138 83 a Fe(\001)42 b Ff(Large)25
-b(n)n(um)n(b)r(er)j(of)f(snapshots)g(can)g(b)r(e)h(held)g(without)g
-(a\033ecting)f(p)r(erformance)f(m)n(uc)n(h)138 249 y
-Fe(\001)42 b Ff(Snapshot)27 b(store)f(utilization)i(is)f(go)r(o)r(d)138
-415 y Fe(\001)42 b Ff(Net)n(w)n(ork)26 b(tra\036c)h(is)g(minimal)138
-581 y Fe(\001)42 b Ff(Rewrites)27 b(are)f(highly)i(optimized)0
-863 y(In)e(other)e(w)n(ords,)h(if)h(y)n(ou)f(need)g(snapshots)f(then)i
-(this)g(implemen)n(tation)f(is)h(lik)n(ely)f(to)g(deliv)n(er)f(go)r(o)r
-(d)h(p)r(erformance)g(v)n(ersus)0 963 y(alternativ)n(es.)35
-b(Plus)26 b(there)g(is)f(a)h(clear)f(path)h(forw)n(ard)e(to)h(ac)n
-(hieving)g(near-optimal)f(p)r(erformance,)h(b)n(y)h(w)n(orking)e(to)n
-(w)n(ards)0 1063 y(a)j(system)g(where)g(the)h(snapshot)f(store)g(can)g
-(b)r(e)h(used)f(e\033ectiv)n(ely)h(alone,)f(with)h(no)f(origin)f(v)n
-(olume.)0 1437 y Fh(11)131 b(F)-11 b(urther)44 b(W)-11
-b(ork)0 1735 y Fg(11.1)112 b(P)m(arallelizing)34 b(the)j(Arc)m
-(hitecture)0 1988 y Ff(Normally)f(the)h(\034rst)f(question)g(I)h(am)f
-(ask)n(ed)g(ab)r(out)g(this)h(clustered)f(snapshot)g(design)g(is)h(wh)n
-(y)f(isn't)h(it)g(symmetric?)0 2087 y(The)32 b(answ)n(er:)45
-b(b)r(ecause)31 b(a\))h(it)h(do)r(esn't)f(ha)n(v)n(e)f(to)h(b)r(e)h(in)
-f(order)f(to)h(p)r(erform)g(w)n(ell)f(on)h(to)r(da)n(y's)g(t)n(ypical)f
-(clusters)h(and)g(b\))0 2187 y(distributing)h(a)f(tree)h(structure)f
-(across)f(indep)r(enden)n(t)i(cac)n(hes)f(is)h(a)f(complex,)i
-(error-prone)29 b(pro)r(cess,)k(and)g(in)n(tro)r(duces)0
-2287 y(o)n(v)n(erhead)25 b(of)j(its)g(o)n(wn.)36 b(A)n(t)28
-b(some)f(p)r(oin)n(t,)h(ho)n(w)n(ev)n(er,)d(the)j(single)f(no)r(de)h
-(serv)n(er)d(arc)n(hitecture)i(will)h(b)r(ecome)f(a)g(b)r(ottlenec)n
-(k,)0 2386 y(so)g(I)h(discuss)f(parallelizing)f(strategies)g(here.)0
-2585 y(The)32 b(easiest)f(thing)h(w)n(e)f(can)h(do,)g(and)g(with)g(the)
-h(strongest)d(immediate)i(e\033ect)g(is)g(to)g(ha)n(v)n(e)e(the)j(serv)
-n(er)d(distribute)i(the)0 2685 y(cop)n(y-out)26 b(w)n(ork)g(to)h
-(underused)g(no)r(des.)37 b(This)27 b(will)h(tak)n(e)e(signi\034can)n
-(t)h(IO)g(bandwidth)h(load)f(o\033)g(the)h(serv)n(er's)d(bus)i(at)h
-(the)0 2785 y(exp)r(ense)g(of)f(a)h(little)g(messaging)e(latency)-7
-b(.)37 b(By)27 b(doing)g(this,)h(a)g(single)f(serv)n(er)f(can)h(lik)n
-(ely)g(scale)g(to)h(handle)f(a)h(h)n(undred)f(or)0 2884
-y(so)f(busy)g(no)r(des)h(of)f(similar)g(p)r(o)n(w)n(er)f(to)i(itself:)
-37 b(the)27 b(real)e(b)r(ottlenec)n(k)i(will)f(probably)g(b)r(e)h(the)g
-(storage)d(arra)n(y)-7 b(.)35 b(A)27 b(user)e(who)0 2984
-y(can)e(a\033ord)g(to)h(upgrade)f(the)h(storage)e(arra)n(y)f(to)j
-(handle)g(ev)n(en)f(larger)f(n)n(um)n(b)r(ers)h(of)h(clien)n(ts)g(can)f
-(lik)n(ely)h(a\033ord)f(to)g(upgrade)0 3084 y(the)28
-b(snapshot)f(serv)n(er)e(as)i(w)n(ell.)0 3283 y(A)n(t)39
-b(some)e(p)r(oin)n(t,)k(p)r(erhaps)d(t)n(w)n(o)f(or)h(three)g(h)n
-(undred)f(clien)n(ts,)k(the)e(snapshot)e(serv)n(er)g(b)r(ecomes)g(a)h
-(b)r(ottlenec)n(k)g(again.)0 3383 y(F)-7 b(urther)26
-b(scaling)f(is)g(easily)g(ac)n(hiev)n(ed)g(b)n(y)h(dividing)g(up)g(the)
-g(w)n(ork)f(b)r(et)n(w)n(een)g(a)h(n)n(um)n(b)r(er)g(of)f(snapshot)h
-(serv)n(ers,)e(b)n(y)h(logical)0 3482 y(address)g(range.)36
-b(Eac)n(h)26 b(snapshot)g(serv)n(er)f(main)n(tains)h(a)g(separate)f
-(btree)i(in)f(a)h(distinct)g(range)e(of)i(logical)e(addresses)g(and)0
-3582 y(op)r(erates)31 b(its)i(o)n(wn)f(journal.)50 b(Care)32
-b(m)n(ust)g(b)r(e)h(tak)n(en)f(that)g(allo)r(cation)g(bitmaps)g(are)f
-(divided)i(up)g(cleanly;)h(this)f(is)f(not)0 3681 y(hard)j(\(e.g.,)i
-(ev)n(en)e(if)h(a)f(logical)f(address)g(range)g(b)r(oundary)g(lies)h
-(in)h(the)g(middle)f(of)h(a)f(bitmap)g(blo)r(c)n(k,)i(the)f(b)r
-(oundary)0 3781 y(bitmap)24 b(can)f(b)r(e)g(replicated)g(b)r(et)n(w)n
-(een)g(t)n(w)n(o)g(no)r(des,)h(with)f(logic)g(to)g(prev)n(en)n(t)f
-(allo)r(cation)h(outside)g(the)g(b)r(oundary)g(-)g(needed)0
-3881 y(an)n(yw)n(a)n(y)g(for)i(error)e(c)n(hec)n(king\).)35
-b(Shared)25 b(metadata)f(suc)n(h)h(as)g(the)h(curren)n(t)e(snapshot)g
-(list,)i(sup)r(erblo)r(c)n(k,)f(etc.,)h(is)g(up)r(dated)0
-3980 y(using)k(a)h(R)-9 b(W)31 b(lo)r(c)n(king)f(strategy)f(\(i.e.,)j
-(using)e(a)h(DLM\).)g(Assuming)g(that)g(w)n(orkload)e(is)h(distributed)
-h(relativ)n(ely)f(ev)n(enly)0 4080 y(across)f(the)i(logical)e(address)g
-(range,)h(this)h(simple)g(parallelization)e(strategy)g(will)i(serv)n(e)
-e(up)i(to)g(a)f(thousand)h(clien)n(ts)f(or)0 4180 y(so,)d(and)g(the)h
-(disk)g(will)f(once)h(again)e(b)r(e)i(the)g(b)r(ottlenec)n(k.)0
-4379 y(If)i(w)n(e)g(w)n(an)n(t)f(to)h(scale)f(to)h(far)g(larger)e(n)n
-(um)n(b)r(ers)h(of)h(clien)n(ts)g(w)n(e)g(probably)f(ha)n(v)n(e)f(to)i
-(bite)h(the)f(bullet)h(and)e(distribute)i(the)0 4478
-y(btrees)25 b(and)g(allo)r(cation)g(bitmaps.)62 b(Ho)n(w)n(ev)n(er)23
-b(I)j(do)f(not)h(think)g(this)f(problem)g(is)h(imminen)n(t;)h(there)e
-(is)g(plen)n(t)n(y)h(of)f(time)h(to)0 4578 y(think)i(ab)r(out)g(it.)0
-4910 y Fg(11.2)112 b(A)m(daptation)37 b(to)g(Single)f(No)s(de)i(Clien)m
-(t)0 5163 y Ff(It)25 b(is)g(exp)r(ected)h(that)f(this)g(cluster)g
-(snapshot)f(design)h(will)g(ev)n(en)n(tually)f(b)r(e)h(adapted)g(for)f
-(single-no)r(de)h(use,)g(replacing)f(the)0 5263 y(curren)n(t)j
-(snapshot)f(target,)h(with)h(the)g(follo)n(wing)f(b)r(ene\034ts:)p
-eop end
-%%Page: 20 20
-TeXDict begin 20 19 bop 138 83 a Fe(\001)42 b Ff(Cop)n(y-out)26
-b(o)n(v)n(erhead)f(will)j(no)f(longer)g(increase)f(linearly)g(with)i(n)
-n(um)n(b)r(er)g(of)f(snapshots)g(held)138 249 y Fe(\001)42
-b Ff(Excessiv)n(e)26 b(memory)h(fo)r(otprin)n(t)g(for)g(large)f(v)n
-(olumes)h(eliminated)138 415 y Fe(\001)42 b Ff(Clien)n(t)27
-b(cac)n(he)g(memory)g(shrinks)f(in)i(resp)r(onse)f(to)g(memory)g
-(pressure)138 581 y Fe(\001)42 b Ff(Only)27 b(one)g(snapshot)g(store)f
-(v)n(olume)h(to)h(manage)e(instead)h(of)h(one)f(p)r(er)g(snapshot)138
-747 y Fe(\001)42 b Ff(Sharing)26 b(exceptions)h(b)r(et)n(w)n(een)h
-(snapshots)e(sa)n(v)n(es)g(disk)h(space)138 913 y Fe(\001)42
-b Ff(All)28 b(snapshots)e(allo)r(cate)h(disk)g(memory)g(from)g(common)g
-(p)r(o)r(ol)138 1079 y Fe(\001)42 b Ff(Just)27 b(one)g(device)g(mapp)r
-(er)h(target)e(to)i(kno)n(w)f(ab)r(out)g(instead)g(of)h(t)n(w)n(o)0
-1362 y(When)f(it)g(comes)f(time)h(to)g(tak)n(e)f(this)h(step,)g(a)f
-(decision)g(m)n(ust)h(b)r(e)g(tak)n(en)f(whether)g(to)h(\(lazily\))f(k)
-n(eep)h(the)g(message-based)0 1461 y(in)n(terface)g(and)g(clien)n
-(t-serv)n(er)e(factoring,)i(or)g(to)g(create)f(a)i(v)-5
-b(arian)n(t)26 b(where)h(the)h(serv)n(er)e(logic)g(is)i(in)n(tegrated)e
-(directly)h(in)n(to)0 1561 y(the)h(clien)n(t.)0 1935
-y Fh(12)131 b(Conclusion)0 2216 y Ff(Blo)r(c)n(k)36 b(device)h
-(snapshots)f(ha)n(v)n(e)g(a)h(n)n(um)n(b)r(er)f(of)h(in)n(teresting)g
-(applications,)h(including)f(the)h(crucial)e(one)h(of)g(enabling)0
-2316 y(online)31 b(bac)n(kup)g(for)g(en)n(terprise)g(Lin)n(ux)g(users.)
-48 b(With)33 b(the)e(imminen)n(t)i(addition)e(of)h(cluster)f
-(\034lesystems)g(to)g(the)h(Lin)n(ux)0 2416 y(k)n(ernel,)f(cluster)f
-(snapshot)g(capabilit)n(y)g(is)g(required.)46 b(This)30
-b(presen)n(ts)g(a)h(n)n(um)n(b)r(er)f(of)h(in)n(teresting)f(sync)n
-(hronization)e(and)0 2515 y(structural)f(problems,)f(for)i(whic)n(h)f
-(original)f(solutions)h(ha)n(v)n(e)f(b)r(een)i(devised)f(and)h(presen)n
-(ted)f(in)g(this)h(pap)r(er.)0 2715 y(As)33 b(w)n(ell)h(as)e(extending)
-i(snapshot)e(capabilit)n(y)h(to)g(clusters,)h(some)f(problems)g(with)g
-(the)h(existing,)h(single)d(no)r(de)i(snap-)0 2814 y(shot)27
-b(design)g(w)n(ere)g(solv)n(ed,)f(yielding)h(signi\034can)n(t)g(impro)n
-(v)n(emen)n(ts)f(in)i(IO)f(p)r(erformance,)f(disk)h(space)g
-(consumption)g(and)0 2914 y(memory)e(usage.)36 b(A)26
-b(protot)n(yp)r(e)g(implemen)n(tation)g(exhibiting)g(go)r(o)r(d)g(p)r
-(erformance)f(c)n(haracteristics)f(has)h(b)r(een)i(created.)0
-3013 y(F)-7 b(urther)22 b(w)n(ork)f(remains)g(to)h(b)r(e)g(done)g(to)g
-(transform)f(the)h(protot)n(yp)r(e)g(in)n(to)g(a)f(fully)i(reliable,)f
-(fault-toleran)n(t)f(facilit)n(y)h(ready)0 3113 y(to)27
-b(b)r(e)h(deplo)n(y)n(ed)f(in)h(a)f(pro)r(duction)g(en)n(vironmen)n(t.)
-p eop end
-%%Trailer
-
-userdict /end-hook known{end-hook}if
-%%EOF
diff --git a/csnap/patches/csnap-2.6.7-2.4.26 b/csnap/patches/csnap-2.6.7-2.4.26
deleted file mode 100644
index 589521e..0000000
--- a/csnap/patches/csnap-2.6.7-2.4.26
+++ /dev/null
@@ -1,195 +0,0 @@
---- /src/2.6.7.csnap/drivers/md/dm-csnap.c	2004-09-03 21:02:24.000000000 +0000
-+++ dm-csnap.c	2004-09-03 21:01:15.000000000 +0000
-@@ -6,7 +6,6 @@
- #include <linux/file.h>
- #include <net/sock.h>
- #include <asm/uaccess.h>
--#include <linux/bio.h>
- #include "dm.h"
- #include "dm-csnap.h"
- 
-@@ -19,12 +18,22 @@
- 
- #define trace trace_on
- 
-+#define bio buffer_head
-+#define bi_sector b_rsector
-+#define bio_end_io_t bh_end_io_t
-+#define bi_private b_private
-+#define bi_end_io b_end_io
-+#define bi_bdev b_rdev
-+#define CLONE_KERNEL (CLONE_FS|CLONE_FILES|CLONE_SIGNAL)
-+#define SOCKET_I(inode) (&inode->u.socket_i)
-+#define bio_data_dir(bio) rw
-+#define bi_size b_size
-+
- /* Pipe helpers */
- 
- static int rwpipe(struct file *file, const void *buffer, unsigned int count,
--	ssize_t (*op)(struct kiocb *, const char *, size_t, loff_t), int mode)
-+	ssize_t (*op)(struct file *, const char *, size_t, loff_t *), int mode)
- {
--	struct kiocb iocb;
- 	mm_segment_t oldseg;
- 	int err = 0;
- 
-@@ -33,12 +42,10 @@
- 		return -EBADF;
- 	if (!op)
- 		return -EINVAL;
--	init_sync_kiocb(&iocb, file); // new in 2.5 (hmm)
--	iocb.ki_pos = file->f_pos;
- 	oldseg = get_fs();
- 	set_fs(get_ds());
- 	while (count) {
--		int chunk = (*op)(&iocb, buffer, count, iocb.ki_pos);
-+		int chunk = (*op)(file, buffer, count, &file->f_pos);
- 		if (chunk <= 0) {
- 			err = chunk? chunk: -EPIPE;
- 			break;
-@@ -48,18 +55,17 @@
- 		buffer += chunk;
- 	}
- 	set_fs(oldseg);
--	file->f_pos = iocb.ki_pos;
- 	return err;
- }
- 
- static inline int readpipe(struct file *file, void *buffer, unsigned int count)
- {
--	return rwpipe(file, buffer, count, (void *)file->f_op->aio_read, FMODE_READ);
-+	return rwpipe(file, buffer, count, (void *)file->f_op->read, FMODE_READ);
- }
- 
- static inline int writepipe(struct file *file, void *buffer, unsigned int count)
- {
--	return rwpipe(file, buffer, count, file->f_op->aio_write, FMODE_WRITE);
-+	return rwpipe(file, buffer, count, file->f_op->write, FMODE_WRITE);
- }
- 
- #define outbead(SOCK, CODE, STRUCT, VALUES...) ({ \
-@@ -167,6 +173,7 @@
- 	u64 chunk;
- 	unsigned chunks;
- 	unsigned id;
-+	unsigned rw;
- 	struct bio *bio;
- 	list_t list;
- };
-@@ -256,7 +263,7 @@
- 	list_t list;
- };
- 
--static int snapshot_read_end_io(struct bio *bio, unsigned int done, int error)
-+static void snapshot_read_end_io(struct bio *bio, int uptodate)
- {
- 	struct end_io_hook *hook = bio->bi_private;
- 	struct snapinfo *info = hook->info;
-@@ -269,7 +276,7 @@
- 
- 	bio->bi_private = hook->old_private;
- 	bio->bi_end_io = hook->old_end_io;
--	return bio->bi_end_io(bio, done, error);
-+	bio->bi_end_io(bio, uptodate);
- }
- 
- /* This is the part that does all the work. */
-@@ -306,7 +313,7 @@
- 		if (chunks != pending->chunks) {
- 			warn("Message mismatch, expected %x got %x", chunks, chunks);
- 			kmem_cache_free(pending_cache, pending);
--			bio_io_error(bio, bio->bi_size);
-+			buffer_IO_error(bio);
- 			return -1;
- 		}
- 
-@@ -317,7 +324,7 @@
- 				u64 logical = bio->bi_sector;
- 				u64 physical = (*p2++ << shift) + (logical & mask);
- 				trace(warn("logical %Lx = physical %Lx", logical, physical));
--				bio->bi_bdev = info->snapdev->bdev;
-+				bio->bi_bdev = info->snapdev->dev;
- 				bio->bi_sector = physical;
- 			}
- 			p = (struct chunk_range *)p2;
-@@ -338,7 +345,7 @@
- 			bio->bi_private = hook;
- 		}
- 
--		generic_make_request(bio);
-+		generic_make_request(rw, bio);
- 		submitted++;
- #ifdef CACHE
- 		for (j = 0; j < p->chunks; j++)
-@@ -347,13 +354,7 @@
- 		kmem_cache_free(pending_cache, pending);
- 	}
- 	if (submitted){
--		request_queue_t *q;
--		q = bdev_get_queue(info->orgdev->bdev);
--		if (q->unplug_fn)
--			q->unplug_fn(q);
--		q = bdev_get_queue(info->snapdev->bdev);
--		if (q->unplug_fn)
--			q->unplug_fn(q);
-+		run_task_queue(&tq_disk);
- 	}
- 	return 0;
- }
-@@ -582,14 +583,14 @@
-  * at the moment, or may not have been established yet, in which case we have
-  * to defer the request until the server becomes available.
-  */
--static int csnap_map(struct dm_target *target, struct bio *bio, union map_info *context)
-+static int csnap_map(struct dm_target *target, struct bio *bio, int rw, union map_info *context)
- {
- 	struct snapinfo *info = target->private;
- 	struct pending *pending;
- 	chunk_t chunk;
- 	unsigned id;
- 
--	bio->bi_bdev = info->orgdev->bdev;
-+	bio->bi_bdev = info->orgdev->dev;
- 	if (bio_data_dir(bio) == READ && !is_snapshot(info))
- 		return 1;
- 
-@@ -625,7 +626,7 @@
- 
- 	id = info->nextid;
- 	info->nextid = (id + 1) & ~(-1 << ID_BITS);
--	*pending = (struct pending){ .id = id, .bio = bio, .chunk = chunk, .chunks = 1 };
-+	*pending = (struct pending){ .id = id, .bio = bio, .rw = rw, .chunk = chunk, .chunks = 1 };
- 	spin_lock(&info->pending_lock);
- 	list_add(&pending->list, info->pending_buckets + hash_pending(pending->id));
- 	spin_unlock(&info->pending_lock);
-@@ -787,7 +788,6 @@
- 	if ((err = kernel_thread((void *)worker, target, CLONE_KERNEL)) < 0)
- 		goto eek;
- 	warn("Created snapshot device origin=%s snapstore=%s snapshot=%i", argv[0], argv[1], snap);
--	target->split_io = 1 << info->chunkshift; // !!! lose this as soon as possible
- 	return 0;
- 
- eek:	warn("Virtual device create error %i: %s!", err, error);
-@@ -828,8 +828,6 @@
- 
- static int csnap_status(struct dm_target *target, status_type_t type, char *result, unsigned int maxlen)
- {
--	char orgbuffer[32];
--	char snapbuffer[32];
- 	struct snapinfo *info = target->private;
- 
- 	switch (type) {
-@@ -838,10 +836,10 @@
- 		break;
- 
- 	case STATUSTYPE_TABLE:
--		format_dev_t(orgbuffer, info->orgdev->bdev->bd_dev);
--		format_dev_t(snapbuffer, info->snapdev->bdev->bd_dev);
- 		snprintf(result, maxlen, "%s %s %u",
--			 orgbuffer, snapbuffer, 1 << info->chunksize_bits);
-+			dm_kdevname(info->orgdev->dev),
-+			dm_kdevname(info->snapdev->dev),
-+			1 << info->chunksize_bits);
- 		break;
- 	}
- 
diff --git a/csnap/patches/csnap-2.6.8.1 b/csnap/patches/csnap-2.6.8.1
deleted file mode 100644
index 23c11a9..0000000
--- a/csnap/patches/csnap-2.6.8.1
+++ /dev/null
@@ -1,1321 +0,0 @@
-diff -up --recursive 2.6.8.1.csnap.clean/drivers/md/Kconfig 2.6.8.1.csnap/drivers/md/Kconfig
---- 2.6.8.1.csnap.clean/drivers/md/Kconfig	2004-08-14 06:54:50.000000000 -0400
-+++ 2.6.8.1.csnap/drivers/md/Kconfig	2004-10-04 16:39:41.000000000 -0400
-@@ -200,5 +200,15 @@ config DM_ZERO
- 	  A target that discards writes, and returns all zeroes for
- 	  reads.  Useful in some recovery situations.
- 
-+config DM_CSNAP
-+	tristate "Cluster snapshot target support"
-+	depends on BLK_DEV_DM && EXPERIMENTAL
-+	---help---
-+	  This device-mapper target allows you to create a virtual device
-+	  that can take snapshots of an underlying device.  This device
-+	  can be accessed simultaneously by multiple nodes of a cluster.
-+
-+	  If unsure, say N.
-+
- endmenu
- 
-diff -up --recursive 2.6.8.1.csnap.clean/drivers/md/Makefile 2.6.8.1.csnap/drivers/md/Makefile
---- 2.6.8.1.csnap.clean/drivers/md/Makefile	2004-08-14 06:55:33.000000000 -0400
-+++ 2.6.8.1.csnap/drivers/md/Makefile	2004-10-23 23:43:09.000000000 -0400
-@@ -29,6 +29,8 @@ obj-$(CONFIG_DM_CRYPT)		+= dm-crypt.o
- obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
- obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o
- obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
-+obj-$(CONFIG_DM_CSNAP)		+= dm-csnap.o
-+obj-$(CONFIG_DM_CSNAP)		+= dm-cmirror.o
- 
- quiet_cmd_unroll = UNROLL  $@
-       cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \
-diff -up --recursive 2.6.8.1.csnap.clean/drivers/md/dm-csnap.c 2.6.8.1.csnap/drivers/md/dm-csnap.c
---- 2.6.8.1.csnap.clean/drivers/md/dm-csnap.c	2004-10-14 12:58:07.000000000 -0400
-+++ 2.6.8.1.csnap/drivers/md/dm-csnap.c	2004-12-08 13:13:02.000000000 -0500
-@@ -0,0 +1,1144 @@
-+#include <linux/fs.h>
-+#include <linux/slab.h>
-+#include <linux/mm.h>
-+#include <linux/module.h>
-+#include <linux/pagemap.h>
-+#include <linux/file.h>
-+#include <linux/syscalls.h> // recvmsg
-+#include <linux/socket.h>
-+#include <linux/un.h>
-+#include <net/sock.h>
-+#include <asm/uaccess.h>
-+#include <linux/bio.h>
-+#include "dm.h"
-+#include "dm-csnap.h"
-+
-+#define BREAK BUG()
-+#define warn(string, args...) do { printk("%s: " string "\n", __func__, ##args); } while (0)
-+#define error(string, args...) do { warn(string, ##args); BREAK; } while (0)
-+#define assert(expr) do { if (!(expr)) error("Assertion " #expr " failed!\n"); } while (0)
-+#define trace_on(args) args
-+#define trace_off(args)
-+
-+#define trace trace_off
-+
-+/*
-+ * To do:
-+ *
-+ * - variable length bios
-+ * - unique cache
-+ * - receive chunk size
-+ * - make pending and hook a union
-+ * - get rid of multiple ranges per message misfeature
-+ * - rationalize sector vs chunk usage in messages
-+ * - detect message id wrap
-+ * - detect message timeout
-+ */
-+
-+/* Useful gizmos */
-+
-+static int rwpipe(struct file *file, const void *buffer, unsigned int count,
-+	ssize_t (*op)(struct kiocb *, const char *, size_t, loff_t), int mode)
-+{
-+	struct kiocb iocb;
-+	mm_segment_t oldseg;
-+	int err = 0;
-+
-+	trace_off(warn("%s %i bytes", mode == FMODE_READ? "read": "write", count);)
-+	if (!(file->f_mode & mode))
-+		return -EBADF;
-+	if (!op)
-+		return -EINVAL;
-+	init_sync_kiocb(&iocb, file); // new in 2.5 (hmm)
-+	iocb.ki_pos = file->f_pos;
-+	oldseg = get_fs();
-+	set_fs(get_ds());
-+	while (count) {
-+		int chunk = (*op)(&iocb, buffer, count, iocb.ki_pos);
-+		if (chunk <= 0) {
-+			err = chunk? chunk: -EPIPE;
-+			break;
-+		}
-+		BUG_ON(chunk > count);
-+		count -= chunk;
-+		buffer += chunk;
-+	}
-+	set_fs(oldseg);
-+	file->f_pos = iocb.ki_pos;
-+	return err;
-+}
-+
-+static inline int readpipe(struct file *file, void *buffer, unsigned int count)
-+{
-+	return rwpipe(file, buffer, count, (void *)file->f_op->aio_read, FMODE_READ);
-+}
-+
-+static inline int writepipe(struct file *file, void *buffer, unsigned int count)
-+{
-+	return rwpipe(file, buffer, count, file->f_op->aio_write, FMODE_WRITE);
-+}
-+
-+#define outbead(SOCK, CODE, STRUCT, VALUES...) ({ \
-+	struct { struct head head; STRUCT body; } PACKED message = \
-+		{ { CODE, sizeof(STRUCT) }, { VALUES } }; \
-+	writepipe(SOCK, &message, sizeof(message)); })
-+
-+/*
-+ * This gets the job done but it sucks as an internal interface: there
-+ * is no reason to deal with fds at all, we just want to receive the
-+ * (struct file *), we do not want to have to wrap the socket in a
-+ * fd just to call recv_fd, and user space pointer for the (bogus) data
-+ * payload is just silly.  Never mind the danger of triggering some
-+ * wierdo signal handling cruft deep in the socket layer.  This kind of
-+ * posturing - lathering layers of cruft upon cruft - is the stuff
-+ * Windows is made of, Linux is not supposed to be like that.  Fixing
-+ * this requires delving into the SCM_RIGHTS path deep inside sys_recvmsg
-+ * and breaking out the part that actually does the work, to be a usable
-+ * internal interface.  Put it on the list of things to do.
-+ */
-+static int recv_fd(int sock, char *bogus, unsigned *len)
-+{
-+	char payload[CMSG_SPACE(sizeof(int))];
-+	struct msghdr msg = {
-+		.msg_control = payload,
-+		.msg_controllen = sizeof(payload),
-+		.msg_iov = &(struct iovec){ .iov_base = bogus, .iov_len = *len },
-+		.msg_iovlen = 1,
-+	};
-+	mm_segment_t oldseg = get_fs();
-+	struct cmsghdr *cmsg;
-+	int result;
-+
-+	set_fs(get_ds());
-+	result = sys_recvmsg(sock, &msg, 0);
-+	set_fs(oldseg);
-+
-+	if (result <= 0)
-+		return result;
-+	if (!(cmsg = CMSG_FIRSTHDR(&msg)))
-+		return -ENODATA;
-+	if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)) ||
-+		cmsg->cmsg_level != SOL_SOCKET ||
-+		cmsg->cmsg_type != SCM_RIGHTS)
-+		return -EBADMSG;
-+
-+	*len = result;
-+	return *((int *)CMSG_DATA(cmsg));
-+}
-+
-+static void kick(struct block_device *dev)
-+{
-+	request_queue_t *q = bdev_get_queue(dev);
-+	if (q->unplug_fn)
-+		q->unplug_fn(q);
-+}
-+
-+/* ...Useful gizmos */
-+
-+typedef u64 chunk_t;
-+
-+#define SECTOR_SHIFT 9
-+#define IS_SNAP_FLAG (1 << 0)
-+#define REPORT_BIT 1
-+#define RECOVER_FLAG (1 << 2)
-+#define FINISH_FLAG (1 << 3)
-+#define NUM_BUCKETS 64
-+#define MASK_BUCKETS (NUM_BUCKETS - 1)
-+#define ID_BITS 16
-+
-+struct snapinfo {
-+	u64 id;
-+	unsigned long flags;
-+	unsigned chunksize_bits;
-+	unsigned chunkshift;
-+//	sector_t len;
-+	int snap, nextid;
-+	u32 *shared_bitmap; // !!! get rid of this, use the inode cache
-+	struct inode  *inode; /* the cache */
-+	struct dm_dev *orgdev;
-+	struct dm_dev *snapdev;
-+	struct file *sock;
-+	struct file *control_socket;
-+	struct semaphore server_in_sem;
-+	struct semaphore server_out_sem;
-+	struct semaphore more_work_sem;
-+	struct semaphore recover_sem;
-+	struct semaphore exit1_sem;
-+	struct semaphore exit2_sem;
-+	struct semaphore exit3_sem;
-+	struct list_head pending[NUM_BUCKETS];
-+	struct list_head queries;
-+	struct list_head releases;
-+	struct list_head locked;
-+	spinlock_t pending_lock;
-+	spinlock_t end_io_lock;
-+	int dont_switch_lists;
-+};
-+
-+static inline int is_snapshot(struct snapinfo *info)
-+{
-+	return !!(info->flags & IS_SNAP_FLAG);
-+}
-+
-+static inline int running(struct snapinfo *info)
-+{
-+	return !(info->flags & FINISH_FLAG);
-+}
-+
-+static inline int worker_running(struct snapinfo *info)
-+{
-+        return !(info->flags & (FINISH_FLAG|RECOVER_FLAG));
-+}
-+
-+static void report_error(struct snapinfo *info)
-+{
-+	if (test_and_set_bit(REPORT_BIT, &info->flags))
-+		return;
-+	up(&info->more_work_sem);
-+	down(&info->recover_sem);
-+	info->flags |= RECOVER_FLAG;
-+}
-+
-+/* Static caches, shared by all csnap instances */
-+
-+static kmem_cache_t *pending_cache;
-+static kmem_cache_t *end_io_cache;
-+static struct super_block *snapshot_super;
-+
-+/* We cache query results because we are greedy about speed */
-+
-+#ifdef CACHE
-+static u64 *snap_map_cachep(struct address_space *mapping, chunk_t chunk, struct page **p)
-+{
-+	u32 page_index;
-+	u32 page_pos;
-+	struct page *page;
-+	u64 *exceptions;
-+
-+	page_index = chunk / (PAGE_SIZE / sizeof(u64));
-+	page_pos = chunk % (PAGE_SIZE / sizeof(u64));
-+
-+	page = find_or_create_page(mapping, page_index, GFP_KERNEL);
-+	if (page) {
-+		/* Clean page if it's a new one */
-+		if (!Page_Uptodate(page)) {
-+			memset(page_address(page), 0, PAGE_SIZE);
-+			SetPageUptodate(page);
-+		}
-+
-+		exceptions = page_address(page);
-+		*p = page;
-+		return &exceptions[page_pos];
-+	}
-+	return NULL;
-+}
-+
-+static inline int get_unshared_bit(struct snapinfo *info, chunk_t chunk)
-+{
-+	return (info->shared_bitmap[chunk >> 5] >> (chunk & 31)) & 1;
-+}
-+
-+static inline void set_unshared_bit(struct snapinfo *info, chunk_t chunk)
-+{
-+	info->shared_bitmap[chunk >> 5] |= 1 << (chunk & 31);
-+}
-+#endif
-+
-+/* Hash table matches up query replies to pending requests */
-+
-+struct pending {
-+	unsigned id;
-+	u64 chunk;
-+	unsigned chunks;
-+	struct bio *bio;
-+	struct list_head list;
-+};
-+
-+static void show_pending(struct snapinfo *info)
-+{
-+	unsigned i, total = 0;
-+
-+	spin_lock(&info->pending_lock);
-+	warn("Pending server queries...");
-+	for (i = 0; i < NUM_BUCKETS; i++) {
-+		struct list_head *list;
-+		list_for_each(list, info->pending + i) {
-+			struct pending *pending = list_entry(list, struct pending, list);
-+			if (!total)
-+				printk("[%u]: ", i);
-+			printk("%u:%Lx ", pending->id, pending->chunk);
-+			total++;
-+		}
-+	}
-+	printk("(%u)\n", total);
-+	if (!list_empty(&info->queries)) {
-+		struct list_head *list;
-+		total = 0;
-+		warn("Queued queries...");
-+		list_for_each(list, &info->queries) {
-+			struct pending *pending = list_entry(list, struct pending, list);
-+			printk("%Lx ", pending->chunk);
-+			total++;
-+		}
-+		printk("(%u)\n", total);
-+	}
-+	spin_unlock(&info->pending_lock);
-+}
-+
-+static inline unsigned hash_pending(unsigned id)
-+{
-+	return id & MASK_BUCKETS;
-+}
-+
-+/* Ah, now it gets interesting.  Called in interrupt context */
-+
-+struct hook {
-+	struct snapinfo *info;
-+	sector_t sector;
-+	/* needed only for end_io, make it a union */
-+	bio_end_io_t *old_end_io;
-+	void *old_private;
-+	/* needed after end_io, for release, make it a union */
-+	struct list_head list;
-+};
-+
-+static int snapshot_read_end_io(struct bio *bio, unsigned int done, int error)
-+{
-+	struct hook *hook = bio->bi_private;
-+	struct snapinfo *info = hook->info;
-+
-+	trace(warn("sector %Lx", (long long)hook->sector);)
-+	spin_lock(&info->end_io_lock);
-+	bio->bi_end_io = hook->old_end_io;
-+	bio->bi_private = hook->old_private;
-+	hook->old_end_io = NULL;
-+	if (info->dont_switch_lists == 0)
-+		list_move(&hook->list, &info->releases);
-+	spin_unlock(&info->end_io_lock);
-+	up(&info->more_work_sem);
-+
-+	return bio->bi_end_io(bio, done, error);
-+}
-+
-+/* This is the part that does all the work. */
-+
-+int replied_rw(struct dm_target *target, struct rw_request *body, unsigned length, int rw, int snap)
-+{
-+	struct snapinfo *info = target->private;
-+	struct chunk_range *p = body->ranges;
-+	unsigned shift = info->chunksize_bits - SECTOR_SHIFT, mask = (1 << shift) - 1;
-+	int i, j, submitted = 0;
-+
-+	trace(show_pending(info);)
-+	trace(warn("id = %u, %u ranges, %s %s", body->id, body->count,
-+		rw == READ? "read from": "write to", snap? "snapshot": "origin");)
-+
-+	for (i = 0; i < body->count; i++) { // !!! check for length overrun
-+		unsigned chunks = p->chunks, id = body->id;
-+		struct list_head *list, *bucket = info->pending + hash_pending(id);
-+		struct pending *pending;
-+		struct bio *bio;
-+
-+		trace(warn("[%Lx/%x]", p->chunk, chunks);)
-+		assert(chunks == 1);
-+
-+		spin_lock(&info->pending_lock);
-+		list_for_each(list, bucket)
-+			if ((pending = list_entry(list, struct pending, list))->id == id)
-+				goto found;
-+		warn("Can't find pending rw for chunk %u:%Lx", id, p->chunk);
-+		spin_unlock(&info->pending_lock);
-+		return -1;
-+found:
-+		list_del(&pending->list);
-+		spin_unlock(&info->pending_lock);
-+
-+		bio = pending->bio;
-+		trace(warn("Handle pending IO sector %Lx", (long long)bio->bi_sector);)
-+
-+		if (chunks != pending->chunks) {
-+			warn("Message mismatch, expected %x got %x", chunks, chunks);
-+			kmem_cache_free(pending_cache, pending);
-+			bio_io_error(bio, bio->bi_size);
-+			return -1;
-+		}
-+
-+		++p;
-+		if (snap) {
-+			chunk_t *p2 = (chunk_t *)p;
-+			for (j = 0; j < chunks; j++) {
-+				u64 physical = (*p2++ << shift) + (bio->bi_sector & mask);
-+				trace(warn("logical %Lx = physical %Lx", (u64)bio->bi_sector, physical));
-+				bio->bi_bdev = info->snapdev->bdev;
-+				bio->bi_sector = physical;
-+			}
-+			p = (struct chunk_range *)p2;
-+		} else if (rw == READ) {
-+			/* snapshot read from origin */
-+			struct hook *hook;
-+			trace(warn("hook end_io for %Lx", (long long)bio->bi_sector));
-+			hook = kmem_cache_alloc(end_io_cache, GFP_KERNEL|__GFP_NOFAIL); // !!! union with pending
-+			*hook = (struct hook){
-+				.info = info,
-+				.sector = bio->bi_sector,
-+				.old_end_io = bio->bi_end_io,
-+				.old_private = bio->bi_private };
-+			bio->bi_end_io = snapshot_read_end_io;
-+			bio->bi_private = hook;
-+			list_add(&hook->list, &info->locked);
-+		}
-+
-+		generic_make_request(bio);
-+		submitted++;
-+#ifdef CACHE
-+		for (j = 0; j < p->chunks; j++)
-+			set_unshared_bit(info, chunk + j);
-+#endif
-+		kmem_cache_free(pending_cache, pending);
-+	}
-+	if (submitted){
-+		kick(info->orgdev->bdev);
-+		kick(info->snapdev->bdev);
-+	}
-+	return 0;
-+}
-+
-+/*
-+ * There happen to be four flavors of server replies to rw queries, two
-+ * write and two read, but the symmetry ends there.  Only one flavor
-+ * (write) is for origin IO, because origin reads do not need global
-+ * synchronization.  The remaining three flavors are for snapshot IO.
-+ * Snapshot writes are always to the snapshot store, so there is only
-+ * one flavor.  On the other hand, snapshot reads can be from either
-+ * the origin or the snapshot store.  Only the server can know which.
-+ * Either or both kinds of snapshot read reply are possible for a given
-+ * query, which is where things get nasty.  These two kinds of replies
-+ * can be interleaved arbitrarily along the original read request, and
-+ * to just to add a little more spice, the server may not send back the
-+ * results for an entire query in one message (it may decide to service
-+ * other queries first, or replly about the 'easiest' chunks first). The
-+ * client has to match up all these reply fragments to the original
-+ * request and decide what to do.  Such bizarre fragmentation of the
-+ * incoming request is unavoidable, it results from write access
-+ * patterns to the origin.  We just have to grin and deal with it.  So
-+ * without further ado, here is how the various reply flavors
-+ *
-+ * - Origin write replies just have logical ranges, since origin physical 
-+ *   address is the same as logical.
-+ *
-+ * - Snapshot read replies come back in two separate messages, one for
-+ *   the origin reads (if any) and one for the snapstore reads (if any),
-+ *   the latter includes snapstore addresses.  Origin reads are globally
-+ *   locked by the server, so we must send release messages on
-+ *   completion.
-+ *
-+ * - Snapshot writes are always to the snapstore, so snapstore write
-+ *   replies always include snapstore addresses.
-+ *
-+ * We know whether we're supposed to be a snapshot or origin client,
-+ * but we only use that knowledge as a sanity check.  The message codes
-+ * tell us explicitly whether the IO target is origin or snapstore.
-+ */
-+
-+/*
-+ * For now, we just block on incoming message traffic, so this daemon
-+ * can't do any other useful work.  It could if we used nonblocking pipe
-+ * IO but we have been too lazy to implement it so far.  So we have one
-+ * more daemon than we really need, and maybe we will get energetic one
-+ * day soon and get rid of it.
-+ *
-+ * When it comes time to destroy things, the daemon has to be kicked
-+ * out of its blocking wait, if it is in one, which it probably is.  We
-+ * do that by shutting down the socket.  This unblocks the waiters and
-+ * feeds them errors.  Does this work for all flavors of sockets?  I
-+ * don't know.  It obviously should, but we've seen some pretty silly
-+ * limitations in our long life, so nothing would surprise us at this
-+ * point.
-+ */
-+static int incoming(struct dm_target *target)
-+{
-+	struct snapinfo *info = target->private;
-+	struct messagebuf message; // !!! have a buffer in the target->info
-+	struct file *sock;
-+	struct task_struct *task = current;
-+	int err, length;
-+
-+	strcpy(task->comm, "csnap-client");
-+	down(&info->exit2_sem);
-+	trace(warn("Client thread started, pid=%i", current->pid);)
-+connect:
-+	trace(warn("Request socket connection");)
-+	outbead(info->control_socket, NEED_SERVER, struct { });
-+	trace(warn("Wait for socket connection");)
-+	down(&info->server_in_sem);
-+	trace(warn("got socket %p", info->sock);)
-+	sock = info->sock;
-+
-+	while (running(info)) { // stop on module exit
-+		int rw, to_snap;
-+
-+		trace(warn("wait message");)
-+		if ((err = readpipe(sock, &message.head, sizeof(message.head))))
-+			goto socket_error;
-+		length = message.head.length;
-+		if (length > maxbody)
-+			goto message_too_long;
-+		trace(warn("%x/%u", message.head.code, length);)
-+		if ((err = readpipe(sock, &message.body, length)))
-+			goto socket_error;
-+	
-+		switch (message.head.code) {
-+		case REPLY_ORIGIN_WRITE:
-+			rw = WRITE;
-+			to_snap = 0;
-+			break;
-+
-+		case REPLY_SNAPSHOT_WRITE:
-+			rw = WRITE;
-+			to_snap = 1;
-+			break;
-+
-+		case REPLY_SNAPSHOT_READ_ORIGIN:
-+			rw = READ;
-+			to_snap = 0;
-+			break;
-+
-+		case REPLY_SNAPSHOT_READ:
-+			rw = READ;
-+			to_snap = 1;
-+			break;
-+
-+		case REPLY_IDENTIFY:
-+			trace(warn("identify succeeded");)
-+			up(&info->server_out_sem);
-+			outbead(info->control_socket, REPLY_CONNECT_SERVER, struct { });
-+			continue;
-+
-+		default: 
-+			warn("Unknown message %x", message.head.code);
-+			continue;
-+		}
-+		if (length < sizeof(struct rw_request))
-+			goto message_too_short;
-+
-+		replied_rw(target, (void *)message.body, length, rw, to_snap);
-+	}
-+out:
-+	up(&info->exit2_sem); /* !!! will crash if module unloaded before ret executes */
-+	warn("%s exiting", task->comm);
-+	return 0;
-+message_too_long:
-+	warn("message %x too long (%u bytes)", message.head.code, message.head.length);
-+	goto out;
-+message_too_short:
-+	warn("message %x too short (%u bytes)", message.head.code, message.head.length);
-+	goto out;
-+socket_error:
-+	warn("socket error %i", err);
-+	if (!running(info))
-+		goto out;
-+
-+	warn("halt worker");
-+	report_error(info);
-+	goto connect;
-+}
-+
-+/*
-+ * Here is our nonblocking worker daemon.  It handles all events other
-+ * than incoming socket traffic.  At the moment, its only job is to
-+ * send read release messages that can't be sent directly from the read
-+ * end_io function, which executes in interrupt context.  But soon its
-+ * duties will be expanded to include submitting IO that was blocked
-+ * because no server pipe is connected yet, or something broke the
-+ * pipe.  It may also have to resubmit some server queries, if the
-+ * server dies for some reason and a new one is incarnated to take its
-+ * place.  We also want to check for timed-out queries here.  Sure, we
-+ * have heartbeating in the cluster, but why not have the guy who knows
-+ * what to expect do the checking?  When we do detect timeouts, we will
-+ * punt the complaint upstairs using some interface that hasn't been
-+ * invented yet, because nobody has thought too deeply about what you
-+ * need to do, to detect faults really quickly and reliably.
-+ *
-+ * We throttle this daemon using a counting semaphore: each up on the
-+ * semaphore causes the daemon to loop through its polling sequence
-+ * once.  So we make sure we up the daemon's semaphore every time we
-+ * queue an event.  The daemon may well process more than one event per
-+ * cycle (we want that, actually, because then it can do some, e.g.,
-+ * message batching if it wants to) and will therefore end up looping
-+ * a few times without doing any work.  This is harmless, and much much
-+ * less nasty than missing an event.  When there are no pending events,
-+ * the daemon sleeps peacefully.  Killing the daemon is easy, we just
-+ * pull down the running flag and up the work semaphore, which causes
-+ * our faithful worker to drop out the bottom.
-+ */
-+void upload_locks(struct snapinfo *info)
-+{
-+	unsigned long irqflags;
-+	struct hook *hook;
-+	struct list_head *entry, *tmp;
-+
-+	spin_lock_irqsave(&info->end_io_lock, irqflags);
-+	info->dont_switch_lists = 1;
-+	while(!list_empty(&info->releases)){
-+		entry = info->releases.prev;
-+		hook = list_entry(entry, struct hook, list);
-+		list_del(entry);
-+		kmem_cache_free(end_io_cache, hook);
-+	}
-+	spin_unlock_irqrestore(&info->end_io_lock, irqflags);
-+	list_for_each_safe(entry, tmp, &info->locked){
-+		chunk_t chunk;
-+
-+		hook = list_entry(entry, struct hook, list);
-+		spin_lock_irqsave(&info->end_io_lock, irqflags);
-+		if (hook->old_end_io == NULL){
-+			list_del(entry);
-+			kmem_cache_free(end_io_cache, hook);
-+			spin_unlock_irqrestore(&info->end_io_lock, irqflags);
-+			continue;
-+		}
-+		spin_unlock_irqrestore(&info->end_io_lock, irqflags);
-+		chunk = hook->sector >> info->chunkshift;
-+		outbead(info->sock, UPLOAD_LOCK, struct rw_request1, .count = 1, .ranges[0].chunk = chunk, .ranges[0].chunks = 1);
-+	}
-+	outbead(info->sock, FINISH_UPLOAD_LOCK, struct {});
-+	spin_lock_irqsave(&info->end_io_lock, irqflags);
-+	list_for_each_safe(entry, tmp, &info->locked){
-+		hook = list_entry(entry, struct hook, list);
-+		if (hook->old_end_io == NULL)
-+			list_move(&hook->list, &info->releases);
-+	}
-+	info->dont_switch_lists = 0;
-+	spin_unlock_irqrestore(&info->end_io_lock, irqflags);
-+}
-+
-+static void requeue_queries(struct snapinfo *info)
-+{
-+	unsigned i;
-+
-+	trace(show_pending(info);)
-+	spin_lock(&info->pending_lock);
-+	warn("");
-+	for (i = 0; i < NUM_BUCKETS; i++) {
-+		struct list_head *bucket = info->pending + i;
-+
-+		while (!list_empty(bucket)) {
-+			struct list_head *entry = bucket->next;
-+			struct pending *pending = list_entry(entry, struct pending, list);
-+			trace_on(warn("requeue %u:%Lx", pending->id, pending->chunk);)
-+
-+			list_move(entry, &info->queries);
-+			up(&info->more_work_sem);
-+		}
-+	}
-+	spin_unlock(&info->pending_lock);
-+	trace(show_pending(info);)
-+}
-+
-+static int worker(struct dm_target *target)
-+{
-+	struct snapinfo *info = target->private;
-+	struct task_struct *task = current;
-+	int err;
-+
-+	strcpy(task->comm, "csnap-worker");
-+	trace(warn("Worker thread started, pid=%i", current->pid);)
-+	down(&info->exit1_sem);
-+	goto recover; /* just for now we'll always upload locks, even on fresh start */
-+restart:
-+	while (worker_running(info)) {
-+		unsigned long irqflags;
-+		down(&info->more_work_sem);
-+
-+		/* Send message for each pending request. */
-+		spin_lock(&info->pending_lock);
-+		while (!list_empty(&info->queries) && worker_running(info)) {
-+			struct list_head *entry = info->queries.prev;
-+			struct pending *pending = list_entry(entry, struct pending, list);
-+
-+			list_del(entry);
-+			list_add(&pending->list, info->pending + hash_pending(pending->id));
-+			spin_unlock(&info->pending_lock);
-+			trace(show_pending(info);)
-+
-+			down(&info->server_out_sem);
-+			trace(warn("Server query [%Lx/%x]", pending->chunk, pending->chunks);)
-+			if ((err = outbead(info->sock,
-+				bio_data_dir(pending->bio) == WRITE? QUERY_WRITE: QUERY_SNAPSHOT_READ,
-+				struct rw_request1,
-+					.id = pending->id, .count = 1,
-+					.ranges[0].chunk = pending->chunk,
-+					.ranges[0].chunks = pending->chunks)))
-+				goto report;
-+			up(&info->server_out_sem);
-+			spin_lock(&info->pending_lock);
-+		}
-+		spin_unlock(&info->pending_lock);
-+
-+		/* Send message for each pending read release. */
-+		spin_lock_irqsave(&info->end_io_lock, irqflags);
-+		while (!list_empty(&info->releases) && worker_running(info)) {
-+			struct list_head *entry = info->releases.prev;
-+			struct hook *hook = list_entry(entry, struct hook, list);
-+			chunk_t chunk = hook->sector >> info->chunkshift;
-+
-+			list_del(entry);
-+			spin_unlock_irqrestore(&info->end_io_lock, irqflags);
-+			trace(warn("release sector %Lx, chunk %Lx", (long long)hook->sector, chunk);)
-+			kmem_cache_free(end_io_cache, hook);
-+			down(&info->server_out_sem);
-+			if ((err = outbead(info->sock, FINISH_SNAPSHOT_READ, struct rw_request1,
-+				.count = 1, .ranges[0].chunk = chunk, .ranges[0].chunks = 1)))
-+				goto report;
-+			up(&info->server_out_sem);
-+			spin_lock_irqsave(&info->end_io_lock, irqflags);
-+		}
-+		spin_unlock_irqrestore(&info->end_io_lock, irqflags);
-+
-+		trace(warn("Yowza! More work?");)
-+	}
-+	if ((info->flags & RECOVER_FLAG)) {
-+		down(&info->server_out_sem);
-+		up(&info->more_work_sem);
-+		goto recover;
-+	}
-+finish:
-+	up(&info->exit1_sem); /* !!! crashes if module unloaded before ret executes */
-+	trace_on(warn("%s exiting", task->comm);)
-+	return 0;
-+
-+report:
-+	warn("worker socket error %i", err);
-+	report_error(info);
-+recover:
-+	trace_on(warn("worker recovering");)
-+	down(&info->recover_sem);
-+	if ((info->flags & FINISH_FLAG))
-+		goto finish;
-+	if (is_snapshot(info))
-+		upload_locks(info);
-+	requeue_queries(info);
-+	trace_on(warn("worker resuming");)
-+
-+	info->flags &= ~(RECOVER_FLAG|(1 << REPORT_BIT));
-+	up(&info->recover_sem);
-+	goto restart;
-+}
-+
-+/*
-+ * Yikes, a third daemon, that makes four including the user space
-+ * monitor.  This daemon proliferation is due to not using poll, which
-+ * we should fix at some point.  Or maybe we should wait for aio to
-+ * work properly for sockets, and use that instead.  Either way, we
-+ * can combine the two socket-waiting daemons into one, which will look
-+ * nicer in ps.  Practically speaking, it doesn't matter a whole lot
-+ * though, if we just stay lazy and have too many daemons.
-+ *
-+ * At least, combine this code with incoming, with just the switches
-+ * different.
-+ */
-+static int control(struct dm_target *target)
-+{
-+	struct task_struct *task = current;
-+	struct snapinfo *info = target->private;
-+	struct messagebuf message; // !!! have a buffer in the target->info
-+	struct file *sock;
-+	int err, length;
-+
-+	strcpy(task->comm, "csnap-control");
-+	trace(warn("Control thread started, pid=%i", current->pid);)
-+	sock = info->control_socket;
-+	trace(warn("got socket %p", sock);)
-+
-+	down(&info->exit3_sem);
-+	while (running(info)) {
-+		trace(warn("wait message");)
-+		if ((err = readpipe(sock, &message.head, sizeof(message.head))))
-+			goto socket_error;
-+		trace(warn("got message header code %x", message.head.code);)
-+		length = message.head.length;
-+		if (length > maxbody)
-+			goto message_too_long;
-+		trace(warn("%x/%u", message.head.code, length);)
-+		if ((err = readpipe(sock, &message.body, length)))
-+			goto socket_error;
-+	
-+		switch (message.head.code) {
-+		case SET_IDENTITY:
-+			info->id = ((struct set_id *)message.body)->id;
-+			warn("id set: %Lu", info->id);
-+			break;
-+		case CONNECT_SERVER: {
-+			unsigned len = 4;
-+			char bogus[len];
-+			int sock_fd = get_unused_fd(), fd;
-+
-+			if (sock_fd < 0) {
-+				warn("Can't get fd, error %i", sock_fd);
-+				break;
-+			}
-+			fd_install(sock_fd, sock);
-+			if ((fd = recv_fd(sock_fd, bogus, &len)) < 0) {
-+				warn("recv_fd failed, error %i", fd);
-+				put_unused_fd(sock_fd);
-+				break;
-+			}
-+			trace(warn("Received socket %i", fd);)
-+			info->sock = fget(fd);
-+			current->files->fd[fd] = NULL; /* this is sooo hokey */
-+			put_unused_fd(sock_fd);
-+			sys_close(fd);
-+			up(&info->server_in_sem);
-+			outbead(info->sock, IDENTIFY, struct identify, .id = info->id, .snap = info->snap);
-+			up(&info->recover_sem); /* worker uploads locks now */
-+			break;
-+		}
-+		default: 
-+			warn("Unknown message %x", message.head.code);
-+			continue;
-+		}
-+	}
-+out:
-+	up(&info->exit3_sem); /* !!! will crash if module unloaded before ret executes */
-+	warn("%s exiting", task->comm);
-+	return 0;
-+message_too_long:
-+	warn("message %x too long (%u bytes)", message.head.code, message.head.length);
-+	goto out;
-+socket_error:
-+	warn("socket error %i", err);
-+	goto out;
-+}
-+
-+/*
-+ * This is the device mapper mapping method, which does one of three things:
-+ * (1) tells device mapper to go ahead and submit the request with a default
-+ * identity mapping (return 1) (2) tells device mapper to forget about the
-+ * request (return 0), goes off and does its own thing, or (3) on a bad
-+ * day, tells device mapper to fail the IO (return negative errnum).
-+ *
-+ * This is pretty simple: we just hand any origin reads back to device mapper
-+ * after filling in the origin device.  Then, we check the cache to see if
-+ * if conditions are right to map the request locally, otherwise we need help
-+ * from the server, so we remember the request in the pending hash and send
-+ * off the appropriate server query.
-+ *
-+ * To make this a little more interesting, our server connection may be broken
-+ * at the moment, or may not have been established yet, in which case we have
-+ * to defer the request until the server becomes available.
-+ */
-+static int csnap_map(struct dm_target *target, struct bio *bio, union map_info *context)
-+{
-+	struct snapinfo *info = target->private;
-+	struct pending *pending;
-+	chunk_t chunk;
-+	unsigned id;
-+
-+	bio->bi_bdev = info->orgdev->bdev;
-+	if (bio_data_dir(bio) == READ && !is_snapshot(info))
-+		return 1;
-+
-+	chunk = bio->bi_sector >> info->chunkshift;
-+	trace(warn("map %Lx/%x, chunk %Lx", (long long)bio->bi_sector, bio->bi_size, chunk);)
-+	assert(bio->bi_size <= 1 << info->chunksize_bits);
-+#ifdef CACHE
-+	if (is_snapshot(info)) { // !!! use page cache for both
-+		struct page *page;
-+		u64 *exception = snap_map_cachep(info->inode->i_mapping, chunk, &page);
-+	
-+		if (!exception) {
-+			printk("Failed to get a page for sector %ld\n", bio->bi_sector);
-+			return -1;
-+		}
-+
-+		u64 exp_chunk = *exception;
-+		UnlockPage(page);
-+		if (exp_chunk) {
-+			bio->bi_sector = bio->bi_sector + ((exp_chunk - chunk) << info->chunkshift);
-+			return 1;
-+		}
-+	} else {
-+		if (info->shared_bitmap && get_unshared_bit(info, chunk))
-+			return 1;
-+	}
-+#endif
-+	id = info->nextid;
-+	info->nextid = (id + 1) & ~(-1 << ID_BITS);
-+	pending = kmem_cache_alloc(pending_cache, GFP_NOIO|__GFP_NOFAIL);
-+	*pending = (struct pending){ .id = id, .bio = bio, .chunk = chunk, .chunks = 1 };
-+	spin_lock(&info->pending_lock);
-+	list_add(&pending->list, &info->queries);
-+	spin_unlock(&info->pending_lock);
-+	up(&info->more_work_sem);
-+	return 0;
-+}
-+
-+/*
-+ * Carefully crafted not to care about how far we got in the process
-+ * of instantiating our client.  As such, it serves both for error
-+ * abort and device unload destruction.  We have to scour our little
-+ * world for resources and give them all back, including any pending
-+ * requests, context structures and daemons.  The latter have to be
-+ * convince to exit on demand, and we must be sure they have exited,
-+ * so we synchronize that with semaphores.  This isn't 100% foolproof'
-+ * there is still the possibility that the destructor could gain
-+ * control between the time a daemon ups its exit semaphore and when
-+ * it has actually returned to its caller.  In that case, the module
-+ * could be unloaded and the exiting thread will segfault.  This is
-+ * a basic flaw in Linux that I hope to get around to fixing at some
-+ * point, one way or another.
-+ */
-+static int shutdown_socket(struct file *socket)
-+{
-+	struct socket *sock = SOCKET_I(socket->f_dentry->d_inode);
-+	return sock->ops->shutdown(sock, RCV_SHUTDOWN);
-+}
-+
-+static void csnap_destroy(struct dm_target *target)
-+{
-+	struct snapinfo *info = target->private;
-+	int err; /* I have no mouth but I must scream */
-+
-+	trace(warn("%p", target);)
-+	if (!info)
-+		return;
-+
-+	/* Unblock helper threads */
-+	info->flags |= FINISH_FLAG;
-+	up(&info->server_in_sem); // unblock incoming thread
-+	up(&info->server_out_sem); // unblock io request threads
-+	up(&info->recover_sem); // unblock worker recovery
-+
-+	if (info->sock && (err = shutdown_socket(info->sock)))
-+		warn("server socket shutdown error %i", err);
-+	if (info->sock && (err = shutdown_socket(info->control_socket)))
-+		warn("control socket shutdown error %i", err);
-+
-+	up(&info->more_work_sem);
-+
-+	// !!! wrong! the thread might be just starting, think about this some more
-+	// ah, don't let csnap_destroy run while csnap_create is spawning threads
-+	down(&info->exit1_sem);
-+	warn("thread 1 exited");
-+	down(&info->exit2_sem);
-+	warn("thread 2 exited");
-+	down(&info->exit3_sem);
-+	warn("thread 3 exited");
-+
-+	if (info->sock)
-+		fput(info->sock);
-+	if (info->inode)
-+		iput(info->inode);
-+	if (info->shared_bitmap)
-+		vfree(info->shared_bitmap);
-+	if (info->snapdev)
-+		dm_put_device(target, info->snapdev);
-+	if (info->orgdev)
-+		dm_put_device(target, info->orgdev);
-+	kfree(info);
-+}
-+
-+/*
-+ * Woohoo, we are going to instantiate a new cluster snapshot virtual
-+ * device, what fun.
-+ */
-+static int get_control_socket(char *sockname)
-+{
-+	mm_segment_t oldseg = get_fs();
-+	struct sockaddr_un addr = { .sun_family = AF_UNIX };
-+	int addr_len = sizeof(addr) - sizeof(addr.sun_path) + strlen(sockname); // !!! check too long
-+	int sock = sys_socket(AF_UNIX, SOCK_STREAM, 0), err = 0;
-+
-+	trace(warn("Connect to control socket %s", sockname);)
-+	if (sock <= 0)
-+		return sock;
-+	strncpy(addr.sun_path, sockname, sizeof(addr.sun_path));
-+	if (sockname[0] == '@')
-+		addr.sun_path[0] = 0;
-+
-+	set_fs(get_ds());
-+	while ((err = sys_connect(sock, (struct sockaddr *)&addr, addr_len)) == -ECONNREFUSED)
-+		break;
-+//		yield();
-+	set_fs(oldseg);
-+
-+	return err? err: sock;
-+}
-+
-+/*
-+ * Round up to nearest 2**k boundary
-+ * !!! lose this
-+ */
-+static inline ulong round_up(ulong n, ulong size)
-+{
-+	return (n + size - 1) & ~(size - 1);
-+}
-+
-+static int csnap_create(struct dm_target *target, unsigned argc, char **argv)
-+{
-+	u64 chunksize_bits = 12; // !!! when chunksize isn't always 4K, have to move all this to identify reply handler
-+	struct snapinfo *info;
-+	int err, i, snap, flags = 0;
-+	char *error;
-+#ifdef CACHE
-+	unsigned bm_size;
-+#endif
-+
-+	error = "csnap usage: orgdev snapdev sockname snapnum";
-+	err = -EINVAL;
-+	if (argc != 4)
-+		goto eek;
-+
-+	snap = simple_strtol(argv[3], NULL, 0);
-+	if (snap >= 0)
-+		flags |= IS_SNAP_FLAG;
-+
-+	err = -ENOMEM;
-+	error = "can't get kernel memory";
-+	if (!(info = kmalloc(sizeof(struct snapinfo), GFP_KERNEL)))
-+		goto eek;
-+
-+	*info = (struct snapinfo){ 
-+		.flags = flags, .snap = snap,
-+		.chunksize_bits = chunksize_bits,
-+		.chunkshift = chunksize_bits - SECTOR_SHIFT};
-+	target->private = info;
-+	sema_init(&info->server_in_sem, 0);
-+	sema_init(&info->server_out_sem, 0);
-+	sema_init(&info->recover_sem, 0);
-+	sema_init(&info->exit1_sem, 1);
-+	sema_init(&info->exit2_sem, 1);
-+	sema_init(&info->exit3_sem, 1);
-+	sema_init(&info->more_work_sem, 0);
-+	spin_lock_init(&info->pending_lock);
-+	spin_lock_init(&info->end_io_lock);
-+	INIT_LIST_HEAD(&info->queries);
-+	INIT_LIST_HEAD(&info->releases);
-+	INIT_LIST_HEAD(&info->locked);
-+	for (i = 0; i < NUM_BUCKETS; i++)
-+		INIT_LIST_HEAD(&info->pending[i]);
-+
-+	error = "Can't get snapshot device";
-+	if ((err = dm_get_device(target, argv[0], 0, target->len, dm_table_get_mode(target->table), &info->snapdev)))
-+		goto eek;
-+	error = "Can't get origin device";
-+	if ((err = dm_get_device(target, argv[1], 0, target->len, dm_table_get_mode(target->table), &info->orgdev)))
-+		goto eek;
-+	error = "Can't connect control socket";
-+	if ((err = get_control_socket(argv[2])) < 0)
-+		goto eek;
-+	info->control_socket = fget(err);
-+	sys_close(err);
-+
-+#ifdef CACHE
-+	bm_size = round_up((target->len  + 7) >> (chunksize_bits + 3), sizeof(u32)); // !!! wrong
-+	error = "Can't allocate bitmap for origin";
-+	if (!(info->shared_bitmap = vmalloc(bm_size)))
-+		goto eek;
-+	memset(info->shared_bitmap, 0, bm_size);
-+	if (!(info->inode = new_inode(snapshot_super)))
-+		goto eek;
-+#endif
-+
-+	error = "Can't start daemon";
-+	if ((err = kernel_thread((void *)incoming, target, CLONE_KERNEL)) < 0)
-+		goto eek;
-+	if ((err = kernel_thread((void *)worker, target, CLONE_KERNEL)) < 0)
-+		goto eek;
-+	if ((err = kernel_thread((void *)control, target, CLONE_KERNEL)) < 0)
-+		goto eek;
-+	warn("Created snapshot device origin=%s snapstore=%s socket=%s snapshot=%i", argv[0], argv[1], argv[2], snap);
-+	target->split_io = 1 << info->chunkshift; // !!! lose this as soon as possible
-+	return 0;
-+
-+eek:	warn("Virtual device create error %i: %s!", err, error);
-+	csnap_destroy(target);
-+	target->error = error;
-+	return err;
-+
-+	{ void *useme = show_pending; useme = useme; }
-+}
-+
-+/* Is this actually useful?  It's really trying to be a message */
-+
-+static int csnap_status(struct dm_target *target, status_type_t type, char *result, unsigned int maxlen)
-+{
-+	char orgbuffer[32];
-+	char snapbuffer[32];
-+	struct snapinfo *info = target->private;
-+
-+	switch (type) {
-+	case STATUSTYPE_INFO:
-+		result[0] = '\0';
-+		break;
-+
-+	case STATUSTYPE_TABLE:
-+		format_dev_t(orgbuffer, info->orgdev->bdev->bd_dev);
-+		format_dev_t(snapbuffer, info->snapdev->bdev->bd_dev);
-+		snprintf(result, maxlen, "%s %s %u",
-+			 orgbuffer, snapbuffer, 1 << info->chunksize_bits);
-+		break;
-+	}
-+
-+	return 0;
-+}
-+
-+static struct target_type csnap = {
-+	.name = "csnapshot",
-+	.version = {0, 0, 0},
-+	.module = THIS_MODULE,
-+	.ctr = csnap_create,
-+	.dtr = csnap_destroy,
-+	.map = csnap_map,
-+	.status = csnap_status,
-+};
-+
-+int __init dm_csnap_init(void)
-+{
-+	int err = -ENOMEM;
-+	char *what = "Cache create";
-+	if (!(pending_cache = kmem_cache_create("csnap-pending",
-+		sizeof(struct pending), __alignof__(struct pending), 0, NULL, NULL)))
-+		goto bad1;
-+	if (!(end_io_cache = kmem_cache_create("csnap-endio",
-+		sizeof(struct hook), __alignof__(struct hook), 0, NULL, NULL)))
-+		goto bad2;
-+	what = "register";
-+	if ((err = dm_register_target(&csnap)))
-+		goto bad3;
-+#ifdef CACHE
-+	err = -ENOMEM;
-+	what = "create snapshot superblock";
-+	if (!(snapshot_super = alloc_super()))
-+		goto bad4;
-+#endif
-+	return 0;
-+
-+#ifdef CACHE
-+bad4:
-+	dm_unregister_target(&csnap);
-+#endif
-+bad3:
-+	kmem_cache_destroy(end_io_cache);
-+bad2:
-+	kmem_cache_destroy(pending_cache);
-+bad1:
-+	DMERR("%s failed\n", what);
-+	return err;
-+}
-+
-+void dm_csnap_exit(void)
-+{
-+	int err;
-+	trace_on(warn(">>> module exit");)
-+	if ((err = dm_unregister_target(&csnap)))
-+		DMERR("Snapshot unregister failed %d", err);
-+	if (pending_cache)
-+		kmem_cache_destroy(pending_cache);
-+	if (end_io_cache)
-+		kmem_cache_destroy(end_io_cache);
-+	kfree(snapshot_super);
-+}
-+
-+module_init(dm_csnap_init);
-+module_exit(dm_csnap_exit);
-diff -up --recursive 2.6.8.1.csnap.clean/drivers/md/dm-csnap.h 2.6.8.1.csnap/drivers/md/dm-csnap.h
---- 2.6.8.1.csnap.clean/drivers/md/dm-csnap.h	2004-10-14 12:58:12.000000000 -0400
-+++ 2.6.8.1.csnap/drivers/md/dm-csnap.h	2004-12-01 23:11:46.000000000 -0500
-@@ -0,0 +1,90 @@
-+#define PACKED __attribute__ ((packed))
-+#define MAGIC  0xadbe
-+
-+struct head { uint32_t code; uint32_t length; } PACKED;
-+
-+enum csnap_codes
-+{
-+	REPLY_ERROR = 0xbead0000,
-+	IDENTIFY,
-+	REPLY_IDENTIFY,
-+	QUERY_WRITE,
-+	REPLY_ORIGIN_WRITE,
-+	REPLY_SNAPSHOT_WRITE,
-+	QUERY_SNAPSHOT_READ,
-+	REPLY_SNAPSHOT_READ,
-+	REPLY_SNAPSHOT_READ_ORIGIN,
-+	FINISH_SNAPSHOT_READ,
-+	CREATE_SNAPSHOT,
-+	REPLY_CREATE_SNAPSHOT,
-+	DELETE_SNAPSHOT,
-+	REPLY_DELETE_SNAPSHOT,
-+	DUMP_TREE,
-+	INITIALIZE_SNAPSTORE,
-+	NEED_SERVER,
-+	CONNECT_SERVER,
-+	REPLY_CONNECT_SERVER,
-+	CONTROL_SOCKET,
-+	SERVER_READY,
-+	START_SERVER,
-+	SHUTDOWN_SERVER,
-+	SET_IDENTITY,
-+	UPLOAD_LOCK,
-+	FINISH_UPLOAD_LOCK,
-+	NEED_CLIENTS,
-+	UPLOAD_CLIENT_ID,
-+	FINISH_UPLOAD_CLIENT_ID,
-+	REMOVE_CLIENT_IDS,
-+};
-+
-+struct match_id { uint64_t id; uint64_t mask; } PACKED;
-+struct set_id { uint64_t id; } PACKED;
-+struct identify { uint64_t id; int32_t snap; } PACKED;
-+struct create_snapshot { uint32_t snap; } PACKED;
-+
-+typedef uint16_t shortcount; /* !!! what is this all about */
-+
-+struct rw_request
-+{
-+	uint16_t id;
-+	shortcount count;
-+	struct chunk_range
-+	{
-+		uint64_t chunk;
-+		shortcount chunks;
-+	} PACKED ranges[];
-+} PACKED;
-+
-+/* !!! can there be only one flavor of me please */
-+struct rw_request1
-+{
-+	uint16_t id;
-+	shortcount count;
-+	struct chunk_range PACKED ranges[1];
-+} PACKED;
-+
-+/* decruft me... !!! */
-+#define maxbody 500
-+struct rwmessage { struct head head; struct rw_request body; };
-+struct messagebuf { struct head head; char body[maxbody]; };
-+/* ...decruft me */
-+
-+/* The endian conversions that libc forgot */
-+
-+static inline uint64_t ntohll(uint64_t n)
-+{
-+#if __BYTE_ORDER == __LITTLE_ENDIAN
-+	return (((uint64_t)ntohl(n)) << 32) | ntohl(n >> 32);
-+#else
-+	return n; 
-+#endif
-+}
-+
-+static inline uint64_t htonll(uint64_t n)
-+{
-+#if __BYTE_ORDER == __LITTLE_ENDIAN
-+	return (((uint64_t)htonl(n)) << 32) | htonl(n >> 32);
-+#else
-+	return n; 
-+#endif
-+}
-diff -up --recursive 2.6.8.1.csnap.clean/fs/super.c 2.6.8.1.csnap/fs/super.c
---- 2.6.8.1.csnap.clean/fs/super.c	2004-08-14 06:55:22.000000000 -0400
-+++ 2.6.8.1.csnap/fs/super.c	2004-10-04 16:39:41.000000000 -0400
-@@ -51,7 +51,7 @@ spinlock_t sb_lock = SPIN_LOCK_UNLOCKED;
-  *	Allocates and initializes a new &struct super_block.  alloc_super()
-  *	returns a pointer new superblock or %NULL if allocation had failed.
-  */
--static struct super_block *alloc_super(void)
-+struct super_block *alloc_super(void)
- {
- 	struct super_block *s = kmalloc(sizeof(struct super_block),  GFP_USER);
- 	static struct super_operations default_op;
-@@ -87,6 +87,8 @@ out:
- 	return s;
- }
- 
-+EXPORT_SYMBOL(alloc_super);
-+
- /**
-  *	destroy_super	-	frees a superblock
-  *	@s: superblock to free
-diff -up --recursive 2.6.8.1.csnap.clean/include/linux/fs.h 2.6.8.1.csnap/include/linux/fs.h
---- 2.6.8.1.csnap.clean/include/linux/fs.h	2004-10-14 13:10:56.000000000 -0400
-+++ 2.6.8.1.csnap/include/linux/fs.h	2004-10-12 02:09:03.000000000 -0400
-@@ -1122,6 +1122,7 @@ void generic_shutdown_super(struct super
- void kill_block_super(struct super_block *sb);
- void kill_anon_super(struct super_block *sb);
- void kill_litter_super(struct super_block *sb);
-+struct super_block *alloc_super(void);
- void deactivate_super(struct super_block *sb);
- int set_anon_super(struct super_block *s, void *data);
- struct super_block *sget(struct file_system_type *type,
-diff -up --recursive 2.6.8.1.csnap.clean/net/socket.c 2.6.8.1.csnap/net/socket.c
---- 2.6.8.1.csnap.clean/net/socket.c	2004-08-14 06:55:10.000000000 -0400
-+++ 2.6.8.1.csnap/net/socket.c	2004-11-01 23:10:54.000000000 -0500
-@@ -2086,6 +2086,12 @@ void socket_seq_show(struct seq_file *se
- }
- #endif /* CONFIG_PROC_FS */
- 
-+/* Cluster devices need these, or better: kernel interfaces */
-+
-+EXPORT_SYMBOL_GPL(sys_connect);
-+EXPORT_SYMBOL_GPL(sys_recvmsg);
-+EXPORT_SYMBOL_GPL(sys_socket);
-+
- /* ABI emulation layers need these two */
- EXPORT_SYMBOL(move_addr_to_kernel);
- EXPORT_SYMBOL(move_addr_to_user);
diff --git a/csnap/src/Makefile b/csnap/src/Makefile
deleted file mode 100644
index 2be7ab8..0000000
--- a/csnap/src/Makefile
+++ /dev/null
@@ -1,44 +0,0 @@
-###############################################################################
-###############################################################################
-##
-##  Copyright (C) 2006 Red Hat, Inc.  All rights reserved.
-##
-##  This copyrighted material is made available to anyone wishing to use,
-##  modify, copy, or redistribute it subject to the terms and conditions
-##  of the GNU General Public License v.2.
-##
-###############################################################################
-###############################################################################
-
-binaries = mksnapstore csnap-server csnap-create csnap-delete csnap-agent
-deps = csnap.h trace.h sock.h buffer.h list.h
-
-top_srcdir=..
-include $(top_srcdir)/make/defines.mk
-
-all: buffer.o $(binaries)
-
-buffer.o: buffer.c $(deps)
-	cc -g -Wall buffer.c -c
-
-mksnapstore: csnap.c buffer.o $(deps)
-	cc -g -Wall csnap.c buffer.o -DCREATE -o mksnapstore -lpopt
-
-csnap-server: csnap.c buffer.o $(deps)
-	cc -g -Wall csnap.c buffer.o -DSERVER -o csnap-server -lpopt
-
-csnap-agent: agent.c $(deps)
-	cc -g -Wall agent.c -I../../../ -ldlm -lmagma -ldl -lpthread -o csnap-agent -lpopt
-
-csnap-create csnap-delete: create.c $(deps)
-	cc -Wall create.c -DCREATE -o csnap-create -lpopt
-	cc -Wall create.c -DDELETE -o csnap-delete -lpopt
-
-install: all
-	if [ ! -d ${sbindir} ]; then \
-		install -d ${sbindir}; \
-	fi
-	install -m755 ${binaries} ${sbindir}
-
-clean:
-	rm -f $(binaries) *.o a.out
diff --git a/csnap/src/agent.c b/csnap/src/agent.c
deleted file mode 100644
index b3f5fb9..0000000
--- a/csnap/src/agent.c
+++ /dev/null
@@ -1,359 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <errno.h> 
-#include <unistd.h> // read
-#include <sys/socket.h>
-#include <sys/poll.h>
-#include <sys/un.h>
-#include <netinet/in.h>
-#include <libdlm.h>
-#include <linux/dm-csnap.h> // message codes
-#include "csnap.h" // outbead
-#include "trace.h"
-#include "sock.h" // send_fd, read/writepipe, connect_socket
-
-#define trace trace_off
-
-#define LOCK "csnap" // !!! choose a sensible name and/or use a lockspace
-
-struct lvb_address{ u32 type; u32 port; u8 address_len; char address[16]; };;
-
-struct client { int sock; enum { CLIENT_CON, SERVER_CON } type; };
-
-struct context {
-	struct server active, local;
-	int serv;
-	int waiters; 
-	struct client *waiting[100];
-	struct dlm_lksb lksb;
-	char lvb[DLM_LVB_LEN];
-	int polldelay;
-	unsigned ast_state;
-};
-
-static inline int have_address(struct server *server)
-{
-	return !!server->address_len;
-}
-
-int connect_clients(struct context *context)
-{
-	warn("connect clients to %x", *(int *)(context->active.address));
-	while (context->waiters)
-	{
-		struct client *client = context->waiting[0];
-		int control = client->sock;
-		struct server *server = &context->active;
-		struct sockaddr_in addr = { .sin_family = server->type, .sin_port = server->port };
-		int sock;
-
-		if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0)
-			error("Can't get socket");
-		memcpy(&addr.sin_addr.s_addr, server->address, server->address_len);
-		if (connect(sock, (struct sockaddr *)&addr, sizeof(addr)) == -1) {
-			warn("Can't connect to server, %s (%i)", strerror(errno), errno);
-//			warn("try again later");
-//			context->polldelay = 500;
-			return -1;
-		}
-		if (outbead(control, CONNECT_SERVER, struct { }) < 0)
-			error("Could not send connect message");
-		if (send_fd(control, sock, "fark", 4) < 0)
-			error("Could not pass server connection to target");
-		context->waiting[0] = context->waiting[--context->waiters];
-	}
-	return 0;
-}
-
-/*
- * Server instantiation algorithm using dlm+lvb:
- *
- * Repeat until bored:
- *    - Try to grab Protected Write without waiting
- *    - If we got it, write our server address to the lvb, start it, done
- *    - Otherwise, convert to Concurrent Read without waiting
- *    - If there's a server address in the lvb, use it, done
- *
- * Then punt to a human: somebody out there is sitting on the PW lock
- * but not distributing a server address.
- */
-
-enum ast_state {
-	dormant,
-	read_lvb_done,
-	write_lvb_done,
-	check_write_lock,
-};
-
-void ast(void *arg); // bogus forward ref
-
-void read_lvb(struct context *context, int flags) // bogus function
-{
-	struct dlm_lksb *lksb = &context->lksb;
-
-	warn("read lvb");
-	if (dlm_lock(LKM_CRMODE, lksb, flags|LKF_NOQUEUE|LKF_VALBLK, LOCK, strlen(LOCK), 0, ast, context, NULL, NULL))
-		error("convert failed");
-	context->ast_state = read_lvb_done;
-}
-
-void ast(void *arg)
-{
-	struct context *context = arg;
-	struct dlm_lksb *lksb = &context->lksb;
-
-	if (lksb->sb_status == EUNLOCK) {
-		warn("released lock");
-		memset(&context->active, 0, sizeof(struct server));
-		return;
-	}
-
-	switch (context->ast_state) {
-	case read_lvb_done:
-		warn("read_lvb_done");
-		if (lksb->sb_status)
-			error("unexpected lock status (%i)", lksb->sb_status);
-	
-		if (have_address((struct server *)lksb->sb_lvbptr)) {
-			memcpy(&context->active, lksb->sb_lvbptr, sizeof(struct server));
-			context->ast_state = dormant;
-			connect_clients(context);
-			return;
-		}
-
-		/* No address in lvb?  Sigh, we have to busywait. */
-		context->ast_state = dormant;
-		context->polldelay = 100;
-		return;
-
-	case write_lvb_done:
-		warn("write_lvb_done");
-		/* if this didn't work the dlm broken, might as well die */
-		if (lksb->sb_status)
-			error("unexpected lock status (%i)", lksb->sb_status);
-
-		warn("Activate local server");
-		memcpy(&context->active, &context->local, sizeof(struct server));
-		if (outbead(context->serv, START_SERVER, struct { }) < 0)
-			error("Could not send message to server");
-		connect_clients(context);
-		context->ast_state = dormant;
-		return;
-
-	case check_write_lock:
-		warn("check_write_lock");
-		warn("status = %i, %s", lksb->sb_status, strerror(lksb->sb_status));
-		if (lksb->sb_status == EAGAIN) {
-			/* We lost the race to start a server (probably) */
-			read_lvb(context, 0);
-			return;
-		}
-
-		warn("got write lock");
-		memcpy(lksb->sb_lvbptr, &context->local, sizeof(struct server));
-		if (dlm_lock(LKM_PWMODE, lksb, LKF_CONVERT|LKF_NOQUEUE|LKF_VALBLK, LOCK, strlen(LOCK), 0, ast, context, NULL, NULL))
-			error("convert failed");
-		context->ast_state = write_lvb_done;
-		return;
-	default:
-		error("Bad ast state %i", context->ast_state);
-	}
-}
-
-int try_to_instantiate(struct context *context)
-{
-	warn("Try to instantiate server");
-	struct dlm_lksb *lksb = &context->lksb;
-	if (dlm_lock(LKM_PWMODE, lksb, LKF_NOQUEUE, LOCK, strlen(LOCK), 0, ast, context, NULL, NULL)) {
-		if (errno == EAGAIN) // bogus double handling of lock collision if master is local
-			read_lvb(context, 0);
-		else
-			error("lock failed (%i) %s", errno, strerror(errno));
-	} else
-		context->ast_state = check_write_lock;
-	return 0;
-}
-
-int incoming(struct context *context, struct client *client)
-{
-	int err;
-	struct messagebuf message;
-	int sock = client->sock;
-
-	if ((err = readpipe(sock, &message.head, sizeof(message.head))))
-		goto pipe_error;
-	if (message.head.length > maxbody)
-		goto message_too_long;
-	if ((err = readpipe(sock, &message.body, message.head.length)))
-		goto pipe_error;
-
-	switch (message.head.code) {
-	case SERVER_READY:
-		warn("received server ready");
-		assert(message.head.length == sizeof(struct server));
-		memcpy(&context->local, message.body, sizeof(struct server));
-		context->serv = sock; // !!! refuse more than one
-		client->type = SERVER_CON;
-		goto instantiate;
-
-	case NEED_SERVER:
-		context->waiting[context->waiters++] = client;
-		/*
-		 * If we have a local server, try to instantiate it as the master.
-		 * If there's already a master out there, connect to it.  If there
-		 * was a master but it went away then the exclusive lock is up for
-		 * grabs.  Always ensure the exclusive is still there before by
-		 * trying to get it, before relying on the lvb server address,
-		 * because that could be stale.
-		 *
-		 * If there's no local server, don't do anything: instantiation
-		 * will be attempted when/if the local server shows up.
-		 */
-		if (have_address(&context->active)) {
-			connect_clients(context);
-			break;
-		}
-		if (have_address(&context->local) && context->ast_state == dormant)
-			goto instantiate;
-		break;
-	case REPLY_CONNECT_SERVER:
-		warn("Everything connected properly, all is well");
-		break;
-	default: 
-		warn("Unknown message %x", message.head.code);
-		break;
-	}
-	return 0;
-
-instantiate:
-	return try_to_instantiate(context);
-
-message_too_long:
-	warn("message %x too long (%u bytes)\n", message.head.code, message.head.length);
-pipe_error:
-	return -1;
-}
-
-int monitor(char *sockname, struct context *context)
-{
-	unsigned maxclients = 100, clients = 0, others = 2;
-	struct pollfd pollvec[others+maxclients];
-	struct client *clientvec[maxclients];
-	struct sockaddr_un addr = { .sun_family = AF_UNIX };
-	int addr_len = sizeof(addr) - sizeof(addr.sun_path) + strlen(sockname);
-	int listener = socket(AF_UNIX, SOCK_STREAM, 0), locksock;
-
-	assert(listener > 0);
-	strncpy(addr.sun_path, sockname, sizeof(addr.sun_path));
-	if (sockname[0] == '@')
-		addr.sun_path[0] = 0;
-	else
-		unlink(sockname);
-
-	if (bind(listener, (struct sockaddr *)&addr, addr_len) || listen(listener, 5))
-		error("Can't bind to control socket (is it in use?)");
-
-	/* Set up lock manager */
-	if ((locksock = dlm_get_fd()) < 0)
-		error("dlm error %i, %s", errno, strerror(errno));
-	context->lksb.sb_lvbptr = context->lvb; /* Yuck! */
-	memset(context->lvb, 0, sizeof(context->lvb));
-
-	/* Launch daemon and exit */
-	switch (fork()) {
-	case -1:
-		error("fork failed");
-	case 0:
-		break; // !!! should daemonize properly
-	default:
-		return 0;
-	}
-
-	pollvec[0] = (struct pollfd){ .fd = listener, .events = POLLIN };
-	pollvec[1] = (struct pollfd){ .fd = locksock, .events = POLLIN };
-	assert(pollvec[0].fd > 0);
-
-	while (1) {
-		switch (poll(pollvec, others+clients, context->polldelay)) {
-		case -1:
-			if (errno == EINTR)
-				continue;
-			error("poll failed, %s", strerror(errno));
-		case 0:
-			/* Timeouts happen here */
-			context->polldelay = -1;
-			warn("try again");
-			connect_clients(context);
-			// If we go through this too many times it means somebody
-			// out there is sitting on the PW lock but did not write
-			// the lvb, this is breakage that should be reported to a
-			// human.  So we should do that, but also keep looping
-			// forever in case somebody is just being slow or in the
-			// process of being fenced/ejected, in which case the PW
-			// will eventually come free again.  Yes this sucks.
-			continue;
-		}
-
-		/* New connection? */
-		if (pollvec[0].revents) {
-			struct sockaddr_in addr;
-			int addr_len = sizeof(addr), sock;
-
-			if (!(sock = accept(listener, (struct sockaddr *)&addr, &addr_len)))
-				error("Cannot accept connection");
-			trace_on(warn("Received connection %i", clients);)
-			assert(clients < maxclients); // !!! make the array bigger
-
-			struct client *client = malloc(sizeof(struct client));
-			*client = (struct client){ .sock = sock };
-			clientvec[clients] = client;
-			pollvec[others+clients] = (struct pollfd){ .fd = sock, .events = POLLIN };
-			clients++;
-		}
-
-		/* Lock event? */
-		if (pollvec[1].revents)
-			dlm_dispatch(locksock);
-
-		/* Activity on connection? */
-		unsigned i = 0;
-		while (i < clients) {
-			if (pollvec[others+i].revents) { // !!! check for poll error
-				struct client **clientp = clientvec + i, *client = *clientp;
-
-				if (incoming(context, client) == -1) {
-					warn("Lost connection %i", i);
-					if (client->type == SERVER_CON) {
-						warn("local server died...");
-						if (!memcmp(&context->active, &context->local, sizeof(struct server))) {
-							warn("release lock");
-							struct dlm_lksb *lksb = &context->lksb;
-							memset(&context->active, 0, sizeof(struct server));
-							memset(lksb->sb_lvbptr, 0, sizeof(struct server));
-						        if (dlm_unlock(lksb->sb_lkid, 0, lksb, context) < 0)
-		 						warn("dlm error %i, %s", errno, strerror(errno));
-						}
-						memset(&context->local, 0, sizeof(struct server));
-					}
-					close(client->sock);
-					free(client);
-					--clients;
-					clientvec[i] = clientvec[clients];
-					pollvec[others + i] = pollvec[others + clients];
-//					memmove(clientp, clientp + 1, sizeof(struct client *) * clients);
-//					memmove(pollvec + i + others, pollvec + i + others + 1, sizeof(struct pollfd) * clients);
-					continue;
-				}
-			}
-			i++;
-		}
-	}
-}
-
-int main(int argc, char *argv[])
-{
-	if (argc != 2)
-		error("usage: %s sockname", argv[0]);
-
-	return monitor(argv[1], &(struct context){ .polldelay = -1 });
-}
diff --git a/csnap/src/buffer.c b/csnap/src/buffer.c
deleted file mode 100644
index 7d08916..0000000
--- a/csnap/src/buffer.c
+++ /dev/null
@@ -1,268 +0,0 @@
-#define _XOPEN_SOURCE 500 /* pwrite */
-#include <string.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stddef.h>
-#include <unistd.h>
-#include <errno.h> 
-#include "list.h"
-#include "buffer.h"
-#include "trace.h"
-
-#define buftrace trace_off
-
-/*
- * Kernel-like buffer layer
- */
-
-/*
- * Even though we are in user space, for reasons of durability and speed
- * we need to access the block directly, handle our own block caching and
- * keep track block by block of which parts of the on-disk data structures
- * as they are accessed and modified.  There's no need to reinvent the
- * wheel here.  I have basically cloned the traditional Unix kernel buffer
- * paradigm, with one small twists of my own, that is, instead of state
- * bits we use scalar values.  This captures the notion of buffer state
- * transitions more precisely than the traditional approach.
- *
- * One big benefit of using a buffer paradigm that looks and acts very
- * much like the kernel incarnation is, porting this into the kernel is
- * going to be a whole lot easier.  Most higher level code will not need
- * to be modified at all.  Another benefit is, it will be much easier to
- * add async IO.
- */
-
-static struct buffer *buffer_table[BUFFER_BUCKETS];
-LIST_HEAD(dirty_buffers);
-unsigned dirty_buffer_count;
-
-void set_buffer_dirty(struct buffer *buffer)
-{
-	buftrace(printf("set_buffer_dirty %llx state=%u\n", buffer->sector, buffer->flags & BUFFER_STATE_MASK);)
-	if (!buffer_dirty(buffer)) {
-		list_add_tail(&buffer->list, &dirty_buffers);
-		dirty_buffer_count++;
-	}
-	buffer->flags = BUFFER_STATE_DIRTY | (buffer->flags & ~BUFFER_STATE_MASK);
-}
-
-void set_buffer_uptodate(struct buffer *buffer)
-{
-	if (buffer_dirty(buffer)) {
-		list_del(&buffer->list);
-		dirty_buffer_count--;
-	}
-	buffer->flags = BUFFER_STATE_CLEAN | (buffer->flags & ~BUFFER_STATE_MASK);
-}
-
-void brelse(struct buffer *buffer)
-{
-	buftrace(printf("Release buffer %llx\n", buffer->sector);)
-	if (!--buffer->count)
-		trace_off(printf("Free buffer %llx\n", buffer->sector));
-}
-
-void brelse_dirty(struct buffer *buffer)
-{
-	buftrace(printf("Release dirty buffer %llx\n", buffer->sector);)
-	set_buffer_dirty(buffer);
-	brelse(buffer);
-}
-
-int read_buffer(struct buffer *buffer)
-{
-	buftrace(warn("read buffer %llx", buffer->sector);)
-	lseek(buffer->fd, buffer->sector << SECTOR_BITS , SEEK_SET);
-
-	unsigned count = 0;
-	while (count < buffer->size)
-	{
-		int n = read(buffer->fd, buffer->data, buffer->size - count);
-		if (n == -1)
-{
-	printf("read error %i %s %i\n", errno, strerror(errno), buffer->size - count);
-			return errno;
-}
-		count += n;
-	}
-	set_buffer_uptodate(buffer);
-	return 0;
-}
-
-int write_buffer_to(struct buffer *buffer, sector_t sector)
-{
-	while (pwrite(buffer->fd, buffer->data, buffer->size, sector  << SECTOR_BITS) == -1)
-		if (errno != EAGAIN)
-			return errno;
-	return 0;
-}
-
-int write_buffer(struct buffer *buffer)
-{
-	buftrace(warn("write buffer %Lx/%u", buffer->sector, buffer->size);)
-	int err;
-
-	if ((err = write_buffer_to(buffer, buffer->sector)))
-		return err;
-	set_buffer_uptodate(buffer);
-	return 0;
-}
-
-unsigned buffer_hash(sector_t sector)
-{
-	return (((sector >> 32) ^ (sector_t)sector) * 978317583) % BUFFER_BUCKETS;
-}
-
-struct buffer *new_buffer(sector_t sector, unsigned size)
-{
-	buftrace(printf("Allocate buffer for %llx\n", sector);)
-	struct buffer *buffer = (struct buffer *)malloc(sizeof(struct buffer));
-	buffer->data = malloc_aligned(size, size); // what if malloc fails?
-	buffer->count = 1;
-	buffer->flags = 0;
-	buffer->size = size;
-	buffer->sector = sector;
-	return buffer;
-}
-
-struct buffer *getblk(unsigned fd, sector_t sector, unsigned size)
-{
-	struct buffer **bucket = &buffer_table[buffer_hash(sector)], *buffer;
-
-	for (buffer = *bucket; buffer; buffer = buffer->hashlist)
-		if (buffer->sector == sector) {
-			buftrace(printf("Found buffer for %llx\n", sector);)
-			buffer->count++;
-			return buffer;
-		}
-
-	buffer = new_buffer(sector, size);
-	buffer->fd = fd;
-	buffer->hashlist = *bucket;
-	*bucket = buffer;
-	return buffer;
-}
-
-struct buffer *bread(unsigned fd, sector_t sector, unsigned size)
-{
-	struct buffer *buffer = getblk(fd, sector, size);
-
-	if (buffer_uptodate(buffer) || buffer_dirty(buffer))
-		return buffer;
-
-	read_buffer(buffer);
-	if (buffer_uptodate(buffer))
-		return buffer;
-
-	brelse(buffer);
-error("bad read");
-	return NULL;
-}
-
-void evict_buffer(struct buffer *buffer)
-{
-	if (buffer_dirty(buffer))
-		set_buffer_uptodate(buffer);
-
-	struct buffer **pbuffer = &buffer_table[buffer_hash(buffer->sector)];
-
-	for (; *pbuffer; pbuffer = &(*pbuffer)->hashlist)
-		if (*pbuffer == buffer) {
-			*pbuffer = buffer->hashlist;
-			buftrace(printf("Evict buffer for %llx\n", buffer->sector);)
-//			free(buffer->data); // !!! malloc_aligned means pointer is wrong
-			free(buffer);
-			return;
-		}
-	error("buffer not found");
-}
-
-void evict_buffers(void) // !!! should use lru list
-{
-	unsigned i;
-	for (i = 0; i < BUFFER_BUCKETS; i++)
-	{
-		struct buffer *buffer;
-		for (buffer = buffer_table[i]; buffer;) {
-			struct buffer *next = buffer->hashlist;
-			if (!buffer->count)
-				evict_buffer(buffer);
-			buffer = next;
-		}
-	}
-}
-
-void flush_buffers(void) // !!! should use lru list
-{
-	while (!list_empty(&dirty_buffers)) {
-		struct list_head *entry = dirty_buffers.next;
-		struct buffer *buffer = list_entry(entry, struct buffer, list);
-
-		if (buffer_dirty(buffer))
-			write_buffer(buffer);
-	}
-}
-
-void show_buffer(struct buffer *buffer)
-{
-	printf("%s%llx/%i ", 
-		buffer_dirty(buffer)? "+": buffer_uptodate(buffer)? "": "?", 
-		buffer->sector, buffer->count);
-}
-
-void show_buffers_(int all)
-{
-	unsigned i;
-
-	for (i = 0; i < BUFFER_BUCKETS; i++)
-	{
-		struct buffer *buffer = buffer_table[i];
-
-		if (!buffer)
-			continue;
-
-		printf("[%i] ", i);
-		for (; buffer; buffer = buffer->hashlist)
-			if (all || buffer->count)
-				show_buffer(buffer);
-		printf("\n");
-	}
-}
-
-void show_active_buffers(void)
-{
-	printf("Active buffers:\n");
-	show_buffers_(0);
-}
-
-void show_buffers(void)
-{
-	printf("Buffers:\n");
-	show_buffers_(1);
-}
-
-void show_dirty_buffers(void)
-{
-	struct list_head *list;
-
-	printf("Dirty buffers: ");
-	list_for_each(list, &dirty_buffers) {
-		struct buffer *buffer = list_entry(list, struct buffer, list);
-		printf("%llx ", buffer->sector);
-	}
-	printf("\n");
-}
-
-#if 0
-void dump_buffer(struct buffer *buffer, unsigned offset, unsigned length)
-{
-	hexdump(buffer->data + offset, length);
-}
-#endif
-
-void init_buffers(void)
-{
-	memset(buffer_table, 0, sizeof(buffer_table));
-	INIT_LIST_HEAD(&dirty_buffers);
-	dirty_buffer_count = 0;
-}
diff --git a/csnap/src/buffer.h b/csnap/src/buffer.h
deleted file mode 100644
index fe5ee33..0000000
--- a/csnap/src/buffer.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#define SECTOR_BITS 9
-#define BUFFER_STATE_INVAL 0
-#define BUFFER_STATE_CLEAN 1
-#define BUFFER_STATE_DIRTY 2
-#define BUFFER_STATE_MASK 3
-#define BUFFER_BUCKETS 9999
-
-typedef unsigned long long sector_t;
-typedef unsigned long long offset_t;
-
-struct buffer
-{
-	struct buffer *hashlist;
-	struct list_head list;
-	unsigned count; // should be atomic_t
-	unsigned flags;
-	unsigned size;
-	sector_t sector;
-	unsigned char *data;
-	unsigned fd;
-};
-
-struct list_head dirty_buffers;
-extern unsigned dirty_buffer_count;
-
-void show_dirty_buffers(void);
-void set_buffer_dirty(struct buffer *buffer);
-void set_buffer_uptodate(struct buffer *buffer);
-void brelse(struct buffer *buffer);
-void brelse_dirty(struct buffer *buffer);
-int write_buffer_to(struct buffer *buffer, offset_t pos);
-int write_buffer(struct buffer *buffer);
-int read_buffer(struct buffer *buffer);
-unsigned buffer_hash(sector_t sector);
-struct buffer *new_buffer(sector_t sector, unsigned size);
-struct buffer *getblk(unsigned fd, sector_t sector, unsigned size);
-struct buffer *bread(unsigned fd, sector_t sector, unsigned size);
-void evict_buffer(struct buffer *buffer);
-void evict_buffers(void);
-void flush_buffers(void);
-void show_buffer(struct buffer *buffer);
-void show_active_buffers(void);
-void show_buffers(void);
-void init_buffers(void);
-
-static inline int buffer_dirty(struct buffer *buffer)
-{
-	return (buffer->flags & BUFFER_STATE_MASK) == BUFFER_STATE_DIRTY;
-}
-
-static inline int buffer_uptodate(struct buffer *buffer)
-{
-	return (buffer->flags & BUFFER_STATE_MASK) == BUFFER_STATE_CLEAN;
-}
-
-static inline void *malloc_aligned(size_t size, unsigned binalign)
-{
-	unsigned long p = (unsigned long)malloc(size + binalign - 1);
-	return (void *)(p + (-p & (binalign - 1)));
-}
diff --git a/csnap/src/buffertest.c b/csnap/src/buffertest.c
deleted file mode 100644
index 8a7692c..0000000
--- a/csnap/src/buffertest.c
+++ /dev/null
@@ -1,15 +0,0 @@
-#include <stdlib.h>
-
-#include "list.h"
-#include "buffer.h"
-
-int main(int argc, char *argv[])
-{
-	struct buffer *buffer = new_buffer(0x64, 4096);
-	show_dirty_buffers();
-	set_buffer_dirty(buffer);
-	show_dirty_buffers();
-	set_buffer_uptodate(buffer);
-	show_dirty_buffers();
-	return 0;
-}
diff --git a/csnap/src/create.c b/csnap/src/create.c
deleted file mode 100644
index 4b94668..0000000
--- a/csnap/src/create.c
+++ /dev/null
@@ -1,58 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <string.h>
-//#include <fcntl.h>
-#include <errno.h>
-#include <inttypes.h>
-#include <netinet/in.h>
-#include <linux/dm-csnap.h>
-#include "csnap.h"
-#include "trace.h"
-#include "sock.h"
-
-#ifdef DELETE
-#  define THIS_CODE DELETE_SNAPSHOT
-#  define THIS_REPLY REPLY_DELETE_SNAPSHOT
-#else
-#  define THIS_CODE CREATE_SNAPSHOT
-#  define THIS_REPLY REPLY_CREATE_SNAPSHOT
-#endif
-
-int main(int argc, char *argv[])
-{
-	int sock, err;
-	if (argc < 3)
-		error("usage: %s host:port snapshot", argv[0]);
-
-	int snap = atoi(argv[2]);
-	char *host = argv[1];
-	int len = strlen(host), port = parse_port(host, &len);
-
-	if (port < 0)
-		error("expected host:port, not %s", host);
-	host[len] = 0;
-
-	if (!(sock = open_socket(host, port)))
-		error("Can't connect to %s:%i", host, port);
-
-	outbead(sock, THIS_CODE, struct create_snapshot, snap);
-
-	struct head head;
-	unsigned maxbuf = 500;
-	char buf[maxbuf];
-
-	if ((err = readpipe(sock, &head, sizeof(head))))
-		goto pipe_error;
-	assert(head.length < maxbuf); // !!! don't die
-	if ((err = readpipe(sock, buf, head.length)))
-		goto pipe_error;
-	trace_on(printf("reply = %x\n", head.code);)
-	err  = head.code != THIS_REPLY;
-
-	if (head.code == REPLY_ERROR)
-		error("%.*s", head.length - 4, buf + 4);
-pipe_error:
-	close(sock);
-	return err;
-}
diff --git a/csnap/src/csnap.c b/csnap/src/csnap.c
deleted file mode 100644
index 25615a2..0000000
--- a/csnap/src/csnap.c
+++ /dev/null
@@ -1,2623 +0,0 @@
-/*
- * Clustered Snapshot Metadata Server
- *
- * Daniel Phillips, Nov 2003 to May 2004
- * (c) 2003 Sistina Software Inc.
- * (c) 2004 Red Hat Software Inc.
- *
- */
-
-#define _GNU_SOURCE /* Berserk glibc headers: O_DIRECT not defined unless _GNU_SOURCE defined */
-
-#include <string.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stddef.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <errno.h> 
-#include <time.h>
-#include <signal.h>
-#include <sys/poll.h>
-#include <sys/types.h> 
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <sys/ioctl.h>
-#include <netinet/in.h>
-#include <netdb.h> // gethostbyname2_r
-#include <linux/fs.h> // BLKGETSIZE
-#include <popt.h>
-#include "list.h"
-#include "buffer.h"
-#include "csnap.h"
-#include <linux/dm-csnap.h>
-#include "trace.h"
-
-#define trace trace_off
-#define jtrace trace_off
-#define BUSHY
-
-/*
-Todo:
-
-BTree
-  * coalesce leafs/nodes for delete
-  - B*Tree splitting
-
-Allocation bitmaps
-  - allocation statistics
-  - Per-snapshot free space as full-tree pass
-  - option to track specific snapshot(s) on the fly
-  - return stats to client (on demand? always?)
-  * Bitmap block radix tree - resizing
-  - allocation policy
-
-Journal
-  \ allocation
-  \ write commit block
-  \ write target blocks
-  \ recovery
-  - stats and misc data in commit block?
-
-File backing
-  \ double linked list ops
-  - buffer lru 
-  - buffer writeout policy
-  - buffer eviction policy
-  - verify no busy buffers between operations
-
-Snapshot vs origin locking
-  - anti-starvation measures
-
-Message handling
-  - send reply on async write completion
-  - build up immediate reply in separate buffer
-
-Snapshot handling path
-  - background copyout thread
-  - try AIO
-  - coalesce leaves/nodes on delete
-     - should wait for current queries on snap to complete
-  - background deletion optimization
-     - record current deletion list in superblock
-
-Multithreading
-  - separate thread for copyouts
-  - separate thread for buffer flushing
-  - separate thread for new connections (?)
-
-Utilities
-  - don't include anything not needed for create
-  - snapshot store integrity check (snapcheck)
-
-Error recovery
-  \ Mark superblock active/inactive
-  + upload client locks on server restart
-  + release snapshot read locks for dead client
-  - Examine entire tree to initialize statistics after unsaved halt
-
-General
-  \ Prevent multiple server starts on same snapshot store
-  + More configurable tracing
-  - Add more internal consistency checks
-  - Magic number + version for superblock
-  - Flesh out and audit error paths
-  - Make it endian-neutral
-  - Verify wordsize neutral
-  - Add an on-the-fly verify path
-  + strip out the unit testing gunk
-  + More documentation
-  - Audits and more audits
-  - Memory inversion prevention
-
-Cluster integration
-  + Restart/Error recovery/reporting
-*/
-
-/*
- * Miscellaneous Primitives
- */
-
-typedef int fd_t;
-
-/*
- * Ripped from libiddev.  It's not quite ugly enough to convince me to
- * add a new dependency on a library that nobody has yet, but it's close.
- */
-static int fd_size(int fd, u64 *bytes)
-{
-	struct stat stat;
-	int error;
-
-	if ((error = fstat(fd, &stat)))
-		return error;
-
-	if (S_ISREG(stat.st_mode)) {
-		*bytes = stat.st_size;
-		return 0;
-	}
-	if ((error = ioctl(fd, BLKGETSIZE64, bytes))) {
-		unsigned sectors;
-
-		if ((error = ioctl(fd, BLKGETSIZE, &sectors)))
-			return error;
-		*bytes = ((u64)sectors) << 9;
-	}
-	return 0;
-}
-
-void hexdump(void *data, unsigned length)
-{
-	while (length ) {
-		int row = length < 16? length: 16;
-		printf("%p: ", data);
-		length -= row;
-		while (row--)
-			printf("%02hhx ", *(unsigned char *)data++);
-		printf("\n");
-	}
-}
-
-/* BTree Operations */
-
-/* Directory at the base of the leaf block */
-
-#define MAX_SNAPSHOTS 64
-
-struct enode
-{
-	u32 count;
-	u32 unused;
-	struct index_entry
-	{
-		u64 key; // note: entries[0].key never accessed
-		sector_t sector; // node sector address goes here
-	} entries[];
-};
-
-struct eleaf
-{
-	le_u16 magic;
-	le_u16 version;
-	le_u32 count;
-	le_u64 base_chunk;
-	le_u64 using_mask;
-	struct etree_map
-	{
-		le_u32 offset;
-		le_u32 rchunk;
-	}
-	map[];
-};
-
-static inline struct enode *buffer2node(struct buffer *buffer)
-{
-	return (struct enode *)buffer->data;
-}
-
-static inline struct eleaf *buffer2leaf(struct buffer *buffer)
-{
-	return (struct eleaf *)buffer->data;
-}
-
-/* On-disk Format */
-
-struct exception
-{
-	le_u64 share;
-	le_u64 chunk;
-};
-
-static inline struct exception *emap(struct eleaf *leaf, unsigned i)
-{
-	return	(struct exception *)((char *) leaf + leaf->map[i].offset);
-}
-
-struct superblock
-{
-	/* Persistent, saved to disk */
-	struct disksuper
-	{
-		char magic[8];
-		sector_t etree_root;
-		sector_t bitmap_base;
-		sector_t chunks, freechunks;
-		sector_t orgchunks;
-		chunk_t last_alloc;
-		u64 flags;
-		u32 blocksize_bits, chunksize_bits;
-		u64 deleting;
-		struct snapshot
-		{
-			u8 tag;
-			u8 bit;
-			u32 create_time;
-			u16 reserved;
-		} snaplist[MAX_SNAPSHOTS];
-		u32 snapshots;
-		u32 etree_levels;
-		u32 bitmap_blocks;
-		s32 journal_base, journal_next, journal_size;
-		u32 sequence;
-	} image;
-
-	/* Derived, not saved to disk */
-	u64 snapmask;
-	u32 blocksize, chunksize, blocks_per_node;
-	u32 sectors_per_block_bits, sectors_per_block;
-	u32 sectors_per_chunk_bits, sectors_per_chunk;
-	unsigned flags;
-	unsigned snapdev, orgdev;
-	unsigned snaplock_hash_bits;
-	struct snaplock **snaplocks;
-	unsigned copybuf_size;
-	char *copybuf;
-	chunk_t source_chunk;
-	chunk_t dest_exception;
-	unsigned copy_chunks;
-	unsigned max_commit_blocks;
-};
-
-#define SB_BUSY 1
-#define SB_MAGIC "snapshot"
-
-/* Journal handling */
-
-#define JMAGIC "MAGICNUM"
-
-struct commit_block
-{
-	char magic[8];
-	u32 checksum;
-	s32 sequence;
-	u32 entries;
-	u64 sector[];
-} PACKED;
-
-static sector_t journal_sector(struct superblock *sb, unsigned i)
-{
-	return sb->image.journal_base + (i << sb->sectors_per_block_bits);
-}
-
-static inline struct commit_block *buf2block(struct buffer *buf)
-{
-	return (void *)buf->data;
-}
-
-unsigned next_journal_block(struct superblock *sb)
-{
-	unsigned next = sb->image.journal_next;
-
-	if (++sb->image.journal_next == sb->image.journal_size)
-		sb->image.journal_next = 0;
-
-	return next;
-}
-
-static int is_commit_block(struct commit_block *block)
-{
-	return !memcmp(&block->magic, JMAGIC, sizeof(block->magic));
-}
-
-static u32 checksum_block(struct superblock *sb, u32 *data)
-{
-	int i, sum = 0;
-	for (i = 0; i < sb->image.blocksize_bits >> 2; i++)
-		sum += data[i];
-	return sum;
-}
-
-static struct buffer *jgetblk(struct superblock *sb, unsigned i)
-{
-	return getblk(sb->snapdev, journal_sector(sb, i), sb->blocksize);
-}
-
-static struct buffer *jread(struct superblock *sb, unsigned i)
-{
-	return bread(sb->snapdev, journal_sector(sb, i), sb->blocksize);
-}
-
-/*
- * For now there is only ever one open transaction in the journal, the newest
- * one, so we don't have to check for journal wrap, but just ensure that each
- * transaction stays small enough to fit in the journal.
- *
- * Since we don't have any asynchronous IO at the moment, journal commit is
- * straightforward: walk through the dirty blocks once, writing them to the
- * journal, then again, adding sector locations to the commit block.  We know
- * the dirty list didn't change between the two passes.  When ansynchronous
- * IO arrives here, this all has to be handled a lot more carefully.
- */
-static void commit_transaction(struct superblock *sb)
-{
-// flush_buffers();
-// return;
-	if (list_empty(&dirty_buffers))
-		return;
-
-	struct list_head *list;
-
-	list_for_each(list, &dirty_buffers) {
-		struct buffer *buffer = list_entry(list, struct buffer, list);
-		unsigned pos = next_journal_block(sb);
-		jtrace(warn("journal data sector = %Lx [%u]", buffer->sector, pos);)
-		assert(buffer_dirty(buffer));
-		write_buffer_to(buffer, journal_sector(sb, pos));
-	}
-
-	unsigned pos = next_journal_block(sb);
-	struct buffer *commit_buffer = jgetblk(sb, pos);
-	struct commit_block *commit = buf2block(commit_buffer);
-	*commit = (struct commit_block){ .magic = JMAGIC, .sequence = sb->image.sequence++ }; 
-
-	while (!list_empty(&dirty_buffers)) {
-		struct list_head *entry = dirty_buffers.next;
-		struct buffer *buffer = list_entry(entry, struct buffer, list);
-		jtrace(warn("write data sector = %Lx", buffer->sector);)
-		assert(buffer_dirty(buffer));
-		assert(commit->entries < sb->max_commit_blocks);
-		commit->sector[commit->entries++] = buffer->sector;
-		write_buffer(buffer); // deletes it from dirty (fixme: fragile)
-		// we hope the order we just listed these is the same as committed above
-	}
-
-	jtrace(warn("commit journal block [%u]", pos);)
-	commit->checksum = 0;
-	commit->checksum = -checksum_block(sb, (void *)commit);
-	write_buffer_to(commit_buffer, journal_sector(sb, pos));
-	brelse(commit_buffer);
-}
-
-int recover_journal(struct superblock *sb)
-{
-	struct buffer *buffer;
-	typeof(((struct commit_block *)NULL)->sequence) sequence;
-	int scribbled = -1, last_block = -1, newest_block = -1;
-	int data_from_start = 0, data_from_last = 0;
-	int size = sb->image.journal_size;
-	char *why = "";
-	unsigned i;
-
-	/* Scan full journal, find newest commit */
-
-	for (i = 0; i < size; brelse(buffer), i++) {
-		buffer = jread(sb, i);
-		struct commit_block *block = buf2block(buffer);
-
-		if (!is_commit_block(block)) {
-			jtrace(warn("[%i] <data>", i);)
-			if (sequence == -1)
-				data_from_start++;
-			else
-				data_from_last++;
-			continue;
-		}
-
-		if (checksum_block(sb, (void *)block)) {
-			warn("block %i failed checksum", i);
-			hexdump(block, 40);
-			if (scribbled != -1) {
-				why = "Too many scribbled blocks in journal";
-				goto failed;
-			}
-
-			if (newest_block != -1 && newest_block != last_block) {
-				why = "Bad block not last written";
-				goto failed;
-			}
-
-			scribbled = i;
-			if (last_block != -1)
-				newest_block = last_block;
-			sequence++;
-			continue;
-		}
-
-		jtrace(warn("[%i] seq=%i", i, block->sequence);)
-
-		if (last_block != -1 && block->sequence != sequence + 1) {
-			int delta = sequence - block->sequence;
-
-			if  (delta <= 0 || delta > size) {
-				why = "Bad sequence";
-				goto failed;
-			}
-	
-			if (newest_block != -1) {
-				why = "Multiple sequence wraps";
-				goto failed;
-			}
-	
-			if (!(scribbled == -1 || scribbled == i - 1)) {
-				why = "Bad block not last written";
-				goto failed;
-			}
-			newest_block = last_block;
-		}
-		data_from_last = 0;
-		last_block = i;
-		sequence = block->sequence;
-	}
-
-	if (last_block == -1) {
-		why = "No commit blocks found";
-		goto failed;
-	}
-	
-	if (newest_block == -1) {
-		/* test for all the legal scribble positions here */
-		newest_block = last_block;
-	}
-
-	jtrace(warn("found newest commit [%u]", newest_block);)
-	buffer = jread(sb, newest_block);
-	struct commit_block *commit = buf2block(buffer);
-	unsigned entries = commit->entries;
-
-	for (i = 0; i < entries; i++) {
-		unsigned pos = (newest_block - entries + i + size) % size;
-		struct buffer *databuf = jread(sb, pos);
-		struct commit_block *block = buf2block(databuf);
-
-		if (is_commit_block(block)) {
-			error("data block [%u] marked as commit block", pos);
-			continue;
-		}
-
-		jtrace(warn("write journal [%u] data to %Lx", pos, commit->sector[i]);)
-		write_buffer_to(databuf, commit->sector[i]);
-		brelse(databuf);
-	}
-	sb->image.journal_next = (newest_block + 1 + size) % size;
-	sb->image.sequence = commit->sequence + 1;
-	brelse(buffer);
-	return 0;
-
-failed:
-	errno = EIO; /* return a misleading error (be part of the problem) */
-	error("Journal recovery failed, %s", why);
-	return -1;
-}
-
-static void _show_journal(struct superblock *sb)
-{
-	int i, j;
-	for (i = 0; i < sb->image.journal_size; i++) {
-		struct buffer *buf = jread(sb, i);
-		struct commit_block *block = buf2block(buf);
-
-		if (!is_commit_block(block)) {
-			printf("[%i] <data>\n", i);
-			continue;
-		}
-
-		printf("[%i] seq=%i (%i)", i, block->sequence, block->entries);
-		for (j = 0; j < block->entries; j++)
-			printf(" %Lx", (long long)block->sector[j]);
-		printf("\n");
-		brelse(buf);
-	}
-	printf("\n");
-}
-
-#define show_journal(sb) do { warn("Journal..."); _show_journal(sb); } while (0)
-
-/* BTree leaf operations */
-
-/*
- * We operate directly on the BTree leaf blocks to insert exceptions and
- * to enquire the sharing status of given chunks.  This means all the data
- * items in the block need to be properly aligned for architecture
- * independence.  To save space and to permit binary search a directory
- * map at the beginning of the block points at the exceptions stored
- * at the top of the block.  The difference between two successive directory
- * pointers gives the number of distinct exceptions for a given chunk.
- * Each exception is paired with a bitmap that specifies which snapshots
- * the exception belongs to.
- *
- * The chunk addresses in the leaf block directory are relative to a base
- * chunk to save space.  These are currently 32 bit values but may become
- * 16 bits values.  Since each is paired with a pointer into the list of
- * exceptions, 16 bit emap entries would limit the blocksize to 64K.
- *
- * A mask in the leaf block header specifies which snapshots are actually
- * encoded in the chunk.  This allows lazy deletion (almost, needs fixing)
- *
- * The leaf operations need to know the size of the block somehow.
- * Currently that is accomplished by inserting the block size as a sentinel
- * in the block directory map; this may change.
- *
- * When an exception is created by a write to the origin it is initially
- * shared by all snapshots that don't already have exceptions.  Snapshot
- * writes may later unshare some of these exceptions.
- */
-
-/*
- * To do:
- *   - Check leaf, index structure
- *   - Mechanism for identifying which snapshots are in each leaf
- *   - binsearch for leaf, index lookup
- *   - enforce 32 bit address range within leaf
- */
-
-struct buffer *snapread(struct superblock *sb, sector_t sector)
-{
-	return bread(sb->snapdev, sector, sb->blocksize);
-}
-
-void show_leaf(struct eleaf *leaf)
-{
-	struct exception *p;
-	int i;
-	
-	printf("%i chunks: ", leaf->count);
-	for (i = 0; i < leaf->count; i++) {
-		printf("%x=", leaf->map[i].rchunk);
-		// printf("@%i ", leaf->map[i].offset);
-		for (p = emap(leaf, i); p < emap(leaf, i+1); p++)
-			printf("%Lx/%08llx%s", p->chunk, p->share, p+1 < emap(leaf, i+1)? ",": " ");
-	}
-	// printf("top@%i", leaf->map[i].offset);
-	printf("\n");
-}
-
-/*
- * origin_chunk_unique: an origin logical chunk is shared unless all snapshots
- * have exceptions.
- */
-
-int origin_chunk_unique(struct eleaf *leaf, u64 chunk, u64 snapmask)
-{
-	u64 using = 0;
-	unsigned i, target = chunk - leaf->base_chunk;
-	struct exception *p;
-
-	for (i = 0; i < leaf->count; i++)
-		if (leaf->map[i].rchunk == target)
-			goto found;
-	return !snapmask;
-found:
-	for (p = emap(leaf, i); p < emap(leaf, i+1); p++)
-		using |= p->share;
-
-	return !(~using & snapmask);
-}
-
-/*
- * snapshot_chunk_unique: a snapshot logical chunk is shared if it has no
- * exception or has the same exception as another snapshot.  In any case
- * if the chunk has an exception we need to know the exception address.
- */
-
-int snapshot_chunk_unique(struct eleaf *leaf, u64 chunk, int snapshot, u64 *exception)
-{
-	u64 mask = 1LL << snapshot;
-	unsigned i, target = chunk - leaf->base_chunk;
-	struct exception *p;
-
-	for (i = 0; i < leaf->count; i++)
-		if (leaf->map[i].rchunk == target)
-			goto found;
-	return 0;
-found:
-	for (p = emap(leaf, i); p < emap(leaf, i+1); p++)
-		/* shared if more than one bit set including this one */
-		if ((p->share & mask)) {
-			*exception = p->chunk;
-// printf("unique %Lx %Lx\n", p->share, mask);
-			return !(p->share & ~mask);
-		}
-	return 0;
-}
-
-/*
- * add_exception_to_leaf:
- *  - cycle through map to find existing logical chunk or insertion point
- *  - if not found need to add new chunk address
- *      - move tail of map up
- *      - store new chunk address in map
- *  - otherwise
- *      - for origin:
- *          - or together all sharemaps, invert -> new map
- *      - for snapshot:
- *          - clear out bit for existing exception
- *              - if sharemap zero warn and reuse this location
- *  - insert new exception
- *      - move head of exceptions down
- *      - store new exception/sharemap
- *      - adjust map head offsets
- *
- * If the new exception won't fit in the leaf, return an error so that
- * higher level code may split the leaf and try again.  This keeps the
- * leaf-editing code complexity down to a dull roar.
- */
-
-unsigned leaf_freespace(struct eleaf *leaf)
-{
-	char *maptop = (char *)(&leaf->map[leaf->count + 1]); // include sentinel
-	return (char *)emap(leaf, 0) - maptop;
-}
-
-unsigned leaf_payload(struct eleaf *leaf)
-{
-	int lower = (char *)(&leaf->map[leaf->count]) - (char *)leaf->map;
-	int upper = (char *)emap(leaf, leaf->count) - (char *)emap(leaf, 0);
-	return lower + upper;
-}
-
-int add_exception_to_leaf(struct eleaf *leaf, u64 chunk, u64 exception, int snapshot, u64 active)
-{
-	unsigned i, j, target = chunk - leaf->base_chunk;
-	u64 mask = 1ULL << snapshot, sharemap;
-	struct exception *ins, *exceptions = emap(leaf, 0);
-	char *maptop = (char *)(&leaf->map[leaf->count + 1]); // include sentinel
-	int free = (char *)exceptions - maptop
-#ifdef BUSHY
- - 10
-#endif
-;
-	trace(warn("chunk %Lx exception %Lx, snapshot = %i", chunk, exception, snapshot);)
-
-	for (i = 0; i < leaf->count; i++) // !!! binsearch goes here
-		if (leaf->map[i].rchunk >= target)
-			break;
-
-	if (i == leaf->count || leaf->map[i].rchunk > target) {
-		if (free < sizeof(struct exception) + sizeof(struct etree_map))
-			return -EFULL;
-
-		ins = emap(leaf, i);
-		memmove(&leaf->map[i+1], &leaf->map[i], maptop - (char *)&leaf->map[i]);
-		leaf->map[i].offset = (char *)ins - (char *)leaf;
-		leaf->map[i].rchunk = target;
-		leaf->count++;
-		sharemap = snapshot == -1? active: mask;
-		goto insert;
-	}
-
-	if (free < sizeof(struct exception))
-		return -EFULL;
-
-	if (snapshot == -1) {
-		for (sharemap = 0, ins = emap(leaf, i); ins < emap(leaf, i+1); ins++)
-			sharemap |= ins->share;
-		sharemap = (~sharemap) & active;
-	} else {
-		for (ins = emap(leaf, i); ins < emap(leaf, i+1); ins++)
-			if ((ins->share & mask)) {
-				ins->share &= ~mask;
-				break;
-			}
-		sharemap = mask;
-	}
-	ins = emap(leaf, i);
-insert:
-	memmove(exceptions - 1, exceptions, (char *)ins - (char *)exceptions);
-	ins--;
-	ins->share = sharemap;
-	ins->chunk = exception;
-
-	for (j = 0; j <= i; j++)
-		leaf->map[j].offset -= sizeof(struct exception);
-
-	return 0;
-}
-
-/*
- * split_leaf: Split one leaf into two approximately in the middle.  Copy
- * the upper half of entries to the new leaf and move the lower half of
- * entries to the top of the original block.
- */
-u64 split_leaf(struct eleaf *leaf, struct eleaf *leaf2)
-{
-	unsigned i, nhead = (leaf->count + 1) / 2, ntail = leaf->count - nhead, tailsize;
-	/* Should split at middle of data instead of median exception */
-	u64 splitpoint = leaf->map[nhead].rchunk + leaf->base_chunk;
-	char *phead, *ptail;
-
-	phead = (char *)emap(leaf, 0);
-	ptail = (char *)emap(leaf, nhead);
-	tailsize = (char *)emap(leaf, leaf->count) - ptail;
-
-	/* Copy upper half to new leaf */
-	memcpy(leaf2, leaf, offsetof(struct eleaf, map)); // header
-	memcpy(&leaf2->map[0], &leaf->map[nhead], (ntail + 1) * sizeof(struct etree_map)); // map
-	memcpy(ptail - (char *)leaf + (char *)leaf2, ptail, tailsize); // data
-	leaf2->count = ntail;
-
-	/* Move lower half to top of block */
-	memmove(phead + tailsize, phead, ptail - phead);
-	leaf->count = nhead;
-	for (i = 0; i <= nhead; i++) // also adjust sentinel
-		leaf->map[i].offset += tailsize;
-	leaf->map[nhead].rchunk = 0; // tidy up
-
-	return splitpoint;
-}
-
-void merge_leaves(struct eleaf *leaf, struct eleaf *leaf2)
-{
-	unsigned nhead = leaf->count, ntail = leaf2->count, i;
-	unsigned tailsize = (char *)emap(leaf2, ntail) - (char *)emap(leaf2, 0);
-	char *phead = (char *)emap(leaf, 0), *ptail = (char *)emap(leaf, nhead);
-
-	// adjust pointers
-	for (i = 0; i <= nhead; i++) // also adjust sentinel
-		leaf->map[i].offset -= tailsize;
-
-	// move data down
-	phead = (char *)emap(leaf, 0);
-	ptail = (char *)emap(leaf, nhead);
-	memmove(phead, phead + tailsize, ptail - phead);
-
-	// move data from leaf2 to top
-	memcpy(ptail, (char *)emap(leaf2, 0), tailsize); // data
-	memcpy(&leaf->map[nhead], &leaf2->map[0], (ntail + 1) * sizeof(struct etree_map)); // map
-	leaf->count += ntail;
-}
-
-void merge_nodes(struct enode *node, struct enode *node2)
-{
-	memcpy(&node->entries[node->count], &node2->entries[0], node2->count * sizeof(struct index_entry));
-	node->count += node2->count;
-}
-
-void init_leaf(struct eleaf *leaf, int block_size)
-{
-	leaf->magic = 0x1eaf;
-	leaf->version = 0;
-	leaf->base_chunk = 0;
-	leaf->count = 0;
-	leaf->map[0].offset = block_size;
-#ifdef BUSHY
-	leaf->map[0].offset = 200;
-#endif
-}
-
-/*
- * Chunk allocation via bitmaps
- */
-
-#define SB_DIRTY 1
-#define SB_SECTOR 8
-#define SB_SIZE 4096
-
-void set_sb_dirty(struct superblock *sb)
-{
-	sb->flags |= SB_DIRTY;
-}
-
-static inline int get_bitmap_bit(unsigned char *bitmap, unsigned bit)
-{
-	return bitmap[bit >> 3] & (1 << (bit & 7));
-}
-
-static inline void set_bitmap_bit(unsigned char *bitmap, unsigned bit)
-{
-	bitmap[bit >> 3] |= 1 << (bit & 7);
-}
-
-static inline void clear_bitmap_bit(unsigned char *bitmap, unsigned bit)
-{
-	bitmap[bit >> 3] &= ~(1 << (bit & 7));
-}
-
-static unsigned calc_bitmap_blocks(struct superblock *sb, u64 chunks)
-{
-	unsigned chunkshift = sb->image.chunksize_bits;
-	return (chunks + (1 << (chunkshift + 3)) - 1) >> (chunkshift + 3);
-}
-
-static void init_allocation(struct superblock *sb)
-{
-	u64 chunks = sb->image.chunks;
-	unsigned bitmaps = calc_bitmap_blocks(sb, chunks);
-	unsigned bitmap_base_chunk = (SB_SECTOR + sb->sectors_per_block + sb->sectors_per_chunk  - 1) >> sb->sectors_per_chunk_bits;
-	unsigned bitmap_chunks = sb->image.bitmap_blocks = bitmaps; // !!! chunksize same as blocksize
-	unsigned reserved = bitmap_base_chunk + bitmap_chunks + sb->image.journal_size; // !!! chunksize same as blocksize
-	unsigned sector = sb->image.bitmap_base = bitmap_base_chunk << sb->sectors_per_chunk_bits;
-
-	warn("snapshot store size: %Lu chunks (%Lu sectors)", chunks, chunks << sb->sectors_per_chunk_bits);
-	printf("Initializing %u bitmap blocks... ", bitmaps);
-
-	unsigned i;
-	for (i = 0; i < bitmaps; i++, sector += sb->sectors_per_block) {
-		struct buffer *buffer = getblk(sb->snapdev, sector, sb->blocksize);
-		printf("%Lx ", buffer->sector);
-		memset(buffer->data, 0, sb->blocksize);
-		/* Reserve bitmaps and superblock */
-		if (i == 0) {
-			unsigned i;
-			for (i = 0; i < reserved; i++)
-				set_bitmap_bit(buffer->data, i);
-		}
-		/* Suppress overrun allocation in partial last byte */
-		if (i == bitmaps - 1 && (chunks & 7))
-			buffer->data[(chunks >> 3) & (sb->blocksize - 1)] |= 0xff << (chunks & 7);
-		trace_off(dump_buffer(buffer, 0, 16);)
-		brelse_dirty(buffer);
-	}
-	printf("\n");
-	sb->image.freechunks = chunks - reserved;
-	sb->image.last_alloc = 0;
-	sb->image.journal_base = (bitmap_base_chunk + bitmap_chunks) << sb->sectors_per_chunk_bits;
-}
-
-static void free_chunk(struct superblock *sb, chunk_t chunk)
-{
-	unsigned bitmap_shift = sb->image.blocksize_bits + 3, bitmap_mask = (1 << bitmap_shift ) - 1;
-	u64 bitmap_block = chunk >> bitmap_shift;
-
-	trace(printf("free chunk %Lx\n", chunk);)
-	struct buffer *buffer = snapread(sb, sb->image.bitmap_base + (bitmap_block << sb->sectors_per_block_bits));
-	if (!get_bitmap_bit(buffer->data, chunk & bitmap_mask)) {
-		warn("chunk %Lx already free!", (long long)chunk);
-		brelse(buffer);
-		return;
-	}
-	clear_bitmap_bit(buffer->data, chunk & bitmap_mask);
-	brelse_dirty(buffer);
-	sb->image.freechunks++;
-	set_sb_dirty(sb); // !!! optimize this away
-}
-
-static inline void free_block(struct superblock *sb, sector_t address)
-{
-	free_chunk(sb, address >> sb->sectors_per_chunk_bits); // !!! assumes blocksize = chunksize
-}
-
-void grab_chunk(struct superblock *sb, chunk_t chunk) // just for testing
-{
-	unsigned bitmap_shift = sb->image.blocksize_bits + 3, bitmap_mask = (1 << bitmap_shift ) - 1;
-	u64 bitmap_block = chunk >> bitmap_shift;
-
-	struct buffer *buffer = snapread(sb, sb->image.bitmap_base + (bitmap_block << sb->sectors_per_block_bits));
-	assert(!get_bitmap_bit(buffer->data, chunk & bitmap_mask));
-	set_bitmap_bit(buffer->data, chunk & bitmap_mask);
-	brelse_dirty(buffer);
-}
-
-chunk_t alloc_chunk_range(struct superblock *sb, chunk_t chunk, chunk_t range)
-{
-	unsigned bitmap_shift = sb->image.blocksize_bits + 3, bitmap_mask = (1 << bitmap_shift ) - 1;
-	u64 blocknum = chunk >> bitmap_shift;
-	unsigned bit = chunk & 7, offset = (chunk & bitmap_mask) >> 3;
-	u64 length = (range + bit + 7) >> 3;
-
-	while (1) {
-		struct buffer *buffer = snapread(sb, sb->image.bitmap_base + (blocknum << sb->sectors_per_block_bits));
-		unsigned char c, *p = buffer->data + offset;
-		unsigned tail = sb->blocksize  - offset, n = tail > length? length: tail;
-	
-		trace_off(printf("search %u bytes of bitmap %Lx from offset %u\n", n, blocknum, offset);)
-		// dump_buffer(buffer, 4086, 10);
-	
-		for (length -= n; n--; p++)
-			if ((c = *p) != 0xff) {
-				int i, bit;
-				trace_off(printf("found byte at offset %u of bitmap %Lx = %hhx\n", p - buffer->data, blocknum, c);)
-				for (i = 0, bit = 1;; i++, bit <<= 1)
-					if (!(c & bit)) {
-						chunk = i + ((p - buffer->data) << 3) + (blocknum << bitmap_shift);
-						assert(!get_bitmap_bit(buffer->data, chunk & bitmap_mask));
-						set_bitmap_bit(buffer->data, chunk & bitmap_mask);
-						brelse_dirty(buffer);
-						sb->image.freechunks--;
-						set_sb_dirty(sb); // !!! optimize this away
-						return chunk;
-					}
-			}
-	
-		brelse(buffer);
-		if (!length)
-			return 0;
-		if (++blocknum == sb->image.bitmap_blocks)
-			 blocknum = 0;
-		offset = 0;
-		trace_off(printf("go to bitmap %Lx\n", blocknum);)
-	}
-}
-
-chunk_t alloc_chunk(struct superblock *sb)
-{
-	chunk_t  last = sb->image.last_alloc, total = sb->image.chunks, found;
-
-	if ((found =  alloc_chunk_range(sb, last, total - last)))
-		goto success;
-	if (!(found =  alloc_chunk_range(sb, 0, last)))
-		error("snapshot store full, do something");
-success:
-	sb->image.last_alloc = found;
-	set_sb_dirty(sb); // !!! optimize this away
-	return (found);
-}
-
-/* Snapshot Store Allocation */
-
-sector_t alloc_block(struct superblock *sb)
-{
-	return alloc_chunk(sb) << sb->sectors_per_chunk_bits; // !!! assume blocksize = chunksize
-}
-
-u64 alloc_exception(struct superblock *sb)
-{
-	return alloc_chunk(sb);
-}
-
-struct buffer *new_block(struct superblock *sb)
-{
-	return getblk(sb->snapdev, alloc_block(sb), sb->blocksize);
-}
-
-struct buffer *new_leaf(struct superblock *sb)
-{
-	trace(printf("New leaf\n");)
-	struct buffer *buffer = new_block(sb);
-	init_leaf(buffer2leaf(buffer), sb->blocksize);
-	set_buffer_dirty(buffer);
-	return buffer;
-}
-
-struct buffer *new_node(struct superblock *sb)
-{
-	trace(printf("New node\n");)
-	struct buffer *buffer = new_block(sb);
-	struct enode *node = buffer2node(buffer);
-	node->count = 0;
-	set_buffer_dirty(buffer);
-	return buffer;
-}
-
-/* BTree debug dump */
-
-void show_subtree(struct superblock *sb, struct enode *node, int levels, int indent)
-{
-	int i;
-	printf("%*s", indent, "");
-	printf("%i nodes:\n", node->count);
-	for (i = 0; i < node->count; i++) {
-		struct buffer *buffer = snapread(sb, node->entries[i].sector);
-		if (i)
-			printf("pivot = %Lx\n", (long long)node->entries[i].key);
-		if (levels)
-			show_subtree(sb, buffer2node(buffer), levels - 1, indent + 3);
-		else {
-			printf("%*s", indent + 3, "");
-			show_leaf(buffer2leaf(buffer));
-		}
-		brelse(buffer);
-	}
-}
-
-void show_tree(struct superblock *sb)
-{
-	struct buffer *buffer = snapread(sb, sb->image.etree_root);
-	show_subtree(sb, buffer2node(buffer), sb->image.etree_levels - 1, 0);
-	brelse(buffer);
-}
-
-/* High Level BTree Editing */
-
-/*
- * BTree insertion is a little hairy, as expected.  We keep track of the
- * access path in a vector of etree_path elements, each of which holds
- * a node buffer and a pointer into the buffer data giving the address at
- * which the next buffer in the path was found, which is also where a new
- * node will be inserted if necessary.  If a leaf is split we may need to
- * work all the way up from the bottom to the top of the path, splitting
- * index nodes as well.  If we split the top index node we need to add
- * a new tree level.  We have to keep track of which nodes were modified
- * and keep track of refcounts of all buffers involved, which can be quite
- * a few.
- *
- * Note that the first key of an index block is never accessed.  This is
- * because for a btree, there is always one more key than nodes in each
- * index node.  In other words, keys lie between node pointers.  We will
- * micro-optimize by placing the node count in the first key, which allows
- * a node to contain an esthetically pleasing binary number of pointers.
- * (Not done yet.)
- */
-
-#define MAX_ETREE_DEPTH 6
-
-struct etree_path { struct buffer *buffer; struct index_entry *pnext; };
-
-struct buffer *probe(struct superblock *sb, u64 chunk, struct etree_path *path)
-{
-	unsigned i, levels = sb->image.etree_levels;
-	struct buffer *nodebuf = snapread(sb, sb->image.etree_root);
-	struct enode *node = buffer2node(nodebuf);
-
-	for (i = 0; i < levels; i++) {
-		struct index_entry *pnext = node->entries, *top = pnext + node->count;
-
-		while (++pnext < top)
-			if (pnext->key > chunk)
-				break;
-
-		path[i].buffer = nodebuf;
-		path[i].pnext = pnext;
-		nodebuf = snapread(sb, (pnext - 1)->sector);
-		node = (struct enode *)nodebuf->data;
-	}
-	assert(((struct eleaf *)nodebuf->data)->magic == 0x1eaf);
-	return nodebuf;
-}
-
-void brelse_path(struct etree_path *path, unsigned levels)
-{
-	unsigned i;
-	for (i = 0; i < levels; i++)
-		brelse(path[i].buffer);
-}
-
-void show_tree_range(struct superblock *sb, chunk_t start, unsigned leaves)
-{
-	int levels = sb->image.etree_levels, level = -1;
-	struct etree_path path[levels];
-	struct buffer *nodebuf;
-	struct enode *node;
-	struct buffer *leafbuf;
-
-#if 1
-	leafbuf = probe(sb, start, path);
-	level = levels - 1;
-	nodebuf = path[level].buffer;
-	node = buffer2node(nodebuf);
-	goto start;
-#endif
-
-	while (1) {
- 		do {
-			level++;
-			nodebuf = snapread(sb, level? path[level - 1].pnext++->sector: sb->image.etree_root);
-			node = buffer2node(nodebuf);
-			path[level].buffer = nodebuf;
-			path[level].pnext = node->entries;
-			trace(printf("push to level %i, %i nodes\n", level, node->count);)
-		} while (level < levels - 1);
-
-		trace(printf("do %i leaf nodes level = %i\n", node->count, level);)
-		while (path[level].pnext  < node->entries + node->count) {
-			leafbuf = snapread(sb, path[level].pnext++->sector);
-start:		show_leaf(buffer2leaf(leafbuf));
-			brelse(leafbuf);
-			if (!--leaves) {
-				brelse_path(path, level + 1);
-				return;
-			}
-		}
-
-		do {
-			brelse(nodebuf);
-			if (!level)
-				return;
-			nodebuf = path[--level].buffer;
-			node = buffer2node(nodebuf);
-			trace(printf("pop to level %i, %i of %i nodes\n", level, path[level].pnext - node->entries, node->count);)
-		} while (path[level].pnext == node->entries + node->count);
-	};
-}
-
-void insert_child(struct enode *node, struct index_entry *p, sector_t child, u64 childkey)
-{
-	memmove(p + 1, p, (char *)(&node->entries[0] + node->count) - (char *)p);
-	p->sector = child;
-	p->key = childkey;
-	node->count++;
-}
-
-void add_exception_to_tree(struct superblock *sb, struct buffer *leafbuf, u64 target, u64 exception, int snapnum, struct etree_path path[], unsigned levels)
-{
-	if (!add_exception_to_leaf(buffer2leaf(leafbuf), target, exception, snapnum, sb->snapmask)) {
-		brelse_dirty(leafbuf);
-		return;
-	}
-
-	trace(printf("add leaf\n");)
-	struct buffer *childbuf = new_leaf(sb);
-	u64 childkey = split_leaf(buffer2leaf(leafbuf), buffer2leaf(childbuf));
-	sector_t childsector = childbuf->sector;
-
-	if (add_exception_to_leaf(target < childkey? buffer2leaf(leafbuf): buffer2leaf(childbuf), target, exception, snapnum, sb->snapmask))
-		error("can't happen");
-	brelse_dirty(leafbuf);
-	brelse_dirty(childbuf);
-
-	while (levels--) {
-		struct index_entry *pnext = path[levels].pnext;
-		struct buffer *parentbuf = path[levels].buffer;
-		struct enode *parent = buffer2node(parentbuf);
-
-		if (parent->count < sb->blocks_per_node) {
-			insert_child(parent, pnext, childsector, childkey);
-			set_buffer_dirty(parentbuf);
-			return;
-		}
-
-		unsigned half = parent->count / 2;
-		u64 newkey = parent->entries[half].key;
-		struct buffer *newbuf = new_node(sb);
-		struct enode *newnode = buffer2node(newbuf);
-
-		newnode->count = parent->count - half;
-		memcpy(&newnode->entries[0], &parent->entries[half], newnode->count * sizeof(struct index_entry));
-		parent->count = half;
-
-		if (pnext > &parent->entries[half]) {
-			pnext = pnext - &parent->entries[half] + newnode->entries;
-			set_buffer_dirty(parentbuf);
-			parentbuf = newbuf;
-			parent = newnode;
-		} else set_buffer_dirty(newbuf);
-
-		insert_child(parent, pnext, childsector, childkey);
-		set_buffer_dirty(parentbuf);
-		childkey = newkey;
-		childsector = newbuf->sector;
-		brelse(newbuf);
-	}
-
-	trace(printf("add tree level\n");)
-	struct buffer *newrootbuf = new_node(sb); // !!! handle error
-	struct enode *newroot = buffer2node(newrootbuf);
-
-	newroot->count = 2;
-	newroot->entries[0].sector = sb->image.etree_root;
-	newroot->entries[1].key = childkey;
-	newroot->entries[1].sector = childsector;
-	sb->image.etree_root = newrootbuf->sector;
-	sb->image.etree_levels++;
-	set_sb_dirty(sb);
-	brelse_dirty(newrootbuf);
-}
-#define chunk_highbit ((sizeof(chunk_t) * 8) - 1)
-
-int finish_copyout(struct superblock *sb)
-{
-	if (sb->copy_chunks) {
-		int is_snap = sb->source_chunk >> chunk_highbit;
-		chunk_t source = sb->source_chunk & ~(1ULL << chunk_highbit);
-		unsigned size = sb->copy_chunks << sb->image.chunksize_bits;
-		trace(warn("copy %u %schunks from %Lx to %Lx", sb->copy_chunks, is_snap? "snapshot ": "", source, sb->dest_exception);)
-		assert(size <= sb->copybuf_size);
-		pread(is_snap? sb->snapdev: sb->orgdev, sb->copybuf, size, source << sb->image.chunksize_bits);  // 64 bit!!!
-		pwrite(sb->snapdev, sb->copybuf, size, sb->dest_exception << sb->image.chunksize_bits);  // 64 bit!!!
-		sb->copy_chunks = 0;
-	}
-	return 0;
-}
-
-int copyout(struct superblock *sb, chunk_t chunk, chunk_t exception)
-{
-#if 1
-	if (sb->source_chunk + sb->copy_chunks == chunk &&
-		sb->dest_exception + sb->copy_chunks == exception &&
-		sb->copy_chunks < sb->copybuf_size >> sb->image.chunksize_bits) {
-		sb->copy_chunks++;
-		return 0;
-	}
-	finish_copyout(sb);
-	sb->copy_chunks = 1;
-	sb->source_chunk = chunk;
-	sb->dest_exception = exception;
-#else
-	int is_snap = sb->source_chunk >> chunk_highbit;
-	chunk_t source = chunk & ~((1ULL << chunk_highbit) - 1);
-	pread(is_snap? sb->snapdev: sb->orgdev, sb->copybuf, sb->chunksize, source << sb->image.chunksize_bits);  // 64 bit!!!
-	pwrite(sb->snapdev, sb->copybuf, sb->chunksize, exception << sb->image.chunksize_bits);  // 64 bit!!!
-#endif
-	return 0;
-}
-
-/*
- * This is the bit that does all the work.  It's rather arbitrarily
- * factored into a probe and test part, then an exception add part,
- * called only if an exception for a given chunk isn't already present
- * in the Btree.  This factoring will change a few more times yet as
- * the code gets more asynchronous and multi-threaded.
- */
-chunk_t make_unique(struct superblock *sb, chunk_t chunk, int snapnum)
-{
-	unsigned levels = sb->image.etree_levels;
-	struct etree_path path[levels + 1];
-	struct buffer *leafbuf = probe(sb, chunk, path);
-	chunk_t exception = 0;
-	trace(warn("chunk %Lx, snapnum %i", chunk, snapnum));
-
-	if (snapnum == -1?
-		origin_chunk_unique(buffer2leaf(leafbuf), chunk, sb->snapmask):
-		snapshot_chunk_unique(buffer2leaf(leafbuf), chunk, snapnum, &exception))
-	{
-		trace(warn("chunk %Lx already unique in snapnum %i", chunk, snapnum);)
-		brelse(leafbuf);
-	} else {
-		u64 newex = alloc_exception(sb);
-// if (snapnum == -1)
-		copyout(sb, exception? (exception | (1ULL << chunk_highbit)): chunk, newex);
-		add_exception_to_tree(sb, leafbuf, chunk, newex, snapnum, path, levels);
-		exception = newex;
-	}
-	brelse_path(path, levels);
-	return exception;
-}
-
-int test_unique(struct superblock *sb, chunk_t chunk, int snapnum, chunk_t *exception)
-{
-	unsigned levels = sb->image.etree_levels;
-	struct etree_path path[levels + 1];
-	struct buffer *leafbuf = probe(sb, chunk, path);
-	trace(warn("chunk %Lx, snapnum %i", chunk, snapnum));
-	int result = snapnum == -1?
-		origin_chunk_unique(buffer2leaf(leafbuf), chunk, sb->snapmask):
-		snapshot_chunk_unique(buffer2leaf(leafbuf), chunk, snapnum, exception);
-	brelse(leafbuf);
-	brelse_path(path, levels);
-	return result;
-}
-
-/* Snapshot Store Superblock handling */
-
-u64 calc_snapmask(struct superblock *sb)
-{
-	u64 mask = 0;
-	int i;
-
-	for (i = 0; i < sb->image.snapshots; i++)
-		mask |= 1ULL << sb->image.snaplist[i].bit;
-
-	return mask;
-}
-
-int tag2snapnum(struct superblock *sb, unsigned tag)
-{
-	unsigned i, n = sb->image.snapshots;
-
-	for (i = 0; i < n; i++)
-		if (sb->image.snaplist[i].tag == tag)
-			return sb->image.snaplist[i].bit;
-
-	return -1;
-}
-
-int snapnum2tag(struct superblock *sb, unsigned bit)
-{
-	unsigned i, n = sb->image.snapshots;
-
-	for (i = 0; i < n; i++)
-		if (sb->image.snaplist[i].bit == bit)
-			return sb->image.snaplist[i].tag;
-
-	return -1;
-}
-
-int create_snapshot(struct superblock *sb, unsigned snaptag)
-{
-	unsigned i, snapshots = sb->image.snapshots;
-	struct snapshot *snapshot;
-
-	/* Check tag not already used */
-	for (i = 0; i < snapshots; i++)
-		if (sb->image.snaplist[i].tag == snaptag)
-			return -1;
-
-	/* Find available snapshot bit */
-	for (i = 0; i < MAX_SNAPSHOTS; i++)
-		if (!(sb->snapmask & (1ULL << i)))
-			goto create;
-	return -EFULL;
-
-create:
-	trace_on(printf("Create snapshot %i (internal %i)\n", snaptag, i);)
-	snapshot = sb->image.snaplist + sb->image.snapshots++;
-	*snapshot = (struct snapshot){ .tag = snaptag, .bit = i, .create_time = time(NULL) };
-	sb->snapmask |= (1ULL << i);
-	set_sb_dirty(sb);
-	return i;
-};
-
-/*
- * delete_snapshot: remove all exceptions from a given snapshot from a leaf
- * working from top to bottom of the exception list clearing snapshot bits
- * and packing the nonzero exceptions into the top of the block.  Then work
- * from bottom to top in the directory map packing nonempty entries into the
- * bottom of the map.
- */
-int delete_snapshots_from_leaf(struct superblock *sb, struct eleaf *leaf, u64 snapmask, unsigned max_dirty)
-{
-	struct exception *p = emap(leaf, leaf->count), *dest = p;
-	struct etree_map *pmap, *dmap;
-	unsigned i, any = 0;
-
-	/* Scan top to bottom clearing snapshot bit and moving
-	 * non-zero entries to top of block */
-	for (i = leaf->count; i--;) {
-		while (p != emap(leaf, i)) {
-			if (max_dirty >= dirty_buffer_count)
-				snapmask = 0;
-			u64 share = (--p)->share;
-
-			any |= share & snapmask;
-			if ((share &= ~snapmask))
-				*--dest = *p;
-			else
-				free_chunk(sb, p->chunk);
-		}
-		leaf->map[i].offset = (char *)dest - (char *)leaf;
-	}
-	/* Remove empties from map */
-	dmap = pmap = &leaf->map[0];
-	for (i = 0; i < leaf->count; i++, pmap++)
-		if (pmap->offset != (pmap + 1)->offset)
-			*dmap++ = *pmap;
-	dmap->offset = pmap->offset;
-	dmap->rchunk = 0; // tidy up
-	leaf->count = dmap - &leaf->map[0];
-	return !!any;
-}
-
-void delete_snapshots_from_tree(struct superblock *sb, u64 snapmask)
-{
-	int levels = sb->image.etree_levels, level = -1;
-	struct etree_path path[levels];
-	struct buffer *nodebuf;
-	struct enode *node;
-
-	trace_on(printf("delete snapshot mask %Lx\n", snapmask);)
-	while (1) {
- 		do {
-			level++;
-			nodebuf = snapread(sb, level? path[level - 1].pnext++->sector: sb->image.etree_root);
-			node = buffer2node(nodebuf);
-			path[level].buffer = nodebuf;
-			path[level].pnext = node->entries;
-			trace(printf("push to level %i, %i nodes\n", level, node->count);)
-		} while (level < levels - 1);
-
-		trace(printf("do %i leaf nodes\n", node->count);)
-		while (path[level].pnext  < node->entries + node->count) {
-			struct buffer *leafbuf = snapread(sb, path[level].pnext++->sector);
-			trace_off(printf("process leaf %Lx\n", leafbuf->sector);)
-			delete_snapshots_from_leaf(sb, buffer2leaf(leafbuf), snapmask, -1);
-			brelse(leafbuf);
-		}
-
-		do {
-			brelse(nodebuf);
-			if (!level)
-				return;
-			nodebuf = path[--level].buffer;
-			node = buffer2node(nodebuf);
-			trace(printf("pop to level %i, %i of %i nodes\n", level, path[level].pnext - node->entries, node->count);)
-		} while (path[level].pnext == node->entries + node->count);
-	};
-}
-
-/*
- * Delete algorithm (flesh this out)
- *
- * reached the end of an index block:
- *    try to merge with an index block in hold[]
- *    if can't merge then maybe can rebalance
- *    if can't merge then release the block in hold[] and move this block to hold[]
- *    can't merge if there's no block in hold[] or can't fit two together
- *    if can merge
- *       release and free this index block and
- *       delete from parent:
- *         if parent count zero, the grantparent key is going to be deleted, updating the pivot
- *         otherwise parent's deleted key becomes new pivot 
-*/
-
-static inline struct enode *path_node(struct etree_path path[], int level)
-{
-	return buffer2node(path[level].buffer);
-}
-
-static inline int finished_level(struct etree_path path[], int level)
-{
-	struct enode *node = path_node(path, level);
-	return path[level].pnext == node->entries + node->count;
-}
-
-void remove_index(struct etree_path path[], int level)
-{
-	struct enode *node = path_node(path, level);
-	chunk_t pivot = (path[level].pnext)->key; // !!! out of bounds for delete of last from full index
-	int count = node->count, i;
-
-	// stomps the node count (if 0th key holds count)
-	memmove(path[level].pnext - 1, path[level].pnext,
-		(char *)&node->entries[count] - (char *)path[level].pnext);
-	node->count = count - 1;
-	--(path[level].pnext);
-	set_buffer_dirty(path[level].buffer);
-
-	// no pivot for last entry
-	if (path[level].pnext == node->entries + node->count)
-		return;
-
-	// climb up to common parent and set pivot to deleted key
-	// what if index is now empty? (no deleted key)
-	// then some key above is going to be deleted and used to set pivot
-	if (path[level].pnext == node->entries && level) {
-		for (i = level - 1; path[i].pnext - 1 == path_node(path, i)->entries; i--)
-			if (!i)
-				return;
-		(path[i].pnext - 1)->key = pivot;
-		set_buffer_dirty(path[i].buffer);
-	}
-}
-
-
-static void brelse_free(struct superblock *sb, struct buffer *buffer)
-{
-	brelse(buffer);
-	if (buffer->count) {
-		warn("free block %Lx still in use!", (long long)buffer->sector);
-		return;
-	}
-	free_block(sb, buffer->sector);
-	evict_buffer(buffer);
-}
-
-static chunk_t delete_tree_range(struct superblock *sb, u64 snapmask, chunk_t resume, unsigned max_dirty)
-{
-	int levels = sb->image.etree_levels, level = levels - 1;
-	struct etree_path path[levels], hold[levels];
-	struct buffer *leafbuf, *prevleaf = NULL;
-	unsigned i;
-
-	for (i = 0; i < levels; i++) // can be initializer if not dynamic array (change it?)
-		hold[i] = (struct etree_path){ };
-
-	leafbuf = probe(sb, resume, path);
-
-	while (1) { /* in-order leaf walk */
-		trace_off(show_leaf(buffer2leaf(leafbuf));)
-		// should pass in and act on max_dirty...
-		if (delete_snapshots_from_leaf(sb, buffer2leaf(leafbuf), snapmask, max_dirty))
-			set_buffer_dirty(leafbuf);
-
-		if (prevleaf) { /* try to merge this leaf with prev */
-			struct eleaf *this = buffer2leaf(leafbuf);
-			struct eleaf *prev = buffer2leaf(prevleaf);
-			trace_off(warn("check leaf %p against %p", leafbuf, prevleaf);)
-			trace_off(warn("need = %i, free = %i", leaf_payload(this), leaf_freespace(prev));)
-			if (leaf_payload(this) <= leaf_freespace(prev)) {
-				trace_off(warn(">>> can merge leaf %p into leaf %p", leafbuf, prevleaf);)
-				merge_leaves(prev, this);
-				remove_index(path, level);
-				set_buffer_dirty(prevleaf);
-				brelse_free(sb, leafbuf);
-				goto keep_prev_leaf;
-			}
-			brelse(prevleaf);
-		}
-		prevleaf = leafbuf;
-keep_prev_leaf:
-		if (finished_level(path, level)) {
-			do { /* pop and try to merge finished nodes */
-				if (hold[level].buffer) {
-					assert(level); /* root node can't have any prev */
-					struct enode *this = path_node(path, level);
-					struct enode *prev = path_node(hold, level);
-					trace_off(warn("check node %p against %p", this, prev);)
-					trace_off(warn("this count = %i prev count = %i", this->count, prev->count);)
-					if (this->count <= sb->blocks_per_node - prev->count) {
-						trace_off(warn(">>> can merge node %p into node %p", this, prev);)
-						merge_nodes(prev, this);
-						remove_index(path, level - 1);
-						set_buffer_dirty(hold[level].buffer);
-						brelse_free(sb, path[level].buffer);
-						goto keep_prev_node;
-					}
-					brelse(hold[level].buffer);
-				}
-				hold[level].buffer = path[level].buffer;
-keep_prev_node:
-				if (!level) { /* remove levels if possible */
-					while (levels > 1 && path_node(hold, 0)->count == 1) {
-						trace_off(warn("drop btree level");)
-						sb->image.etree_root = hold[1].buffer->sector;
-						brelse_free(sb, hold[0].buffer);
-						levels = --sb->image.etree_levels;
-						memcpy(hold, hold + 1, levels * sizeof(hold[0]));
-						set_sb_dirty(sb);
-					}
-					brelse(prevleaf);
-					brelse_path(hold, levels);
-					return 0;
-				}
-
-				level--;
-				trace_off(printf("pop to level %i, %i of %i nodes\n", level, path[level].pnext - path_node(path, level)->entries, path_node(path, level)->count);)
-			} while (finished_level(path, level));
-
-			do { /* push back down to leaf level */
-				struct buffer *nodebuf = snapread(sb, path[level++].pnext++->sector);
-				path[level].buffer = nodebuf;
-				path[level].pnext = buffer2node(nodebuf)->entries;
-				trace_off(printf("push to level %i, %i nodes\n", level, path_node(path, level)->count);)
-			} while (level < levels - 1);
-		}
-
-		/* Exit if dirty buffer count above threshold */
-		// might incur a few extra index reads but oh well
-		if (dirty_buffer_count >= max_dirty) {
-			brelse(prevleaf);
-			brelse_path(path, levels);
-			for (i = 0; i < levels; i++)
-				if (hold[i].buffer)
-					brelse(hold[i].buffer);
-//			sb->delete_progress = ???;
-			return 1;
-		}
-
-		leafbuf = snapread(sb, path[level].pnext++->sector);
-	};
-}
-
-int delete_snapshot(struct superblock *sb, unsigned tag)
-{
-	struct snapshot *snapshot;
-	unsigned i, bit;
-
-	for (i = 0; i < sb->image.snapshots; i++)
-		if (sb->image.snaplist[i].tag == tag)
-			goto delete;
-	return -1;
-
-delete:
-	snapshot = sb->image.snaplist + i;
-	bit = snapshot->bit;
-	trace_on(printf("Delete snapshot %i (internal %i)\n", tag, bit);)
-	memmove(snapshot, snapshot + 1, (char *)(sb->image.snaplist + --sb->image.snapshots) - (char *)snapshot);
-	sb->snapmask &= ~(1ULL << bit);
-	delete_snapshots_from_tree(sb, 1ULL << bit);
-	set_sb_dirty(sb);
-	return bit;
-};
-
-void show_snapshots(struct superblock *sb)
-{
-	unsigned snapnum, snapshots = sb->image.snapshots;
-
-	printf("%u snapshots\n", snapshots);
-	for (snapnum = 0; snapnum < snapshots; snapnum++) {
-		struct snapshot *snapshot = sb->image.snaplist + snapnum;
-		printf("snapshot %u tag %u created %x\n", 
-			snapshot->bit, 
-			snapshot->tag, 
-			snapshot->create_time);
-	}
-};
-
-/* Lock snapshot reads against origin writes */
-
-static void reply(fd_t sock, struct messagebuf *message)
-{
-	trace(warn("%x/%u", message->head.code, message->head.length);)
-	writepipe(sock, &message->head, message->head.length + sizeof(message->head));
-}
-
-struct client
-{
-	u64 id;
-	fd_t sock;
-	int snap;
-};
-
-struct pending
-{
-	unsigned holdcount;
-	struct client *client;
-	struct messagebuf message;
-};
-
-struct snaplock_wait
-{
-	struct pending *pending;
-	struct snaplock_wait *next;
-};
-
-struct snaplock_hold
-{
-	struct client *client;
-	struct snaplock_hold *next;
-};
-
-struct snaplock
-{
-	struct snaplock_wait *waitlist;
-	struct snaplock_hold *holdlist;
-	struct snaplock *next;
-	chunk_t chunk;
-};
-
-struct snaplock *new_snaplock(struct superblock *sb)
-{
-	return malloc(sizeof(struct snaplock));
-}
-
-struct snaplock_wait *new_snaplock_wait(struct superblock *sb)
-{
-	return malloc(sizeof(struct snaplock_wait));
-}
-
-struct snaplock_hold *new_snaplock_hold(struct superblock *sb)
-{
-	return malloc(sizeof(struct snaplock_hold));
-}
-
-void free_snaplock(struct superblock *sb, struct snaplock *p)
-{
-	free(p);
-}
-
-void free_snaplock_hold(struct superblock *sb, struct snaplock_hold *p)
-{
-	free(p);
-}
-
-void free_snaplock_wait(struct superblock *sb, struct snaplock_wait *p)
-{
-	free(p);
-}
-
-unsigned snaplock_hash(struct superblock *sb, chunk_t chunk)
-{
-	return ((u32)(chunk * 3498734713U)) >> (32 - sb->snaplock_hash_bits);
-}
-
-struct snaplock *find_snaplock(struct snaplock *list, chunk_t chunk)
-{
-	for (; list; list = list->next)
-		if (list->chunk == chunk)
-			return list;
-	return NULL;
-}
-
-void waitfor_chunk(struct superblock *sb, chunk_t chunk, struct pending **pending)
-{
-	struct snaplock *lock;
-	if ((lock = find_snaplock(sb->snaplocks[snaplock_hash(sb, chunk)], chunk))) {
-		if (!*pending) {
-			// arguably we should know the client and fill it in here
-			*pending = calloc(1, sizeof(struct pending));
-			(*pending)->holdcount = 1;
-		}
-		struct snaplock_wait *wait = new_snaplock_wait(sb);
-		wait->pending = *pending;
-		wait->next = lock->waitlist;
-		lock->waitlist = wait;
-		(*pending)->holdcount++;
-	}
-}
-
-void readlock_chunk(struct superblock *sb, chunk_t chunk, struct client *client)
-{
-	struct snaplock **bucket = &sb->snaplocks[snaplock_hash(sb, chunk)];
-	struct snaplock *lock;
-
-	if (!(lock = find_snaplock(*bucket, chunk))) {
-		lock = new_snaplock(sb);
-		*lock = (struct snaplock){ .chunk = chunk, .next = *bucket };
-		*bucket = lock;
-	}
-	struct snaplock_hold *hold = new_snaplock_hold(sb);
-	hold->client = client;
-	hold->next = lock->holdlist;
-	lock->holdlist = hold;
-}
-
-struct snaplock *release_lock(struct superblock *sb, struct snaplock *lock, struct client *client)
-{
-	struct snaplock *ret = lock;
-	struct snaplock_hold **holdp = &lock->holdlist;
-	while (*holdp && (*holdp)->client != client)
-		holdp = &(*holdp)->next;
-
-	if (!*holdp) {
-		trace_on(printf("chunk %Lx holder %Lu not found\n", lock->chunk, client->id);)
-		return NULL;
-	}
-
-	/* Delete and free holder record */
-	struct snaplock_hold *next = (*holdp)->next;
-	free_snaplock_hold(sb, *holdp);
-	*holdp = next;
-
-	if (lock->holdlist)
-		return ret;
-
-	/* Release and delete waiters, delete lock */
-	struct snaplock_wait *list = lock->waitlist;
-	while (list) {
-		struct snaplock_wait *next = list->next;
-		assert(list->pending->holdcount);
-		if (!--(list->pending->holdcount)) {
-			struct pending *pending = list->pending;
-			reply(pending->client->sock, &pending->message);
-			free(pending);
-		}
-		free_snaplock_wait(sb, list);
-		list = next;
-	}
-	ret = lock->next;
-	free_snaplock(sb, lock);
-	return ret;
-}
-
-int release_chunk(struct superblock *sb, chunk_t chunk, struct client *client)
-{
-	trace(printf("release %Lx\n", chunk);)
-	struct snaplock **lockp = &sb->snaplocks[snaplock_hash(sb, chunk)];
-
-	/* Find pointer to lock record */
-	while (*lockp && (*lockp)->chunk != chunk)
-		lockp = &(*lockp)->next;
-	struct snaplock *next, *lock = *lockp;
-
-	if (!lock) {
-		trace_on(printf("chunk %Lx not locked\n", chunk);)
-		return -1;
-	}
-
-	next = release_lock(sb, lock, client);
-	if (!next)
-		return -2;
-	*lockp = next;
-	return 0;
-}
-
-void show_locks(struct superblock *sb)
-{
-	unsigned n = 0, i;
-	for (i = 0; i < (1 << sb->snaplock_hash_bits); i++) {
-		struct snaplock *lock = sb->snaplocks[i];
-		if (!lock)
-			continue;
-		if (!n) printf("Locks:\n");
-		printf("[%03u] ", i);
-		do {
-			printf("chunk %Lx ", lock->chunk);
-			struct snaplock_hold *hold = lock->holdlist;
-			for (; hold; hold = hold->next)
-				printf("held by client %Lu ", hold->client->id);
-			struct snaplock_wait *wait = lock->waitlist;
-			for (; wait; wait = wait->next)
-				printf("wait [%02hx/%u] ", snaplock_hash(sb, (u32)wait->pending), wait->pending->holdcount);
-		} while ((lock = lock->next));
-		printf("\n");
-		n++;
-	}
-	if (!n) printf("-- no locks --\n");
-}
-
-/* Build up a response as a list of chunk ranges */
-
-struct addto
-{ 
-	unsigned count;
-	chunk_t firstchunk; 
-	chunk_t nextchunk;
-	struct rwmessage *reply;
-	shortcount *countp;
-	chunk_t *top;
-	char *lim;
-};
-
-void check_response_full(struct addto *r, unsigned bytes)
-{
-	if ((char *)r->top < r->lim - bytes)
-		return;
-	error("Need realloc");
-}
-
-void addto_response(struct addto *r, chunk_t chunk)
-{
-	if (chunk != r->nextchunk) {
-		if (r->top) {
-			trace_off(warn("finish old range\n");)
-			*(r->countp) = (r->nextchunk -  r->firstchunk);
-		} else {
-			trace_off(warn("alloc new reply");)
-			r->reply = (void *) malloc(sizeof(struct messagebuf));
-			r->top = (chunk_t *)(((char *)r->reply) + sizeof(struct head) + offsetof(struct rw_request, ranges));
-			r->lim = ((char *)r->reply) + maxbody;
-			r->count++;
-		}
-		trace_off(warn("start new range");)
-		check_response_full(r, 2*sizeof(chunk_t));
-		r->firstchunk = *(r->top)++ = chunk;
-		r->countp = (shortcount *)r->top;
-		r->top = (chunk_t *)(((shortcount *)r->top) + 1);
-	}
-	r->nextchunk = chunk + 1;
-}
-
-int finish_reply_(struct addto *r, unsigned code, unsigned id)
-{
-	if (!r->countp)
-		return 0;
-
-	*(r->countp) = (r->nextchunk -  r->firstchunk);
-	r->reply->head.code = code;
-	r->reply->head.length = (char *)r->top - (char *)r->reply - sizeof(struct head);
-	r->reply->body.id = id;
-	r->reply->body.count = r->count;
-	return 1;
-}
-
-void finish_reply(int sock, struct addto *r, unsigned code, unsigned id)
-{
-	if (finish_reply_(r, code, id))
-		reply(sock, (struct messagebuf *)r->reply);
-	free(r->reply);
-}
-
-/* Initialization, State load/save */
-
-void setup_sb(struct superblock *sb)
-{
-	unsigned blocksize_bits = sb->image.blocksize_bits;
-	unsigned chunksize_bits = sb->image.blocksize_bits;
-	sb->blocksize = 1 << blocksize_bits;
-	sb->chunksize = 1 << chunksize_bits, 
-	sb->sectors_per_block_bits = blocksize_bits - SECTOR_BITS;
-	sb->sectors_per_chunk_bits = chunksize_bits - SECTOR_BITS;
-	sb->blocks_per_node = (sb->blocksize - offsetof(struct enode, entries)) / sizeof(struct index_entry);
-#ifdef BUSHY
-	sb->blocks_per_node = 10;
-#endif
-	sb->copybuf = malloc_aligned(sb->copybuf_size = (32 * sb->chunksize), 4096); // !!! check failed
-	sb->sectors_per_block = 1 << sb->sectors_per_block_bits;
-	sb->sectors_per_chunk = 1 << sb->sectors_per_chunk_bits;
-	sb->snapmask = 0;
-	sb->flags = 0;
-
-	sb->max_commit_blocks = (sb->blocksize - sizeof(struct commit_block)) / sizeof(sector_t);
-
-	unsigned snaplock_hash_bits = 8;
-	sb->snaplock_hash_bits = snaplock_hash_bits;
-	sb->snaplocks = (struct snaplock **)calloc(1 << snaplock_hash_bits, sizeof(struct snaplock *));
-}
-
-void load_sb(struct superblock *sb)
-{
-	struct buffer *buffer = bread(sb->snapdev, SB_SECTOR, SB_SIZE);
-	memcpy(&sb->image, buffer->data, sizeof(sb->image));
-	assert(!memcmp(sb->image.magic, SB_MAGIC, sizeof(sb->image.magic)));
-	brelse(buffer);
-	setup_sb(sb);
-	sb->snapmask = calc_snapmask(sb);
-	trace_on(printf("Active snapshot mask: %016llx\n", sb->snapmask);)
-}
-
-void save_sb(struct superblock *sb)
-{
-	if (sb->flags & SB_DIRTY) {
-		struct buffer *buffer = getblk(sb->snapdev, SB_SECTOR, SB_SIZE);
-		memcpy(buffer->data, &sb->image, sizeof(sb->image));
-		write_buffer(buffer);
-		brelse(buffer);
-		sb->flags &= ~SB_DIRTY;
-	}
-}
-
-void save_state(struct superblock *sb)
-{
-	flush_buffers();
-	save_sb(sb);
-}
-
-/*
- * This source compiles either the snapshot server or the snapshot store setup
- * utility, depending on whether the macro variable CREATE is defined.
- *
- * I'll leave all the testing hooks lying around in the main routine for now,
- * since the low level components still tend to break every now and then and
- * require further unit testing.
- */
-
-int init_snapstore(struct superblock *sb)
-{
-	int i, error;
-
-	unsigned sectors_per_block_bits = 3;
-	sb->image = (struct disksuper){ .magic = SB_MAGIC };
-	sb->image.etree_levels = 1,
-	sb->image.blocksize_bits = SECTOR_BITS + sectors_per_block_bits;
-	sb->image.chunksize_bits = sb->image.blocksize_bits; // !!! just for now
-	setup_sb(sb);
-
-	u64 size;
-	if ((error = fd_size(sb->snapdev, &size)))
-		error("Error %i: %s determining snapshot store size", error, strerror(error));
-	sb->image.chunks = size >> sb->image.chunksize_bits;
-	if ((error = fd_size(sb->orgdev, &size)))
-		error("Error %i: %s determining origin volume size", errno, strerror(errno));
-	sb->image.orgchunks = size >> sb->image.chunksize_bits;
-
-	sb->image.journal_size = 100;
-#ifdef TEST_JOURNAL
-	sb->image.journal_size = 5;
-#endif
-	sb->image.journal_next = 0;
-	sb->image.sequence = sb->image.journal_size;
-	init_allocation(sb);
-	set_sb_dirty(sb);
-
-	for (i = 0; i < sb->image.journal_size; i++) {
-		struct buffer *buffer = jgetblk(sb, i);
-		struct commit_block *commit = (struct commit_block *)buffer->data;
-		*commit = (struct commit_block){ .magic = JMAGIC, .sequence = i };
-#ifdef TEST_JOURNAL
-		commit->sequence = (i + 3) % 5;
-#endif
-		commit->checksum = -checksum_block(sb, (void *)commit);
-		brelse_dirty(buffer);
-	}
-#ifdef TEST_JOURNAL
-	show_journal(sb);
-	show_tree(sb);
-	flush_buffers();
-	recover_journal(sb);
-	show_buffers();
-#endif
-
-#if 0
-	printf("chunk = %Lx\n", alloc_chunk_range(sb, sb->image.chunks - 1, 1));
-//	struct buffer *buffer = snapread(sb, sb->image.bitmap_base + 3 * 8);
-//	dump_buffer(buffer, 4090, 6);
-return 0;
-#endif
-
-#if 0
-	grab_chunk(sb, 32769);
-	struct buffer *buffer = snapread(sb, sb->image.bitmap_base + 8);
-	printf("sector %Lx\n", buffer->sector);
-	free_chunk(sb, 32769);
-return 0;
-#endif
-
-	struct buffer *leafbuf = new_leaf(sb);
-	struct buffer *rootbuf = new_node(sb);
-	buffer2node(rootbuf)->count = 1;
-	buffer2node(rootbuf)->entries[0].sector = leafbuf->sector;
-	sb->image.etree_root = rootbuf->sector;
-
-#if 0
-	printf("chunk = %Lx\n", alloc_chunk(sb));
-	printf("chunk = %Lx\n", alloc_chunk(sb));
-	printf("chunk = %Lx\n", alloc_chunk(sb));
-	printf("chunk = %Lx\n", alloc_chunk(sb));
-	printf("chunk = %Lx\n", alloc_chunk(sb));
-	printf("chunk = %Lx\n", alloc_chunk(sb));
-	printf("chunk = %Lx\n", alloc_chunk(sb));
-//	free_chunk(sb, 23);
-	printf("chunk = %Lx\n", alloc_chunk(sb));
-	printf("chunk = %Lx\n", alloc_chunk(sb));
-	printf("chunk = %Lx\n", alloc_chunk(sb));
-	printf("chunk = %Lx\n", alloc_chunk(sb));
-return 0;
-#endif
-
-	brelse_dirty(rootbuf);
-	brelse_dirty(leafbuf);
-#if 0
-	struct buffer *leafbuf1 = new_leaf(sb);
-	struct buffer *leafbuf2 = new_leaf(sb);
-	struct eleaf *leaf1 = buffer2leaf(leafbuf1);
-	struct eleaf *leaf2 = buffer2leaf(leafbuf2);
-	init_leaf(leaf1, 256);
-	init_leaf(leaf2, 256);
-	add_exception_to_leaf(leaf1, 0x111, 0x11, 0, 3);
-	add_exception_to_leaf(leaf2, 0x222, 0x11, 1, 3);
-	add_exception_to_leaf(leaf2, 0x333, 0x33, 1, 3);
-	show_leaf(leaf1);
-	show_leaf(leaf2);
-	merge_leaves(leaf1, leaf2);
-	show_leaf(leaf1);
-	return 0;
-#endif
-#ifdef TEST_JOURNAL
-	show_buffers();
-	show_dirty_buffers();
-	commit_transaction(sb);
-	evict_buffers();
-
-	show_journal(sb);
-	show_tree(sb);
-	recover_journal(sb);
-	evict_buffers();
-	show_tree(sb);
-#endif
-	save_state(sb);
-	return 0;
-}
-
-// Expand snapshot store:
-//   Calculate num bitmap blocks for new size
-//   Copy bitmap blocks to current top
-//   Clear new bitmap blocks
-//   Reserve new bitmap blocks
-//   Clear remainder bits in old last bitmap byte
-//   Set remainder bits in new last bitmap byte
-//   Set new bitmap base and chunks count
-
-static void expand_snapstore(struct superblock *sb, u64 newchunks)
-{
-	u64 oldchunks = sb->image.chunks;
-	unsigned oldbitmaps = sb->image.bitmap_blocks;
-	unsigned newbitmaps = calc_bitmap_blocks(sb, newchunks);
-	unsigned blocksize = sb->blocksize;
-	unsigned blockshift = sb->image.blocksize_bits;
-	u64 oldbase = sb->image.bitmap_base << SECTOR_BITS;
-	u64 newbase = sb->image.bitmap_base << SECTOR_BITS;
-	
-	int i;
-	for (i = 0; i < oldbitmaps; i++) {
-		// do it one block at a time for now !!! sucks
-		// maybe should do copy with bread/write?
-		pread(sb->snapdev, sb->copybuf, blocksize, oldbase + (i << blockshift));  // 64 bit!!!
-		pwrite(sb->snapdev, sb->copybuf, blocksize, newbase + (i << blockshift));  // 64 bit!!!
-	}
-
-	if ((oldchunks & 7)) {
-		sector_t sector = (oldbase >> SECTOR_BITS) + ((oldbitmaps - 1) << sb->sectors_per_block_bits);
-		struct buffer *buffer = getblk(sb->snapdev, sector, blocksize);
-		buffer->data[(oldchunks >> 3) & (blocksize - 1)] &= ~(0xff << (oldchunks & 7));
-		brelse_dirty(buffer);
-	}
-
-	for (i = oldbitmaps; i < newbitmaps; i++) {
-		struct buffer *buffer = getblk(sb->snapdev, newbase >> SECTOR_BITS, blocksize);
-		memset(buffer->data, 0, sb->blocksize);
-		/* Suppress overrun allocation in partial last byte */
-		if (i == newbitmaps - 1 && (newchunks & 7))
-			buffer->data[(newchunks >> 3) & (blocksize - 1)] |= 0xff << (newchunks & 7);
-		brelse_dirty(buffer);
-	}
-
-	for (i = 0; i < newbitmaps; i++) {
-		grab_chunk(sb, (newbase >> blockshift) + i); // !!! assume blocksize = chunksize
-	}
-
-	sb->image.bitmap_base = newbase >> SECTOR_BITS;
-	sb->image.chunks = newchunks;
-	save_state(sb);
-}
-
-int client_locks(struct superblock *sb, struct client *client, int check)
-{
-	int i;
-
-	for (i = 0; i < (1 << sb->snaplock_hash_bits); i++) {
-		struct snaplock **lockp = &sb->snaplocks[i];
-
-		while (*lockp) {
-			struct snaplock_hold *hold;
-
-			for (hold = (*lockp)->holdlist; hold; hold = hold->next)
-				if (hold->client == client) {
-					if (check)
-						return 1;
-					*lockp = release_lock(sb, *lockp, client);
-					goto next;
-				}
-			lockp = &(*lockp)->next;
-next:
-			continue;
-		}
-	}
-	return 0;
-}
-
-#define check_client_locks(x, y) client_locks(x, y, 1)
-#define free_client_locks(x, y) client_locks(x, y, 0)
-
-/*
- * Responses to IO requests take two quite different paths through the
- * machinery:
- *
- *   - Origin write requests are just sent back with their message
- *     code changed, unless they have to wait for a snapshot read
- *     lock in which case the incoming buffer is copied and the
- *     response takes a kafkaesque journey through the read locking
- *     beaurocracy.
- *
- *   - Responses to snapshot read or write requests have to be built
- *     up painstakingly in allocated buffers, keeping a lot of state
- *     around so that they end up with a minimum number of contiguous
- *     chunk ranges.  Once complete they can always be sent
- *     immediately.
- *
- * To mess things up further, snapshot read requests can return both
- * a list of origin ranges and a list of snapshot store ranges.  In
- * the latter case the specific snapshot store chunks in each logical
- * range are also returned, because they can (normally will) be
- * discontiguous.  This goes back to the client in two separate
- * messages, on the theory that the client will find it nice to be
- * able to process the origin read ranges and snapshot read chunks
- * separately.  We'll see how good an idea that is.
- *
- * The implementation ends up looking nice and tidy, but appearances
- * can be deceiving.
- */
-int incoming(struct superblock *sb, struct client *client)
-{
-	struct messagebuf message;
-	unsigned sock = client->sock;
-	int i, j, err;
-
-	if ((err = readpipe(sock, &message.head, sizeof(message.head))))
-		goto pipe_error;
-	trace(warn("%x/%u", message.head.code, message.head.length);)
-	if (message.head.length > maxbody)
-		goto message_too_long;
-	if ((err = readpipe(sock, &message.body, message.head.length)))
-		goto pipe_error;
-
-	switch (message.head.code) {
-		case QUERY_WRITE:
-		if (client->snap == -1) {
-			struct pending *pending = NULL;
-			struct rw_request *body = (struct rw_request *)message.body;
-			struct chunk_range *p = body->ranges;
-			chunk_t chunk;
-			if (message.head.length < sizeof(*body))
-				goto message_too_short;
-			trace(printf("origin write query, %u ranges\n", body->count);)
-
-			for (i = 0; i < body->count; i++, p++)
-				for (j = 0, chunk = p->chunk; j < p->chunks; j++, chunk++)
-					if (make_unique(sb, chunk, -1))
-						waitfor_chunk(sb, chunk, &pending);
-			finish_copyout(sb);
-			commit_transaction(sb);
-
-			message.head.code = REPLY_ORIGIN_WRITE;
-			if (pending) {
-				pending->client = client;
-				memcpy(&pending->message, &message, message.head.length + sizeof(struct head));
-				pending->holdcount--;
-				break;
-			}
-			reply(sock, &message);
-			break;
-		} else {
-			struct rw_request *body = (struct rw_request *)message.body;
-			if (message.head.length < sizeof(*body))
-				goto message_too_short;
-			trace(printf("snapshot write request, %u ranges\n", body->count);)
-			struct addto snap = { .nextchunk = -1 };
-
-			for (i = 0; i < body->count; i++)
-				for (j = 0; j < body->ranges[i].chunks; j++) {
-					chunk_t chunk = body->ranges[i].chunk + j;
-					chunk_t exception = make_unique(sb, chunk, client->snap);
-					trace(printf("exception = %Lx\n", exception);)
-					addto_response(&snap, chunk);
-					check_response_full(&snap, sizeof(chunk_t));
-					*(snap.top)++ = exception;
-				}
-			finish_copyout(sb);
-			commit_transaction(sb);
-			finish_reply(client->sock, &snap, REPLY_SNAPSHOT_WRITE, body->id);
-			break;
-		}
-
-		case QUERY_SNAPSHOT_READ:
-		{
-			struct rw_request *body = (struct rw_request *)message. body;
-			if (message.head.length < sizeof(*body))
-				goto message_too_short;
-			trace(printf("snapshot read request, %u ranges\n", body->count);)
-			struct addto snap = { .nextchunk = -1 }, org = { .nextchunk = -1 };
-
-			for (i = 0; i < body->count; i++)
-				for (j = 0; j < body->ranges[i].chunks; j++) {
-					chunk_t chunk = body->ranges[i].chunk + j, exception = 0;
-					trace(warn("read %Lx", chunk));
-					test_unique(sb, chunk, client->snap, &exception);
-					if (exception) {
-						trace(warn("read exception %Lx", exception));
-						addto_response(&snap, chunk);
-						check_response_full(&snap, sizeof(chunk_t));
-						*(snap.top)++ = exception;
-					} else {
-						trace(warn("read origin %Lx", chunk));
-						addto_response(&org, chunk);
-						readlock_chunk(sb, chunk, client);
-					}
-				}
-			finish_reply(client->sock, &org, REPLY_SNAPSHOT_READ_ORIGIN, body->id);
-			finish_reply(client->sock, &snap, REPLY_SNAPSHOT_READ, body->id);
-			break;
-		}
-
-		case FINISH_SNAPSHOT_READ:
-		{
-			struct rw_request *body = (struct rw_request *)message.body;
-			if (message.head.length < sizeof(*body))
-				goto message_too_short;
-			trace(printf("finish snapshot read, %u ranges\n", body->count);)
-
-			for (i = 0; i < body->count; i++)
-				for (j = 0; j < body->ranges[i].chunks; j++)
-					release_chunk(sb, body->ranges[i].chunk + j, client);
-
-			break;
-		}
-
-		case IDENTIFY:
-		{
-			int tag = ((struct identify *)message.body)->snap, snap = tag2snapnum(sb, tag);
-			if (snap >= 0)
-				client->snap = snap;
-			client->id = ((struct identify *)message.body)->id;
-			warn("client id %Li, snapshot %i (snapnum %i)", client->id, tag, snap);
-			outbead(sock, REPLY_IDENTIFY, struct { });
-			break;
-		}
-
-		case UPLOAD_LOCK:
-			break;
-
-		case FINISH_UPLOAD_LOCK:
-			break;
-
-		case CREATE_SNAPSHOT:
-			create_snapshot(sb, ((struct create_snapshot *)message.body)->snap);
-			save_state(sb);
-			outbead(sock, REPLY_CREATE_SNAPSHOT, struct { });
-			break;
-
-		case DELETE_SNAPSHOT:
-			delete_snapshot(sb, ((struct create_snapshot *)message.body)->snap);
-			save_state(sb);
-			outbead(sock, REPLY_DELETE_SNAPSHOT, struct { });
-			break;
-
-		case INITIALIZE_SNAPSTORE:
-			init_snapstore(sb);
-			break;
-
-		case DUMP_TREE:
-			show_tree(sb);
-			break;
-
-		case START_SERVER:
-			warn("Activating server");
-			load_sb(sb);
-			if (sb->image.flags & SB_BUSY) {
-				warn("Server was not shut down properly");
-				jtrace(show_journal(sb);)
-				recover_journal(sb);
-			} else {
-				sb->image.flags |= SB_BUSY;
-				set_sb_dirty(sb);
-				save_sb(sb);
-			}
-			break;
-
-		case SHUTDOWN_SERVER:
-			return -2;
-
-		default: 
-			outbead(sock, REPLY_ERROR, struct { int code; char error[50]; }, message.head.code, "Unknown message"); // wrong!!!
-	}
-
-#if 0
-	static int messages = 0;
-	if (++messages == 5) {
-		warn(">>>>Simulate server crash<<<<");
-		exit(1);
-	}
-#endif
-	return 0;
-
-message_too_long:
-	warn("message %x too long (%u bytes)\n", message.head.code, message.head.length);
-	return -1;
-message_too_short:
-	warn("message %x too short (%u bytes)\n", message.head.code, message.head.length);
-	return -1;
-pipe_error:
-	return -1; /* we quietly drop the client if the connect breaks */
-}
-
-/* Signal Delivery via pipe */
-
-static int sigpipe;
-
-void sighandler(int signum)
-{
-	trace_off(printf("caught signal %i\n", signum);)
-	write(sigpipe, (char[]){signum}, 1);
-}
-
-int cleanup(struct superblock *sb)
-{
-	warn("cleaning up");
-	sb->image.flags &= ~SB_BUSY;
-	set_sb_dirty(sb);
-	save_state(sb);
-	return 0;
-}
-
-int resolve_host(char *name, int family, void *result, int length)
-{
-	struct hostent host, *bogus;
-	char work[500];
-	int err, dumb;
-
-	if ((err = gethostbyname2_r(name, family, &host, work, sizeof(work), &bogus, &dumb))) {
-		errno = err;
-		return -1;
-	}
-	memcpy(result, host.h_addr_list[0], host.h_length);
-	return host.h_length;
-}
-
-int resolve_self(int family, void *result, int length)
-{
-	char name[HOST_NAME_MAX + 1];
-	if (gethostname(name, HOST_NAME_MAX) == -1)
-		return -1;
-
-	return resolve_host(name, family, result, length);
-}
-
-int csnap_server(struct superblock *sb, const char *sockname, int port)
-{
-	unsigned maxclients = 100, clients = 0, others = 3;
-	struct client *clientvec[maxclients];
-	struct pollfd pollvec[others+maxclients];
-	int listener, getsig, pipevec[2], err = 0;
-
-	if (pipe(pipevec))
-		error("Can't open pipe");
-	sigpipe = pipevec[1];
-	getsig = pipevec[0];
-
-	struct server server = { .port = htons(port), .type = AF_INET,  };
-
-	if ((listener = socket(AF_INET, SOCK_STREAM, 0)) < 0) 
-		error("Can't get socket");
-
-	if (bind(listener,
-		(struct sockaddr *)&(struct sockaddr_in){
-			.sin_family = server.type, 
-			.sin_port = server.port,
-			.sin_addr = { .s_addr = INADDR_ANY } },
-		sizeof(struct sockaddr_in)) < 0) 
-		error("Can't bind to socket");
-	listen(listener, 5);
-
-	warn("csnap server bound to port %i", port);
-
-	/* Get agent connection */
-	struct sockaddr_un addr = { .sun_family = AF_UNIX };
-	int addr_len = sizeof(addr) - sizeof(addr.sun_path) + strlen(sockname);
-	int sock, len;
-
-	trace(warn("Connect to control socket %s", sockname);)
-	if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) == -1)
-		error("Can't get socket");
-	strncpy(addr.sun_path, sockname, sizeof(addr.sun_path));
-	if (sockname[0] == '@')
-		addr.sun_path[0] = 0;
-
-	if (connect(sock, (struct sockaddr *)&addr, addr_len) == -1)
-		error("Can't connect to control socket");
-
-	trace_on(warn("Received control connection");)
-	pollvec[2] = (struct pollfd){ .fd = sock, .events = POLLIN };
-
-	if ((len = resolve_self(AF_INET, server.address, sizeof(server.address))) == -1)
-		error("Can't get own address, %s (%i)", strerror(errno), errno);
-	server.address_len = len;
-	warn("host = %x/%u", *(int *)server.address, server.address_len);
-	writepipe(sock, &(struct head){ SERVER_READY, sizeof(server) }, sizeof(struct head));
-	writepipe(sock, &server, sizeof(server));
-
-#if 1
-	switch (fork()) {
-	case -1:
-		error("fork failed");
-	case 0: // !!! daemonize goes here
-		break;
-	default:
-		return 0;
-	}
-#endif
-
-	pollvec[0] = (struct pollfd){ .fd = listener, .events = POLLIN };
-	pollvec[1] = (struct pollfd){ .fd = getsig, .events = POLLIN };
-
-	signal(SIGINT, sighandler);
-	signal(SIGTERM, sighandler);
-	signal(SIGPIPE, SIG_IGN);
-
-	while (1) {
-		int activity = poll(pollvec, others+clients, -1);
-
-		if (activity < 0) {
-			if (errno != EINTR)
-				error("poll failed, %s", strerror(errno));
-			continue;
-		}
-
-		if (!activity) {
-			printf("waiting...\n");
-			continue;
-		}
-
-		/* New connection? */
-		if (pollvec[0].revents) {
-			struct sockaddr_in addr;
-			int addr_len = sizeof(addr), sock;
-
-			if (!(sock = accept(listener, (struct sockaddr *)&addr, &addr_len)))
-				error("Cannot accept connection");
-
-			trace_on(warn("Received connection");)
-			assert(clients < maxclients); // !!! send error and disconnect
-
-			struct client *client = malloc(sizeof(struct client));
-			*client = (struct client){ .sock = sock };
-			clientvec[clients] = client;
-			pollvec[others+clients] = (struct pollfd){ .fd = sock, .events = POLLIN };
-			clients++;
-		}
-
-		/* Signal? */
-		if (pollvec[1].revents) {
-			u8 sig = 0;
-			/* it's stupid but this read also gets interrupted, so... */
-			do { } while (read(getsig, &sig, 1) == -1 && errno == EINTR);
-			trace_on(warn("caught signal %i", sig);)
-			cleanup(sb); // !!! don't do it on segfault
-			if (sig == SIGINT) { 
-		        	signal(SIGINT, SIG_DFL);
-        			kill(getpid(), sig); /* commit harikiri */
-			}
-			goto done;
-		}
-
-		/* Agent message? */
-		if (pollvec[2].revents)
-			incoming(sb, &(struct client){ .sock = sock, .id = -2, .snap = -2 });
-
-		/* Client message? */
-		unsigned i = 0;
-		while (i < clients) {
-			if (pollvec[others+i].revents) { // !!! check for poll error
-				struct client *client = clientvec[i];
-				int result;
-
-				trace_off(printf("event on socket %i = %x\n", client->sock, pollvec[others+i].revents);)
-				if ((result = incoming(sb, client)) == -1) {
-					warn("Client %Li disconnected", client->id);
-					save_state(sb); // !!! just for now
-					close(client->sock);
-					free(client);
-					--clients;
-					clientvec[i] = clientvec[clients];
-					pollvec[others + i] = pollvec[others + clients];
-					continue;
-				}
-
-				if (result == -2) { // !!! wrong !!!
-					cleanup(sb);
-					goto done;
-				}
-			}
-			i++;
-		}
-	}
-done:
-	// in a perfect world we'd close all the connections
-	close(listener);
-	return err;
-}
-
-#if 0
-void usage(poptContext optCon, int exitcode, char *error, char *addl) {
-	poptPrintUsage(optCon, stderr, 0);
-	if (error) fprintf(stderr, "%s: %s0", error, addl);
-	exit(exitcode);
-}
-#endif
-
-int main(int argc, const char *argv[])
-{
-	poptContext optCon;
-	unsigned volsize;
-
-	struct poptOption optionsTable[] = {
-	     { "size", 's', POPT_ARG_INT, &volsize, 0, "volume size", "size" },
-	     POPT_AUTOHELP
-	     { NULL, 0, 0, NULL, 0 }
-	};
-
-	optCon = poptGetContext(NULL, argc, argv, optionsTable, 0);
-
-#ifdef SERVER
-	poptSetOtherOptionHelp(optCon, "dev/snapshot dev/origin socket port");
-	if (argc < 5) {
-#else
-	poptSetOtherOptionHelp(optCon, "dev/snapshot dev/origin");
-	if (argc < 3) {
-#endif
-		poptPrintUsage(optCon, stderr, 0);
-		exit(1);
-	}
-
-	char c;
-	while ((c = poptGetNextOpt(optCon)) >= 0)
-		;
-#if 0
-	snapname = poptGetArg(optCon);
-	if(!(poptPeekArg(optCon) == NULL)) {
-		poptPrintUsage(optCon, stderr, 0);
-		exit(1);
-	}
-#endif
-	if (c < -1) {
-		 fprintf(stderr, "%s: %s\n",
-			 poptBadOption(optCon, POPT_BADOPTION_NOALIAS),
-			 poptStrerror(c));
-		 return 1;
-	}
-
-	struct superblock *sb = &(struct superblock){};
-
-	init_buffers();
-#ifdef SERVER
-	if (argc < 5)
-		error("usage: %s dev/snapshot dev/origin socket port", argv[0]);
-#else
-	if (argc < 3)
-		error("usage: %s dev/snapshot dev/origin", argv[0]);
-#endif
-	if (!(sb->snapdev = open(poptGetArg(optCon), O_RDWR | O_DIRECT)))
-		error("Could not open snapshot store %s", argv[1]);
-
-	if (!(sb->orgdev = open(poptGetArg(optCon), O_RDONLY | O_DIRECT)))
-		error("Could not open origin volume %s", argv[2]);
-#ifdef SERVER
-	const char *sockname = poptGetArg(optCon);
-	int port = atoi(poptGetArg(optCon));
-	poptFreeContext(optCon);
-	return csnap_server(sb, sockname, port);
-#else
-	poptFreeContext(optCon);
-#if 0
-	init_snapstore(sb); 
-	create_snapshot(sb, 0);
-
-	int i;
-	for (i = 0; i < 100; i++) {
-		make_unique(sb, i, 0);
-	}
-
-	flush_buffers();
-	evict_buffers();
-	warn("delete...");
-	delete_tree_range(sb, 1, 0, 5);
-	show_buffers();
-	warn("dirty buffers = %i", dirty_buffer_count);
-	show_tree(sb);
-	return 0;
-#endif
-	return init_snapstore(sb); 
-#endif
-
-	void *useme = _show_journal;
-	useme = useme;
-	useme = (void *)delete_tree_range;
-	useme = (void *)expand_snapstore;
-}
diff --git a/csnap/src/csnap.h b/csnap/src/csnap.h
deleted file mode 100644
index ce645c6..0000000
--- a/csnap/src/csnap.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#define u8 unsigned char
-#define u16 unsigned short
-#define s16 short
-#define s32 int
-#define u32 unsigned
-#define u64 unsigned long long
-
-#define le_u32 u32
-#define le_u16 u16
-#define le_u64 u64
-#define u64 unsigned long long
-#define EFULL ENOMEM
-#define PACKED __attribute__ ((packed))
-
-static inline int readpipe(int fd, void *buffer, size_t count)
-{
-	// printf("read %u bytes\n", count);
-	int n;
-	while (count) {
-		if ((n = read(fd, buffer, count)) < 1)
-			return n? n: -EPIPE;
-		buffer += n;
-		count -= n;
-	}
-	return 0;
-}
-
-#define writepipe write
-
-#define outbead(SOCK, CODE, STRUCT, VALUES...) ({ \
-	struct { struct head head; STRUCT body; } PACKED message = \
-		{ { CODE, sizeof(STRUCT) }, { VALUES } }; \
-	writepipe(SOCK, &message, sizeof(message)); })
-
-typedef unsigned long long chunk_t;
-
-#define MAX_ADDRESS 16
-
-struct server { u16 port; u8 type; u8 address_len; char address[MAX_ADDRESS]; } PACKED;
-
-#ifndef HOST_NAME_MAX
-#define HOST_NAME_MAX 256
-#endif
-
diff --git a/csnap/src/list.h b/csnap/src/list.h
deleted file mode 100644
index 8a28deb..0000000
--- a/csnap/src/list.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* List ops from include/linux/list.h */
-
-#define LIST_POISON1 ((void *) 0x00100100)
-#define LIST_POISON2 ((void *) 0x00200200)
-
-struct list_head { struct list_head *next, *prev; };
-
-static inline void __list_add(struct list_head *new,
-			      struct list_head *prev,
-			      struct list_head *next)
-{
-	next->prev = new;
-	new->next = next;
-	new->prev = prev;
-	prev->next = new;
-}
-
-static inline void list_add(struct list_head *new, struct list_head *head)
-{
-	__list_add(new, head, head->next);
-}
-
-static inline void list_add_tail(struct list_head *new, struct list_head *head)
-{
-	__list_add(new, head->prev, head);
-}
-
-static inline void __list_del(struct list_head *prev, struct list_head *next)
-{
-	next->prev = prev;
-	prev->next = next;
-}
-
-static inline void list_del(struct list_head *entry)
-{
-	__list_del(entry->prev, entry->next);
-	entry->next = LIST_POISON1;
-	entry->prev = LIST_POISON2;
-}
-
-static inline int list_empty(const struct list_head *head)
-{
-	return head->next == head;
-}
-
-#define LIST_HEAD_INIT(name) { &(name), &(name) }
-
-#define LIST_HEAD(name) \
-	struct list_head name = LIST_HEAD_INIT(name)
-
-#define INIT_LIST_HEAD(ptr) do { \
-	(ptr)->next = (ptr); (ptr)->prev = (ptr); \
-} while (0)
-
-#define container_of(ptr, type, member) ({ \
-	const typeof( ((type *)0)->member ) *__mptr = (ptr); \
-	(type *)( (char *)__mptr - offsetof(type,member) );})
-
-#define list_entry(ptr, type, member) \
-	container_of(ptr, type, member)
-
-#define list_for_each(pos, head) \
-	for (pos = (head)->next; pos != (head); pos = pos->next)
-
diff --git a/csnap/src/sock.h b/csnap/src/sock.h
deleted file mode 100644
index 2f3d2fe..0000000
--- a/csnap/src/sock.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#include <sys/socket.h>
-#include <netdb.h>
-
-/*
- * Find and return the port number of a host:port pair, shortening
- * the original string to include only the hostname.
- */
-static inline int parse_port(char *s, unsigned *len)
-{
-	char *p = memchr(s, ':', *len);
-	if (!p || p == s || p - s == *len)
-		return -1;
-	*len = p - s;
-	return atoi(p + 1);
-}
-
-/*
- * Dumbed down interface for opening an IPv4 connection.
- */
-static inline int open_socket(char *name, unsigned port)
-{
-	struct sockaddr_in addr = { .sin_family = AF_INET, .sin_port = htons(port) };
-	struct hostent *host;
-	int sock;
-
-	if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0)
-		error("Can't get socket");
-	if (!(host = gethostbyname(name)))
-		-h_errno;
-	memcpy(&addr.sin_addr.s_addr, host->h_addr, host->h_length);
-	if (connect(sock, (struct sockaddr *)&addr, sizeof(addr)) < 0)
-		return -errno;
-	return sock;
-}
-
-/*
- * Pass a fd over a local socket connection.  You have to send some stream
- * data as well, just to make an ugly interface even more irritating.
- */
-int send_fd(int sock, int fd, char *bogus, unsigned len)
-{
-	char payload[CMSG_SPACE(sizeof(int))];
-	struct msghdr msg = {
-		.msg_control = payload,
-		.msg_controllen = sizeof(payload),
-		.msg_iov = &(struct iovec){ .iov_base = bogus, .iov_len = len },
-		.msg_iovlen = 1,
-	};
-	struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
-
-	*cmsg = (struct cmsghdr){ CMSG_LEN(sizeof(int)), SOL_SOCKET, SCM_RIGHTS };
-	*((int *)CMSG_DATA(cmsg)) = fd; // this is really an array, .cmsg_len gives count (??)
-
-	return sendmsg(sock, &msg, 0) != len? -EIO: len;
-}
diff --git a/csnap/src/trace.h b/csnap/src/trace.h
deleted file mode 100644
index 8815cb2..0000000
--- a/csnap/src/trace.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#define BREAK asm("int3")
-#define warn(string, args...) do { fprintf(stderr, "[%u] %s: " string "\n", getpid(), __func__, ##args); } while (0)
-#define error(string, args...) do { warn(string, ##args); BREAK; } while (0)
-#define assert(expr) do { if (!(expr)) error("Failed assertion \"%s\"\n", #expr); } while (0)
-
-#define trace_on(args) args
-#define trace_off(args)
diff --git a/csnap/tests/Makefile b/csnap/tests/Makefile
deleted file mode 100644
index 7b92e1e..0000000
--- a/csnap/tests/Makefile
+++ /dev/null
@@ -1,49 +0,0 @@
-###############################################################################
-###############################################################################
-##
-##  Copyright (C) 2006 Red Hat, Inc.  All rights reserved.
-##
-##  This copyrighted material is made available to anyone wishing to use,
-##  modify, copy, or redistribute it subject to the terms and conditions
-##  of the GNU General Public License v.2.
-##
-###############################################################################
-###############################################################################
-
-binaries = testclient devpoke devspam
-deps = ../src/csnap.h ../src/trace.h ../src/sock.h ../src/buffer.h ../src/list.h
-
-all: $(binaries)
-
-testclient: testclient.c $(deps)
-	cc -Wall testclient.c -o testclient -I../src
-
-devpoke: devpoke.c
-	cc -Wall devpoke.c -o devpoke -lpopt
-
-devspam: devspam.c
-	cc -Wall devspam.c -o devspam -lpopt
-
-clean:
-	rm -f $(binaries) *.o a.out
-
-test: test1 test2 test3
-
-test1:
-	killall csnap-server || true
-	sudo killall csnap-agent || true
-	./mksnapstore /dev/test-snapstore /dev/test-origin
-	sudo /sbin/dmsetup remove testdev || true
-	sudo ./csnap-agent @test
-	./csnap-server /dev/test-snapstore /dev/test-origin @test 8080
-
-test2:
-	sudo ./csnap-create localhost:8080 0
-	echo 0 497976 csnapshot /dev/test-snapstore /dev/test-origin @test -1 | sudo /sbin/dmsetup create testdev
-
-test3:
-	sudo ./devspam /dev/mapper/testdev write 19 77
-
-test9:
-	sudo /sbin/dmsetup remove testdev
-
diff --git a/csnap/tests/devpoke.c b/csnap/tests/devpoke.c
deleted file mode 100644
index 57a7fed..0000000
--- a/csnap/tests/devpoke.c
+++ /dev/null
@@ -1,55 +0,0 @@
-#define _GNU_SOURCE /* Berserk glibc headers: O_DIRECT not defined unless _GNU_SOURCE defined */
-#include <stdio.h>
-#include <stdlib.h>
-#include <stddef.h>
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <errno.h> 
-
-ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset);
-
-#define trace_on(args) args
-#define trace_off(args)
-#define BREAK asm("int3")
-#define warn(string, args...) do { fprintf(stderr, "%s: " string "\n", __FILE__, ##args); } while (0)
-#define error(string, args...) do { warn(string, ##args); BREAK; } while (0)
-
-#define trace trace_on
-
-void *malloc_aligned(size_t size, unsigned binalign)
-{
-	unsigned long p = (unsigned long)malloc(size + binalign - 1);
-	return (void *)(p + (-p & (binalign - 1)));
-}
-
-int main(int argc, char *argv[])
-{
-	int err, dev, iterations = 1, blockshift = 13, blocksize = 1 << blockshift;
-	char *buffer = malloc_aligned(blocksize, blocksize);
-
-	if (!(dev = open(argv[1], O_RDWR | O_DIRECT)))
-		error("Could not open %s", argv[1]);
-	if (argc < 3 || argc > 4)
-		error("usage: %s device read/write [iterations]", argv[0]);
-	if (argc > 3)
-		iterations = atoi(argv[3]);
-
-	int rw = !strcmp(argv[2], "write");
-	typeof(pread) *fn = rw? ((typeof(pread) *)pwrite): pread;
-	char *what = rw? "write": "read";
-	unsigned range = (lseek(dev, 0, SEEK_END) >> blockshift), total = 0;
-
-	printf("range = %u, iterations = %u\n", range, iterations);
-
-	while (iterations--) {
-		unsigned block = 1? (rand() % range): total;
-		trace(warn("%s block %x", what, block);)
-		if ((err = fn(dev, buffer, blocksize, block << blockshift)) < 0)
-			error("poke error %i", err);
-		trace(warn("...block %x done", block);)
-		total++;
-	}
-
-	return err;
-}
diff --git a/csnap/tests/devspam.c b/csnap/tests/devspam.c
deleted file mode 100644
index 09eb1ce..0000000
--- a/csnap/tests/devspam.c
+++ /dev/null
@@ -1,83 +0,0 @@
-#define _GNU_SOURCE /* O_DIRECT */
-#define _XOPEN_SOURCE 500 /* pwrite */
-#include <unistd.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <errno.h>
-
-#define error(string, args...) do { printf(string "\n", ##args); exit(1); } while (0)
-
-void *malloc_aligned(size_t size, unsigned binalign)
-{
-	unsigned long p = (unsigned long)malloc(size + binalign - 1);
-	return (void *)(p + (-p & (binalign - 1)));
-}
-
-void spamdata(char *buf, unsigned len, unsigned tag, unsigned block)
-{
-	int i, j, k, n = len / 16;
-	char *p = buf;
-
-	for(i = 0; i < n; i++) {
-		int spams[3] = { tag, block, i };
-		memcpy(p, "SPAM", 4);
-		p += 4;
-		for(j = 0; j < 3; j++) {
-			int v = spams[j];
-			for (k = 0; k < 4; k++,v <<= 4) {
-				int d = (v >> 12) & 0xf;
-				*p++ = d < 10? d + '0': d - 10 + 'a'; 
-			}
-		}
-	}
-	/* assert(p == buf + len); */
-}
-
-int main(int argc, char *argv[])
-{
-	#define ncommands 4
-	char *commands[ncommands] = { "read", "write", "randread", "randwrite" };
-	int err, dev, command, blockshift = 12, blocksize = 1 << blockshift;
-	char *buffer = malloc_aligned(blocksize, blocksize);
-	char *buffer2 = malloc(blocksize);
-
-	if (argc != 5)
-usage:		error("usage: %s device read/write/randread/randwrite tag iterations", argv[0]);
-
-	if ((dev = open(argv[1], O_RDWR | O_DIRECT)) == -1)
-		error("Can't open %s, %s", argv[1], strerror(errno));
-
-	for (command = 0; command < ncommands; command++)
-		if (!strcmp(argv[2], commands[command]))
-			break;
-
-	if (command == ncommands)
-		goto usage;
-
-	int code = atoi(argv[4]), iterations = atoi(argv[3]);
-	int is_write = command & 1, is_rand = command >> 1;
-	unsigned range = (lseek(dev, 0, SEEK_END) >> blockshift), total = 0;
-	typeof(pread) *fn = is_write? ((typeof(pread) *)pwrite): pread;
-
-	printf("spam code = %u, iterations = %u, range = %u\n", code, iterations, range);
-
-	while (iterations--) {
-		unsigned block = is_rand? (rand() % range): total;
-
-		if (is_write)
-			spamdata(buffer, blocksize, code, block);
-
-		if ((err = fn(dev, buffer, blocksize, block << blockshift)) < 0)
-			error("spam %s error %i", commands[command], err);
-
-		if (!is_write) {
-			spamdata(buffer2, blocksize, code, block);
-			if (memcmp(buffer, buffer2, blocksize))
-				printf("block %u doesn't match\n", block);
-		}
-		total++;
-	}
-	return err;
-}
diff --git a/csnap/tests/testclient.c b/csnap/tests/testclient.c
deleted file mode 100644
index bd5401a..0000000
--- a/csnap/tests/testclient.c
+++ /dev/null
@@ -1,185 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <stddef.h>
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <errno.h> 
-#include <netinet/in.h>
-#include <sys/poll.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <netdb.h> 
-#include "csnap.h"
-#include <linux/dm-csnap.h>
-#include "trace.h"
-
-#define trace trace_off
-
-int open_socket(char *name, unsigned port)
-{
-	int sock;
-
-	if ((sock = socket(AF_INET,  SOCK_STREAM, 0)) < 0)
-		error("Can't get socket");
-
-	struct hostent *host;
-	if (!(host = gethostbyname(name)))
-		error("Unknown host '%s'", name);
-
-	struct sockaddr_in sockaddr = { .sin_family = AF_INET, .sin_port = htons(port) };
-	memcpy(&sockaddr.sin_addr.s_addr, host->h_addr, host->h_length);
-	if (connect(sock, (struct sockaddr *)&sockaddr, sizeof(sockaddr)) < 0) 
-		error("Cannot connect to %s:%i", name, port);
-
-	return sock;
-}
-
-#if 0
-static unsigned seed = 0;
-
-unsigned myrand(void)
-{
-	return seed = seed * 1871923741 + 3298913417U;
-}
-#else
-#define myrand rand
-#endif
-
-int serviced = 0;
-
-int incoming(unsigned sock)
-{
-	struct messagebuf message;
-	int err, i, j;
-
-	if ((err = readpipe(sock, &message.head, sizeof(message.head))))
-		goto pipe_error;
-	if (message.head.length > maxbody)
-		goto message_too_long;
-	trace(warn("%x+%u", message.head.code, message.head.length);)
-	if ((err = readpipe(sock, &message.body, message.head.length)))
-		goto pipe_error;
-
-	switch (message.head.code) {
-		case REPLY_ORIGIN_WRITE:
-		{
-			struct rw_request *body = (struct rw_request *)message.body;
-			struct chunk_range *p = body->ranges;
-			if (message.head.length < sizeof(*body))
-				goto message_too_short;
-			trace(printf("origin write reply, %u ranges ", body->count);)
-			for (i = 0; i < body->count; i++, p++)
-				trace(printf("%llu/%u ", p->chunk, p->chunks);)
-			trace(printf("\n");)
-			serviced++;
-			break;
-		}
-
-		case REPLY_SNAPSHOT_WRITE:
-		{
-			struct rw_request *body = (struct rw_request *)message.body;
-			struct chunk_range *p = body->ranges;
-			if (message.head.length < sizeof(*body))
-				goto message_too_short;
-			trace(printf("snapshot write reply, %u ranges ", body->count);)
-			for (i = 0; i < body->count; i++) {
-				trace(printf("%llu/%u ", p->chunk, p->chunks);)
-				chunk_t *q = (chunk_t *)(p + 1);
-				for (j = 0; j < p->chunks; j++, q++)
-					trace(printf("%llu ", *q));
-				p = (struct chunk_range *)q;
-			}
-			trace(printf("\n");)
-			serviced++;
-			break;
-		}
-
-		case REPLY_CREATE_SNAPSHOT:
-			trace(warn("create snapshot succeeded");)
-			break;
-
-		default: 
-			warn("Unknown message %x", message.head.code);
-	}
-	return 0;
-
-message_too_long:
-	warn("message %x too long (%u bytes)\n", message.head.code, message.head.length);
-	return -1;
-message_too_short:
-	warn("message %x too short (%u bytes)\n", message.head.code, message.head.length);
-	return -1;
-pipe_error:
-	return -1;
-}
-
-unsigned available(unsigned sock)
-{
-	unsigned bytes;
-	ioctl(sock, FIONREAD, &bytes);
-	trace(if (bytes) printf("%u bytes waiting\n", bytes);)
-	return bytes;
-}
-
-static unsigned total_chunks;
-
-int main(int argc, char *argv[])
-{
-	int err, snapdev, orgdev, iterations = 1;
-
-#if 0
-	warn("rand = %u", myrand());
-	warn("rand = %u", myrand());
-	warn("rand = %u", myrand());
-	warn("rand = %u", myrand());
-	warn("rand = %u", myrand());
-return 0;
-#endif
-
-	if (argc < 5)
-		error("usage: %s dev/snapstore dev/origin hostname port [iterations]", argv[0]);
-	if (!(snapdev = open(argv[1], O_RDWR /*| O_DIRECT*/)))
-		error("Could not open snapshot store %s", argv[1]);
-	if (!(orgdev = open(argv[2], O_RDWR /*| O_DIRECT*/)))
-		error("Could not open origin volume %s", argv[2]);
-	if (argc > 5)
-		iterations = atoi(argv[5]);
-
-	int sock = open_socket(argv[3], atoi(argv[4]));
-	unsigned length_range = 32;
-	unsigned chunk_range = (lseek(orgdev, 0, SEEK_END) >> 12) - length_range;
-
-	outbead(sock, CREATE_SNAPSHOT, struct create_snapshot, 8);
-	outbead(sock, CREATE_SNAPSHOT, struct create_snapshot, 9);
-	outbead(sock, IDENTIFY, struct identify, .snap = -1);
-	trace_on(warn("start %u transfers", iterations);)
-
-	int i;
-	for (i=0; i < iterations; i++) {
-		unsigned length = offsetof(struct rw_request, ranges) + sizeof(struct chunk_range);
-		struct { struct head head; struct rw_request body; char tail[maxbody]; } PACKED message;
-		message.head.code = QUERY_WRITE;
-		message.head.length = length;
- 		message.body.count = 1;
-		message.body.ranges[0].chunk = 0? (myrand() % chunk_range): total_chunks;
-		message.body.ranges[0].chunks = 1;
-		total_chunks += message.body.ranges[0].chunks = 0? (myrand() % length_range + 1): 1;
-
-		if (write(sock, &message, sizeof(struct head) + length) < 0)
-			error("Error writing to socket");
-
-		while (available(sock) >= sizeof(struct head))
-			incoming(sock);
-	}
-	while (serviced < iterations) {
-		poll(NULL, 0, 100);
-		trace(warn("wait for %i responses", iterations - serviced);)
-		while (available(sock) >= sizeof(struct head))
-			incoming(sock);
-	}
-//	outbead(sock, DUMP_TREE, struct { });
-	close(sock);
-	return err;
-}
diff --git a/gfs2/debug/Makefile b/gfs2/debug/Makefile
deleted file mode 100644
index a6bc8c1..0000000
--- a/gfs2/debug/Makefile
+++ /dev/null
@@ -1,46 +0,0 @@
-top_srcdir=..
-all: ${TARGET}
-
-include ${top_srcdir}/make/defines.mk
-
-TARGET= gfs2_debug
-
-SOURCE=	\
-	block_device.c \
-	basic.c \
-	main.c \
-	ondisk.c \
-	readfile.c \
-	util.c
-
-CFLAGS+= -DHELPER_PROGRAM -D_FILE_OFFSET_BITS=64
-
-INCLUDE= -I${top_srcdir}/include -I${top_srcdir}/config \
-	-I${gfs2kincdir} -I${incdir}
-
-ifneq (${KERNEL_SRC}, )
-# Use the kernel tree if patched, otherwise, look where cluster headers
-#  should be installed
-INCLUDE += $(shell if [ -e ${KERNEL_SRC}/include/linux/gfs2_ondisk.h ]; then \
-		echo '-I${KERNEL_SRC}/include'; else \
-		echo '-I${incdir}'; fi)
-else
-INCLUDE += -I${incdir}
-endif
-
-LDFLAGS+= -L${libdir}
-
-
-gfs2_debug: ${SOURCE}
-	${CC} ${CFLAGS} ${INCLUDE} ${LDFLAGS} ${SOURCE} ${LDLIBS} -o $@
-
-install: all
-	if [ ! -d ${sbindir} ]; then \
-		install -d ${sbindir}; \
-	fi
-	install -m755 ${TARGET} ${sbindir}
-
-clean:
-	rm -f *.o ${TARGET}
-
-
diff --git a/gfs2/debug/basic.c b/gfs2/debug/basic.c
deleted file mode 100644
index b087f09..0000000
--- a/gfs2/debug/basic.c
+++ /dev/null
@@ -1,458 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <errno.h>
-#include <stdint.h>
-#include <inttypes.h>
-
-#include <linux/gfs2_ondisk.h>
-#include "linux_endian.h"
-
-#include "gfs2_debug.h"
-#include "basic.h"
-#include "block_device.h"
-
-/**
- * verify_block_size -
- * @bsize:
- *
- */
-
-static int
-verify_block_size(unsigned int bsize)
-{
-	unsigned int x = 512;
-
-	for (;;) {
-		if (!x)
-			return -1;
-		if (x == bsize)
-			return 0;
-		x <<= 1;
-	}
-}
-
-/**
- * verify_gfs2 -
- *
- */
-
-void
-verify_gfs2(void)
-{
-	char buf[GFS2_BASIC_BLOCK];
-	struct gfs2_sb sb;
-
-	if (device_size < (GFS2_SB_ADDR + 1) * GFS2_BASIC_BLOCK)
-		return;
-
-	do_lseek(device_fd, GFS2_SB_ADDR * GFS2_BASIC_BLOCK);
-	do_read(device_fd, buf, GFS2_BASIC_BLOCK);
-
-	gfs2_sb_in(&sb, buf);
-
-	if (sb.sb_header.mh_magic != GFS2_MAGIC ||
-	    sb.sb_header.mh_type != GFS2_METATYPE_SB ||
-	    sb.sb_bsize != 1 << sb.sb_bsize_shift ||
-	    verify_block_size(sb.sb_bsize))
-		return;
-
-	if (!block_size || block_size == sb.sb_bsize) {
-		unsigned int x;
-
-		is_gfs2 = TRUE;
-		block_size = sb.sb_bsize;
-		block_size_shift = sb.sb_bsize_shift;
-
-		sd_diptrs = (block_size - sizeof(struct gfs2_dinode)) / sizeof(uint64_t);
-		sd_inptrs = (block_size - sizeof(struct gfs2_meta_header)) / sizeof(uint64_t);
-		sd_jbsize = block_size - sizeof(struct gfs2_meta_header);
-		sd_hash_bsize = block_size / 2;
-		sd_hash_ptrs = sd_hash_bsize / sizeof(uint64_t);
-
-		sd_heightsize[0] = block_size - sizeof(struct gfs2_dinode);
-		sd_heightsize[1] = block_size * sd_diptrs;
-		for (x = 2;; x++) {
-			uint64_t space = sd_heightsize[x - 1] * sd_inptrs;
-			uint64_t d = space / sd_inptrs;
-			uint32_t m = space % sd_inptrs;
-
-			if (d != sd_heightsize[x - 1] || m)
-				break;
-			sd_heightsize[x] = space;
-		}
-		sd_max_height = x;
-
-		sd_jheightsize[0] = block_size - sizeof(struct gfs2_dinode);
-		sd_jheightsize[1] = sd_jbsize * sd_diptrs;
-		for (x = 2;; x++) {
-			uint64_t space = sd_jheightsize[x - 1] * sd_inptrs;
-			uint64_t d = space / sd_inptrs;
-			uint32_t m = space % sd_inptrs;
-
-			if (d != sd_jheightsize[x - 1] || m)
-				break;
-			sd_jheightsize[x] = space;
-		}
-		sd_max_jheight = x;
-	}
-}
-
-/**
- * must_be_gfs2 -
- *
- */
-
-void
-must_be_gfs2(void)
-{
-	if (!is_gfs2)
-		die("not a gfs2 filesystem\n");
-}
-
-/**
- * scan_device -
- *
- */
-
-void
-scan_device(void)
-{
-	char data[GFS2_BASIC_BLOCK];
-	uint64_t bb;
-	struct gfs2_meta_header mh;
-
-	for (bb = 0; (bb + 1) * GFS2_BASIC_BLOCK <= device_size; bb++) {
-		do_lseek(device_fd, bb * GFS2_BASIC_BLOCK);
-		do_read(device_fd, data, GFS2_BASIC_BLOCK);
-		gfs2_meta_header_in(&mh, data);
-
-		if (mh.mh_magic == GFS2_MAGIC &&
-		    mh.mh_type && mh.mh_type <= GFS2_METATYPE_EA)
-			printf("sector %"PRIu64": type %u\n",
-			       bb, mh.mh_type);
-	}
-}
-
-/**
- * print_superblock -
- *
- */
-
-void
-print_superblock(void)
-{
-	char *data;
-	struct gfs2_sb sb;
-
-	must_be_gfs2();
-
-	data = get_block(GFS2_SB_ADDR * GFS2_BASIC_BLOCK / block_size, TRUE);
-	gfs2_sb_in(&sb, data);
-	free(data);
-
-	gfs2_sb_print(&sb);
-}
-
-/**
- * print_bitmaps -
- * @data:
- * @offset:
- *
- */
-
-static void
-print_bitmaps(char *data, unsigned int offset)
-{
-	unsigned int bn = 0;
-	unsigned int bit;
-	unsigned char value;
-	char *type;
-
-	printf("\n");
-
-	for (; offset < block_size; offset++) {
-		for (bit = 0; bit < GFS2_NBBY; bit++) {
-			value = data[offset];
-			value = (value >> (bit * GFS2_BIT_SIZE)) & GFS2_BIT_MASK;
-			switch (value) {
-			case GFS2_BLKST_FREE:
-				type = "free";
-				break;
-			case GFS2_BLKST_USED:
-				type = "used data";
-				break;
-			case GFS2_BLKST_INVALID:
-				type = "invalid";
-				break;
-			case GFS2_BLKST_DINODE:
-				type = "dinode";
-				break;
-			default:
-				ASSERT(FALSE,);
-			}
-			printf("  block %u: %s\n", bn, type);
-			bn++;
-		}
-	}
-}
-
-/**
- * print_stuffed_hash -
- * @data:
- *
- */
-
-static void
-print_stuffed_hash(char *data)
-{
-	uint64_t *p = (uint64_t *)(data + sizeof(struct gfs2_dinode));
-	uint64_t *end = (uint64_t *)(((char *)p) + block_size / 2);
-	uint64_t this, last = 0;
-	unsigned int run = 0;
-	int first = TRUE;
-
-	printf("\n");
-
-	for (; p < end; p++) {
-		this = le64_to_cpu(*p);
-
-		if (first) {
-			first = FALSE;
-			run = 1;
-		} else {
-			if (this == last)
-				run++;
-			else {
-				printf("  pointer: %"PRIu64" (%u)\n",
-				       last, run);
-				run = 1;
-			}
-		}
-			
-		last = this;
-	}
-
-	printf("  pointer: %"PRIu64" (%u)\n",
-	       last, run);
-}
-
-/**
- * print_dirents -
- * @data:
- * @offset:
- *
- * Make this more robust
- *
- */
-
-void
-print_dirents(char *data, unsigned int offset)
-{
-	struct gfs2_dirent de;
-
-	for (; offset < block_size; offset += de.de_rec_len) {
-		printf("\n");
-		gfs2_dirent_in(&de, data + offset);
-
-		if (sizeof(struct gfs2_dirent) + de.de_name_len > de.de_rec_len)
-			continue;
-		if (offset + sizeof(struct gfs2_dirent) + de.de_name_len > block_size)
-			break;
-		if (de.de_inum.no_formal_ino)
-			gfs2_dirent_print(&de, data + offset + sizeof(struct gfs2_dirent));
-	}
-}
-
-/**
- * print_pointers -
- * @data:
- * @offset:
- *
- */
-
-static void
-print_pointers(char *data, unsigned int offset)
-{
-	uint64_t *p = (uint64_t *)(data + offset);
-	uint64_t *end = (uint64_t *)(data + block_size);
-	unsigned int x = 0;
-
-	printf("\n");
-
-	for (; p < end; p++, x++)
-		if (*p)
-			printf("  pointer #%u: %"PRIu64"\n",
-			       x, le64_to_cpu(*p));
-}
-
-/**
- * identify_block -
- *
- */
-
-void
-identify_block(void)
-{
-	char *data;
-	struct gfs2_meta_header mh;
-
-	must_be_gfs2();
-
-	data = get_block(block_number, TRUE);
-	gfs2_meta_header_in(&mh, data);
-
-	if (mh.mh_magic != GFS2_MAGIC) {
-		printf("Not GFS2 metadata\n");
-		free(data);
-		return;
-	}
-
-	switch (mh.mh_type) {
-	case GFS2_METATYPE_NONE:
-		printf("GFS2_METATYPE_NONE\n");
-		break;
-
-	case GFS2_METATYPE_SB:
-		printf("Super\n");
-		if (verbose) {
-			struct gfs2_sb sb;
-			gfs2_sb_in(&sb, data);
-			gfs2_sb_print(&sb);
-		}
-		break;
-
-	case GFS2_METATYPE_RG:
-		printf("Resource Group Header\n");
-		if (verbose) {
-			struct gfs2_rgrp rg;
-			gfs2_rgrp_in(&rg, data);
-			gfs2_rgrp_print(&rg);
-			if (verbose > 1)
-				print_bitmaps(data, sizeof(struct gfs2_rgrp));
-		}
-		break;
-
-	case GFS2_METATYPE_RB:
-		printf("Resource Group Bitmap\n");
-		if (verbose) {
-			gfs2_meta_header_print(&mh);
-			if (verbose > 1)
-				print_bitmaps(data, sizeof(struct gfs2_meta_header));
-		}
-		break;
-
-	case GFS2_METATYPE_DI:
-		printf("Dinode\n");
-		if (verbose) {
-			struct gfs2_dinode di;
-			gfs2_dinode_in(&di, data);
-			gfs2_dinode_print(&di);
-			if (verbose > 1) {
-				if (di.di_height)
-					print_pointers(data, sizeof(struct gfs2_dinode));
-				else {
-					if (S_ISREG(di.di_mode))
-						printf("\n  stuffed data\n");
-					else if (S_ISDIR(di.di_mode) &&
-						(di.di_flags & GFS2_DIF_EXHASH))
-						print_stuffed_hash(data);
-					else if (S_ISDIR(di.di_mode))
-						print_dirents(data, sizeof(struct gfs2_dinode));
-					else if (S_ISLNK(di.di_mode))
-						printf("\nsymlink to %s\n",
-						       data + sizeof(struct gfs2_dinode));
-				}
-			}
-		}
-		break;
-
-	case GFS2_METATYPE_IN:
-		printf("Indirect\n");
-		if (verbose) {
-			gfs2_meta_header_print(&mh);
-			if (verbose > 1) {
-				print_pointers(data, sizeof(struct gfs2_meta_header));
-			}
-		}
-		break;
-
-	case GFS2_METATYPE_LF:
-		printf("Directory Leaf\n");
-		if (verbose) {
-			struct gfs2_leaf lf;
-			gfs2_leaf_in(&lf, data);
-			gfs2_leaf_print(&lf);
-			if (verbose > 1)
-				print_dirents(data, sizeof(struct gfs2_leaf));
-		}
-		break;
-
-	case GFS2_METATYPE_JD:
-		printf("Journaled Data\n");
-		if (verbose)
-			gfs2_meta_header_print(&mh);
-		break;
-
-	case GFS2_METATYPE_LH:
-		printf("Log Header\n");
-		if (verbose) {
-			struct gfs2_log_header lh;
-			gfs2_log_header_in(&lh, data);
-			gfs2_log_header_print(&lh);
-		}
-		break;
-
-	case GFS2_METATYPE_LD:
-		printf("Lock Descriptor\n");
-		if (verbose) {
-			struct gfs2_log_descriptor ld;
-			gfs2_log_descriptor_in(&ld, data);
-			gfs2_log_descriptor_print(&ld);
-		}
-		break;
-
-	case GFS2_METATYPE_LB:
-		printf("Generic Log Block\n");
-		if (verbose)
-			gfs2_meta_header_print(&mh);
-		break;
-
-	case GFS2_METATYPE_EA:
-		printf("Extended Attribute\n");
-		if (verbose)
-			gfs2_meta_header_print(&mh);
-		break;
-
-	case GFS2_METATYPE_ED:
-		printf("Extended Attribute Data\n");
-		if (verbose)
-			gfs2_meta_header_print(&mh);
-		break;
-
-	case GFS2_METATYPE_UT:
-		printf("Unlinked Tags\n");
-		if (verbose)
-			gfs2_meta_header_print(&mh);
-		break;
-
-	case GFS2_METATYPE_QC:
-		printf("Quota Changes\n");
-		if (verbose)
-			gfs2_meta_header_print(&mh);
-		break;
-
-	default:
-		printf("Unknown metadata type\n");
-		if (verbose)
-			gfs2_meta_header_print(&mh);
-		break;
-	}
-
-	free(data);
-}
-
diff --git a/gfs2/debug/basic.h b/gfs2/debug/basic.h
deleted file mode 100644
index f0bdfed..0000000
--- a/gfs2/debug/basic.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef __BASIC_DOT_H__
-#define __BASIC_DOT_H__
-
-
-EXTERN unsigned int sd_diptrs INIT(0);
-EXTERN unsigned int sd_inptrs INIT(0);
-EXTERN unsigned int sd_jbsize INIT(0);
-EXTERN unsigned int sd_hash_bsize INIT(0);
-EXTERN unsigned int sd_hash_ptrs INIT(0);
-EXTERN uint32_t sd_max_height INIT(0);
-EXTERN uint64_t sd_heightsize[GFS2_MAX_META_HEIGHT];
-EXTERN uint32_t sd_max_jheight INIT(0);
-EXTERN uint64_t sd_jheightsize[GFS2_MAX_META_HEIGHT];
-
-
-void verify_gfs2(void);
-void must_be_gfs2(void);
-void scan_device(void);
-void print_superblock(void);
-void identify_block(void);
-
-void print_dirents(char *data, unsigned int offset);
-
-
-#endif /* __BASIC_DOT_H__ */
-
diff --git a/gfs2/debug/block_device.c b/gfs2/debug/block_device.c
deleted file mode 100644
index 14e8850..0000000
--- a/gfs2/debug/block_device.c
+++ /dev/null
@@ -1,117 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <errno.h>
-#include <stdint.h>
-#include <inttypes.h>
-
-#include "gfs2_debug.h"
-#include "block_device.h"
-
-/**
- * find_device_size -
- *
- */
-
-void
-find_device_size(void)
-{
-	device_size = lseek(device_fd, 0, SEEK_END);
-	if (device_size < 0)
-		die("can't determine device size: %s\n",
-		    strerror(errno));
-}
-
-/**
- * get_block -
- * @bn:
- * @fatal:
- *
- * Returns: the data in the block (needs to be freed)
- */
-
-char *
-get_block(uint64_t bn, int fatal)
-{
-	char *data;
-
-	if (device_size < (bn + 1) * block_size) {
-		fprintf(stderr, "%s: block %"PRIu64" is off the end of the device\n",
-			prog_name, bn);
-		if (fatal)
-			exit(EXIT_FAILURE);
-	}
-
-	data = malloc(block_size);
-	if (!data)
-		die("out of memory (%s, %u)\n",
-		    __FILE__, __LINE__);
-
-	do_lseek(device_fd, bn * block_size);
-	do_read(device_fd, data, block_size);
-
-	return data;
-}
-
-/**
- * print_size -
- *
- */
-
-void
-print_size(void)
-{
-	printf("%"PRIu64"\n", device_size);
-}
-
-/**
- * print_hexblock -
- *
- */
-
-void
-print_hexblock(void)
-{
-	char *data;
-	unsigned int x;
-
-	if (!block_size)
-		die("no block size set\n");
-
-	data = get_block(block_number, TRUE);
-
-	for (x = 0; x < block_size; x++) {
-		printf("%.2X", ((unsigned char *)data)[x]);
-		if (x % 16 == 15)
-			printf("\n");
-		else
-			printf(" ");
-	}
-
-	if (x % 16)
-		printf("\n");
-
-	free(data);
-}
-
-/**
- * print_rawblock -
- *
- */
-
-void
-print_rawblock(void)
-{
-	char *data;
-
-	if (!block_size)
-		die("no block size set\n");
-
-	data = get_block(block_number, TRUE);
-	do_write(STDOUT_FILENO, data, block_size);
-	free(data);
-}
diff --git a/gfs2/debug/block_device.h b/gfs2/debug/block_device.h
deleted file mode 100644
index 6af5437..0000000
--- a/gfs2/debug/block_device.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef __BLOCK_DEVICE_DOT_H__
-#define __BLOCK_DEVICE_DOT_H__
-
-
-void find_device_size(void);
-char *get_block(uint64_t bn, int fatal);
-
-void print_size(void);
-void print_hexblock(void);
-void print_rawblock(void);
-
-
-#endif /* __BLOCK_DEVICE_DOT_H__ */
-
diff --git a/gfs2/debug/gfs2_debug.h b/gfs2/debug/gfs2_debug.h
deleted file mode 100644
index 68243d2..0000000
--- a/gfs2/debug/gfs2_debug.h
+++ /dev/null
@@ -1,83 +0,0 @@
-#ifndef __GFS2_DEBUG_DOT_H__
-#define __GFS2_DEBUG_DOT_H__
-
-
-#ifndef TRUE
-#define TRUE (1)
-#endif
-
-#ifndef FALSE
-#define FALSE (0)
-#endif
-
-#ifndef EXTERN
-#define EXTERN extern
-#define INIT(X)
-#else
-#undef EXTERN
-#define EXTERN
-#define INIT(X) =X 
-#endif
-
-
-#define die(fmt, args...) \
-do { \
-	fprintf(stderr, "%s: ", prog_name); \
-	fprintf(stderr, fmt, ##args); \
-	exit(EXIT_FAILURE); \
-} while (0)
-
-#define ASSERT(x, todo) \
-do { \
-	if (!(x)) { \
-		{todo} \
-		die("assertion failed on line %d of file %s\n", \
-		    __LINE__, __FILE__); \
-	} \
-} while (0)
-
-EXTERN char *prog_name;
-
-#define do_lseek(fd, off) \
-do { \
-	if (lseek((fd), (off), SEEK_SET) != (off)) \
-		die("bad seek on line %d of file %s: %s\n", \
-		    __LINE__, __FILE__, strerror(errno)); \
-} while (0)
-
-#define do_read(fd, buff, len) \
-do { \
-	if (read((fd), (buff), (len)) != (len)) \
-		die("bad read on line %d of file %s: %s\n", \
-		    __LINE__, __FILE__, strerror(errno)); \
-} while (0)
-
-#define do_write(fd, buff, len) \
-do { \
-	if (write((fd), (buff), (len)) != (len)) \
-		die("bad write on line %d of file %s: %s\n", \
-		    __LINE__, __FILE__, strerror(errno)); \
-} while (0)
-
-#define DIV_RU(x, y) (((x) + (y) - 1) / (y))
-
-
-/* Command line arguments */
-
-EXTERN unsigned int verbose INIT(0);
-
-EXTERN char *action INIT(NULL);
-
-EXTERN char *device INIT(NULL);
-EXTERN int device_fd INIT(-1);
-EXTERN off_t device_size INIT(-1);
-
-EXTERN int is_gfs2 INIT(FALSE);
-EXTERN unsigned int block_size INIT(0);
-EXTERN unsigned int block_size_shift INIT(0);
-
-EXTERN uint64_t block_number INIT(0);
-
-
-#endif /* __GFS2_DEBUG_DOT_H__ */
-
diff --git a/gfs2/debug/main.c b/gfs2/debug/main.c
deleted file mode 100644
index a4a7a53..0000000
--- a/gfs2/debug/main.c
+++ /dev/null
@@ -1,179 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <errno.h>
-#include <stdint.h>
-#include <inttypes.h>
-
-#include <linux/gfs2_ondisk.h>
-#include "copyright.cf"
-
-#define EXTERN
-#include "gfs2_debug.h"
-#include "basic.h"
-#include "block_device.h"
-#include "readfile.h"
-
-/**
- * print_usage - print out usage information
- *
- */
-
-static void
-print_usage(void)
-{
-	printf("Usage:\n");
-	printf("\n");
-	printf("%s [options] <action>\n", prog_name);
-	printf("\n");
-	printf("Actions:\n");
-	printf("  size             print the device size\n");
-	printf("  hexread          print a block in hex\n");
-	printf("  rawread          print a block raw\n");
-	printf("\n");
-	printf("GFS2-specific Actions:\n");
-	printf("  scan             scan the device looking for GFS2 blocks\n");
-	printf("  identify         identify the contents of a block\n");
-	printf("  sb               print superblock\n");
-	printf("  jindex           print journal index\n");
-	printf("  rindex           print resource index\n");
-	printf("  quota            print quota file\n");
-	printf("  root             print root directory\n");
-	printf("  readfile         print the contents of a file\n");
-	printf("  readdir          print the contents of a directory\n");
-	printf("\n");
-	printf("Options:\n");
-	printf("\n");
-	printf("  -B <bytes>       Set the block size\n");
-	printf("  -b <number>      Block number\n");
-	printf("  -d <device>      Device to look at\n");
-	printf("  -h               Print this help, then exit\n");
-	printf("  -v               Verbose\n");
-	printf("  -V               Print program version information, then exit\n");
-}
-
-/**
- * decode_arguments -
- * @argc:
- * @argv:
- *
- */
-
-static void
-decode_arguments(int argc, char *argv[])
-{
-	int cont = TRUE;
-	int optchar;
-
-	while (cont) {
-		optchar = getopt(argc, argv, "B:b:d:hVv");
-
-		switch (optchar) {
-		case 'B':
-			sscanf(optarg, "%u", &block_size);
-			if (!block_size)
-				die("can't have a zero block size\n");
-			break;
-
-		case 'b':
-			sscanf(optarg, "%"SCNu64, &block_number);
-			break;
-
-		case 'd':
-			device = optarg;
-			break;
-
-		case 'h':
-			print_usage();
-			exit(EXIT_SUCCESS);
-
-		case 'V':
-			printf("gfs2_mkfs %s (built %s %s)\n", RELEASE_VERSION, __DATE__, __TIME__);
-			printf("%s\n", REDHAT_COPYRIGHT);
-			exit(EXIT_SUCCESS);
-
-		case 'v':
-			verbose++;
-			break;
-
-		case EOF:
-			cont = FALSE;
-			break;
-
-		default:
-			die("unknown option: %c\n", optchar);
-		};
-	}
-
-	if (optind < argc) {
-		action = argv[optind];
-		optind++;
-	} else
-		die("no action specified\n");
-
-	if (optind < argc) 
-		die("Unrecognized option: %s\n", argv[optind]);
-
-	if (!device)
-		die("no device specified\n");
-}
-
-/**
- * main - 
- * @argc:
- * @argv:
- *
- * Returns: exit status
- */
-
-int
-main(int argc, char *argv[])
-{
-	prog_name = argv[0];
-
-	decode_arguments(argc, argv);
-
-	device_fd = open(device, O_RDWR);
-	if (device_fd < 0)
-		die("can't open device %s: %s\n",
-		    device, strerror(errno));
-
-	find_device_size();
-	verify_gfs2();
-
-	if (!strcmp(action, "size"))
-		print_size();
-	else if (!strcmp(action, "hexread"))
-		print_hexblock();
-	else if (!strcmp(action, "rawread"))
-		print_rawblock();
-	else if (!strcmp(action, "scan"))
-		scan_device();
-	else if (!strcmp(action, "identify"))
-		identify_block();
-	else if (!strcmp(action, "sb"))
-		print_superblock();
-	else if (!strcmp(action, "jindex"))
-		print_jindex();
-	else if (!strcmp(action, "rindex"))
-		print_rindex();
-	else if (!strcmp(action, "quota"))
-		print_quota();
-	else if (!strcmp(action, "root"))
-		print_root();
-	else if (!strcmp(action, "readfile"))
-		readfile();
-	else if (!strcmp(action, "readdir"))
-		readdir();
-	else
-		die("unknown action %s\n", action);
-		
-	close(device_fd);
-
-	exit(EXIT_SUCCESS);
-}
-
diff --git a/gfs2/debug/ondisk.c b/gfs2/debug/ondisk.c
deleted file mode 100644
index 4aa598c..0000000
--- a/gfs2/debug/ondisk.c
+++ /dev/null
@@ -1,12 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include <inttypes.h>
-
-#include "linux_endian.h"
-
-#define printk printf
-
-#define WANT_GFS2_CONVERSION_FUNCTIONS
-#include <linux/gfs2_ondisk.h>
diff --git a/gfs2/debug/readfile.c b/gfs2/debug/readfile.c
deleted file mode 100644
index 3c69074..0000000
--- a/gfs2/debug/readfile.c
+++ /dev/null
@@ -1,215 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <errno.h>
-#include <stdint.h>
-#include <inttypes.h>
-
-#include <linux/gfs2_ondisk.h>
-#include "linux_endian.h"
-
-#include "gfs2_debug.h"
-#include "basic.h"
-#include "block_device.h"
-#include "readfile.h"
-#include "util.h"
-
-void
-print_jindex(void)
-{
-	printf("FixMe!!!\n");
-}
-
-void
-print_rindex(void)
-{
-#if 0
-	struct gfs2_sb sb;
-	struct gfs2_dinode di;
-	struct gfs2_rindex ri;
-	char *data;
-	char buf[sizeof(struct gfs2_rindex)];
-	uint64_t o;
-	unsigned int x;
-	int error;
-
-	must_be_gfs2();
-
-	data = get_block(GFS2_SB_ADDR * GFS2_BASIC_BLOCK / block_size, TRUE);
-	gfs2_sb_in(&sb, data);
-	free(data);
-
-	data = get_block(sb.sb_rindex_di.no_addr, TRUE);
-	gfs2_dinode_in(&di, data);
-	free(data);
-
-	if (di.di_size % sizeof(struct gfs2_rindex))
-		fprintf(stderr, "%s: strange size for resource index %"PRIu64"\n",
-			prog_name, di.di_size);
-
-	for (o = 0, x = 0;; o += sizeof(struct gfs2_rindex), x++) {
-		error = gfs2_readi(&di, buf, o, sizeof(struct gfs2_rindex));
-		if (!error)
-			break;
-		if (error < sizeof(struct gfs2_rindex))
-			continue;
-		gfs2_rindex_in(&ri, buf);
-		printf("Resource Group %u:\n", x);
-		gfs2_rindex_print(&ri);
-		printf("\n");
-	}
-#else
-	printf("FixMe!!!\n");
-#endif
-}
-
-void
-print_quota(void)
-{
-#if 0
-	struct gfs2_sb sb;
-	struct gfs2_dinode di;
-	struct gfs2_quota qu;
-	char *data;
-	char buf[sizeof(struct gfs2_quota)];
-	uint64_t o;
-	unsigned int x;
-	int error;
-
-	must_be_gfs2();
-
-	data = get_block(GFS2_SB_ADDR * GFS2_BASIC_BLOCK / block_size, TRUE);
-	gfs2_sb_in(&sb, data);
-	free(data);
-
-	data = get_block(sb.sb_quota_di.no_addr, TRUE);
-	gfs2_dinode_in(&di, data);
-	free(data);
-
-	for (o = 0, x = 0;; o += sizeof(struct gfs2_quota), x++) {
-		error = gfs2_readi(&di, buf, o, sizeof(struct gfs2_quota));
-		if (!error)
-			break;
-		if (error < 0)
-			continue;
-		gfs2_quota_in(&qu, buf);
-
-		if (!qu.qu_limit && !qu.qu_warn && !qu.qu_value)
-			continue;
-
-		printf("Quota (%s, %u):\n", (x % 2) ? "group" : "user", x / 2);
-		gfs2_quota_print(&qu);
-		printf("\n");
-	}
-#else
-	printf("FixMe!!!\n");
-#endif
-}
-
-void
-print_root(void)
-{
-#if 0
-	struct gfs2_sb sb;
-	char *data;
-
-	must_be_gfs2();
-
-	data = get_block(GFS2_SB_ADDR * GFS2_BASIC_BLOCK / block_size, TRUE);
-	gfs2_sb_in(&sb, data);
-	free(data);
-
-	block_number = sb.sb_root_di.no_addr;
-
-	readdir();
-#else
-	printf("FixMe!!!\n");
-#endif
-}
-
-#define CHUNKSIZE (65536)
-
-void
-readfile(void)
-{
-	struct gfs2_dinode di;
-	char *data;
-	char buf[CHUNKSIZE];
-	uint64_t o = 0;
-	int error;
-
-	must_be_gfs2();
-
-	data = get_block(block_number, TRUE);
-	gfs2_dinode_in(&di, data);
-	free(data);
-
-	if (di.di_header.mh_magic != GFS2_MAGIC ||
-	    di.di_header.mh_type != GFS2_METATYPE_DI)
-		die("block %"PRIu64" isn't an inode\n",
-		    block_number);
-
-	if (!S_ISREG(di.di_mode))
-		die("block %"PRIu64" isn't a regular file\n",
-		    block_number);
-
-	for (;;) {
-		error = gfs2_readi(&di, buf, o, CHUNKSIZE);
-		if (error <= 0)
-			break;
-		write(STDOUT_FILENO, buf, error);
-		o += error;
-	}
-}
-
-static void
-do_readdir(struct gfs2_dinode *di, char *data,
-	   uint32_t index, uint32_t len, uint64_t leaf_no,
-	   void *opaque)
-{
-	struct gfs2_leaf leaf;
-
-	print_dirents(data, sizeof(struct gfs2_leaf));
-	gfs2_leaf_in(&leaf, data);
-
-	while (leaf.lf_next) {
-		data = get_block(leaf.lf_next, FALSE);
-		if (!data)
-			return;
-		print_dirents(data, sizeof(struct gfs2_leaf));
-		gfs2_leaf_in(&leaf, data);
-		free(data);
-	}
-}
-
-void
-readdir(void)
-{
-	struct gfs2_dinode di;
-	char *data;
-
-	must_be_gfs2();
-
-	data = get_block(block_number, TRUE);
-	gfs2_dinode_in(&di, data);
-
-	if (di.di_header.mh_magic != GFS2_MAGIC ||
-	    di.di_header.mh_type != GFS2_METATYPE_DI)
-		die("block %"PRIu64" isn't an inode\n",
-		    block_number);
-
-	if (!S_ISDIR(di.di_mode))
-		die("block %"PRIu64" isn't a directory\n",
-		    block_number);
-
-	if (di.di_flags & GFS2_DIF_EXHASH)
-		foreach_leaf(&di, do_readdir, NULL);
-	else
-		print_dirents(data, sizeof(struct gfs2_dinode));
-
-	free(data);
-}
diff --git a/gfs2/debug/readfile.h b/gfs2/debug/readfile.h
deleted file mode 100644
index e4e84e9..0000000
--- a/gfs2/debug/readfile.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef __READFILE_DOT_H__
-#define __READFILE_DOT_H__
-
-
-void print_jindex(void);
-void print_rindex(void);
-void print_quota(void);
-void print_root(void);
-void readfile(void);
-void readdir(void);
-
-
-#endif /* __READFILE_DOT_H__ */
-
diff --git a/gfs2/debug/util.c b/gfs2/debug/util.c
deleted file mode 100644
index 5489349..0000000
--- a/gfs2/debug/util.c
+++ /dev/null
@@ -1,334 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <errno.h>
-#include <stdint.h>
-#include <inttypes.h>
-
-#include <linux/gfs2_ondisk.h>
-#include "linux_endian.h"
-
-#include "gfs2_debug.h"
-#include "basic.h"
-#include "block_device.h"
-#include "util.h"
-
-int
-check_type(char *data, unsigned int type)
-{
-	struct gfs2_meta_header mh;
-
-	gfs2_meta_header_in(&mh, data);
-	if (mh.mh_magic != GFS2_MAGIC ||
-	    mh.mh_type != type) {
-		fprintf(stderr, "%s: expected metadata type %u\n",
-			prog_name, type);
-		return -1;
-	}
-	return 0;
-}
-
-/**
- * recursive_scan - call a function for each block pointer in a file
- * @di:
- * @height:
- * @bn:
- * @pc:
- * @opaque:
- *
- */
-
-void
-recursive_scan(struct gfs2_dinode *di,
-	       unsigned int height, uint64_t bn,
-	       pointer_call_t pc, void *opaque)
-{
-	char *data = NULL;
-	uint64_t *top, *bottom;
-	uint64_t x;
-	
-	if (!height) {
-		data = get_block(di->di_num.no_addr, TRUE);
-
-		top = (uint64_t *)(data + sizeof(struct gfs2_dinode));
-		bottom = (uint64_t *)(data + sizeof(struct gfs2_dinode)) + sd_diptrs;
-	} else {
-		data = get_block(bn, FALSE);
-		if (!data)
-			return;
-		if (check_type(data, GFS2_METATYPE_IN))
-			return;
-
-		top = (uint64_t *)(data + sizeof(struct gfs2_meta_header));
-		bottom = (uint64_t *)(data + sizeof(struct gfs2_meta_header)) + sd_inptrs;
-	}
-
-	for ( ; top < bottom; top++) {
-		x = le64_to_cpu(*top);
-
-		pc(di, height, x, opaque);
-
-		if (x && height < di->di_height - 1)
-			recursive_scan(di,
-				       height + 1, x,
-				       pc, opaque);
-	}
-
-	free(data);
-}
-
-void
-foreach_leaf(struct gfs2_dinode *di,
-	     leaf_call_t lc, void *opaque)
-{
-	char *data;
-	struct gfs2_leaf leaf;
-	uint32_t hsize, len;
-	uint32_t ht_offset, lp_offset, ht_offset_cur = -1;
-	uint32_t index = 0;
-	uint64_t lp[sd_hash_ptrs];
-	uint64_t leaf_no;
-	int error;
-
-	hsize = 1 << di->di_depth;
-	if (hsize * sizeof(uint64_t) != di->di_size)
-		die("bad hash table size\n");
-
-	while (index < hsize) {
-		lp_offset = index % sd_hash_ptrs;
-		ht_offset = index - lp_offset;
-
-		if (ht_offset_cur != ht_offset) {
-			error = gfs2_readi(di, (char *)lp, ht_offset * sizeof(uint64_t), sd_hash_bsize);
-			if (error != sd_hash_bsize)
-				die("FixMe!!!\n");
-			ht_offset_cur = ht_offset;
-		}
-
-		leaf_no = le64_to_cpu(lp[lp_offset]);
-		if (!leaf_no)
-			die("NULL leaf pointer\n");
-
-		data = get_block(leaf_no, TRUE);
-		gfs2_leaf_in(&leaf, data);
-		len = 1 << (di->di_depth - leaf.lf_depth);
-
-		lc(di, data, index, len, leaf_no, opaque);
-
-		free(data);
-		index += len;
-	}
-
-	if (index != hsize)
-		die("screwed up directory\n");
-}
-
-static unsigned int
-calc_tree_height(struct gfs2_dinode *di, uint64_t size)
-{
-	uint64_t *arr;
-	unsigned int max, height;
-
-	if (di->di_size > size)
-		size = di->di_size;
-
-	if (di->di_flags & GFS2_DIF_JDATA) {
-		arr = sd_jheightsize;
-		max = sd_max_jheight;
-	} else {
-		arr = sd_heightsize;
-		max = sd_max_height;
-	}
-
-	for (height = 0; height < max; height++)
-		if (arr[height] >= size)
-			break;
-
-	return height;
-}
-
-struct metapath {
-	unsigned int mp_list[GFS2_MAX_META_HEIGHT];
-};
-
-static struct metapath *
-find_metapath(struct gfs2_dinode *di, uint64_t block)
-{
-	struct metapath *mp;
-	uint64_t b = block;
-	unsigned int i;
-
-	mp = malloc(sizeof(struct metapath));
-	if (!mp)
-		die("out of memory (%s, %u)\n",
-		    __FILE__, __LINE__);
-	memset(mp, 0, sizeof(struct metapath));
-
-	for (i = di->di_height; i--;) {
-		mp->mp_list[i] = b % sd_inptrs;
-		b /= sd_inptrs;
-	}
-
-	return mp;
-}
-
-static uint64_t
-lookup_block(char *data, unsigned int height,
-	     struct metapath *mp)
-{
-	unsigned int head_size;
-	uint64_t block;
-
-	head_size = (height > 0) ?
-		sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
-	block = *(((uint64_t *)(data + head_size)) + mp->mp_list[height]);
-
-	if (block)
-		return le64_to_cpu(block);
-	else
-		return 0;
-}
-
-int
-gfs2_block_map(struct gfs2_dinode *di,
-	      uint64_t lblock, uint64_t *dblock)
-{
-	unsigned int bsize;
-	unsigned int height;
-	struct metapath *mp;
-	unsigned int end_of_metadata;
-	char *data;
-	unsigned int x;
-	int error = 0;
-
-	*dblock = 0;
-
-	if (!di->di_height) {
-		if (!lblock)
-			*dblock = di->di_num.no_addr;
-		return 0;
-	}
-
-	bsize = (di->di_flags & GFS2_DIF_JDATA) ? sd_jbsize : block_size;
-
-	height = calc_tree_height(di, (lblock + 1) * bsize);
-	if (di->di_height < height)
-		return 0;
-
-	mp = find_metapath(di, lblock);
-	end_of_metadata = di->di_height - 1;
-
-	data = get_block(di->di_num.no_addr, TRUE);
-
-	for (x = 0; x < end_of_metadata; x++) {
-		*dblock = lookup_block(data, x, mp);
-		free(data);
-		if (!*dblock)
-			goto out;
-
-		data = get_block(*dblock, FALSE);
-		if (!data) {
-			error = -1;
-			goto out;
-		}
-	}
-
-	*dblock = lookup_block(data, x, mp);
-
-	free(data);
-
- out:
-	free(mp);
-
-	return error;
-}
-
-static int
-copy2mem(char *data, void **buf,
-	     unsigned int offset, unsigned int size)
-{
-	char **p = (char **)buf;
-
-	if (data)
-		memcpy(*p, data + offset, size);
-	else
-		memset(*p, 0, size);
-
-	*p += size;
-
-	return 0;
-}
-
-int
-gfs2_readi(struct gfs2_dinode *di, void *buf,
-	  uint64_t offset, unsigned int size)
-{
-	int journaled = (di->di_flags & GFS2_DIF_JDATA);
-	uint64_t lblock, dblock;
-	unsigned int o;
-	unsigned int amount;
-	char *data;
-	int copied = 0;
-	int error = 0;
-
-	if (offset >= di->di_size)
-		return 0;
-
-	if ((offset + size) > di->di_size)
-		size = di->di_size - offset;
-
-	if (!size)
-		return 0;
-
-	if (journaled) {
-		lblock = offset / sd_jbsize;
-		o = offset % sd_jbsize;
-	} else {
-		lblock = offset >> block_size_shift;
-		o = offset & (block_size - 1);
-	}
-
-	if (!di->di_height)
-		o += sizeof(struct gfs2_dinode);
-	else if (journaled)
-		o += sizeof(struct gfs2_meta_header);
-
-	while (copied < size) {
-		amount = size - copied;
-		if (amount > block_size - o)
-			amount = block_size - o;
-
-		error = gfs2_block_map(di, lblock, &dblock);
-		if (error)
-			goto fail;
-
-		if (dblock) {
-			data = get_block(dblock, FALSE);
-			if (!data) {
-				error = -1;
-				goto fail;
-			}
-		} else
-			data = NULL;
-
-		copy2mem(data, &buf, o, amount);
-
-		if (data)
-			free(data);
-
-		copied += amount;
-		lblock++;
-
-		o = (journaled) ? sizeof(struct gfs2_meta_header) : 0;
-	}
-
-	return copied;
-
- fail:
-	return (copied) ? copied : error;
-}
-
diff --git a/gfs2/debug/util.h b/gfs2/debug/util.h
deleted file mode 100644
index 2c08b66..0000000
--- a/gfs2/debug/util.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef __UTIL_DOT_H__
-#define __UTIL_DOT_H__
-
-
-int check_type(char *data, unsigned int type);
-
-
-typedef void (*pointer_call_t)(struct gfs2_dinode *di,
-			       unsigned int height, uint64_t bn,
-			       void *opaque);
-void recursive_scan(struct gfs2_dinode *di,
-		    unsigned int height, uint64_t bn,
-		    pointer_call_t pc, void *opaque);
-
-typedef void (*leaf_call_t)(struct gfs2_dinode *di, char *data,
-			    uint32_t index, uint32_t len, uint64_t leaf_no,
-			    void *opaque);
-void foreach_leaf(struct gfs2_dinode *di,
-		  leaf_call_t lc, void *opaque);
-
-
-int gfs2_block_map(struct gfs2_dinode *di,
-		  uint64_t lblock, uint64_t *dblock);
-int gfs2_readi(struct gfs2_dinode *di, void *buf,
-	      uint64_t offset, unsigned int size);
-
-
-#endif /* __UTIL_DOT_H__ */
-


hooks/post-receive
--
Cluster Project


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2008-06-06 12:44 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-06-06 12:44 Cluster Project branch, master, updated. cluster-2.99.03-16-gb5fb9a7 fabbione

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).