qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH] Add a disk format named iROW , supporting high-effi


From: zhengjs . act
Subject: [Qemu-devel] [PATCH] Add a disk format named iROW , supporting high-efficiency VM snapshot
Date: Sat, 26 Jan 2013 16:15:37 +0800

From: Jingsheng Zheng <address@hidden>

iRow (imporved Redirect-on-Write) is a disk format supporting high-efficiency 
VM disk snapshot.
iROW uses bitmap to reduce the amount of metadata, so that both the VM disk 
snapshot key operations
performance and the VM disk I/O performance would be enhanced at the same time.

Signed-off-by : JingshengZheng <address@hidden>

---
 block/Makefile.objs |    1 +
 block/irow.c        | 2257 +++++++++++++++++++++++++++++++++++++++++++++++++++
 block/irow.h        |  135 +++
 3 files changed, 2393 insertions(+), 0 deletions(-)
 create mode 100644 block/irow.c
 create mode 100644 block/irow.h

diff --git a/block/Makefile.objs b/block/Makefile.objs
index c067f38..e045440 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -1,5 +1,6 @@
 block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o 
vvfat.o
 block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o 
qcow2-cache.o
+block-obj-y += irow.o
 block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
 block-obj-y += parallels.o blkdebug.o blkverify.o
diff --git a/block/irow.c b/block/irow.c
new file mode 100644
index 0000000..99b8579
--- /dev/null
+++ b/block/irow.c
@@ -0,0 +1,2257 @@
+/* IROW(Improved ROW)Disk Format
+ * */
+
+/*
+ * iRow (imporved Redirect-on-Write) is a disk format supporting 
high-efficiency VM disk snapshot.
+ * iROW uses bitmap to reduce the amount of metadata, so that both the VM disk 
snapshot key operations
+ * performance and the VM disk I/O performance would be enhanced at the same 
time.
+ *
+ *The iROW VM disk image consists of a meta file and several snapshots.
+ *
+ *A snapshot consists of 2 files: a bitmap file (btmp file) and a VM disk data 
file (irvd file).
+ *The current state of the iROW VM disk also occupies a snapshot.
+ *
+ *The meta file consists of the meta header and the snapshots information. The 
meta header is used to
+ *store basic information of VM disk image. The snapshots information 
sequentially stores every snapshot’s name,
+ *id and others related information.
+ *
+ *The btmp file consists of a bitmap and the VM state data. The bitmap is used 
to indicate whether the
+ *clusters exist in corresponding irvd file. Each cluster in the VM disk image 
is mapped to a bit in the bitmap.
+ *
+ *The irvd file is used to store the actual data of the VM disk image. The 
smallest unit of storage is cluster.
+ *iROW does not decide the address of the data clusters. It just writes the 
clusters to the same VM disk image
+ *addresses as the virtual addresses of the clusters. Because of host 
machine’s file system support sparse files,
+ *iROW also achieves the gradual growth of the VM disk image size with the 
actual disk usage.
+ *
+ */
+
+#include "qemu-common.h"
+#include "include/block/block_int.h"
+#include "include/qemu/module.h"
+#include "block/irow.h"
+
+#include <linux/falloc.h>
+
+BDRVIrowState **birows_cache = NULL;
+ClusterCache *cluster_cache = NULL;
+
+static int get_bits_from_size(size_t size)
+{
+    int ret = 0;
+    if (size == 0) {
+        return -1;
+    }
+    while (size != 1) {
+       if (size & 1) {
+               return -1;
+        }
+        size >>= 1;
+        ret++;
+    }
+    return ret;
+}
+
+static int irow_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+       const IRowMeta *irow_meta = (const void *)buf;
+
+    if (buf_size >= sizeof(IRowMeta) &&
+        be32_to_cpu(irow_meta->magic) == IROW_MAGIC &&
+        be32_to_cpu(irow_meta->version) == IROW_VERSION){
+        return 100;
+    }
+    else {
+        return 0;
+    }
+}
+
+static void irow_close_btmp(BDRVIrowState *s) {
+       if(s->bitmap) {
+               g_free(s->bitmap);
+               s->bitmap = NULL;
+       }
+
+       if(s->irow_btmp) {
+               bdrv_delete(s->irow_btmp);
+               s->irow_btmp = NULL;
+       }
+}
+
+static void irow_close_irvd(BDRVIrowState *s) {
+       if(s->irow_irvd) {
+               bdrv_delete(s->irow_irvd);
+               s->irow_irvd = NULL;
+       }
+}
+
+static void irow_close_snapshots2(IRowSnapshot *snapshots, int nb_snapshots) {
+       int i;
+       IRowSnapshot *snap_ptr;
+
+       if(snapshots == NULL)
+               return;
+
+       for(i = 0; i < nb_snapshots; i++) {
+               snap_ptr = snapshots + i;
+               if(snap_ptr->btmp_file) {
+                       g_free(snap_ptr->btmp_file);
+                       snap_ptr->btmp_file = NULL;
+               }
+
+               if(snap_ptr->irvd_file) {
+                       g_free(snap_ptr->irvd_file);
+                       snap_ptr->irvd_file = NULL;
+               }
+
+               if(snap_ptr->father_btmp_file) {
+                       g_free(snap_ptr->father_btmp_file);
+                       snap_ptr->father_btmp_file = NULL;
+               }
+
+               if(snap_ptr->id_str) {
+                       g_free(snap_ptr->id_str);
+                       snap_ptr->id_str = NULL;
+               }
+
+               if(snap_ptr->name) {
+                       g_free(snap_ptr->name);
+                       snap_ptr->name = NULL;
+               }
+       }
+       g_free(snapshots);
+}
+
+static void irow_close_snapshots(BDRVIrowState *birows) {
+       irow_close_snapshots2(birows->snapshots, birows->nb_snapshots);
+       birows->snapshots = NULL;
+}
+
+static void irow_close_meta(BDRVIrowState *s) {
+       if(s->meta_file) {
+               g_free(s->meta_file);
+               s->meta_file = NULL;
+       }
+
+       if(s->current_btmp_file) {
+               g_free(s->current_btmp_file);
+               s->current_btmp_file = NULL;
+       }
+
+       if(s->father_btmp_file) {
+               g_free(s->father_btmp_file);
+               s->father_btmp_file = NULL;
+       }
+
+       if(s->irvd_file) {
+               g_free(s->irvd_file);
+               s->irvd_file = NULL;
+       }
+
+       if(s->opened_btmp_file) {
+               g_free(s->opened_btmp_file);
+               s->opened_btmp_file = NULL;
+       }
+
+       if(s->irow_meta) {
+               bdrv_delete(s->irow_meta);
+               s->irow_meta = NULL;
+       }
+       if(s->snapshots) {
+               irow_close_snapshots(s);
+       }
+}
+
+static void irow_close_state(BDRVIrowState *s) {
+
+       irow_close_meta(s);
+       irow_close_btmp(s);
+       irow_close_irvd(s);
+
+}
+
+static int irow_check_bitmap(BDRVIrowState *birows) {
+       uint64_t i;
+       for(i = 0; i < birows->bitmap_size; i++) {
+               if(birows->bitmap[i] != 0xff)
+                       return 0;
+       }
+       return 1;
+}
+
+static int irow_update_btmp(BDRVIrowState *birows) {
+
+       int ret = 0;
+       if(birows->bitmap_is_dirty) {
+               if(bdrv_pwrite(birows->irow_btmp, 0, birows->bitmap, 
birows->bitmap_size) != birows->bitmap_size) {
+                       fprintf(stderr, "Failed to write the IROW bitmap data 
to %s\n", birows->opened_btmp_file);
+                       ret = -1;
+                       goto end;
+               }
+               birows->bitmap_is_dirty = 0;
+               ret = bdrv_truncate(birows->irow_btmp, birows->bitmap_size + 
birows->vm_state_size);
+               if(irow_check_bitmap(birows)) {
+                       birows->complete_image = 1;
+               }
+       }
+       if(birows->vmstate_is_saved) {
+               birows->vmstate_is_saved = 0;
+               ret = bdrv_truncate(birows->irow_btmp, birows->bitmap_size + 
birows->vm_state_size);
+       }
+
+end:
+       return ret;
+}
+
+static int irow_update_meta(BDRVIrowState *birows, const char *current_btmp, 
int change_copy_on_demand_state) {
+       int i,  ret = 0;
+       uint32_t copy_on_demand;
+       IRowMeta meta;
+       IRowSnapshotHeader snap_header;
+       IRowSnapshot *snap_ptr;
+
+       if(change_copy_on_demand_state == 0 && birows->snapshots_is_dirty == 0 
&& current_btmp == NULL)
+               goto end;
+
+       if(bdrv_pread (birows->irow_meta, 0, &meta, sizeof(meta)) != 
sizeof(meta)) {
+                       fprintf (stderr, "Failed to read the meta data from 
%s\n", birows->meta_file);
+                       ret = -1;
+                       goto end;
+       }
+       if(change_copy_on_demand_state) {
+               copy_on_demand = meta.copy_on_demand;
+               be32_to_cpus(&copy_on_demand);
+               copy_on_demand = copy_on_demand ? 0 : 1;
+               meta.copy_on_demand = cpu_to_be32(copy_on_demand);
+       }
+       if(current_btmp != NULL) {
+               memset(meta.current_btmp, 0, MAX_FILE_NAME_LENGTH);
+               strncpy(meta.current_btmp, current_btmp, MAX_FILE_NAME_LENGTH);
+       }
+
+       if(birows->snapshots_is_dirty) {
+               meta.nb_snapshots = cpu_to_be32(birows->nb_snapshots);
+               for(i = 0; i < birows->nb_snapshots; i++) {
+                       memset(&snap_header, 0, sizeof(snap_header));
+                       snap_ptr = birows->snapshots + i;
+                       snap_header.snap_magic = 
cpu_to_be32(IROW_SNAPHEADER_MAGIC);
+                       snap_header.date_sec = snap_ptr->date_sec;
+                       snap_header.date_nsec = snap_ptr->date_nsec;
+                       snap_header.vm_clock_nsec = snap_ptr->vm_clock_nsec;
+                       snap_header.vm_state_size = snap_ptr->vm_state_size;
+                       snap_header.nb_children = snap_ptr->nb_children;
+                       snap_header.is_deleted = snap_ptr->is_deleted;
+                       if(snap_ptr->id_str != NULL)
+                               strncpy(snap_header.id_str, snap_ptr->id_str, 
128);
+                       if(snap_ptr->name != NULL)
+                               strncpy(snap_header.name, snap_ptr->name, 256);
+                       if(snap_ptr->btmp_file == NULL) {
+                               fprintf(stderr, "Void btmp filename\n");
+                               ret = -1;
+                               goto end;
+                       }
+                       strncpy(snap_header.btmp_file, snap_ptr->btmp_file, 
MAX_FILE_NAME_LENGTH);
+                       if(snap_ptr->irvd_file == NULL) {
+                               fprintf(stderr, "Void irvd filename\n");
+                               ret = -1;
+                               goto end;
+                       }
+                       strncpy(snap_header.irvd_file, snap_ptr->irvd_file, 
MAX_FILE_NAME_LENGTH);
+                       if(snap_ptr->father_btmp_file != NULL)
+                               strncpy(snap_header.father_btmp_file, 
snap_ptr->father_btmp_file, MAX_FILE_NAME_LENGTH);
+
+                       if(bdrv_pwrite(birows->irow_meta, sizeof(meta) + i * 
sizeof(IRowSnapshotHeader), &snap_header, sizeof(snap_header)) != 
sizeof(snap_header)) {
+                               fprintf (stderr, "Failed to write the snapshot 
#%d info to %s\n", i, birows->meta_file);
+                               ret = -1;
+                               goto end;
+                       }
+               }
+               birows->snapshots_is_dirty = 0;
+       }
+
+       if(bdrv_pwrite(birows->irow_meta, 0, &meta, sizeof(meta)) != 
sizeof(meta)) {
+               fprintf (stderr, "Failed to write the meta data to %s\n", 
birows->meta_file);
+               ret = -1;
+               goto end;
+       }
+
+       ret = bdrv_truncate(birows->irow_meta, sizeof(meta) + 
(birows->nb_snapshots) * sizeof(IRowSnapshotHeader));
+
+end:
+       return ret;
+}
+
+static void irow_close_previous_state(BDRVIrowState *birows) {
+       birows->irow_meta = NULL;
+       irow_close_state(birows);
+       g_free(birows);
+}
+
+static void irow_free_birows_cache(BDRVIrowState *birows) {
+       int i;
+       if(birows_cache != NULL) {
+               for(i = 0; i < birows->nb_snapshots; i++) {
+                       if(birows_cache[i] != NULL) {
+                               irow_close_previous_state(birows_cache[i]);
+                       }
+               }
+               g_free(birows_cache);
+               birows_cache = NULL;
+       }
+}
+
+static void irow_close(BlockDriverState *bs) {
+
+       BDRVIrowState *s = bs->opaque;
+
+       irow_free_birows_cache(s);
+       irow_close_state(s);
+
+}
+
+static int irow_open_snapshots(BDRVIrowState *birows) {
+       int i, ret = 0;
+       IRowSnapshotHeader snap_header;
+       IRowSnapshot *snap_ptr;
+       int64_t offset;
+
+       birows->snapshots = g_malloc0(sizeof(IRowSnapshot) * 
birows->nb_snapshots);
+       offset = IROW_SNAPSHOT_OFFSET;
+       for(i = 0; i < birows->nb_snapshots; i++) {
+               if(bdrv_pread(birows->irow_meta, offset, &snap_header, 
sizeof(snap_header)) != sizeof(snap_header)) {
+                       fprintf(stderr, "Failed to read snapshot #%d info from 
%s\n", i, birows->meta_file);
+                       ret = -1;
+                       goto fail;
+               }
+               snap_ptr = birows->snapshots + i;
+               snap_ptr->date_sec = snap_header.date_sec;
+               snap_ptr->date_nsec = snap_header.date_nsec;
+               snap_ptr->vm_clock_nsec = snap_header.vm_clock_nsec;
+               snap_ptr->vm_state_size = snap_header.vm_state_size;
+               snap_ptr->nb_children = snap_header.nb_children;
+               snap_ptr->is_deleted = snap_header.is_deleted;
+
+               if(snap_header.id_str[0] != '\0') {
+                       snap_ptr->id_str = g_malloc0(128);
+                       strncpy(snap_ptr->id_str, snap_header.id_str, 128);
+               }
+               if(snap_header.name[0] != '\0') {
+                       snap_ptr->name = g_malloc0(256);
+                       strncpy(snap_ptr->name, snap_header.name, 256);
+               }
+               if(snap_header.btmp_file == '\0') {
+                       fprintf(stderr, "Invalid btmp file name. (snapshot 
#%d)\n", i);
+                       ret = -1;
+                       goto fail;
+               }
+               snap_ptr->btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+               strncpy(snap_ptr->btmp_file, snap_header.btmp_file, 
MAX_FILE_NAME_LENGTH);
+               if(snap_header.irvd_file == '\0') {
+                       fprintf(stderr, "Invalid irvd file name. (snapshot 
#%d)\n", i);
+                       ret = -1;
+                       goto fail;
+               }
+               snap_ptr->irvd_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+               strncpy(snap_ptr->irvd_file, snap_header.irvd_file, 
MAX_FILE_NAME_LENGTH);
+               if(snap_header.father_btmp_file[0] != '\0') {
+                       snap_ptr->father_btmp_file = 
g_malloc0(MAX_FILE_NAME_LENGTH);
+                       strncpy(snap_ptr->father_btmp_file, 
snap_header.father_btmp_file, MAX_FILE_NAME_LENGTH);
+               }
+               offset += sizeof(snap_header);
+       }
+       birows->snapshots_is_dirty = 0;
+
+       return ret;
+fail:
+       irow_close_snapshots(birows);
+       return ret;
+
+}
+
+static int irow_open_meta(BlockDriverState *bs, BDRVIrowState *birows, const 
char *filename, int flags) {
+       int ret = 0;
+       IRowMeta meta;
+
+       birows->irow_meta = bdrv_new ("");
+       ret = bdrv_file_open(&birows->irow_meta, filename, flags);
+       if (ret < 0) {
+               fprintf (stderr, "Failed to open %s\n", filename);
+               goto end;
+       }
+       if (bdrv_pread (birows->irow_meta, 0, &meta, sizeof(meta)) != 
sizeof(meta)) {
+               fprintf (stderr, "Failed to read the IROW meta data from %s\n", 
filename);
+               ret = -1;
+               goto end;
+       }
+       be32_to_cpus(&meta.magic);
+       be32_to_cpus(&meta.version);
+       be32_to_cpus(&meta.copy_on_demand);
+       be32_to_cpus(&meta.cluster_size);
+       be32_to_cpus(&meta.cluster_bits);
+       be64_to_cpus(&meta.total_clusters);
+       be32_to_cpus(&meta.sectors_per_cluster);
+       be64_to_cpus(&meta.disk_size);
+       be32_to_cpus(&meta.nb_snapshots);
+
+       if(meta.magic != IROW_MAGIC || meta.version != IROW_VERSION) {
+               fprintf (stderr, "Invalid magic number or version number!\n");
+               ret = -1;
+               goto end;
+       }
+       if((meta.cluster_bits < MIN_CLUSTER_BITS) || (meta.cluster_bits > 
MAX_CLUSTER_BITS)) {
+               fprintf (stderr, "Invalid cluster_bits!\n");
+               ret = -1;
+               goto end;
+       }
+       if(meta.cluster_bits != get_bits_from_size(meta.cluster_size)) {
+               fprintf (stderr, "cluster_size and cluster_bits do not 
match!\n");
+               ret = -1;
+               goto end;
+       }
+       if(meta.total_clusters != ((meta.disk_size + meta.cluster_size - 1) >> 
meta.cluster_bits)) {
+               fprintf (stderr, "total_clusters and disk_size do not 
match!\n");
+               ret = -1;
+               goto end;
+       }
+       if(meta.sectors_per_cluster != (meta.cluster_size >> BDRV_SECTOR_BITS)) 
{
+               fprintf (stderr, "Invalid sectors_per_cluster!\n");
+               ret = -1;
+               goto end;
+       }
+       birows->copy_on_demand = meta.copy_on_demand;
+       birows->cluster_size = meta.cluster_size;
+       birows->cluster_bits = meta.cluster_bits;
+       birows->total_clusters = meta.total_clusters;
+       birows->sectors_per_cluster = meta.sectors_per_cluster;
+       birows->disk_size = meta.disk_size;
+       bs->total_sectors = meta.disk_size / BDRV_SECTOR_SIZE;
+       birows->bitmap_size = (birows->total_clusters + 7) >> 3;
+       birows->nb_snapshots = meta.nb_snapshots;
+       birows->meta_file = g_malloc(MAX_FILE_NAME_LENGTH);
+       strncpy(birows->meta_file, filename, MAX_FILE_NAME_LENGTH);
+       birows->current_btmp_file = g_malloc(MAX_FILE_NAME_LENGTH);
+       strncpy(birows->current_btmp_file, meta.current_btmp, 
MAX_FILE_NAME_LENGTH);
+       strncpy(bs->backing_file, meta.backing_file, sizeof(bs->backing_file));
+
+       if(cluster_cache == NULL) {
+               cluster_cache = g_malloc0(sizeof(ClusterCache));
+               if(cluster_cache != NULL) {
+                       cluster_cache->cache = qemu_memalign(512, 
birows->cluster_size);
+                       if(cluster_cache->cache != NULL)
+                               memset(cluster_cache->cache, 0, 
birows->cluster_size);
+                       else {
+                               fprintf(stderr, "Failed to create father 
cache\n");
+                               ret = -1;
+                               goto end;
+                       }
+                       cluster_cache->cluster_num = -1;
+               } else {
+                       fprintf(stderr, "Failed to create father cache\n");
+                       ret = -1;
+                       goto end;
+               }
+       }
+
+       if(irow_open_snapshots(birows) < 0) {
+               fprintf(stderr, "Failed to read snapshots info from %s\n", 
birows->meta_file);
+               ret = -1;
+               goto end;
+       }
+
+end:
+       return ret;
+}
+
+static int irow_open_btmp(BDRVIrowState *birows,  const char *filename, int 
flags) {
+       int ret;
+
+       birows->irow_btmp = bdrv_new ("");
+       ret = bdrv_file_open(&birows->irow_btmp, filename, flags);
+       if (ret < 0) {
+               return ret;
+       }
+       birows->bitmap = qemu_memalign(512, birows->bitmap_size);
+       if(bdrv_pread(birows->irow_btmp, 0, birows->bitmap, 
birows->bitmap_size) != birows->bitmap_size) {
+               fprintf(stderr, "Failed to read bitmap from %s\n", filename);
+               return -1;
+       }
+       birows->bitmap_is_dirty = 0;
+       birows->vmstate_is_saved = 0;
+       if(irow_check_bitmap(birows)) {
+               birows->complete_image = 1;
+       } else {
+               birows->complete_image = 0;
+       }
+       return ret;
+}
+
+static int irow_open_vd(BDRVIrowState *birows, const char *filename, int 
flags) {
+       int ret;
+       birows->irow_irvd =  bdrv_new ("");
+   ret = bdrv_file_open(&birows->irow_irvd, filename, flags);
+       return ret;
+}
+
+static int irow_open_data(BDRVIrowState *birows, int flags) {
+
+       int ret = 0;
+
+       if(birows->opened_btmp_file == NULL || birows->opened_btmp_file[0] == 
'\0') {
+               fprintf (stderr, "Void btmp file name\n");
+               ret = -1;
+               goto end;
+       }
+       if(irow_open_btmp(birows, birows->opened_btmp_file, flags) < 0) {
+               fprintf (stderr, "Failed to open %s\n", 
birows->opened_btmp_file);
+               ret = -1;
+               goto end;
+       }
+
+       if(birows->irvd_file == NULL || birows->irvd_file[0] == '\0') {
+               fprintf (stderr, "Void irvd file name\n");
+               ret = -1;
+               goto end;
+       }
+       if(irow_open_vd(birows, birows->irvd_file, flags) < 0) {
+               fprintf (stderr, "Failed to open %s\n", birows->irvd_file);
+               ret = -1;
+               goto end;
+       }
+
+end:
+       return ret;
+}
+
+static int irow_find_snapshot_by_btmp(BDRVIrowState *birows, const char *btmp) 
{
+       int i;
+
+       for(i = 0; i < birows->nb_snapshots; i++) {
+               if(birows->snapshots[i].btmp_file != NULL) {
+                       if(strcmp(birows->snapshots[i].btmp_file, btmp) == 0) {
+                               return i;
+                       }
+               }
+       }
+       return -1;
+}
+
+static int irow_load_info_from_snapshot(BDRVIrowState *birows, int 
snapshot_index) {
+       IRowSnapshot *snap;
+       int ret = 0;
+
+       if(snapshot_index < 0) {
+       fprintf (stderr, "Invalid snapshot index.\n");
+       ret = -1;
+       goto end;
+     }
+    snap = birows->snapshots + snapshot_index;
+    if(snap->btmp_file == NULL) {
+       fprintf (stderr, "Void btmp file name in snap info\n");
+       ret = -1;
+       goto end;
+    }
+    if(snap->irvd_file == NULL) {
+       fprintf (stderr, "Void irvd file name in snap info\n");
+       ret = -1;
+       goto end;
+    }
+    birows->opened_btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+    birows->irvd_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+    strncpy(birows->opened_btmp_file, snap->btmp_file, MAX_FILE_NAME_LENGTH);
+    strncpy(birows->irvd_file, snap->irvd_file, MAX_FILE_NAME_LENGTH);
+    if(snap->father_btmp_file) {
+       birows->father_btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+       strncpy(birows->father_btmp_file, snap->father_btmp_file, 
MAX_FILE_NAME_LENGTH);
+    }
+    birows->vm_state_size = snap->vm_state_size;
+end:
+       return ret;
+}
+
+static BDRVIrowState *irow_open_previous_state(BDRVIrowState *birows, int 
snap_index) {
+       BDRVIrowState *new_birows = g_malloc0(sizeof(BDRVIrowState));
+
+       new_birows->cluster_size = birows->cluster_size;
+       new_birows->cluster_bits = birows->cluster_bits;
+       new_birows->total_clusters = birows->total_clusters;
+       new_birows->sectors_per_cluster = birows->sectors_per_cluster;
+       new_birows->disk_size = birows->disk_size;
+       new_birows->bitmap_size = birows->bitmap_size;
+       new_birows->current_btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+       strcpy(new_birows->current_btmp_file, birows->current_btmp_file);
+
+       new_birows->nb_snapshots = birows->nb_snapshots;
+       new_birows->irow_meta = birows->irow_meta;
+       irow_open_snapshots(new_birows);
+
+       if(irow_load_info_from_snapshot(new_birows, snap_index) < 0) {
+               goto fail;
+       }
+       new_birows->open_flags = birows->open_flags;
+       if(irow_open_data(new_birows, new_birows->open_flags) < 0) {
+               goto fail;
+       }
+
+       return new_birows;
+
+fail:
+       if(new_birows != NULL) {
+               irow_close_previous_state(new_birows);
+               new_birows = NULL;
+       }
+
+       return NULL;
+}
+
+static int irow_init_birows_cache(BDRVIrowState *birows) {
+       int ret = 0;
+       birows_cache = g_malloc0(sizeof(BDRVIrowState *) * 
birows->nb_snapshots);
+       if(birows_cache == NULL) {
+               ret = -1;
+               goto end;
+       }
+end:
+       return ret;
+}
+
+static int irow_open(BlockDriverState *bs, int flags) {
+    BDRVIrowState *s = bs->opaque;
+
+    int snap_index;
+
+       s->open_flags = flags;
+    if(irow_open_meta(bs, s, bs->filename, flags) < 0) {
+       fprintf (stderr, "Failed to open %s\n", bs->filename);
+       goto fail;
+    }
+
+    snap_index = irow_find_snapshot_by_btmp(s, s->current_btmp_file);
+    if(irow_load_info_from_snapshot(s, snap_index) < 0) {
+       fprintf (stderr, "Failed to load filename from snapshot\n");
+       goto fail;
+    }
+
+    if(irow_open_data(s, flags) < 0) {
+       goto fail;
+    }
+
+       if(irow_init_birows_cache(s) < 0) {
+       fprintf (stderr, "Failed to create birows_cache\n");
+       goto fail;
+       }
+    return 0;
+
+fail:
+       irow_close (bs);
+       return -1;
+}
+
+static int irow_get_bit(BDRVIrowState *birows, int64_t cluster_index) {
+       int64_t byte_index, bit_index;
+
+       byte_index = cluster_index >> 3;
+       bit_index = cluster_index & 0x7;
+       return (birows->bitmap[byte_index] >> bit_index) & 1;
+}
+
+static void irow_set_bit(BDRVIrowState *birows, int64_t cluster_index) {
+       int64_t byte_index, bit_index;
+       int old_bit;
+
+       if(cluster_cache != NULL) {
+               if(cluster_index == cluster_cache->cluster_num)
+                       cluster_cache->cluster_num = -1;
+       }
+
+       byte_index = cluster_index >> 3;
+       bit_index = cluster_index & 0x7;
+       old_bit = (birows->bitmap[byte_index] >> bit_index) & 1;
+       if(old_bit == 0) {
+               birows->bitmap[byte_index] |= (1 <<  bit_index);
+               birows->bitmap_is_dirty = 1;
+       }
+}
+
+static int irow_read_missing_clusters2(BlockDriverState *bs, BDRVIrowState 
*birows, int64_t start_cluster, int64_t nb_clusters, uint8_t *buf, uint8_t 
*buf_bitmap, uint64_t buf_start) {
+       int64_t continuous_missing_clusters, continuous_appearing_clusters, i, 
cluster_index, buf_index;
+       int64_t backing_len, backing_sector_num, backing_nb_sectors;
+       uint8_t *backing_buf;
+       int snap_index, ret = 0;
+       BlockDriver *drv;
+
+       continuous_missing_clusters = 0;
+       continuous_appearing_clusters = 0;
+       for(i = 0; i < nb_clusters; i++) {
+               if(irow_get_bit(birows, start_cluster + i) == 0) {
+                       buf_bitmap[buf_start + i] = 1;
+                       continuous_missing_clusters += 1;
+                       if(continuous_appearing_clusters != 0) {
+                               if(strcmp(birows->current_btmp_file, 
birows->opened_btmp_file) != 0) {
+                                       cluster_index = start_cluster + i - 
continuous_appearing_clusters;
+                                       buf_index = buf_start + i - 
continuous_appearing_clusters;
+                                       if(cluster_cache != NULL) {
+                                               if(cluster_cache->cache != 
NULL) {
+                                                       if(cluster_index == 
cluster_cache->cluster_num) {
+                                                               memcpy(buf + 
buf_index * birows->cluster_size, cluster_cache->cache, birows->cluster_size);
+                                                               cluster_index 
+= 1;
+                                                               buf_index += 1;
+                                                               
continuous_appearing_clusters -= 1;
+                                                               
if(continuous_appearing_clusters == 0) {
+                                                                       
continue;
+                                                               }
+                                                       }
+                                               }
+                                       }
+                                       drv = birows->irow_irvd->drv;
+                                       if(bdrv_read(birows->irow_irvd,
+                                                               cluster_index * 
birows->sectors_per_cluster,
+                                                               buf + buf_index 
* birows->cluster_size,
+                                                               
continuous_appearing_clusters * birows->sectors_per_cluster) < 0) {
+                                                       fprintf(stderr, "Failed 
to read clusters from %s\n", birows->irvd_file);
+                                                       ret = -1;
+                                                       goto end;
+                                               }
+                                       if(cluster_cache != NULL) {
+                                               if(cluster_cache->cache != 
NULL) {
+                                                       
memcpy(cluster_cache->cache, buf + (buf_start + i - 1) * birows->cluster_size, 
birows->cluster_size);
+                                                       
cluster_cache->cluster_num = start_cluster + i - 1;
+                                               }
+                                       }
+                                       }
+                               continuous_appearing_clusters = 0;
+                       }
+               } else {
+                       continuous_appearing_clusters += 1;
+                       if(continuous_missing_clusters != 0) {
+                               if(birows->father_btmp_file != NULL) {
+                                       snap_index = 
irow_find_snapshot_by_btmp(birows, birows->father_btmp_file);
+                                       if(birows_cache[snap_index] == NULL) {
+                                               birows_cache[snap_index] = 
irow_open_previous_state(birows, snap_index);
+                                               if(birows_cache[snap_index] == 
NULL) {
+                                                       ret = -1;
+                                                       goto end;
+                                               }
+                                       }
+                                       ret = irow_read_missing_clusters2(bs,
+                                                                               
                                                birows_cache[snap_index],
+                                                                               
                                                start_cluster + i - 
continuous_missing_clusters,
+                                                                               
                                                continuous_missing_clusters,
+                                                                               
                                                buf,
+                                                                               
                                                buf_bitmap,
+                                                                               
                                                buf_start + i - 
continuous_missing_clusters);
+
+                               } else {
+                                       if(bs->backing_hd) {
+                                           backing_len = 
bdrv_getlength(bs->backing_hd) / 512;
+                                           backing_sector_num = (start_cluster 
+ i - continuous_missing_clusters) * birows->sectors_per_cluster;
+                                           backing_nb_sectors = 
continuous_missing_clusters * birows->sectors_per_cluster;
+                                           backing_buf = buf + (buf_start + i 
- continuous_missing_clusters) * birows->cluster_size;
+                                           if(backing_sector_num < 
backing_len) {
+                                               if(backing_nb_sectors  > 
backing_len - backing_sector_num) {
+                                                       backing_nb_sectors = 
backing_len - backing_sector_num;
+                                               }
+                                               if(bdrv_read(bs->backing_hd, 
backing_sector_num, backing_buf, backing_nb_sectors)<0) {
+                                                       fprintf(stderr, "failed 
to read base image: %s\n", bs->backing_file);
+                                                       ret = -1;
+                                                               goto end;
+                                               }
+                                           }
+                                       }
+                               }
+                               continuous_missing_clusters = 0;
+                       }
+               }
+       }
+       if(continuous_missing_clusters != 0) {
+               if(birows->father_btmp_file != NULL) {
+                       snap_index = irow_find_snapshot_by_btmp(birows, 
birows->father_btmp_file);
+                       if(birows_cache[snap_index] == NULL) {
+                               birows_cache[snap_index] = 
irow_open_previous_state(birows, snap_index);
+                               if(birows_cache[snap_index] == NULL) {
+                                       ret = -1;
+                                       goto end;
+                               }
+                       }
+                       ret = irow_read_missing_clusters2(bs,
+                                                                               
                                birows_cache[snap_index],
+                                                                               
                                start_cluster + i - continuous_missing_clusters,
+                                                                               
                                continuous_missing_clusters,
+                                                                               
                                buf,
+                                                                               
                                buf_bitmap,
+                                                                               
                                buf_start + i - continuous_missing_clusters);
+
+               } else {
+                       if(bs->backing_hd) {
+                           backing_len = bdrv_getlength(bs->backing_hd) / 512;
+                           backing_sector_num = (start_cluster + i - 
continuous_missing_clusters) * birows->sectors_per_cluster;
+                           backing_nb_sectors = continuous_missing_clusters * 
birows->sectors_per_cluster;
+                           backing_buf = buf + (buf_start + i - 
continuous_missing_clusters) * birows->cluster_size;
+                           if(backing_sector_num  < backing_len) {
+                               if(backing_nb_sectors > backing_len - 
backing_sector_num) {
+                                       backing_nb_sectors = backing_len - 
backing_sector_num;
+                               }
+                               if(bdrv_read(bs->backing_hd, 
backing_sector_num, backing_buf, backing_nb_sectors)<0) {
+                                       fprintf(stderr, "failed to read base 
image: %s\n", bs->backing_file);
+                                       ret = -1;
+                                               goto end;
+                               }
+                           }
+                       }
+               }
+               continuous_missing_clusters = 0;
+       }
+
+       if(continuous_appearing_clusters != 0) {
+               if(strcmp(birows->current_btmp_file, birows->opened_btmp_file) 
!= 0) {
+                       cluster_index = start_cluster + i - 
continuous_appearing_clusters;
+                       buf_index = buf_start + i - 
continuous_appearing_clusters;
+                       if(cluster_cache != NULL) {
+                               if(cluster_cache->cache != NULL) {
+
+                                       if(cluster_index == 
cluster_cache->cluster_num) {
+                                               memcpy(buf + buf_index * 
birows->cluster_size, cluster_cache->cache, birows->cluster_size);
+                                               cluster_index += 1;
+                                               buf_index += 1;
+                                               continuous_appearing_clusters 
-= 1;
+                                               
if(continuous_appearing_clusters == 0) {
+                                                       goto end;
+                                               }
+                                       }
+                               }
+                       }
+                       drv = birows->irow_irvd->drv;
+                       if(bdrv_read(birows->irow_irvd,
+                                               cluster_index * 
birows->sectors_per_cluster,
+                                               buf + buf_index * 
birows->cluster_size,
+                                               continuous_appearing_clusters * 
birows->sectors_per_cluster) < 0) {
+                                       fprintf(stderr, "Failed to read 
clusters from %s\n", birows->irvd_file);
+                                       ret = -1;
+                               }
+                       if(cluster_cache != NULL) {
+                               if(cluster_cache->cache != NULL) {
+                                       memcpy(cluster_cache->cache, buf + 
(buf_start + i - 1) * birows->cluster_size, birows->cluster_size);
+                                       cluster_cache->cluster_num = 
start_cluster + i - 1;
+                               }
+                       }
+               }
+               continuous_appearing_clusters = 0;
+       }
+
+end:
+       return ret;
+}
+
+static int irow_read_missing_clusters(BlockDriverState *bs, int64_t 
first_cluster, int64_t last_cluster, uint8_t *buf, uint8_t *buf_bitmap, int 
is_read) {
+       BDRVIrowState *birows = bs->opaque;
+       int64_t nb_clusters;
+       int ret = 0;
+
+       if(first_cluster >= birows->total_clusters) {
+                       fprintf (stderr, "Invalid first_cluster!\n");
+               ret  = -1;
+               goto end;
+       }
+       if(last_cluster >= birows->total_clusters) {
+                       fprintf (stderr, "Invalid last_cluster!\n");
+               ret = -1;
+               goto end;
+       }
+
+       if(is_read) {
+               nb_clusters = last_cluster - first_cluster + 1;
+               ret = irow_read_missing_clusters2(bs, birows, first_cluster, 
nb_clusters, buf, buf_bitmap, 0);
+               if(ret < 0)
+                       goto end;
+
+       } else {
+               ret = irow_read_missing_clusters2(bs, birows, first_cluster, 1, 
buf, buf_bitmap, 0);
+               if(ret < 0)
+                       goto end;
+               if(first_cluster != last_cluster) {
+                       ret = irow_read_missing_clusters2(bs, birows, 
last_cluster, 1, buf, buf_bitmap, 1);
+               }
+       }
+
+
+end:
+       return ret;
+}
+
+static int irow_write_clusters(BDRVIrowState *birows, int64_t cluster_index, 
const uint8_t *buf, int nb_clusters) {
+       int ret = 0;
+       BlockDriver *drv;
+
+       if(cluster_index >= birows->total_clusters) {
+                       fprintf (stderr, "Invalid cluster_index!\n");
+               ret  = -1;
+               goto end;
+       }
+       if((cluster_index + nb_clusters -1) >= birows->total_clusters) {
+                       fprintf (stderr, "Invalid cluster_index or 
nb_clusters!\n");
+               ret = -1;
+               goto end;
+       }
+       drv = birows->irow_irvd->drv;
+       ret = bdrv_write(birows->irow_irvd, birows->sectors_per_cluster * 
cluster_index, buf, birows->sectors_per_cluster * nb_clusters);
+
+end:
+       return ret;
+}
+
+static int64_t first_sector_in_cluster(BDRVIrowState *birows, int64_t 
cluster_index) {
+       return cluster_index * birows->sectors_per_cluster;
+}
+
+static int64_t last_sector_in_cluster(BDRVIrowState *birows, int64_t 
cluster_index) {
+       return (cluster_index + 1) * birows->sectors_per_cluster - 1;
+}
+
+static int irow_assert_clusters(BlockDriverState *bs, ClusterBuffer *cbuf, 
int64_t sector_num, int nb_sectors, int op_type) {
+       BDRVIrowState *birows = bs->opaque;
+       int64_t nb_clusters, i, first_cluster, last_cluster, 
continuous_cluster, cluster_offset;
+       uint8_t *buffer_offset;// *zero_buf = NULL;
+       int ret = 0;
+
+       first_cluster = sector_num / birows->sectors_per_cluster;
+       last_cluster = (sector_num + nb_sectors - 1) / 
birows->sectors_per_cluster;
+       nb_clusters = last_cluster - first_cluster + 1;
+
+       switch(op_type) {
+       case IROW_READ:
+       case IROW_AIO_READ:
+               if(irow_read_missing_clusters(bs, first_cluster, last_cluster, 
cbuf->buf, cbuf->read_from_father, 1) < 0) {
+                       ret = -1;
+                       goto end;
+               }
+
+               if(birows->copy_on_demand) {
+                       continuous_cluster = 0;
+                       for(i = 0; i < nb_clusters + 1; i++) {
+                               if(cbuf->read_from_father[i] == 0) {
+                                       if(continuous_cluster == 0)
+                                               continue;
+                                       cluster_offset = first_cluster + i - 
continuous_cluster;
+                                       buffer_offset = cbuf->buf + (i - 
continuous_cluster) * birows->cluster_size;
+                                       if(irow_write_clusters(birows, 
cluster_offset, buffer_offset, continuous_cluster) < 0) {
+                                               ret = -1;
+                                               goto end;
+                                       }
+                                       continuous_cluster = 0;
+                               } else {
+                                       continuous_cluster += 1;
+                                       irow_set_bit(birows, first_cluster + i);
+                               }
+                       }
+               }
+               break;
+       case IROW_WRITE:
+       case IROW_AIO_WRITE:
+               if(sector_num == first_sector_in_cluster(birows, 
first_cluster)) {
+                       if((sector_num + nb_sectors - 1) == 
last_sector_in_cluster(birows, last_cluster)) {
+                               break;
+                       } else {
+                               if(irow_read_missing_clusters(bs, last_cluster, 
last_cluster, cbuf->buf, cbuf->read_from_father, 0) < 0) {
+                                       ret = -1;
+                                       goto end;
+                               }
+                               if(cbuf->read_from_father[0] == 1) {
+                                       if(irow_write_clusters(birows, 
last_cluster , cbuf->buf, 1) < 0) {
+                                               ret = -1;
+                                               goto end;
+                                       }
+                                       irow_set_bit(birows, last_cluster);
+                               }
+                               break;
+                       }
+               } else {
+                       if((sector_num + nb_sectors - 1) == 
last_sector_in_cluster(birows, last_cluster)) {
+                               if(irow_read_missing_clusters(bs, 
first_cluster, first_cluster, cbuf->buf, cbuf->read_from_father, 0) < 0) {
+                                       ret = -1;
+                                       goto end;
+                               }
+                               if(cbuf->read_from_father[0] == 1) {
+                                       if(irow_write_clusters(birows, 
first_cluster , cbuf->buf, 1) < 0) {
+                                               ret = -1;
+                                               goto end;
+                                       }
+                                       irow_set_bit(birows, first_cluster);
+                               }
+                               break;
+                       } else {
+                               if(irow_read_missing_clusters(bs, 
first_cluster, last_cluster, cbuf->buf, cbuf->read_from_father, 0) < 0) {
+                                       ret = -1;
+                                       goto end;
+                               }
+                               if(cbuf->read_from_father[0] == 1) {
+                                       if(irow_write_clusters(birows, 
first_cluster, cbuf->buf, 1) < 0) {
+                                               ret = -1;
+                                               goto end;
+                                       }
+                                       irow_set_bit(birows, first_cluster);
+                               }
+                               if(cbuf->read_from_father[1] == 1) {
+                                       if(irow_write_clusters(birows, 
last_cluster, cbuf->buf + birows->cluster_size, 1) < 0) {
+                                               ret = -1;
+                                               goto end;
+                                       }
+                                       irow_set_bit(birows, last_cluster);
+                               }
+                               break;
+                       }
+               }
+       }
+
+end:
+       return ret;
+}
+
+static int irow_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, 
int nb_sectors) {
+
+       BDRVIrowState *s = bs->opaque;
+       int64_t first_cluster, last_cluster, nb_clusters, sector_index, 
cluster_index, buf_offset, temp_buf_offset, temp_buf_index;
+       int first_cluster_copied = 0;
+       BlockDriver *drv;
+       ClusterBuffer cbuf;
+       int remain_sectors, cbuf_offset, len, ret = 0;
+       uint8_t *temp_buf = NULL;
+
+       first_cluster = sector_num / s->sectors_per_cluster;
+       last_cluster = (sector_num + nb_sectors - 1) / s->sectors_per_cluster;
+       nb_clusters = last_cluster - first_cluster + 1;
+       temp_buf_offset = (sector_num & (s->sectors_per_cluster - 1)) * 
BDRV_SECTOR_SIZE;
+       temp_buf_index = 0;
+       cbuf.buf = NULL;
+       cbuf.read_from_father = NULL;
+
+       if(first_cluster >= s->total_clusters) {
+               fprintf (stderr, "Invalid sector_num.\n");
+               ret = -1;
+               goto end;
+       }
+       if(last_cluster >= s->total_clusters) {
+               fprintf (stderr, "Invalid nb_sectors.\n");
+               ret = -1;
+               goto end;
+       }
+
+       temp_buf = qemu_memalign(512, nb_clusters * s->cluster_size);
+       memset(temp_buf, 0, nb_clusters * s->cluster_size);
+       if(temp_buf == NULL) {
+               fprintf (stderr, "Failed to create temp_buf.\n");
+               ret = -1;
+               goto end;
+       }
+       if(cluster_cache != NULL) {
+               if(cluster_cache->cache != NULL) {
+                       if(first_cluster == cluster_cache->cluster_num) {
+                               memcpy(temp_buf, cluster_cache->cache, 
s->cluster_size);
+                               first_cluster_copied = 1;
+                               first_cluster += 1;
+                               nb_clusters -= 1;
+                               temp_buf_index += 1;
+                       }
+               }
+       }
+
+       if(nb_clusters != 0) {
+               drv = s->irow_irvd->drv;
+               ret = bdrv_read(s->irow_irvd, first_cluster * 
s->sectors_per_cluster, temp_buf + temp_buf_index * s->cluster_size,  
nb_clusters * s->sectors_per_cluster);
+               if(ret < 0) {
+                       goto end;
+               }
+       }
+
+       memcpy(buf, temp_buf + temp_buf_offset, nb_sectors * BDRV_SECTOR_SIZE);
+
+       if(nb_clusters != 0) {
+               if(first_cluster_copied) {
+                       first_cluster -= 1;
+                       nb_clusters += 1;
+               }
+               if(cluster_cache != NULL) {
+                       if(cluster_cache->cache != NULL) {
+                               if(irow_get_bit(s, last_cluster)) {
+                               memcpy(cluster_cache->cache, temp_buf + 
(nb_clusters - 1) * s->cluster_size, s->cluster_size);
+                               cluster_cache->cluster_num = last_cluster;
+                               }
+                       }
+               }
+
+               if(s->complete_image != 1) {
+                       cbuf.buf = qemu_memalign(512, nb_clusters * 
s->cluster_size);
+                       memset(cbuf.buf, 0, nb_clusters * s->cluster_size);
+                       cbuf.read_from_father = g_malloc0(nb_clusters  + 1);
+
+                       if(irow_assert_clusters(bs, &cbuf, 
first_sector_in_cluster(s, first_cluster), nb_clusters * 
s->sectors_per_cluster, IROW_READ) < 0) {
+                               fprintf (stderr, "irow_assert_clusters() 
failed.\n");
+                               ret = -1;
+                               goto end;
+                       }
+
+                       irow_update_btmp(s);
+
+                       sector_index = sector_num;
+                       remain_sectors = nb_sectors;
+                       buf_offset = 0;
+
+                       while(remain_sectors > 0) {
+                               cluster_index = sector_index / 
s->sectors_per_cluster;
+                               len = last_sector_in_cluster(s, cluster_index) 
- sector_index + 1;
+                               if(len > remain_sectors)
+                                       len = remain_sectors;
+
+                               if(cbuf.read_from_father[cluster_index - 
first_cluster] == 1) {
+                                       cbuf_offset = (sector_index & 
(s->sectors_per_cluster - 1)) + (cluster_index - first_cluster) * 
s->sectors_per_cluster;
+                                       memcpy(buf + buf_offset, cbuf.buf + 
cbuf_offset * BDRV_SECTOR_SIZE, len * BDRV_SECTOR_SIZE);
+                               }
+                               sector_index = first_sector_in_cluster(s, 
cluster_index + 1);
+                               remain_sectors -= len;
+                               buf_offset += len * BDRV_SECTOR_SIZE;
+                       }
+               }
+
+       }
+
+end:
+       if(cbuf.buf != NULL) {
+               g_free(cbuf.buf);
+               cbuf.buf = NULL;
+       }
+       if(cbuf.read_from_father != NULL) {
+               g_free(cbuf.read_from_father);
+               cbuf.read_from_father = NULL;
+       }
+       if(temp_buf != NULL) {
+               g_free(temp_buf);
+               temp_buf = NULL;
+       }
+       return ret;
+}
+
+static int irow_write(BlockDriverState *bs, int64_t sector_num, const uint8_t 
*buf, int nb_sectors) {
+       BDRVIrowState *s = bs->opaque;
+       int64_t first_cluster, last_cluster, current_cluster;
+       ClusterBuffer cbuf;
+       BlockDriver *drv;
+       int ret = 0;
+
+       first_cluster = sector_num / s->sectors_per_cluster;
+       last_cluster = (sector_num + nb_sectors - 1) / s->sectors_per_cluster;
+
+
+       if(first_cluster >= s->total_clusters) {
+               fprintf (stderr, "Invalid sector_num!\n");
+               ret = -1;
+               goto end;
+       }
+       if(last_cluster >= s->total_clusters) {
+               fprintf (stderr, "Invalid nb_sectors!\n");
+               ret = -1;
+               goto end;
+       }
+
+       cbuf.buf = NULL;
+       cbuf.read_from_father = NULL;
+       if(s->complete_image != 1) {
+               cbuf.buf = qemu_memalign(512, 2 * s->cluster_size);
+               memset(cbuf.buf, 0, 2 * s->cluster_size);
+               cbuf.read_from_father = g_malloc0(2);
+               if(irow_assert_clusters(bs, &cbuf, sector_num, nb_sectors, 
IROW_WRITE) < 0) {
+                       ret = -1;
+                       goto end;
+               }
+       }
+
+       for(current_cluster = first_cluster; current_cluster <= last_cluster; 
current_cluster++) {
+                       irow_set_bit(s, current_cluster);
+       }
+
+       drv = s->irow_irvd->drv;
+       ret = bdrv_write(s->irow_irvd, sector_num, buf, nb_sectors);
+       if(ret < 0) {
+               goto end;
+       }
+
+       if(irow_update_btmp(s) < 0) {
+               fprintf (stderr, "Failed to update btmp file. (%s)\n", 
s->opened_btmp_file);
+               ret = -1;
+               goto end;
+       }
+
+end:
+       if(cbuf.buf != NULL) {
+               g_free(cbuf.buf);
+               cbuf.buf = NULL;
+       }
+       if(cbuf.read_from_father != NULL) {
+               g_free(cbuf.read_from_father);
+               cbuf.read_from_father = NULL;
+       }
+
+       return ret;
+}
+
+static int irow_generate_filename(char *dest, const char *prefix, const char 
*body, const char *suffix) {
+       if(strlen(prefix) + strlen(body) + strlen(suffix) + 2 >= 
MAX_FILE_NAME_LENGTH) {
+               fprintf(stderr, "Invalid filename length, max is %d\n", 
MAX_FILE_NAME_LENGTH);
+               return -1;
+       }
+       strcpy(dest, prefix);
+       strcat(dest, "-");
+       strcat(dest, body);
+       strcat(dest, ".");
+       strcat(dest, suffix);
+       return 0;
+}
+
+static int irow_create_meta(IRowCreateState *cs) {
+       IRowMeta meta;
+       IRowSnapshotHeader snap_header;
+       uint32_t cluster_size, copy_on_demand;
+       uint64_t disk_size;
+       qemu_timeval tv;
+       int fd, cluster_bits, ret = 0;
+
+       if(cs->disk_size == 0) {
+               fprintf(stderr, "Invalid disk_size\n");
+               ret = -1;
+               goto end;
+       }
+       disk_size = cs->disk_size;
+
+       if(cs->cluster_size == 0) {
+               fprintf(stderr, "Invalid cluster_size\n");
+               ret = -1;
+               goto end;
+       }
+       cluster_size = cs->cluster_size;
+
+   cluster_bits = get_bits_from_size(cluster_size);
+   cs->cluster_bits = cluster_bits;
+   if ((cluster_bits < MIN_CLUSTER_BITS) || (cluster_bits > MAX_CLUSTER_BITS)) 
{
+       fprintf(stderr, "Cluster size must be a power of two between %d and 
%dk\n",
+            1 << MIN_CLUSTER_BITS,
+            1 << (MAX_CLUSTER_BITS - 10));
+       ret =  -1;
+       goto end;
+
+    }
+   copy_on_demand = cs->copy_on_demand;
+   if(cs->meta_file[0] == '\0') {
+          fprintf(stderr, "Void meta file name\n");
+          ret = -1;
+          goto end;
+   }
+   fd = open(cs->meta_file, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
+       if (fd < 0) {
+               fprintf(stderr, "Can not open %s\n", cs->meta_file);
+               ret = -1;
+               goto end;
+       }
+       memset(&meta, 0, sizeof(meta));
+       meta.magic = cpu_to_be32(IROW_MAGIC);
+       meta.version = cpu_to_be32(IROW_VERSION);
+       meta.copy_on_demand = cpu_to_be32(copy_on_demand);
+       meta.cluster_size = cpu_to_be32(cluster_size);
+       meta.cluster_bits = cpu_to_be32(cluster_bits);
+       meta.total_clusters = cpu_to_be64((disk_size + cluster_size -1) >> 
cluster_bits);
+       meta.sectors_per_cluster = cpu_to_be32(cluster_size >> 
BDRV_SECTOR_BITS);
+       meta.disk_size = cpu_to_be64(disk_size);
+       meta.nb_snapshots = cpu_to_be32(1);
+
+       if(irow_generate_filename(meta.current_btmp, cs->meta_file, 
cs->time_value, "btmp") < 0) {
+               ret = -1;
+               goto end;
+       }
+
+       if(irow_generate_filename(cs->irvd_file, cs->meta_file, cs->time_value, 
"irvd") < 0) {
+               ret = -1;
+               goto end;
+       }
+
+       if(cs->backing_file != NULL) {
+               strncpy(meta.backing_file, cs->backing_file, 
MAX_FILE_NAME_LENGTH);
+       }
+
+       strncpy(cs->btmp_file, meta.current_btmp, MAX_FILE_NAME_LENGTH);
+
+       memset(&snap_header, 0, sizeof(snap_header));
+
+   snap_header.snap_magic = cpu_to_be32(IROW_SNAPHEADER_MAGIC);
+   sprintf(snap_header.id_str, "0");
+   sprintf(snap_header.name, "current state");
+       strncpy(snap_header.btmp_file, cs->btmp_file, MAX_FILE_NAME_LENGTH);
+       strncpy(snap_header.irvd_file, cs->irvd_file, MAX_FILE_NAME_LENGTH);
+       qemu_gettimeofday(&tv);
+       snap_header.date_sec = tv.tv_sec;
+       snap_header.date_nsec = tv.tv_usec * 1000;
+       snap_header.nb_children = 0;
+       snap_header.is_deleted = 0;
+
+       if(write(fd, &meta, sizeof(meta))==-1){
+               ret = -1;
+               goto end;
+       }
+       if(write(fd, &snap_header, sizeof(snap_header))==-1){
+               ret = -1;
+               goto end;
+       }
+
+       if(close(fd) != 0) {
+               ret = -1;
+       }
+end:
+       return ret;
+}
+
+static int irow_create_btmp(IRowCreateState *cs) {
+
+       char *bitmap = NULL;
+       int fd,  bitmap_size, ret = 0;
+
+       if(cs->btmp_file[0] == '\0') {
+               fprintf(stderr, "Void btmp file name\n");
+               ret = -1;
+               goto end;
+       }
+       fd = open(cs->btmp_file, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
+       if(fd < 0) {
+               fprintf(stderr, "Can not open %s\n", cs->btmp_file);
+               ret = -1;
+               goto end;
+       }
+
+       bitmap_size = (((cs->disk_size + cs->cluster_size - 1) >> 
cs->cluster_bits) + 7) >> 3;
+       bitmap = g_malloc(bitmap_size);
+       memset(bitmap, 0, bitmap_size);
+
+       if(write(fd, bitmap, bitmap_size)==-1){
+               ret = -1;
+               goto end;
+       }
+
+       if(close(fd) != 0) {
+               ret = -1;
+       }
+
+end:
+       if(bitmap != NULL)
+               g_free(bitmap);
+       return ret;
+}
+
+static int irow_create_vd(IRowCreateState *cs) {
+       int fd, ret = 0;
+
+       if(cs->irvd_file[0] == '\0') {
+               fprintf(stderr, "Void irvd file name\n");
+               ret = -1;
+               goto end;
+       }
+
+       fd = open(cs->irvd_file, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
+       if(fd < 0) {
+               fprintf(stderr, "Can not open %s\n", cs->irvd_file);
+               ret = -1;
+               goto end;
+       }
+       if(fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, cs->disk_size) < 0) {
+               ;
+       }
+       if (ftruncate(fd, cs->disk_size) != 0) {
+               fprintf(stderr, "Can not truncate %s to %" PRId64 " bytes\n", 
cs->irvd_file, cs->disk_size);
+               ret = -1;
+       }
+       if (close(fd) != 0) {
+               ret = -1;
+       }
+
+
+end:
+       return ret;
+}
+
+static IRowCreateState *irow_create_state_new(void) {
+       IRowCreateState *cs = g_malloc0(sizeof(IRowCreateState));
+       qemu_timeval tv;
+
+       cs->meta_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+       cs->btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+       cs->irvd_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+       cs->time_value = g_malloc0(MAX_FILE_NAME_LENGTH);
+       cs->father_btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+
+       qemu_gettimeofday(&tv);
+       sprintf(cs->time_value, "%lx%lx", tv.tv_sec, tv.tv_usec);
+       return cs;
+}
+
+static void irow_create_state_delete(IRowCreateState *cs) {
+       if(cs->meta_file != NULL)
+               g_free(cs->meta_file);
+       if(cs->btmp_file != NULL)
+               g_free(cs->btmp_file);
+       if(cs->irvd_file != NULL)
+               g_free(cs->irvd_file);
+       if(cs->time_value != NULL)
+               g_free(cs->time_value);
+       if(cs->father_btmp_file != NULL)
+               g_free(cs->father_btmp_file);
+       g_free(cs);
+}
+
+static int irow_create(const char *filename, QEMUOptionParameter *options) {
+       IRowCreateState *cs = irow_create_state_new();
+       int ret = 0;
+
+       if(cs == NULL) {
+               ret = -1;
+               goto end;
+       }
+       cs->cluster_size = 65536;
+       cs->copy_on_demand = 0;
+       cs->backing_file = NULL;
+       strncpy(cs->meta_file, filename, MAX_FILE_NAME_LENGTH);
+       while (options && options->name) {
+               if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+                       cs->disk_size= options->value.n;
+                       } else if (!strcmp(options->name, 
BLOCK_OPT_CLUSTER_SIZE)) {
+                               if (options->value.n) {
+                                       cs->cluster_size = options->value.n;
+                               }
+                       } else if (!strcmp(options->name, 
BLOCK_OPT_BACKING_FILE)) {
+                   cs->backing_file = options->value.s;
+                       } else if(!strcmp(options->name, "copy_on_demand")) {
+                               cs->copy_on_demand = options->value.n;
+                       }
+               options++;
+       }
+
+    if(irow_create_meta(cs) < 0) {
+       fprintf(stderr, "Fail to create meta file of %s\n", filename);
+       ret = -1;
+       goto end;
+    }
+
+    if(irow_create_btmp(cs) < 0) {
+       fprintf(stderr, "Fail to create bitmap file of %s\n", filename);
+       ret = -1;
+       goto end;
+    }
+
+    if(irow_create_vd(cs) < 0) {
+       fprintf(stderr, "Fail to create virtual machine disk file of %s\n", 
filename);
+       ret = -1;
+       goto end;
+    }
+
+end:
+       if(cs != NULL) {
+               irow_create_state_delete(cs);
+       }
+       return ret;
+}
+
+static int coroutine_fn irow_flush(BlockDriverState *bs) {
+       BDRVIrowState *s = bs->opaque;
+
+       return bdrv_flush(s->irow_irvd);
+}
+
+typedef struct IRowAIOCB {
+    BlockDriverAIOCB common;
+    int64_t sector_num;
+    QEMUIOVector *qiov;
+    int nb_sectors;
+    BlockDriverAIOCB *irvd_aiocb;
+
+} IRowAIOCB;
+
+static void irow_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+       IRowAIOCB *acb = (IRowAIOCB *)blockacb;
+    if (acb->irvd_aiocb)
+        bdrv_aio_cancel(acb->irvd_aiocb);
+    qemu_aio_release(acb);
+}
+
+static AIOCBInfo irow_aio_pool = {
+    .aiocb_size         = sizeof(IRowAIOCB),
+    .cancel             = irow_aio_cancel,
+};
+
+
+static IRowAIOCB *irow_aio_setup(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    IRowAIOCB *acb;
+
+    acb = qemu_aio_get(&irow_aio_pool, bs, cb, opaque);
+    if (!acb)
+        return NULL;
+    acb->irvd_aiocb = NULL;
+    acb->sector_num = sector_num;
+    acb->qiov = qiov;
+    acb->nb_sectors = nb_sectors;
+    return acb;
+}
+
+static void irow_aio_readv_cb(void *opaque, int ret) {
+       IRowAIOCB *acb = opaque;
+       BlockDriverState *bs = acb->common.bs;
+       BDRVIrowState *birows = bs->opaque;
+       int64_t first_cluster, last_cluster, nb_clusters, sector_index, 
cluster_index, buf_offset;
+       ClusterBuffer cbuf;
+       void *buf = NULL;
+       int remain_sectors, cbuf_offset, len;
+
+       if(ret < 0) {
+               fprintf(stderr, "aio_readv failed\n");
+               goto end;
+       }
+          first_cluster = acb->sector_num / birows->sectors_per_cluster;
+          last_cluster = (acb->sector_num + acb->nb_sectors - 1) / 
birows->sectors_per_cluster;
+
+               if(first_cluster >= birows->total_clusters) {
+                       fprintf (stderr, "Invalid sector_num.\n");
+                       ret = -1;
+                       goto end;
+               }
+               if(last_cluster >= birows->total_clusters) {
+                       fprintf (stderr, "Invalid nb_sectors.\n");
+                       ret = -1;
+                       goto end;
+               }
+
+               cbuf.buf = NULL;
+               cbuf.read_from_father = NULL;
+               if(birows->complete_image != 1) {
+                       nb_clusters = last_cluster - first_cluster + 1;
+                       cbuf.buf = qemu_memalign(512, nb_clusters * 
birows->cluster_size);
+                       memset(cbuf.buf, 0, nb_clusters * birows->cluster_size);
+                       cbuf.read_from_father = g_malloc0(nb_clusters  + 1);
+                  if(irow_assert_clusters(bs, &cbuf, acb->sector_num, 
acb->nb_sectors, IROW_AIO_READ) < 0) {
+                          fprintf (stderr, "irow_assert_clusters() failed.\n");
+                          ret = -1;
+                          goto end;
+                  }
+                       irow_update_btmp(birows);
+
+                  buf = g_malloc(acb->qiov->size);
+                  qemu_iovec_to_buf(acb->qiov, 0, buf, acb->qiov->size);
+
+                       sector_index = acb->sector_num;
+                       remain_sectors = acb->nb_sectors;
+                       buf_offset = 0;
+                       while(remain_sectors > 0) {
+                               cluster_index = sector_index / 
birows->sectors_per_cluster;
+                               len = last_sector_in_cluster(birows, 
cluster_index) - sector_index + 1;
+                               if(len > remain_sectors)
+                                       len = remain_sectors;
+                               if(cbuf.read_from_father[cluster_index - 
first_cluster] == 1) {
+                                       cbuf_offset = (sector_index & 
(birows->sectors_per_cluster - 1)) + (cluster_index - first_cluster) * 
birows->sectors_per_cluster;
+                                       memcpy(buf + buf_offset, cbuf.buf + 
cbuf_offset * BDRV_SECTOR_SIZE, len * BDRV_SECTOR_SIZE);
+                               }
+                               sector_index = first_sector_in_cluster(birows, 
cluster_index + 1);
+                               remain_sectors -= len;
+                               buf_offset += len * BDRV_SECTOR_SIZE;
+                       }
+
+                       qemu_iovec_from_buf(acb->qiov, 0, buf, acb->qiov->size);
+               }
+
+       end:
+               if(buf != NULL) {
+                       g_free(buf);
+                       buf = NULL;
+               }
+               if(cbuf.buf != NULL) {
+                       g_free(cbuf.buf);
+                       cbuf.buf = NULL;
+               }
+               if(cbuf.read_from_father != NULL) {
+                       g_free(cbuf.read_from_father);
+                       cbuf.read_from_father = NULL;
+               }
+           acb->common.cb(acb->common.opaque, ret);
+           qemu_aio_release(acb);
+}
+
+static BlockDriverAIOCB *irow_aio_readv(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque) {
+    IRowAIOCB *acb;
+    BDRVIrowState *birows = bs->opaque;
+    BlockDriver *drv;
+
+    acb = irow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque);
+    if (!acb)
+        return NULL;
+       drv = birows->irow_irvd->drv;
+       acb->irvd_aiocb = drv->bdrv_aio_readv(birows->irow_irvd, sector_num, 
qiov, nb_sectors, irow_aio_readv_cb, acb);
+       if(acb->irvd_aiocb == NULL){
+               qemu_aio_release(acb);
+               return NULL;
+       }
+   return &acb->common;
+}
+
+static BlockDriverAIOCB *irow_aio_writev(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque) {
+       BDRVIrowState *s = bs->opaque;
+       int64_t first_cluster, last_cluster, current_cluster;
+       ClusterBuffer cbuf;
+       BlockDriver *drv;
+       BlockDriverAIOCB *ret = NULL;
+
+   first_cluster = sector_num / s->sectors_per_cluster;
+   last_cluster = (sector_num + nb_sectors - 1) / s->sectors_per_cluster;
+
+       if(first_cluster >= s->total_clusters) {
+               fprintf (stderr, "Invalid sector_num!\n");
+               goto end;
+       }
+       if(last_cluster >= s->total_clusters) {
+               fprintf (stderr, "Invalid nb_sectors!\n");
+               goto end;
+       }
+       cbuf.buf = NULL;
+       cbuf.read_from_father = NULL;
+       if(s->complete_image != 1) {
+               cbuf.buf = qemu_memalign(512, 2  * s->cluster_size);
+               cbuf.read_from_father = g_malloc0(2);
+               if(irow_assert_clusters(bs, &cbuf, sector_num, nb_sectors, 
IROW_AIO_WRITE) < 0) {
+                       fprintf (stderr, "irow_assert_clusters() failed.\n");
+                       goto end;
+               }
+       }
+
+       for(current_cluster = first_cluster; current_cluster <= last_cluster; 
current_cluster++) {
+                       irow_set_bit(s, current_cluster);
+       }
+
+       drv = s->irow_irvd->drv;
+   ret = drv->bdrv_aio_writev(s->irow_irvd, sector_num, qiov, nb_sectors, cb, 
opaque );
+   if(ret == NULL) {
+          goto end;
+   }
+
+       if(irow_update_btmp(s) < 0) {
+               fprintf (stderr, "Failed to update btmp file. (%s)\n", 
s->opened_btmp_file);
+               ret = NULL;
+               goto end;
+       }
+
+end:
+       if(cbuf.buf != NULL) {
+               g_free(cbuf.buf);
+               cbuf.buf = NULL;
+       }
+       if(cbuf.read_from_father != NULL) {
+               g_free(cbuf.read_from_father);
+               cbuf.read_from_father = NULL;
+       }
+   return ret;
+}
+
+static BlockDriverAIOCB *irow_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque) {
+       BDRVIrowState *s = bs->opaque;
+       BlockDriverAIOCB *ret = NULL;
+
+       ret = bdrv_aio_flush(s->irow_irvd, cb, opaque);
+
+       return ret;
+}
+
+static void irow_new_snapshot_id(BDRVIrowState *birows, char *id_str, int 
id_str_size) {
+       IRowSnapshot *snap_ptr;
+   uint i, id, found;
+
+   for(id = 1; id < 0xffffffff; id++) {
+          found = 1;
+          for(i = 0; i < birows->nb_snapshots; i++) {
+                  snap_ptr = birows->snapshots + i;
+                  if(snap_ptr->id_str != NULL) {
+                          if(id == strtoul(snap_ptr->id_str, NULL, 10)) {
+                                  found = 0;
+                                  break;
+                          }
+                  }
+          }
+          if(found)
+                  break;
+   }
+  snprintf(id_str, id_str_size, "%d", id);
+}
+
+static int irow_find_snapshot_by_id(BDRVIrowState *birows, const char *id_str) 
{
+       int i;
+
+       for(i = 0; i < birows->nb_snapshots; i++) {
+               if(birows->snapshots[i].id_str != NULL) {
+                       if(strcmp(birows->snapshots[i].id_str, id_str) == 0) {
+                               return i;
+                       }
+               }
+       }
+       return -1;
+}
+
+static int irow_find_snapshot_by_name(BDRVIrowState *birows, const char *name) 
{
+       int i;
+
+       for(i = 0; i < birows->nb_snapshots; i++) {
+               if(birows->snapshots[i].name != NULL) {
+                       if(strcmp(birows->snapshots[i].name, name) == 0) {
+                               return i;
+                       }
+               }
+       }
+       return -1;
+}
+
+static int irow_find_free_snapshot(BDRVIrowState *birows) {
+       int i;
+
+       for(i = 0; i < birows->nb_snapshots; i++) {
+               if(birows->snapshots[i].nb_children == 0 && 
birows->snapshots[i].is_deleted == 1) {
+                       return i;
+               }
+       }
+       return -1;
+}
+
+static int irow_update_nb_children(BDRVIrowState *birows, IRowSnapshot *snap, 
int value) {
+       IRowSnapshot *father_snap;
+       int snap_index, ret = 0;
+       snap->nb_children += value;
+       if(snap->nb_children == 0 && snap->is_deleted == 1) {
+               if(snap->father_btmp_file) {
+                       snap_index = irow_find_snapshot_by_btmp(birows, 
snap->father_btmp_file);
+                       if(snap_index < 0) {
+                               fprintf(stderr, "Failed to find father 
snapshot\n");
+                               ret = -1;
+                               goto end;
+                       }
+                       father_snap = birows->snapshots + snap_index;
+                       irow_update_nb_children(birows, father_snap, value);
+               }
+       }
+
+end:
+       return ret;
+}
+
+static int irow_snapshot_add(BDRVIrowState *birows, IRowCreateState *cs, 
QEMUSnapshotInfo *sn_info) {
+       IRowSnapshot *new_snap, *snap;
+       qemu_timeval tv;
+       int snap_index;
+
+       birows->snapshots = g_realloc(birows->snapshots, (birows->nb_snapshots 
+ 1) * sizeof(IRowSnapshot));
+
+       snap_index = irow_find_snapshot_by_btmp(birows, 
birows->current_btmp_file);
+       if(snap_index < 0) {
+               return -1;
+       }
+       snap = birows->snapshots + snap_index;
+
+       new_snap = birows->snapshots + birows->nb_snapshots;
+       memset(new_snap, 0, sizeof(IRowSnapshot));
+
+       snap->date_sec = sn_info->date_sec;
+       snap->date_nsec = sn_info->date_nsec;
+       snap->vm_clock_nsec = sn_info->vm_clock_nsec;
+       snap->vm_state_size = sn_info->vm_state_size;
+       irow_update_nb_children(birows, snap, 1);
+
+       if(snap->id_str == NULL) {
+               snap->id_str = g_malloc0(128);
+       } else {
+               memset(snap->id_str, 0, 128);
+       }
+       strncpy(snap->id_str, sn_info->id_str, 128);
+
+       if(snap->name == NULL) {
+               snap->name = g_malloc0(256);
+       } else {
+               memset(snap->name, 0, 256);
+       }
+       strncpy(snap->name, sn_info->name, 256);
+
+       new_snap->id_str = g_malloc0(128);
+       sprintf(new_snap->id_str, "0");
+       new_snap->name = g_malloc0(256);
+       sprintf(new_snap->name, "current state");
+       new_snap->btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+       strncpy(new_snap->btmp_file, cs->btmp_file, MAX_FILE_NAME_LENGTH);
+       new_snap->irvd_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+       strncpy(new_snap->irvd_file, cs->irvd_file, MAX_FILE_NAME_LENGTH);
+       if(cs->father_btmp_file != NULL) {
+               new_snap->father_btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+               strncpy(new_snap->father_btmp_file, cs->father_btmp_file, 
MAX_FILE_NAME_LENGTH);
+       }
+       qemu_gettimeofday(&tv);
+       new_snap->date_sec = tv.tv_sec;
+       new_snap->date_nsec = tv.tv_usec * 1000;
+
+       birows->nb_snapshots += 1;
+       birows_cache = g_realloc(birows_cache, sizeof(BDRVIrowState *) * 
birows->nb_snapshots);
+       memset(birows_cache, 0, sizeof(BDRVIrowState *) * birows->nb_snapshots);
+       birows->snapshots_is_dirty = 1;
+
+       return 0;
+}
+
+static void irow_snapshot_copy(IRowSnapshot *dst, IRowSnapshot *src) {
+
+       if(src->id_str) {
+               dst->id_str = g_malloc0(128);
+               strncpy(dst->id_str, src->id_str, 128);
+       }
+       if(src->name) {
+               dst->name = g_malloc0(256);
+               strncpy(dst->name, src->name, 256);
+       }
+       if(src->btmp_file) {
+               dst->btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+               strncpy(dst->btmp_file, src->btmp_file, MAX_FILE_NAME_LENGTH);
+       }
+       if(src->irvd_file) {
+               dst->irvd_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+               strncpy(dst->irvd_file, src->irvd_file, MAX_FILE_NAME_LENGTH);
+       }
+       if(src->father_btmp_file) {
+               dst->father_btmp_file = g_malloc0(MAX_FILE_NAME_LENGTH);
+               strncpy(dst->father_btmp_file, src->father_btmp_file, 
MAX_FILE_NAME_LENGTH);
+       }
+       dst->date_sec = src->date_sec;
+       dst->date_nsec = src->date_nsec;
+       dst->vm_clock_nsec = src->vm_clock_nsec;
+       dst->vm_state_size = src->vm_state_size;
+       dst->nb_children = src->nb_children;
+       dst->is_deleted = src->is_deleted;
+}
+
+static int irow_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo 
*sn_info) {
+       BDRVIrowState *s = bs->opaque;
+       IRowCreateState *cs = NULL;
+       IRowSnapshot *free_snap, *old_snap, *snap;
+       int snap_index, offset, ret = 0;
+
+       if(sn_info->id_str[0] == '\0') {
+               irow_new_snapshot_id(s, sn_info->id_str, 
sizeof(sn_info->id_str));
+       }
+
+       if(irow_find_snapshot_by_id(s, sn_info->id_str) >= 0) {
+               fprintf(stderr, "Duplicated snapshot id\n");
+               ret = -1;
+               goto end;
+       }
+
+       if(irow_find_snapshot_by_name(s, sn_info->name) >= 0) {
+               fprintf(stderr, "Duplicated snapshot name\n");
+               ret = -1;
+               goto end;
+       }
+
+       cs = irow_create_state_new();
+       cs->cluster_bits = s->cluster_bits;
+       cs->cluster_size = s->cluster_size;
+       cs->disk_size = s->disk_size;
+       strncpy(cs->meta_file, s->meta_file, MAX_FILE_NAME_LENGTH);
+       strncpy(cs->father_btmp_file, s->current_btmp_file, 
MAX_FILE_NAME_LENGTH); // 其father文件为老的当前镜像
+
+       snap_index = irow_find_free_snapshot(s);
+       if(snap_index >= 0) {
+               free_snap = s->snapshots + snap_index;
+               strcpy(cs->btmp_file, free_snap->btmp_file);
+               strcpy(cs->irvd_file, free_snap->irvd_file);
+               old_snap = s->snapshots;
+               s->snapshots = g_malloc0((s->nb_snapshots - 1) * 
sizeof(IRowSnapshot));
+               offset = 0;
+               for(snap_index = 0; snap_index < s->nb_snapshots; snap_index++) 
{
+                       snap = old_snap + snap_index;
+                       if(snap != free_snap) {
+                               irow_snapshot_copy(s->snapshots + offset, snap);
+                               offset += 1;
+                       }
+               }
+
+               irow_close_snapshots2(old_snap, s->nb_snapshots);
+               s->nb_snapshots -= 1;
+       } else {
+               irow_generate_filename(cs->btmp_file, cs->meta_file, 
cs->time_value, "btmp");
+               irow_generate_filename(cs->irvd_file, cs->meta_file, 
cs->time_value, "irvd");
+
+               if(irow_create_btmp(cs) < 0) {
+                       fprintf(stderr, "Failed to create new btmp file 
(%s)\n", cs->btmp_file);
+                       ret = -1;
+                       goto end;
+               }
+
+               if(irow_create_vd(cs) < 0) {
+                       fprintf(stderr, "Failed to create new irvd file 
(%s)\n", cs->irvd_file);
+                       ret = -1;
+                       goto end;
+               }
+       }
+
+       if(irow_snapshot_add(s, cs, sn_info) < 0) {
+               fprintf(stderr, "Failed to add new snapshot in mem\n");
+               ret = -1;
+               goto end;
+       }
+
+       if(irow_update_meta(s, cs->btmp_file, 0) < 0) {
+               fprintf(stderr, "Failed to update meta file (%s)\n", 
s->meta_file);
+               ret = -1;
+               goto end;
+       }
+
+       s->vm_state_size = sn_info->vm_state_size;
+       irow_update_btmp(s);
+
+       irow_close_btmp(s);
+       irow_close_irvd(s);
+
+       strncpy(s->current_btmp_file, cs->btmp_file, MAX_FILE_NAME_LENGTH);
+       snap_index = irow_find_snapshot_by_btmp(s, s->current_btmp_file);
+       if(irow_load_info_from_snapshot(s, snap_index) < 0) {
+               ret = -1;
+               goto end;
+       }
+       ret = irow_open_data(s, s->open_flags);
+       memset(s->bitmap, 0, s->bitmap_size);
+       s->bitmap_is_dirty = 1;
+       if(irow_update_btmp(s) < 0) {
+               fprintf(stderr, "Failed to update btmp file\n");
+               ret = -1;
+               goto end;
+       }
+
+end:
+       if(cs != NULL) {
+               irow_create_state_delete(cs);
+               cs = NULL;
+       }
+       return ret;
+}
+
+static int64_t irow_vm_state_offset(BDRVIrowState *birows) {
+       return birows->bitmap_size;
+}
+
+static int irow_load_vmstate2(BDRVIrowState *birows, uint8_t *buf, int64_t 
pos, int size) {
+
+       return bdrv_pread(birows->irow_btmp, irow_vm_state_offset(birows) + 
pos, buf, size);
+
+}
+
+static int irow_save_vmstate2(BDRVIrowState *birows, const uint8_t *buf, 
int64_t pos, int size) {
+       birows->vmstate_is_saved = 1;
+       return bdrv_pwrite(birows->irow_btmp, irow_vm_state_offset(birows) + 
pos, buf, size);
+
+}
+
+static int irow_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) {
+
+       BDRVIrowState *s = bs->opaque;
+       IRowSnapshot *target_snap, *current_snap, *father_snap;
+       int snap_index, ret = 0;
+
+       if(strcmp(snapshot_id, "0") == 0 || strcmp(snapshot_id, "current 
state") == 0) {
+               fprintf(stderr, "No need to goto current state.\n");
+               goto end;
+       }
+
+       snap_index = irow_find_snapshot_by_id(s, snapshot_id);
+       if(snap_index < 0) {
+               snap_index = irow_find_snapshot_by_name(s, snapshot_id);
+               if(snap_index < 0) {
+                       fprintf(stderr, "Failed to find snapshot %s\n", 
snapshot_id);
+                       ret = -1;
+                       goto end;
+               }
+       }
+       target_snap = s->snapshots + snap_index;
+
+       if(target_snap->is_deleted) {
+               fprintf(stderr, "Can not go to deleted snapshot %s\n", 
snapshot_id);
+               ret = -1;
+               goto end;
+       }
+
+       snap_index = irow_find_snapshot_by_btmp(s, s->current_btmp_file);
+       if(snap_index < 0) {
+               fprintf(stderr, "Failed to find current state.\n");
+               ret = -1;
+               goto end;
+       }
+       current_snap = s->snapshots + snap_index;
+       snap_index = irow_find_snapshot_by_btmp(s, s->father_btmp_file);
+       if(snap_index < 0) {
+               fprintf(stderr, "Failed to find father snapshot.\n");
+               ret = -1;
+               goto end;
+       }
+       father_snap = s->snapshots + snap_index;
+       strncpy(s->father_btmp_file, target_snap->btmp_file, 
MAX_FILE_NAME_LENGTH);
+       strncpy(current_snap->father_btmp_file, target_snap->btmp_file, 
MAX_FILE_NAME_LENGTH);
+
+       irow_update_nb_children(s, father_snap, -1);
+       irow_update_nb_children(s, target_snap, 1);
+
+       current_snap->date_sec = target_snap->date_sec;
+       current_snap->date_nsec = target_snap->date_nsec;
+       current_snap->vm_clock_nsec = target_snap->vm_clock_nsec;
+       current_snap->vm_state_size = 0;
+
+       memset(s->bitmap, 0, s->bitmap_size);
+       s->bitmap_is_dirty = 1;
+       if(irow_update_btmp(s) < 0) {
+               fprintf(stderr, "Failed to update btmp file\n");
+               ret = -1;
+               goto end;
+       }
+
+       s->snapshots_is_dirty = 1;
+       if(irow_update_meta(s, NULL, 0) < 0) {
+               fprintf(stderr, "Failed to update meta file\n");
+               ret = -1;
+       }
+
+
+end:
+       return ret;
+}
+
+static int irow_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) 
{
+
+       BDRVIrowState *s = bs->opaque;
+       IRowSnapshot *target_snap, *father_snap;
+       int snap_index, ret = 0;
+
+       if(strcmp(snapshot_id, "0") == 0 || strcmp(snapshot_id, "current 
state") == 0) {
+               fprintf(stderr, "Can not delete current state.\n");
+               goto end;
+       }
+
+       snap_index = irow_find_snapshot_by_id(s, snapshot_id);
+       if(snap_index < 0) {
+               snap_index = irow_find_snapshot_by_name(s, snapshot_id);
+               if(snap_index < 0) {
+                       fprintf(stderr, "Failed to find snapshot %s\n", 
snapshot_id);
+                       ret = -1;
+                       goto end;
+               }
+       }
+       target_snap = s->snapshots + snap_index;
+
+       if(target_snap->is_deleted) {
+               fprintf(stderr, "Can not delete deleted snapshot %s\n", 
snapshot_id);
+               ret = -1;
+               goto end;
+       }
+
+       target_snap->is_deleted = 1;
+       strncat(target_snap->name, "_del", 255-strlen(target_snap->name));
+
+       if(target_snap->nb_children == 0) {
+               if(target_snap->father_btmp_file) {
+                       snap_index = irow_find_snapshot_by_btmp(s, 
target_snap->father_btmp_file);
+                       if(snap_index < 0) {
+                               fprintf(stderr, "Failed to find father 
snapshot\n");
+                               ret = -1;
+                               goto end;
+                       }
+                       father_snap = s->snapshots + snap_index;
+                       irow_update_nb_children(s, father_snap, -1);
+               }
+       }
+
+       s->snapshots_is_dirty = 1;
+       irow_update_meta(s, NULL, 0);
+end:
+       return ret;
+}
+
+static int irow_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo 
**psn_tab) {
+
+       BDRVIrowState *s = bs->opaque;
+   QEMUSnapshotInfo *snap_tab, *snap_info;
+   IRowSnapshot *snap;
+   int i, offset, nb_del_snapshots = 0;
+
+   if (s->nb_snapshots == 0) {
+          *psn_tab = NULL;
+      return s->nb_snapshots;
+   }
+
+   for(i = 0; i < s->nb_snapshots; i++) {
+          snap = s->snapshots + i;
+          if(snap->is_deleted)
+                  nb_del_snapshots += 1;
+   }
+   snap_tab = g_malloc0((s->nb_snapshots - nb_del_snapshots) * 
sizeof(QEMUSnapshotInfo));
+   offset = 0;
+   for(i = 0; i < s->nb_snapshots; i++) {
+          snap_info = snap_tab + offset;
+          snap = s->snapshots + i;
+          if(snap->is_deleted != 1) {
+                  if(snap->id_str != NULL) {
+                          pstrcpy(snap_info->id_str, 
sizeof(snap_info->id_str), snap->id_str);
+                  }
+                  if(snap->name != NULL) {
+                          pstrcpy(snap_info->name, sizeof(snap_info->name), 
snap->name);
+                  }
+                  snap_info->vm_state_size = snap->vm_state_size;
+                  snap_info->date_sec = snap->date_sec;
+                  snap_info->date_nsec = snap->date_nsec;
+                  snap_info->vm_clock_nsec = snap->vm_clock_nsec;
+
+                  offset += 1;
+          }
+   }
+   *psn_tab = snap_tab;
+   return s->nb_snapshots - nb_del_snapshots;
+}
+
+static int irow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) {
+       BDRVIrowState *s = bs->opaque;
+       bdi->cluster_size = s->cluster_size;
+       bdi->vm_state_offset = irow_vm_state_offset(s);
+       return 0;
+}
+
+static int irow_save_vmstate(BlockDriverState *bs, const uint8_t *buf, int64_t 
pos, int size) {
+
+       BDRVIrowState *birows = bs->opaque;
+       int ret = 0;
+
+       ret = irow_save_vmstate2(birows, buf, pos, size);
+       return ret;
+}
+
+static int irow_load_vmstate(BlockDriverState *bs, uint8_t *buf, int64_t pos, 
int size) {
+
+       BDRVIrowState *target_birows = NULL, *birows = bs->opaque;
+       int target_index, ret = 0;
+
+       target_index = irow_find_snapshot_by_btmp(birows, 
birows->father_btmp_file);
+       if(target_index < 0) {
+               ret = -1;
+               goto end;
+       }
+
+       target_birows = irow_open_previous_state(birows, target_index);
+       if(target_birows == NULL) {
+               ret = -1;
+               goto end;
+       }
+
+       ret = irow_load_vmstate2(target_birows, buf, pos, size);
+
+end:
+       if(target_birows != NULL) {
+               irow_close_previous_state(target_birows);
+               target_birows = NULL;
+       }
+       return ret;
+}
+
+static int irow_check(BlockDriverState *bs, BdrvCheckResult *res, 
BdrvCheckMode fix){
+       BDRVIrowState *birows = bs->opaque;
+       char user_input[100];
+       printf("current copy_on_demand state is ");
+       if(birows->copy_on_demand) {
+               printf("ON\n");
+       } else {
+               printf("OFF\n");
+       }
+       while(1) {
+               printf("do you want to change copy_on_demand state? (y/n)");
+               if(scanf("%s", user_input)== EOF){
+                       return 1;
+               }
+               user_input[0] = tolower(user_input[0]);
+               if(user_input[0] == 'y') {
+                       birows->copy_on_demand = birows->copy_on_demand ? 0 : 1;
+                       irow_update_meta(birows, NULL, 1);
+                       break;
+               }
+               if(user_input[0] == 'n')
+                       break;
+       }
+       return 0;
+}
+
+static int64_t irow_get_length(BlockDriverState *bs) {
+       BDRVIrowState *birows = bs->opaque;
+       int64_t ret;
+       ret = birows->disk_size;
+       return ret;
+}
+
+static QEMUOptionParameter irow_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    {
+        .name = BLOCK_OPT_CLUSTER_SIZE,
+        .type = OPT_SIZE,
+        .help = "irow cluster size"
+    },
+    {
+        .name = BLOCK_OPT_BACKING_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a base image"
+    },
+    {
+        .name = "copy_on_demand",
+        .type = OPT_FLAG,
+        .help = "copy clusters to current irvd when needed"
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_irow = {
+    .format_name       = "irow",
+    .instance_size     = sizeof(BDRVIrowState),
+    .bdrv_probe                = irow_probe,
+    .bdrv_open         = irow_open,
+    .bdrv_read         = irow_read,
+    .bdrv_write                = irow_write,
+    .bdrv_close                = irow_close,
+    .bdrv_create       = irow_create,
+
+    .bdrv_co_flush_to_disk = irow_flush,
+
+    .bdrv_aio_readv            = irow_aio_readv,
+    .bdrv_aio_writev   = irow_aio_writev,
+    .bdrv_aio_flush            = irow_aio_flush,
+
+    .bdrv_snapshot_create   = irow_snapshot_create,
+    .bdrv_snapshot_goto     = irow_snapshot_goto,
+    .bdrv_snapshot_delete   = irow_snapshot_delete,
+    .bdrv_snapshot_list     = irow_snapshot_list,
+
+    .bdrv_get_info     = irow_get_info,
+    .bdrv_getlength = irow_get_length,
+
+    .bdrv_save_vmstate    = irow_save_vmstate,
+    .bdrv_load_vmstate    = irow_load_vmstate,
+
+    .create_options = irow_create_options,
+    .bdrv_check = irow_check,
+};
+
+static void bdrv_irow_init(void)
+{
+    bdrv_register(&bdrv_irow);
+}
+
+block_init(bdrv_irow_init);
diff --git a/block/irow.h b/block/irow.h
new file mode 100644
index 0000000..131b741
--- /dev/null
+++ b/block/irow.h
@@ -0,0 +1,135 @@
+/* IROW(Improved ROW)Disk Format
+ * */
+/*
+ * iRow (imporved Redirect-on-Write) is a disk format supporting 
high-efficiency VM disk snapshot.
+ * iROW uses bitmap to reduce the amount of metadata, so that both the VM disk 
snapshot key operations
+ * performance and the VM disk I/O performance would be enhanced at the same 
time.
+ *
+ *The iROW VM disk image consists of a meta file and several snapshots.
+ *
+ *A snapshot consists of 2 files: a bitmap file (btmp file) and a VM disk data 
file (irvd file).
+ *The current state of the iROW VM disk also occupies a snapshot.
+ *
+ *The meta file consists of the meta header and the snapshots information. The 
meta header is used to
+ *store basic information of VM disk image. The snapshots information 
sequentially stores every snapshot’s name,
+ *id and others related information.
+ *
+ *The btmp file consists of a bitmap and the VM state data. The bitmap is used 
to indicate whether the
+ *clusters exist in corresponding irvd file. Each cluster in the VM disk image 
is mapped to a bit in the bitmap.
+ *
+ *The irvd file is used to store the actual data of the VM disk image. The 
smallest unit of storage is cluster.
+ *iROW does not decide the address of the data clusters. It just writes the 
clusters to the same VM disk image
+ *addresses as the virtual addresses of the clusters. Because of host 
machine’s file system support sparse files,
+ *iROW also achieves the gradual growth of the VM disk image size with the 
actual disk usage.
+ *
+ */
+#define IROW_MAGIC (('I' << 24) | ('R' << 16) | ('O' << 8) | 'W')
+#define IROW_VERSION 1
+
+#define IROW_SNAPHEADER_MAGIC (('S' << 24) | ('N' << 16) | ('A' << 8) | 'P')
+
+#define MIN_CLUSTER_BITS 9
+#define MAX_CLUSTER_BITS 21
+#define MAX_FILE_NAME_LENGTH 256
+
+#define IROW_READ 1
+#define IROW_WRITE 2
+#define IROW_AIO_READ 3
+#define IROW_AIO_WRITE 4
+
+
+typedef struct __attribute__((packed)) IRowMeta {
+    uint32_t magic;
+    uint32_t version;
+    uint32_t copy_on_demand;
+    uint32_t nb_snapshots;
+    uint32_t cluster_size;
+    uint32_t cluster_bits;
+    uint32_t sectors_per_cluster;
+    uint64_t total_clusters;
+    uint64_t disk_size;
+    char current_btmp[MAX_FILE_NAME_LENGTH];
+    char backing_file[MAX_FILE_NAME_LENGTH];
+} IRowMeta;
+
+typedef struct __attribute__((packed)) IRowSnapshotHeader {
+       uint32_t snap_magic;
+       char id_str[128];
+       char name[256];
+       char btmp_file[MAX_FILE_NAME_LENGTH];
+       char irvd_file[MAX_FILE_NAME_LENGTH];
+       char father_btmp_file[MAX_FILE_NAME_LENGTH];
+       uint32_t vm_state_size;
+       uint32_t date_sec;
+       uint32_t date_nsec;
+       uint64_t vm_clock_nsec;
+       uint32_t nb_children;
+       uint32_t is_deleted;
+} IRowSnapshotHeader;
+
+typedef struct IRowSnapshot {
+       char *id_str;
+       char *name;
+       char *btmp_file;
+       char *irvd_file;
+       char *father_btmp_file;
+       uint32_t vm_state_size;
+       uint32_t date_sec;
+       uint32_t date_nsec;
+       uint64_t vm_clock_nsec;
+       uint32_t nb_children;
+       uint32_t is_deleted;
+} IRowSnapshot;
+
+typedef struct IRowCreateState {
+       uint64_t disk_size;
+       uint32_t cluster_size;
+       uint32_t cluster_bits;
+       uint32_t copy_on_demand;
+       char *meta_file;
+       char *father_btmp_file;
+       char *btmp_file;
+       char *irvd_file;
+       char *time_value;
+       char *backing_file;
+} IRowCreateState;
+
+typedef struct ClusterCache {
+       uint8_t *cache;
+       int64_t cluster_num;
+} ClusterCache;
+
+typedef struct BDRVIrowState {
+    BlockDriverState *irow_meta;
+    BlockDriverState *irow_btmp;
+    BlockDriverState *irow_irvd;
+    uint64_t disk_size;
+    uint64_t bitmap_size;
+    uint32_t cluster_size;
+    uint32_t cluster_bits;
+    uint64_t total_clusters;
+    uint32_t sectors_per_cluster;
+    uint32_t nb_snapshots;
+    uint32_t vm_state_size;
+    uint32_t copy_on_demand;
+    int open_flags;
+    IRowSnapshot *snapshots;
+    uint32_t snapshots_is_dirty;
+    uint8_t *bitmap;
+    uint32_t bitmap_is_dirty;
+    uint32_t vmstate_is_saved;
+    uint32_t complete_image;
+    char *meta_file;
+    char *current_btmp_file;
+    char *father_btmp_file;
+    char *opened_btmp_file;
+    char *irvd_file;
+} BDRVIrowState;
+
+typedef struct ClusterBuffer {
+       uint8_t *buf;
+       uint8_t *read_from_father;
+} ClusterBuffer;
+
+#define IROW_SNAPSHOT_OFFSET sizeof(IRowMeta)
+#define MAX_MERGE_BUFFER 16 * 1024 * 1024
-- 
1.7.2.5
The iROW VM disk image consists of a meta file and several snapshots.

A snapshot consists of 2 files: a bitmap file (btmp file) and a VM disk data 
file (irvd file).
The current state of the iROW VM disk also occupies a snapshot.

The meta file consists of the meta header and the snapshots information. The 
meta header is used to
store basic information of VM disk image. The snapshots information 
sequentially stores every snapshot’s name,
id and others related information.

The btmp file consists of a bitmap and the VM state data. The bitmap is used to 
indicate whether the
clusters exist in corresponding irvd file. Each cluster in the VM disk image is 
mapped to a bit in the bitmap.

The irvd file is used to store the actual data of the VM disk image. The 
smallest unit of storage is cluster.
iROW does not decide the address of the data clusters. It just writes the 
clusters to the same VM disk image
addresses as the virtual addresses of the clusters. Because of host machine’s 
file system support sparse files,
iROW also achieves the gradual growth of the VM disk image size with the actual 
disk usage.




reply via email to

[Prev in Thread] Current Thread [Next in Thread]