Signed-off-by: Chunqiang Tang<address@hidden>
---
block/fvd-create.c | 475
+++++++++++++++++++++++++++++++++++++++++++++++++++
block/fvd-debug.c | 406 ++++++++++++++++++++++++++++++++++++++++++++
block/fvd-ext.h | 71 ++++++++
block/fvd.c | 127 ++++++++++++++
block/fvd.h | 481
++++++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 1560 insertions(+), 0 deletions(-)
create mode 100644 block/fvd-create.c
create mode 100644 block/fvd-debug.c
create mode 100644 block/fvd-ext.h
create mode 100644 block/fvd.c
create mode 100644 block/fvd.h
diff --git a/block/fvd-create.c b/block/fvd-create.c
new file mode 100644
index 0000000..b978ecb
--- /dev/null
+++ b/block/fvd-create.c
@@ -0,0 +1,475 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ * Chunqiang Tang<address@hidden>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ * A short description: this module implements bdrv_create() for FVD.
+
*============================================================================*/
+
+static inline int64_t calc_min_journal_size (int64_t table_entries);
+static inline int search_holes(const char *filename, size_t
bitmap_size,
+ int32_t bitmap_start_offset, BlockDriverState * bs,
+ int64_t nb_sectors, int32_t hole_size, int32_t
block_size);
+
+static int fvd_create (const char *filename, QEMUOptionParameter *
options)
+{
+ int fd, ret;
+ FvdHeader *header;
+ int64_t virtual_disk_size = DEF_PAGE_SIZE;
+ int32_t header_size;
+ const char *base_img = NULL;
+ const char *base_img_fmt = NULL;
+ const char *data_file = NULL;
+ const char *data_file_fmt = NULL;
+ int32_t hole_size = 0;
+ int copy_on_read = FALSE;
+ int prefetch_start_delay = -1;
+ int64_t prefetch_profile_size = 0;
+ BlockDriverState *bs = NULL;
+ int bitmap_size = 0;
+ int64_t base_img_size = 0;
+ int64_t table_size = 0;
+ int64_t journal_size = 0;
+ int32_t block_size = 0;
+
+ header_size = sizeof (FvdHeader);
+ header_size = ROUND_UP (header_size, DEF_PAGE_SIZE);
+ header = my_qemu_mallocz (header_size);
+
+ /* Read out options */
+ while (options&& options->name) {
+ if (!strcmp (options->name, BLOCK_OPT_SIZE)) {
+ virtual_disk_size = options->value.n;
+ } else if (!strcmp (options->name,"prefetch_start_delay")) {
+ if (options->value.n<= 0) {
+ prefetch_start_delay = -1;
+ } else {
+ prefetch_start_delay = options->value.n;
+ }
+ } else if (!strcmp (options->name, BLOCK_OPT_BACKING_FILE)) {
+ base_img = options->value.s;
+ } else if (!strcmp (options->name, BLOCK_OPT_BACKING_FMT)) {
+ base_img_fmt = options->value.s;
+ } else if (!strcmp (options->name, "copy_on_read")) {
+ copy_on_read = options->value.n;
+ } else if (!strcmp (options->name, "data_file")) {
+ data_file = options->value.s;
+ } else if (!strcmp (options->name, "data_file_fmt")) {
+ data_file_fmt = options->value.s;
+ } else if (!strcmp (options->name, "detect_sparse_hole")) {
+ hole_size = options->value.n;
+ } else if (!strcmp (options->name, "compact_image")) {
+ header->compact_image = options->value.n;
+ } else if (!strcmp (options->name, "block_size")) {
+ block_size = options->value.n;
+ } else if (!strcmp (options->name, "chunk_size")) {
+ header->chunk_size = options->value.n;
+ } else if (!strcmp (options->name, "journal_size")) {
+ journal_size = options->value.n;
+ } else if (!strcmp (options->name, "storage_grow_unit")) {
+ header->storage_grow_unit = options->value.n;
+ } else if (!strcmp (options->name, "add_storage_cmd")
+&& options->value.s) {
+ pstrcpy (header->add_storage_cmd, sizeof
(header->add_storage_cmd),
+ options->value.s);
+ }
+ options++;
+ }
+
+ virtual_disk_size = ROUND_UP (virtual_disk_size, 512);
+
+ /* Check if arguments are valid. */
+ if (base_img&& strlen (base_img)> 1023) {
+ fprintf (stderr, "The base image name is longer than 1023
characters, "
+ "which is not allowed.\n");
+ return -EINVAL;
+ }
+
+ if (base_img&& hole_size> 0) {
+ if (header->compact_image) {
+ fprintf (stderr, "compact_image and detect_sparse_hole
cannot be "
+ "enabled together. Please disable
detect_sparse_hole. \n");
+ return -EINVAL;
+ }
+ header->need_zero_init = TRUE;
+ } else {
+ header->need_zero_init = FALSE;
+ }
+
+ if (data_file) {
+ pstrcpy (header->data_file, 1024, data_file);
+ if (data_file_fmt) {
+ pstrcpy (header->data_file_fmt, 16, data_file_fmt);
+ }
+ }
+
+ header->magic = FVD_MAGIC;
+ header->version = FVD_VERSION;
+ header->virtual_disk_size = virtual_disk_size;
+ header->clean_shutdown = TRUE;
+
+ if (!base_img) {
+ header->all_data_in_fvd_img = TRUE;
+ } else {
+ int ret;
+
+ bs = bdrv_new ("");
+ if (!bs) {
+ fprintf (stderr, "Failed to create a new block driver\n");
+ return -1;
+ }
+
+ pstrcpy (header->base_img, 1024, base_img);
+ if (base_img_fmt) {
+ pstrcpy (header->base_img_fmt, 16, base_img_fmt);
+ BlockDriver *drv = bdrv_find_format (base_img_fmt);
+ if (!drv) {
+ fprintf (stderr, "Failed to find driver for format
'%s'\n",
+ base_img_fmt);
+ return -1;
+ }
+ ret = bdrv_open (bs, header->data_file, 0, drv);
+ } else {
+ ret = bdrv_open (bs, base_img, 0, NULL);
+ }
+
+ if (ret< 0) {
+ fprintf (stderr, "Failed to open the base image %s\n",
base_img);
+ return -1;
+ }
+
+ base_img_size = bdrv_getlength (bs);
+ base_img_size = MIN (virtual_disk_size, base_img_size);
+ base_img_size = ROUND_UP (base_img_size, 512);
+
+ if (block_size<= 0) {
+ /* No block size is provided. Find the smallest block
size that
+ * does not make the bitmap too big. */
+ block_size = 512;
+ while (1) {
+ int64_t blocks = (base_img_size + block_size - 1) /
block_size;
+ bitmap_size = (blocks + 7) / 8;
+ if (bitmap_size<= MODERATE_BITMAP_SIZE) {
+ break;
+ }
+ block_size *= 2;
+ }
+ } else {
+ block_size = ROUND_UP (block_size, 512);
+ int64_t blocks = (base_img_size + block_size - 1) /
block_size;
+ bitmap_size = (blocks + 7) / 8;
+ }
+
+ bitmap_size = ROUND_UP (bitmap_size, DEF_PAGE_SIZE);
+ header->bitmap_size = bitmap_size;
+ header->block_size = block_size;
+ header->bitmap_offset = header_size;
+
+ prefetch_profile_size = header->prefetch_profile_entries *
+ sizeof (PrefetchProfileEntry);
+ prefetch_profile_size = ROUND_UP (prefetch_profile_size,
DEF_PAGE_SIZE);
+ header->base_img_size = base_img_size;
+ header->max_outstanding_copy_on_read_data =
+ MAX_OUTSTANDING_COPY_ON_READ_DATA;
+ header->copy_on_read = copy_on_read;
+ header->prefetch_start_delay =
+ prefetch_start_delay;
+ header->num_prefetch_slots = NUM_PREFETCH_SLOTS;
+ header->bytes_per_prefetch = ROUND_UP (BYTES_PER_PREFETCH,
block_size);
+ header->prefetch_throttle_time = PREFETCH_THROTTLING_TIME;
+ header->prefetch_read_throughput_measure_time =
+ PREFETCH_MIN_MEASURE_READ_TIME;
+ header->prefetch_write_throughput_measure_time =
+ PREFETCH_MIN_MEASURE_WRITE_TIME;
+ header->prefetch_perf_calc_alpha = PREFETCH_PERF_CALC_ALPHA;
+ header->prefetch_min_read_throughput =
PREFETCH_MIN_READ_THROUGHPUT;
+ header->prefetch_min_write_throughput =
PREFETCH_MIN_WRITE_THROUGHPUT;
+ header->prefetch_max_read_throughput =
PREFETCH_MAX_READ_THROUGHPUT;
+ header->prefetch_max_write_throughput =
PREFETCH_MAX_WRITE_THROUGHPUT;
+ header->all_data_in_fvd_img = FALSE;
+ header->unit_of_PrefetchProfileEntry_len = DEF_PAGE_SIZE;
+ header->generate_prefetch_profile = FALSE; /* To be
implemented. */
+ header->profile_directed_prefetch_start_delay = -1;/*To be
implemented*/
+ }
+
+ /* Set the table size. */
+ if (header->compact_image) {
+ if (header->chunk_size<= 0) {
+ header->chunk_size = CHUNK_SIZE;
+ }
+ header->chunk_size = ROUND_UP (header->chunk_size,
DEF_PAGE_SIZE);
+ if (header->storage_grow_unit<= 0) {
+ header->storage_grow_unit = STORAGE_GROW_UNIT;
+ }
+ if (header->storage_grow_unit< header->chunk_size) {
+ header->storage_grow_unit = header->chunk_size;
+ }
+ int64_t table_entries =
+ (virtual_disk_size + header->chunk_size - 1) /
header->chunk_size;
+ table_size = sizeof (uint32_t) * table_entries;
+ table_size = ROUND_UP (table_size, DEF_PAGE_SIZE);
+ header->table_offset = header_size + bitmap_size;
+ }
+
+ /* Set the journal size. */
+ if (bitmap_size<= 0&& table_size<= 0) {
+ header->journal_size = 0; /* No need to use journal. */
+ } else if (journal_size< 0) {
+ /* Disable the use of journal, which reduces overhead but
may cause
+ * data corruption if the host crashes. This is a valid
configuration
+ * for some use cases, where data integrity is not
critical. */
+ header->journal_size = 0;
+ } else {
+ if (journal_size == 0) {
+ /* No journal size is specified. Use a default size. */
+ journal_size = JOURNAL_SIZE;
+ }
+ if (table_size> 0) {
+ /* Make sure that the journal is at least large enough
to record
+ * all table changes in one shot, which is the extremely
unlikely
+ * worst case. */
+ int64_t vsize = virtual_disk_size + header->chunk_size - 1;
+ int64_t table_entries = vsize / header->chunk_size;
+ int64_t min_journal_size = calc_min_journal_size
(table_entries);
+ if (journal_size< min_journal_size) {
+ journal_size = min_journal_size;
+ }
+ }
+ journal_size = ROUND_UP (journal_size, DEF_PAGE_SIZE);
+ header->journal_size = journal_size;
+ header->journal_offset = header_size + bitmap_size +
table_size;
+ }
+
+ const int64_t metadata_size = header_size + bitmap_size +
table_size +
+ prefetch_profile_size + MAX (0,
journal_size);
+ header->metadata_size = metadata_size;
+
+ fd = open (filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
0644);
+ if (fd< 0) {
+ fprintf (stderr, "Failed to open %s\n", filename);
+ goto fail;
+ }
+ fvd_header_cpu_to_le (header);
+
+ if (qemu_write_full (fd, header, header_size) != header_size) {
+ fprintf (stderr, "Failed to write the header of %s\n",
filename);
+ goto fail;
+ }
+
+ /* Initialize the bitmap. */
+ if (bitmap_size> 0) {
+ uint8_t *bitmap = my_qemu_mallocz (bitmap_size);
+ ret = qemu_write_full (fd, bitmap, bitmap_size);
+ my_qemu_free (bitmap);
+ if (ret != bitmap_size) {
+ fprintf (stderr, "Failed to zero out the bitmap of
%s\n", filename);
+ goto fail;
+ }
+ }
+
+ /* Initialize the table. */
+ if (table_size> 0) {
+ /* Set all entries to EMPTY_TABLE (0xFFFFFFFF). */
+ uint8_t *empty_table = my_qemu_malloc (table_size);
+ memset (empty_table, 0xFF, table_size);
+ ret = qemu_write_full (fd, empty_table, table_size);
+ my_qemu_free (empty_table);
+ if (ret != table_size) {
+ fprintf (stderr, "Failed to write the table of %s\n.",
filename);
+ goto fail;
+ }
+ }
+
+ /* Initialize the journal. */
+ if (journal_size> 0) {
+ uint8_t *empty_journal = my_qemu_mallocz (journal_size);
+ ret = qemu_write_full (fd, empty_journal, journal_size);
+ my_qemu_free (empty_journal);
+ if (ret != journal_size) {
+ fprintf (stderr, "Failed to initialize the journal for
%s\n.",
+ filename);
+ goto fail;
+ }
+ }
+
+ close (fd);
+ ret = 0;
+
+ if (bs&& hole_size> 0) {
+ ret = search_holes (filename, (size_t) bitmap_size,
header_size, bs,
+ base_img_size / 512, hole_size,
block_size);
+ }
+
+ if (bs) {
+ bdrv_close (bs);
+ }
+ my_qemu_free (header);
+ return ret;
+
+ fail:
+ if (bs) {
+ bdrv_close (bs);
+ }
+ close (fd);
+ my_qemu_free (header);
+ return -1;
+}
+
+/* For the optimization called "free write to zero-filled blocks".
See Section
+ * 3.3.3 of the FVD-cow paper. */
+static inline int search_holes (const char *filename, size_t
bitmap_size,
+ int32_t bitmap_start_offset,
+ BlockDriverState * bs, int64_t
nb_sectors,
+ int32_t hole_size, int32_t block_size)
+{
+ const int fd = open (filename, O_RDWR | O_BINARY | O_LARGEFILE, 0);
+ if (fd< 0) {
+ fprintf (stderr, "Failed to open %s for read and write.\n",
filename);
+ return -1;
+ }
+
+ printf ("Searching zero-filled sectors in the base image. Please
wait...");
+ fflush (stdout);
+
+ uint8_t *bitmap =
+ (uint8_t *) mmap (NULL, bitmap_size, PROT_READ | PROT_WRITE,
MAP_SHARED,
+ fd, (off_t) bitmap_start_offset);
+ if (bitmap == MAP_FAILED) {
+ fprintf (stderr, "Failed to mmap() %s\n", filename);
+ close (fd);
+ return -1;
+ }
+
+ if (hole_size< block_size) {
+ hole_size = block_size;
+ }
+ hole_size = ROUND_UP (hole_size, block_size);
+ nb_sectors = ROUND_DOWN (nb_sectors, hole_size);
+ const int sectors_per_hole = hole_size / 512;
+ const int sectors_per_block = block_size / 512;
+ int num_int64_in_hole = hole_size / 8;
+ int64_t hole_count = 0;
+ int i, ret = 0;
+ int64_t sec = 0;
+ uint8_t *p = my_qemu_blockalign (bs, hole_size);
+
+ while (sec< nb_sectors) {
+ int64_t *q;
+
+ if (bdrv_read (bs, sec, p, sectors_per_hole)< 0) {
+ fprintf (stderr, "Error in reading the base image\n");
+ ret = -1;
+ goto done;
+ }
+
+ /* All zeros? */
+ q = (int64_t *) p;
+ for (i = 0; i< num_int64_in_hole; i++) {
+ if (*q != 0) {
+ break;
+ }
+ q++;
+ }
+
+ if (i< num_int64_in_hole) {
+ /* This is not a hole. */
+ sec += sectors_per_hole;
+ } else {
+ /* These sectors consist of only zeros. Set the flag to
+ * indicate that there is no need to read this sector
from the
+ * base image. See Section 3.3.3 of the FVD-cow paper
for the
+ * rationale. */
+ hole_count++;
+ int64_t end = sec + sectors_per_hole;
+ while (sec< end) {
+ int block_num = sec / sectors_per_block;
+ int64_t bitmap_byte_offset = block_num / 8;
+ uint8_t bitmap_bit_offset = block_num % 8;
+ int8_t mask = (uint8_t) (0x01<< bitmap_bit_offset);
+ uint8_t b = bitmap[bitmap_byte_offset];
+ if (!(b& mask)) {
+ b |= mask;
+ bitmap[bitmap_byte_offset] |= mask;
+ }
+ sec += sectors_per_block;
+ }
+ }
+ }
+
+ done:
+ printf ("\nFound %" PRId64
+ " zero-filled hole regions. Image creation done.\n",
hole_count);
+ my_qemu_vfree (p);
+ munmap (bitmap, bitmap_size);
+ close (fd);
+ return ret;
+}
+
+static QEMUOptionParameter fvd_create_options[] = {
+ {
+ .name = BLOCK_OPT_SIZE,
+ .type = OPT_SIZE,
+ .help = "Virtual disk size"},
+ {
+ .name = "compact_image",
+ .type = OPT_FLAG,
+ .help = "compact_image=on|off"},
+ {
+ .name = "block_size",
+ .type = OPT_SIZE,
+ .help = "Block size"},
+ {
+ .name = "chunk_size",
+ .type = OPT_SIZE,
+ .help = "Chunk size"},
+ {
+ .name = "storage_grow_unit",
+ .type = OPT_SIZE,
+ .help = "Storage grow unit"},
+ {
+ .name = "add_storage_cmd",
+ .type = OPT_STRING,
+ .help = "Command to add storage when FSI runs out of space"},
+ {
+ .name = BLOCK_OPT_BACKING_FILE,
+ .type = OPT_STRING,
+ .help = "File name of a backing image"},
+ {
+ .name = BLOCK_OPT_BACKING_FMT,
+ .type = OPT_STRING,
+ .help = "Image format of the backing image"},
+ {
+ .name = "data_file",
+ .type = OPT_STRING,
+ .help = "File name of a separate data file"},
+ {
+ .name = "data_file_fmt",
+ .type = OPT_STRING,
+ .help = "Image format of the separate data file"},
+ {
+ .name = "copy_on_read",
+ .type = OPT_FLAG,
+ .help = "copy_on_read=on|off"},
+ {
+ .name = "prefetch_start_delay",
+ .type = OPT_NUMBER,
+ .help = "Delay in seconds before starting whole image
prefetching. "
+ "Prefetching is disabled if the delay is not a positive
number."},
+ {
+ .name = "detect_sparse_hole",
+ .type = OPT_SIZE,
+ .help = "Minimum size (in bytes) of a continuous zero-filled
region to be "
+ "considered as a sparse file hole in the backing image
(setting it "
+ "to 0 turns off sparse file detection)"},
+ {
+ .name = "journal_size",
+ .type = OPT_SIZE,
+ .help = "Journal size"},
+ {NULL}
+};
diff --git a/block/fvd-debug.c b/block/fvd-debug.c
new file mode 100644
index 0000000..4cef5ec
--- /dev/null
+++ b/block/fvd-debug.c
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ * Chunqiang Tang<address@hidden>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ * A short description: this module implements debugging functions for
+ * the Fast Virtual Disk (FVD) format.
+
*============================================================================*/
+
+#ifndef ENABLE_TRACE_IO
+# define TRACE_REQUEST(...) do {} while (0)
+# define TRACE_STORE_IN_FVD(...) do {} while (0)
+
+#else
+/* Monitor IO on a specific sector that triggers bugs. */
+static inline void debug_sector (int64_t sector_num)
+{
+ if (FALSE) {
+ if (sector_num == ((int64_t) 1023990LL)) {
+ QPAUSE ("right sector");
+ }
+ }
+}
+
+static void TRACE_REQUEST (int do_write, int64_t sector_num, int
nb_sectors)
+{
+ if (do_write) {
+ QDEBUG ("TRACE_REQUEST: write sector_num=%" PRId64
+ " nb_sectors=%d\n [ ", sector_num, nb_sectors);
+ } else {
+ QDEBUG ("TRACE_REQUEST: read sector_num=%" PRId64 "
nb_sectors=%d\n"
+ "[ ", sector_num, nb_sectors);
+ }
+
+ int64_t end = sector_num + nb_sectors;
+ int64_t sec;
+ for (sec = sector_num; sec< end; sec++) {
+ QDEBUG ("sec%" PRId64 " ", sec);
+ debug_sector (sec);
+ }
+ QDEBUG (" ]\n");
+}
+
+static void TRACE_STORE_IN_FVD (const char *str, int64_t sector_num,
+ int nb_sectors)
+{
+ QDEBUG ("TRACE_STORE: %s sector_num=%" PRId64 "
nb_sectors=%d\n [ ",
+ str, sector_num, nb_sectors);
+ int64_t end = sector_num + nb_sectors;
+ int64_t sec;
+ for (sec = sector_num; sec< end; sec++) {
+ QDEBUG ("sec%" PRId64 " ", sec);
+ debug_sector (sec);
+ }
+ QDEBUG (" ]\n");
+}
+#endif
+
+#ifndef FVD_DEBUG
+# define my_qemu_malloc qemu_malloc
+# define my_qemu_mallocz qemu_mallocz
+# define my_qemu_blockalign qemu_blockalign
+# define my_qemu_free qemu_free
+# define my_qemu_vfree qemu_vfree
+# define my_qemu_aio_get qemu_aio_get
+# define my_qemu_aio_release qemu_aio_release
+# define COPY_UUID(to,from) do {} while (0)
+
+#else
+FILE *__fvd_debug_fp;
+static unsigned long long int fvd_uuid = 1;
+static int64_t pending_qemu_malloc = 0;
+static int64_t pending_qemu_aio_get = 0;
+static int64_t pending_local_writes = 0;
+static const char *alloc_file;
+static int alloc_line;
+
+#define my_qemu_malloc(size) \
+ ((void*)(alloc_file=__FILE__, alloc_line=__LINE__,
_my_qemu_malloc(size)))
+
+#define my_qemu_mallocz(size) \
+ ((void*)(alloc_file=__FILE__, alloc_line=__LINE__,
_my_qemu_mallocz(size)))
+
+#define my_qemu_blockalign(bs,size) \
+ ((void*)(alloc_file=__FILE__, \
+ alloc_line=__LINE__, \
+ _my_qemu_blockalign(bs,size)))
+
+#define my_qemu_aio_get(pool,bs,cb,op) \
+ ((void*)(alloc_file=__FILE__, \
+ alloc_line=__LINE__, \
+ _my_qemu_aio_get(pool,bs,cb,op)))
+
+#define my_qemu_free(p) \
+ (alloc_file=__FILE__, alloc_line=__LINE__, _my_qemu_free(p))
+
+#define my_qemu_vfree(p) \
+ (alloc_file=__FILE__, alloc_line=__LINE__, _my_qemu_vfree(p))
+
+static void COPY_UUID (FvdAIOCB * to, FvdAIOCB * from)
+{
+ if (from) {
+ to->uuid = from->uuid;
+ FVD_DEBUG_ACB (to);
+ }
+}
+
+#ifdef DEBUG_MEMORY_LEAK
+# define MAX_TRACER 10485760
+static int alloc_tracer_used = 1; /* slot 0 is not used. */
+static void **alloc_tracers = NULL;
+
+static void __attribute__ ((constructor)) init_mem_alloc_tracers (void)
+{
+ if (!alloc_tracers) {
+ alloc_tracers = qemu_mallocz (sizeof (void *) * MAX_TRACER);
+ }
+}
+
+static void trace_alloc (void *p, size_t size)
+{
+ alloc_tracer_t *t = p;
+ t->magic = FVD_ALLOC_MAGIC;
+ t->alloc_file = alloc_file;
+ t->alloc_line = alloc_line;
+ t->size = size;
+
+ if (alloc_tracer_used< MAX_TRACER) {
+ t->alloc_tracer = alloc_tracer_used++;
+ alloc_tracers[t->alloc_tracer] = t;
+ QDEBUG ("Allocate memory using tracer%d in %s on line %d.\n",
+ t->alloc_tracer, alloc_file, alloc_line);
+ } else {
+ t->alloc_tracer = 0;
+ }
+
+ /* Set header and footer to detect out-of-range writes. */
+ if (size != (size_t) - 1) {
+ uint8_t *q = (uint8_t *) p;
+ uint64_t *header = (uint64_t *) (q + 512 - sizeof (uint64_t));
+ uint64_t *footer = (uint64_t *) (q + size - 512);
+ *header = FVD_ALLOC_MAGIC;
+ *footer = FVD_ALLOC_MAGIC;
+ }
+}
+
+static void trace_free (void *p)
+{
+ alloc_tracer_t *t = p;
+
+ QDEBUG ("Free memory with tracer%d in %s on line %d.\n",
+ t->alloc_tracer, alloc_file, alloc_line);
+ ASSERT (t->magic == FVD_ALLOC_MAGIC&& t->alloc_tracer>= 0);
+
+ /* Check header and footer to detect out-of-range writes. */
+ if (t->size != (size_t) - 1) {
+ uint8_t *q = (uint8_t *) p;
+ uint64_t *header = (uint64_t *) (q + 512 - sizeof (uint64_t));
+ uint64_t *footer = (uint64_t *) (q + t->size - 512);
+ ASSERT (*header == FVD_ALLOC_MAGIC);
+ ASSERT (*footer == FVD_ALLOC_MAGIC);
+ }
+
+ if (t->alloc_tracer) {
+ ASSERT (alloc_tracers[t->alloc_tracer] == t);
+ alloc_tracers[t->alloc_tracer] = NULL;
+ t->alloc_tracer = -INT_MAX;
+ } else {
+ t->alloc_tracer *= -1; /* Guard against double free. */
+ }
+}
+
+static void dump_alloc_tracers (void)
+{
+ int unfreed = 0;
+ int i;
+ for (i = 1; i< alloc_tracer_used; i++) {
+ if (!alloc_tracers[i]) {
+ continue;
+ }
+
+ unfreed++;
+ alloc_tracer_t *t = alloc_tracers[i];
+
+ if (t->size == (size_t) - 1) {
+ FvdAIOCB *acb = container_of (alloc_tracers[i],
FvdAIOCB, tracer);
+ ASSERT (acb->magic == FVDAIOCB_MAGIC);
+ QDEBUG ("Memory %p with tracer%d allocated in %s on line
%d "
+ "(FvdAIOCB acb%llu-%p) is not freed. magic %s\n",
+ alloc_tracers[i], i, t->alloc_file, t->alloc_line,
+ acb->uuid, acb,
+ t->magic == FVD_ALLOC_MAGIC ? "correct" : "wrong");
+ } else {
+ QDEBUG ("Memory %p with tracer%d allocated in %s on line
%d is "
+ "not freed. magic %s\n",
+ alloc_tracers[i], i, t->alloc_file, t->alloc_line,
+ t->magic == FVD_ALLOC_MAGIC ? "correct" : "wrong");
+
+ uint8_t *q = (uint8_t *) t;
+ uint64_t *header = (uint64_t *) (q + 512 - sizeof
(uint64_t));
+ uint64_t *footer = (uint64_t *) (q + t->size - 512);
+ ASSERT (*header == FVD_ALLOC_MAGIC);
+ ASSERT (*footer == FVD_ALLOC_MAGIC);
+ }
+ }
+
+ QDEBUG ("Unfreed memory allocations: %d\n", unfreed);
+}
+#endif
+
+static inline void *_my_qemu_aio_get (AIOPool * pool,
BlockDriverState * bs,
+ BlockDriverCompletionFunc * cb,
+ void *opaque)
+{
+ pending_qemu_aio_get++;
+ FvdAIOCB *acb = (FvdAIOCB *) qemu_aio_get (&fvd_aio_pool, bs,
cb, opaque);
+ acb->uuid = ++fvd_uuid;
+ acb->magic = FVDAIOCB_MAGIC;
+
+ FVD_DEBUG_ACB (acb);
+
+#ifdef DEBUG_MEMORY_LEAK
+ trace_alloc (&acb->tracer, -1);
+#endif
+
+ return acb;
+}
+
+static inline void my_qemu_aio_release (void *p)
+{
+ pending_qemu_aio_get--;
+ ASSERT (pending_qemu_aio_get>= 0);
+
+#ifdef DEBUG_MEMORY_LEAK
+ FvdAIOCB *acb = p;
+ trace_free (&acb->tracer);
+#endif
+
+ qemu_aio_release (p);
+}
+
+static inline void *_my_qemu_malloc (size_t size)
+{
+ ASSERT (size> 0);
+ pending_qemu_malloc++;
+#ifndef DEBUG_MEMORY_LEAK
+ return qemu_malloc (size);
+#else
+
+ size += 1024; /* 512 bytes header and 512 bytes footer. */
+ uint8_t *ret = qemu_malloc (size);
+ trace_alloc (ret, size);
+ return ret + 512;
+#endif
+}
+
+static inline void *_my_qemu_mallocz (size_t size)
+{
+ ASSERT (size> 0);
+ pending_qemu_malloc++;
+#ifndef DEBUG_MEMORY_LEAK
+ return qemu_mallocz (size);
+#else
+
+ size += 1024; /* 512 bytes header and 512 bytes footer. */
+ uint8_t *ret = qemu_mallocz (size);
+ trace_alloc (ret, size);
+ return ret + 512;
+#endif
+}
+
+static inline void *_my_qemu_blockalign (BlockDriverState * bs,
size_t size)
+{
+ ASSERT (size> 0);
+ pending_qemu_malloc++;
+
+#ifndef DEBUG_MEMORY_LEAK
+ return qemu_blockalign (bs, size);
+#else
+
+ size += 1024; /* 512 bytes header and 512 bytes footer. */
+ uint8_t *ret = qemu_blockalign (bs, size);
+ trace_alloc (ret, size);
+ return ret + 512;
+#endif
+}
+
+static inline void _my_qemu_free (void *ptr)
+{
+ pending_qemu_malloc--;
+ ASSERT (pending_qemu_malloc>= 0);
+#ifndef DEBUG_MEMORY_LEAK
+ qemu_free (ptr);
+#else
+
+ uint8_t *q = ((uint8_t *) ptr) - 512;
+ trace_free (q);
+ qemu_free (q);
+#endif
+}
+
+static inline void _my_qemu_vfree (void *ptr)
+{
+ pending_qemu_malloc--;
+ ASSERT (pending_qemu_malloc>= 0);
+#ifndef DEBUG_MEMORY_LEAK
+ qemu_vfree (ptr);
+#else
+
+ uint8_t *q = ((uint8_t *) ptr) - 512;
+ trace_free (q);
+ qemu_vfree (q);
+#endif
+}
+
+static void count_pending_requests (BDRVFvdState * s)
+{
+ int m = 0, k = 0;
+ FvdAIOCB *w;
+
+ QLIST_FOREACH (w,&s->copy_locks, copy_lock.next) {
+ m++;
+ QDEBUG ("copy_lock: acb%llu-%p\n", w->uuid, w);
+ }
+
+ QLIST_FOREACH (w,&s->write_locks, write.next_write_lock) {
+ k++;
+ QDEBUG ("write_lock: acb%llu-%p\n", w->uuid, w);
+ }
+
+ QDEBUG ("Debug_memory_leak: copy_locks=%d write_locks=%d\n", m,
k);
+}
+
+static void dump_resource_summary (BDRVFvdState * s)
+{
+#ifdef DEBUG_MEMORY_LEAK
+ dump_alloc_tracers ();
+#endif
+
+ QDEBUG ("Resource summary: outstanding_copy_on_read_data=%" PRId64
+ " total_copy_on_read_data=%" PRId64 "
total_prefetch_data=%" PRId64
+ " " " pending_qemu_malloc=%" PRId64 "
pending_qemu_aio_get=%" PRId64
+ " pending_local_writes=%" PRId64 "\n",
+ s->outstanding_copy_on_read_data,
s->total_copy_on_read_data,
+ s->total_prefetch_data, pending_qemu_malloc,
pending_qemu_aio_get,
+ pending_local_writes);
+ count_pending_requests (s);
+}
+
+/* Monitor processing a specific FvdAIOCB that triggers bugs. */
+void FVD_DEBUG_ACB (void *p)
+{
+ if (FALSE) {
+ FvdAIOCB *acb = p;
+
+ /* Is it FvdAIOCB? */
+ if (acb->magic != FVDAIOCB_MAGIC || acb->common.bs->drv
!=&bdrv_fvd) {
+ /* Is it CompactChildCB? */
+ CompactChildCB *child = p;
+ acb = child->acb;
+ if (acb->magic != FVDAIOCB_MAGIC
+ || acb->common.bs->drv !=&bdrv_fvd
+ || (acb->type != OP_LOAD_COMPACT
+&& acb->type != OP_STORE_COMPACT)) {
+ return;
+ }
+ }
+
+ if (acb->uuid == 20ULL) {
+ QPAUSE ("Processing the right acb");
+ }
+ }
+}
+
+void init_fvd_debug_fp (void)
+{
+ char buf[256];
+ sprintf (buf, "/tmp/fvd.log-%d", getpid ());
+ if ((__fvd_debug_fp = fopen (buf, "wt")) == NULL) {
+ __fvd_debug_fp = stdout;
+ }
+}
+#endif
+
+void fvd_check_memory_usage (void)
+{
+ ASSERT (pending_qemu_malloc == 0);
+}
+
+int fvd_get_copy_on_read (BlockDriverState * bs)
+{
+ BDRVFvdState *s = bs->opaque;
+ return s->copy_on_read;
+}
+
+void fvd_set_copy_on_read (BlockDriverState * bs, int copy_on_read)
+{
+ BDRVFvdState *s = bs->opaque;
+ s->copy_on_read = copy_on_read;
+}
diff --git a/block/fvd-ext.h b/block/fvd-ext.h
new file mode 100644
index 0000000..6839e25
--- /dev/null
+++ b/block/fvd-ext.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ * Chunqiang Tang<address@hidden>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ * A short description: this header file contains functions of the
FVD block
+ * device driver that are used by other external modules. These
functions are
+ * mainly for testing and debugging urposes.
+
*============================================================================*/
+
+#ifndef __fvd_debug_h__
+#define __fvd_debug_h__
+
+//#define FVD_DEBUG
+
+int fvd_get_copy_on_read (BlockDriverState *bs);
+void fvd_set_copy_on_read (BlockDriverState *bs, int copy_on_read);
+void fvd_check_memory_usage (void);
+void fvd_init_prefetch(void * bs);
+void fvd_enable_host_crash_test (void);
+
+#ifndef TRUE
+# define TRUE 1
+#endif
+#ifndef FALSE
+# define FALSE 0
+#endif
+
+#ifndef FVD_DEBUG
+# define QDEBUG(format,...) do {} while (0)
+# define ASSERT(x) do {} while (0)
+# define FVD_DEBUG_ACB(...) do {} while (0)
+# define QPAUSE(...) do {} while (0)
+
+#else
+
+extern FILE *__fvd_debug_fp;
+void init_fvd_debug_fp (void);
+void FVD_DEBUG_ACB (void *p);
+# define QDEBUG(format,...) \
+ do { \
+ if (__fvd_debug_fp==NULL) init_fvd_debug_fp(); \
+ fprintf (__fvd_debug_fp, format, ##__VA_ARGS__); \
+ fflush(__fvd_debug_fp); \
+ } while(0)
+
+# define ASSERT(x) \
+ do { \
+ if (!(x)) { \
+ fprintf (stderr, "Assertion failed in process %d at
%s:%d. " \
+ "Waiting for debugging...\n", getpid(),__FILE__,
__LINE__); \
+ fgetc (stdin); exit (1); \
+ } \
+ } while (0) \
+
+# define QPAUSE(format,...) \
+ do { \
+ printf (format, ##__VA_ARGS__); \
+ printf (" Pause process %d for debugging...\n", getpid()); \
+ fgetc (stdin); \
+ } while (0)
+
+#endif
+
+#endif
diff --git a/block/fvd.c b/block/fvd.c
new file mode 100644
index 0000000..311ff58
--- /dev/null
+++ b/block/fvd.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ * Chunqiang Tang<address@hidden>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ * A short description: this module implements the QEMU block
device driver
+ * for the Fast Virtual Disk (FVD) format. See the following
companion
+ * papers for a detailed description of FVD:
+ * 1. The so-called "FVD-cow paper":
+ * "FVD: a High-Performance Virtual Machine Image Format
for Cloud",
+ * by Chunqiang Tang, 2010.
+ * 2. The so-called "FVD-compact paper":
+ * "FVD: a High-Performance Virtual Machine Image Format
for Cloud
+ * with Sparse Image Capability", by Chunqiang Tang, 2010.
+
*============================================================================*/
+
+#include "block/fvd.h"
+
+//#define ENABLE_TRACE_IO
+//#define DEBUG_MEMORY_LEAK
+//#define SIMULATED_TEST_WITH_QEMU_IO
+
+#ifndef FVD_DEBUG
+#undef DEBUG_MEMORY_LEAK
+#undef ENABLE_TRACE_IO
+#undef SIMULATED_TEST_WITH_QEMU_IO
+#endif
+
+/* Use include to avoid exposing too many FVD symbols, and to allow
inline
+ * function optimization. */
+#include "block/fvd-utils.c"
+#include "block/fvd-debug.c"
+#include "block/fvd-misc.c"
+#include "block/fvd-create.c"
+#include "block/fvd-open.c"
+#include "block/fvd-read.c"
+#include "block/fvd-write.c"
+#include "block/fvd-load.c"
+#include "block/fvd-store.c"
+#include "block/fvd-journal.c"
+#include "block/fvd-prefetch.c"
+
+static AIOPool fvd_aio_pool = {
+ .aiocb_size = sizeof (FvdAIOCB),
+ .cancel = fvd_aio_cancel,
+};
+
+static BlockDriver bdrv_fvd = {
+ .format_name = "fvd",
+ .instance_size = sizeof (BDRVFvdState),
+ .bdrv_create = fvd_create,
+ .bdrv_probe = fvd_probe,
+ .bdrv_file_open = fvd_open,
+ .bdrv_close = fvd_close,
+ .bdrv_is_allocated = fvd_is_allocated,
+ .bdrv_flush = fvd_flush,
+ .bdrv_aio_readv = fvd_aio_readv,
+ .bdrv_aio_writev = fvd_aio_writev,
+ .bdrv_aio_flush = fvd_aio_flush,
+ .create_options = fvd_create_options,
+ .bdrv_get_info = fvd_get_info,
+ .bdrv_update = fvd_update,
+ .bdrv_has_zero_init = fvd_has_zero_init
+};
+
+static void bdrv_fvd_init (void)
+{
+ bdrv_register (&bdrv_fvd);
+}
+
+block_init (bdrv_fvd_init);
+
+/*
+ * Since bdrv_close may not be properly invoked on a VM shutdown, we
+ * use a destructor to flush metadata to disk. This only affects
+ * performance and does not affect correctness.
+ * See Section 3.3.4 of the FVD-cow paper for the rationale.
+ */
+extern QTAILQ_HEAD (, BlockDriverState) bdrv_states;
+static void __attribute__ ((destructor)) flush_fvd_bitmap_to_disk
(void)
+{
+ BlockDriverState *bs;
+ QTAILQ_FOREACH (bs,&bdrv_states, list) {
+ if (bs->drv ==&bdrv_fvd) {
+ flush_metadata_to_disk_on_exit (bs);
+
+#ifdef FVD_DEBUG
+ dump_resource_summary (bs->opaque);
+#endif
+ }
+ }
+}
+
+/*
+ * TODOs: Below are some potential enhancements for future development:
+ * 1. Handle storage leak on failure.
+ *
+ * 2. Profile-directed prefetch. See Section 3.4.1 of the FVD-cow
paper.
+ * Related metadata are FvdHeader.prefetch_profile_offset and
+ * FvdHeader.prefetch_profile_entries,
+ * FvdHeader.profile_directed_prefetch_start_delay,
+ * FvdHeader.generate_prefetch_profile.
+ *
+ * 3. Cap the prefetch throughput at the upper limit. See Section
3.4.2 of
+ * the FVD-cow paper. Related metadata are
+ * FvdHeader.prefetch_max_read_throughput and
+ * FvdHeader.prefetch_max_write_throughput.
+ *
+ * 4. Support write through to the base image. When a VM issues a write
+ * request, in addition to saving the data in the FVD data file,
also save the
+ * data in the base image if the address of write request is not
beyond the
+ * size of the base image (this of course requires the base image
NOT to be
+ * 'read_only'. This feature changes the semantics of copy-on-write,
but it
+ * suits a different use case, where the base image is stored on a
remote
+ * storage server, and the FVD image is stored on a local disk and
acts as a
+ * write-through cache of the base image. This can be used to cache and
+ * improve the performance of persistent storage on network-attached
storage,
+ * e.g., Amazon EBS. This feature is not described in the FVD-cow
paper as it
+ * would complicate the discussion. Related metadata are
+ * FvdHeader.write_updates_base_img.
+ */
diff --git a/block/fvd.h b/block/fvd.h
new file mode 100644
index 0000000..cce8cc8
--- /dev/null
+++ b/block/fvd.h
@@ -0,0 +1,481 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ * Chunqiang Tang<address@hidden>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ * A short description: this is the header of the FVD block device
driver.
+
*============================================================================*/
+
+#include<sys/vfs.h>
+#include<sys/mman.h>
+#include<pthread.h>
+#include<execinfo.h>
+#include<stdlib.h>
+#include<sys/ioctl.h>
+#include<stdint.h>
+#include<stdio.h>
+#include<inttypes.h>
+#include "block_int.h"
+#include "osdep.h"
+#include "qemu-option.h"
+#include "qemu-timer.h"
+#include "block.h"
+#include "qemu-queue.h"
+#include "qemu-common.h"
+#include "block/blksim.h"
+#include "block/fvd-ext.h"
+
+#define FVD_MAGIC (('Q'<< 24) | ('C'<< 16) | (0xF5<< 8) |
0xA9)
+#define FVD_VERSION 1
+
+/* Profile-directed prefetch. (to be implemented). */
+typedef struct __attribute__ ((__packed__)) PrefetchProfileEntry {
+ int64_t offset; /* in bytes */
+
+ /* In the unit of FvdHeader.prefetch_profile_entry_len_unit, i.e.,
+ * len_in_bytes = len *
FvdHeader.unit_of_PrefetchProfileEntry_len. */
+ uint32_t len;
+} PrefetchProfileEntry;
+
+/*
+ * The FVD format consists of:
+ * + Header fields of FvdHeader.
+ * + Bitmap, starting on a 4KB page boundary at a location
specified by
+ * FvdHeader.bitmap_offset.
+ * + Table, starting on a 4KB page boundary at a location
specified by
+ * FvdHeader.table_offset.
+ * + Journal, starting on a 4KB page boundary at a location
specified by
+ * FvdHeader.journal_offset.
+ * + Prefetch profile entries, starting on a 4KB page boundary at
a location
+ * specified by FvdHeader.prefetch_profile_offset. (to be
implemented)
+ * + Virtual disk data, starting on a 4KB page boundary.
Optionally, disk
+ * data can be stored in a separate data file specified by
+ * FvdHeader.data_file.
+ */
+typedef struct __attribute__ ((__packed__)) FvdHeader {
+ uint32_t magic;
+ uint32_t version;
+
+ /* This field is set to TRUE after whole-image prefetching
finishes. */
+ int32_t all_data_in_fvd_img;
+
+ int64_t virtual_disk_size; /* in bytes. Disk size
perceived by the VM. */
+ int64_t metadata_size; /* in bytes. */
+ char base_img[1024];
+ char base_img_fmt[16];
+ int64_t base_img_size; /* in bytes. */
+ int64_t bitmap_offset; /* in bytes. Aligned on
DEF_PAGE_SIZE. */
+ int64_t bitmap_size; /* in bytes. Rounded up to
DEF_PAGE_SIZE */
+ int32_t block_size; /* in bytes. */
+ int32_t copy_on_read; /* TRUE or FALSE */
+ int64_t max_outstanding_copy_on_read_data; /* in bytes. */
+
+ /* If (data_file[0]==0), the FVD metadata and data are stored in
one file.*/
+ char data_file[1024];
+ char data_file_fmt[16];
+
+ /******** Begin: for prefetching. *******************************/
+ /* in seconds. -1 means disable whole image prefetching. */
+ int32_t prefetch_start_delay;
+
+ /* in bytes. Aligned on DEF_PAGE_SIZE. (to be implemented) */
+ int64_t prefetch_profile_offset;
+
+ /* Number of PrefetchProfileEntry. (to be implemented) */
+ int64_t prefetch_profile_entries;
+
+ int32_t num_prefetch_slots; /* Max number of oustanding
prefetch writes. */
+ int32_t bytes_per_prefetch; /* For whole image
prefetching. */
+ int32_t prefetch_read_throughput_measure_time; /* in
milliseconds. */
+ int32_t prefetch_write_throughput_measure_time; /* in
milliseconds. */
+
+ /* Controls the calculation of the moving average of throughput.
Must be a
+ * value between [0,100].
+ * actual_normalized_alpha = * prefetch_perf_calc_alpha /
100.0 */
+ int32_t prefetch_perf_calc_alpha;
+
+ int32_t prefetch_min_read_throughput; /* in KB/second. */
+ int32_t prefetch_min_write_throughput; /* in KB/second. */
+ int32_t prefetch_max_read_throughput; /* in KB/second. */
+ int32_t prefetch_max_write_throughput; /* in KB/second. */
+
+ /* in milliseconds. When prefetch read/write throughput is low,
prefetch
+ * pauses for a random time uniformly distributed in
+ * [0, prefetch_throttle_time]. */
+ int32_t prefetch_throttle_time;
+ /******** End: for prefetching. *******************************/
+
+ /******** Begin: for compact image. *****************************/
+ int32_t compact_image; /* TRUE or FALSE */
+ int64_t table_offset; /* in bytes. */
+ int64_t chunk_size; /* in bytes. */
+ int64_t storage_grow_unit; /* in bytes. */
+ char add_storage_cmd[2048];
+ /******** End: for compact image. *******************************/
+
+ /******** Begin: for journal. ***********************************/
+ int64_t journal_offset; /* in bytes. */
+ int64_t journal_size; /* in bytes. */
+ int32_t clean_shutdown; /* TRUE if VM's last shutdown was
graceful. */
+ /******** End: for journal. *************************************/
+
+ /*
+ * This field is TRUE if the image mandates that the storage layer
+ * (BDRVFvdState.fvd_data) must return TRUE for
bdrv_has_zero_init().
+ * This is the case if the optimization described in Section
3.3.3 of the
+ * FVD-cow paper is enabled (see function search_holes()). If
'qemu-img
+ * create' sets need_zero_init to TRUE, 'qemu-img update' can be
used to
+ * manually reset it to FALSE, if the user always manually
pre-fills the
+ * storage (e.g., a raw partition) with zeros. If the image is
stored on a
+ * file system, it already supports zero_init, and hence there
is no need
+ * to manually manipulate this field.
+ */
+ int32_t need_zero_init;
+
+ /* If TRUE, FVD dumps a prefetch profile after the VM shuts down.
+ * (to be implemented) */
+ int32_t generate_prefetch_profile;
+
+ /* See the comment on PrefetchProfileEntry.len. (to be
implemented) */
+ int32_t unit_of_PrefetchProfileEntry_len;
+
+ /* in seconds. -1 means disable profile-directed prefetching.
+ * (to be implemented) */
+ int32_t profile_directed_prefetch_start_delay;
+
+ /* Possible values are "no", "writethrough", "writeback", or
+ * "writenocache". (to be implemented) */
+ char write_updates_base_img[16];
+} FvdHeader;
+
+typedef struct BDRVFvdState {
+ BlockDriverState *fvd_metadata;
+ BlockDriverState *fvd_data;
+ int64_t virtual_disk_size; /*in bytes. */
+ int64_t bitmap_offset; /* in sectors */
+ int64_t bitmap_size; /* in bytes. */
+ int64_t data_offset; /* in sectors. Begin of real data. */
+ int64_t nb_sectors_in_base_img;
+ int32_t block_size; /* in sectors. */
+ int copy_on_read; /* TRUE or FALSE */
+ int64_t max_outstanding_copy_on_read_data; /* in bytes. */
+ int64_t outstanding_copy_on_read_data; /* in bytes. */
+ int data_region_prepared; /* TRUE or FALSE */
+ QLIST_HEAD(WriteLocks, FvdAIOCB) write_locks; /* All writes. */
+ QLIST_HEAD(CopyLocks, FvdAIOCB) copy_locks; /* copy-on-read and
CoW. */
+
+ /* Keep two copies of bitmap to reduce the overhead of updating the
+ * on-disk bitmap, i.e., copy-on-read and prefetching do not
update the
+ * on-disk bitmap. See Section 3.3.4 of the FVD-cow paper. */
+ uint8_t *fresh_bitmap;
+ uint8_t *stale_bitmap;
+
+ /******** Begin: for prefetching.
***********************************/
+ struct FvdAIOCB **prefetch_acb;
+ int prefetch_state; /* PREFETCH_STATE_RUNNING, FINISHED, or
DISABLED. */
+ int prefetch_error; /* TRUE or FALSE */
+ int num_prefetch_slots;
+ int num_filled_prefetch_slots;
+ int next_prefetch_read_slot;
+ int prefetch_read_active; /* TRUE or
FALSE */
+ int pause_prefetch_requested; /* TRUE or FALSE */
+ int prefetch_start_delay; /* in seconds */
+ int64_t unclaimed_prefetch_region_start;
+ int64_t prefetch_read_time; /* in
milliseconds. */
+ int64_t prefetch_write_time; /* in milliseconds. */
+ int64_t prefetch_data_read; /* in bytes. */
+ int64_t prefetch_data_written; /* in bytes. */
+ double prefetch_read_throughput; /* in
bytes/millisecond. */
+ double prefetch_write_throughput; /* in
bytes/millisecond. */
+ double prefetch_min_read_throughput; /* in
bytes/millisecond. */
+ double prefetch_min_write_throughput; /* in
bytes/millisecond. */
+ int64_t prefetch_read_throughput_measure_time; /* in
millisecond. */
+ int64_t prefetch_write_throughput_measure_time; /* in
millisecond.*/
+ int prefetch_throttle_time; /* in millisecond. */
+ int sectors_per_prefetch;
+ QEMUTimer *prefetch_timer;
+ /* prefetch_perf_calc_alpha =
FvdHeader.prefetch_perf_calc_alpha/100.0 */
+ double prefetch_perf_calc_alpha;
+ /******** End: for prefetching.
***********************************/
+
+ /******** Begin: for compact image.
*************************************/
+ uint32_t *table; /* Mapping table stored in memory in
little endian. */
+ int64_t data_storage; /* in sectors. */
+ int64_t used_storage; /* in sectors. */
+ int64_t chunk_size; /* in sectors. */
+ int64_t storage_grow_unit; /* in sectors. */
+ int64_t table_offset; /* in sectors. */
+ char *add_storage_cmd;
+ /******** Begin: for compact image.
*************************************/
+
+ /******** Begin: for journal.
*******************************************/
+ int64_t journal_offset; /* in sectors. */
+ int64_t journal_size; /* in sectors. */
+ int64_t next_journal_sector; /* in sector. */
+ int ongoing_journal_updates; /* Number of ongoing journal
updates. */
+ int dirty_image; /* TRUE or FALSE. */
+
+ /* Requests waiting for metadata flush and journal recycle to
finish. */
+ QLIST_HEAD(JournalFlush, FvdAIOCB) wait_for_journal;
+ /******** End: for journal.
********************************************/
+
+#ifdef FVD_DEBUG
+ int64_t total_copy_on_read_data; /* in bytes. */
+ int64_t total_prefetch_data; /* in bytes. */
+#endif
+} BDRVFvdState;
+
+/* Begin of data type definitions. */
+struct FvdAIOCB;
+
+typedef struct JournalCB {
+ BlockDriverAIOCB *hd_acb;
+ QEMUIOVector qiov;
+ struct iovec iov;
+ QLIST_ENTRY(FvdAIOCB) next_wait_for_journal;
+} JournalCB;
+
+/* CopyLock is used by AIOWriteCB and AIOCopyCB. */
+typedef struct CopyLock {
+ QLIST_ENTRY(FvdAIOCB) next;
+ int64_t begin;
+ int64_t end;
+ QLIST_HEAD(DependentWritesHead, FvdAIOCB) dependent_writes;
+} CopyLock;
+
+typedef struct ChildAIOReadCB {
+ BlockDriverAIOCB *hd_acb;
+ struct iovec iov;
+ QEMUIOVector qiov;
+ int64_t sector_num;
+ int nb_sectors;
+ int done;
+} ChildAIOReadCB;
+
+typedef struct AIOReadCB {
+ QEMUIOVector *qiov;
+ int ret;
+ ChildAIOReadCB read_backing;
+ ChildAIOReadCB read_fvd;
+} AIOReadCB;
+
+/* For copy-on-read and prefetching. */
+typedef struct AIOCopyCB {
+ BlockDriverAIOCB *hd_acb;
+ struct iovec iov;
+ QEMUIOVector qiov;
+ uint8_t *buf;
+ int64_t buffered_sector_begin;
+ int64_t buffered_sector_end;
+ int64_t last_prefetch_op_start_time; /* For prefetch
only. */
+} AIOCopyCB;
+
+typedef struct AIOWriteCB {
+ BlockDriverAIOCB *hd_acb;
+ QEMUIOVector *qiov;
+ uint8_t *cow_buf;
+ QEMUIOVector *cow_qiov;
+ int64_t cow_start_sector;
+ int update_table; /* TRUE or FALSE. */
+ int ret;
+ QLIST_ENTRY(FvdAIOCB) next_write_lock; /* See
BDRVFvdState.write_locks */
+
+ /* See FvdAIOCB.write.dependent_writes. */
+ QLIST_ENTRY(FvdAIOCB) next_dependent_write;
+} AIOWriteCB;
+
+/* For AIOStoreCompactCB and AIOLoadCompactCB. */
+typedef struct CompactChildCB {
+ struct FvdAIOCB *acb;
+ BlockDriverAIOCB *hd_acb;
+} CompactChildCB;
+
+/* For storing data to a compact image. */
+typedef struct AIOStoreCompactCB {
+ CompactChildCB one_child;
+ CompactChildCB *children;
+ int update_table;
+ int num_children;
+ int finished_children;
+ struct FvdAIOCB *parent_acb;
+ int ret;
+ int soft_write; /*TRUE if the store is caused by copy-on-read or
prefetch.*/
+ QEMUIOVector *orig_qiov;
+} AIOStoreCompactCB;
+
+/* For loading data from a compact image. */
+typedef struct AIOLoadCompactCB {
+ CompactChildCB *children;
+ CompactChildCB one_child;
+ int num_children;
+ int finished_children;
+ struct FvdAIOCB *parent_acb;
+ int ret;
+ QEMUIOVector *orig_qiov;
+} AIOLoadCompactCB;
+
+typedef struct AIOFlushCB {
+ BlockDriverAIOCB *data_acb;
+ BlockDriverAIOCB *metadata_acb;
+ int num_finished;
+ int ret;
+} AIOFlushCB;
+
+typedef struct AIOWrapperCB {
+ QEMUBH *bh;
+} AIOWrapperCB;
+
+typedef enum { OP_READ = 1, OP_WRITE, OP_COPY, OP_STORE_COMPACT,
+ OP_LOAD_COMPACT, OP_WRAPPER, OP_FLUSH } op_type;
+
+#ifdef FVD_DEBUG
+/* For debugging memory leadk. */
+typedef struct alloc_tracer_t {
+ int64_t magic;
+ int alloc_tracer;
+ const char *alloc_file;
+ int alloc_line;
+ size_t size;
+} alloc_tracer_t;
+#endif
+
+typedef struct FvdAIOCB {
+ BlockDriverAIOCB common;
+ op_type type;
+ int64_t sector_num;
+ int nb_sectors;
+ JournalCB jcb; /* For AIOWriteCB and AIOStoreCompactCB. */
+ CopyLock copy_lock; /* For AIOWriteCB and AIOCopyCB. */
+
+ /* Use a union so that all requests can efficiently share one
big AIOPool.*/
+ union {
+ AIOWrapperCB wrapper;
+ AIOReadCB read;
+ AIOWriteCB write;
+ AIOCopyCB copy;
+ AIOLoadCompactCB load;
+ AIOStoreCompactCB store;
+ AIOFlushCB flush;
+ };
+
+#ifdef FVD_DEBUG
+ int64_t magic;
+ alloc_tracer_t tracer;
+
+ /* Uniquely identifies a request across all processing
activities. */
+ unsigned long long int uuid;
+#endif
+} FvdAIOCB;
+
+static AIOPool fvd_aio_pool;
+static BlockDriver bdrv_fvd;
+static QEMUOptionParameter fvd_create_options[];
+
+/* Function prototypes. */
+static int do_aio_write(struct FvdAIOCB *acb);
+static void finish_write_data(void *opaque, int ret);
+static void restart_dependent_writes(struct FvdAIOCB *acb);
+static void finish_prefetch_read(void *opaque, int ret);
+static int read_fvd_header(BDRVFvdState * s, FvdHeader * header);
+static int update_fvd_header(BDRVFvdState * s, FvdHeader * header);
+static void fvd_aio_cancel(BlockDriverAIOCB * blockacb);
+static BlockDriverAIOCB *store_data_in_compact_image(struct FvdAIOCB
*acb,
+ int soft_write, struct FvdAIOCB *parent_acb,
BlockDriverState * bs,
+ int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
+ BlockDriverCompletionFunc * cb, void *opaque);
+static BlockDriverAIOCB *load_data_from_compact_image(struct
FvdAIOCB *acb,
+ struct FvdAIOCB *parent_acb, BlockDriverState * bs,
+ int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
+ BlockDriverCompletionFunc * cb, void *opaque);
+static void free_write_resource(struct FvdAIOCB *acb);
+static void write_metadata_to_journal(struct FvdAIOCB *acb);
+static void flush_metadata_to_disk(BlockDriverState * bs);
+static void free_journal_sectors(BDRVFvdState * s);
+static int fvd_create(const char *filename, QEMUOptionParameter *
options);
+static int fvd_probe(const uint8_t * buf, int buf_size, const char
*filename);
+static int fvd_open(BlockDriverState * bs, const char *filename, int
flags);
+static void fvd_close(BlockDriverState * bs);
+static int fvd_is_allocated(BlockDriverState * bs, int64_t sector_num,
+ int nb_sectors, int *pnum);
+static int fvd_flush(BlockDriverState * bs);
+static BlockDriverAIOCB *fvd_aio_readv(BlockDriverState * bs,
+ int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
+ BlockDriverCompletionFunc * cb, void *opaque);
+static BlockDriverAIOCB *fvd_aio_writev(BlockDriverState * bs,
+ int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
+ BlockDriverCompletionFunc * cb, void *opaque);
+static BlockDriverAIOCB *fvd_aio_flush(BlockDriverState * bs,
+ BlockDriverCompletionFunc * cb, void *opaque);
+static int fvd_get_info(BlockDriverState * bs, BlockDriverInfo * bdi);
+static int fvd_update(BlockDriverState * bs, int argc, char **argv);
+static int fvd_has_zero_init(BlockDriverState * bs);
+static void fvd_read_cancel(FvdAIOCB * acb);
+static void fvd_write_cancel(FvdAIOCB * acb);
+static void fvd_copy_cancel(FvdAIOCB * acb);
+static void fvd_load_compact_cancel(FvdAIOCB * acb);
+static void fvd_store_compact_cancel(FvdAIOCB * acb);
+static void fvd_wrapper_cancel(FvdAIOCB * acb);
+static void flush_metadata_to_disk_on_exit (BlockDriverState *bs);
+static inline BlockDriverAIOCB *load_data(FvdAIOCB * parent_acb,
+ BlockDriverState * bs, int64_t sector_num, QEMUIOVector
* orig_qiov,
+ int nb_sectors, BlockDriverCompletionFunc * cb, void
*opaque);
+static inline BlockDriverAIOCB *store_data(int soft_write,
+ FvdAIOCB * parent_acb, BlockDriverState * bs, int64_t
sector_num,
+ QEMUIOVector * orig_qiov, int nb_sectors,
+ BlockDriverCompletionFunc * cb, void *opaque);
+
+/* Default configurations. */
+#define DEF_PAGE_SIZE 4096 /*
bytes */
+#define BYTES_PER_PREFETCH 1048576 /*
bytes */
+#define PREFETCH_THROTTLING_TIME 30000 /*
milliseconds */
+#define NUM_PREFETCH_SLOTS 2
+#define PREFETCH_MIN_MEASURE_READ_TIME 100 /*
milliseconds */
+#define PREFETCH_MIN_MEASURE_WRITE_TIME 100 /*
milliseconds */
+#define PREFETCH_MIN_READ_THROUGHPUT 5120 /*
KB/s */
+#define PREFETCH_MIN_WRITE_THROUGHPUT 5120 /*
KB/s */
+#define PREFETCH_MAX_READ_THROUGHPUT
1000000000L /* KB/s */
+#define PREFETCH_MAX_WRITE_THROUGHPUT
1000000000L /* KB/s */
+#define PREFETCH_PERF_CALC_ALPHA 80 /* in
[0,100]. */
+#define MAX_OUTSTANDING_COPY_ON_READ_DATA
2000000 /* bytes */
+#define MODERATE_BITMAP_SIZE 4194304L
/* bytes */
+#define CHUNK_SIZE 1048576LL
/* bytes */
+#define JOURNAL_SIZE
16777216LL /* bytes */
+#define STORAGE_GROW_UNIT 104857600LL
/* bytes */
+
+/* State of BDRVFvdState.prefetch_state. */
+#define PREFETCH_STATE_RUNNING 1
+#define PREFETCH_STATE_FINISHED 2
+#define PREFETCH_STATE_DISABLED 3
+
+/* For convience. */
+#define ROUND_UP(x, base) ((((x)+(base)-1) / (base)) *
(base))
+#define ROUND_DOWN(x, base) ((((x) / (base)) * (base)))
+#define BOOL(x) ((x) ? "true" : "false")
+#define EMPTY_TABLE ((uint32_t)0xFFFFFFFF)
+#define DIRTY_TABLE ((uint32_t)0x80000000)
+#define READ_TABLE(entry) (le32_to_cpu(entry)& ~DIRTY_TABLE)
+# define FVDAIOCB_MAGIC ((uint64_t)0x3A8FCE89325B976DULL)
+# define FVD_ALLOC_MAGIC ((uint64_t)0x4A7dCEF9925B976DULL)
+#define IS_EMPTY(entry) ((entry) == EMPTY_TABLE)
+#define IS_DIRTY(entry) (le32_to_cpu(entry)& DIRTY_TABLE)
+#define WRITE_TABLE(entry,id) ((entry) = cpu_to_le32(id))
+#define READ_TABLE2(entry) \
+ ((entry)==EMPTY_TABLE ? EMPTY_TABLE : (le32_to_cpu(entry)&
~DIRTY_TABLE))
+
+#define CLEAN_DIRTY(entry) \
+ do { \
+ if (!IS_EMPTY(entry)) \
+ entry = cpu_to_le32(le32_to_cpu(entry)& ~DIRTY_TABLE); \
+ } while (0)
+
+#define CLEAN_DIRTY2(entry) \
+ do { \
+ ASSERT(!IS_EMPTY(entry)); \
+ entry = cpu_to_le32(le32_to_cpu(entry)& ~DIRTY_TABLE); \
+ } while (0)