[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [PATCH 10/26] FVD: add impl of interface bdrv_file_open()
From: |
Chunqiang Tang |
Subject: |
[Qemu-devel] [PATCH 10/26] FVD: add impl of interface bdrv_file_open() |
Date: |
Fri, 25 Feb 2011 17:37:50 -0500 |
This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.
This patch adds FVD's implementation of the bdrv_file_open() interface.
It supports openning an FVD image.
Signed-off-by: Chunqiang Tang <address@hidden>
---
block/fvd-journal.c | 6 +
block/fvd-open.c | 445 +++++++++++++++++++++++++++++++++++++++++++++++++-
block/fvd-prefetch.c | 17 ++
block/fvd.c | 1 +
4 files changed, 468 insertions(+), 1 deletions(-)
create mode 100644 block/fvd-prefetch.c
diff --git a/block/fvd-journal.c b/block/fvd-journal.c
index 246f425..5ba34bd 100644
--- a/block/fvd-journal.c
+++ b/block/fvd-journal.c
@@ -22,6 +22,12 @@ static inline int64_t calc_min_journal_size(int64_t
table_entries)
return 512;
}
+static int init_journal(int read_only, BlockDriverState * bs,
+ FvdHeader * header)
+{
+ return -ENOTSUP;
+}
+
void fvd_emulate_host_crash(bool cond)
{
emulate_host_crash = cond;
diff --git a/block/fvd-open.c b/block/fvd-open.c
index 056b994..8caf8d3 100644
--- a/block/fvd-open.c
+++ b/block/fvd-open.c
@@ -11,7 +11,450 @@
*
*/
+static void init_prefetch_timer(BlockDriverState * bs, BDRVFvdState * s);
+static int init_data_file(BDRVFvdState * s, FvdHeader * header, int flags);
+static int init_bitmap(BlockDriverState * bs, BDRVFvdState * s,
+ FvdHeader * header, const char *const filename);
+static int load_table(BDRVFvdState * s, FvdHeader * header,
+ const char *const filename);
+static int init_journal(int read_only, BlockDriverState * bs,
+ FvdHeader * header);
+static int init_compact_image(BDRVFvdState * s, FvdHeader * header,
+ const char *const filename);
+
static int fvd_open(BlockDriverState * bs, const char *filename, int flags)
{
- return -ENOTSUP;
+ BDRVFvdState *s = bs->opaque;
+ int ret;
+ FvdHeader header;
+ BlockDriver *drv;
+ int i;
+
+ const char *protocol = strchr(filename, ':');
+ if (protocol) {
+ drv = bdrv_find_protocol(filename);
+ filename = protocol + 1;
+ } else {
+ /* Use "raw" instead of "file" to allow storing the image on device. */
+ drv = bdrv_find_format("raw");
+ if (!drv) {
+ fprintf(stderr, "Failed to find the block device driver\n");
+ return -EINVAL;
+ }
+ }
+
+ s->fvd_metadata = bdrv_new("");
+ ret = bdrv_open(s->fvd_metadata, filename, flags, drv);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to open %s\n", filename);
+ return ret;
+ }
+
+ /* Initialize so that jumping to 'fail' would do cleanup properly. */
+ s->stale_bitmap = NULL;
+ s->fresh_bitmap = NULL;
+ s->table = NULL;
+ s->outstanding_copy_on_read_data = 0;
+ QLIST_INIT(&s->write_locks);
+ QLIST_INIT(&s->copy_locks);
+ s->prefetch_acb = NULL;
+ s->add_storage_cmd = NULL;
+#ifdef FVD_DEBUG
+ s->total_copy_on_read_data = s->total_prefetch_data = 0;
+#endif
+
+ if (bdrv_pread(s->fvd_metadata, 0, &header, sizeof(header)) !=
+ sizeof(header)) {
+ fprintf(stderr, "Failed to read the header of %s\n", filename);
+ ret = -EIO;
+ goto fail;
+ }
+
+ fvd_header_le_to_cpu(&header);
+
+ if (header.magic != FVD_MAGIC) {
+ fprintf(stderr, "Incorrect magic number in header: %0X\n",
+ header.magic);
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ /* Check incompatible features. */
+ for (i = 0; i < INCOMPATIBLE_FEATURES_SPACE; i++) {
+ if (header.incompatible_features[i] != 0) {
+ fprintf(stderr, "The image was created by FVD version %d "
+ " and uses features not supported by this FVD version
%d\n",
+ header.create_version, FVD_VERSION);
+ ret = -ENOTSUP;
+ }
+ }
+
+ if (header.virtual_disk_size % 512 != 0) {
+ fprintf(stderr, "Disk size %" PRId64 " in the header of %s is not "
+ "a multple of 512.\n", header.virtual_disk_size, filename);
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ /* Initialize the fields of BDRVFvdState. */
+ s->chunks_relocated = header.chunks_relocated;
+ s->dirty_image = false;
+ s->metadata_err_prohibit_write = false;
+ s->block_size = header.block_size / 512;
+ s->bitmap_size = header.bitmap_size;
+ s->prefetch_timer = NULL;
+ s->sectors_per_prefetch = (header.bytes_per_prefetch + 511) / 512;
+ s->prefetch_throttle_time = header.prefetch_throttle_time;
+ s->prefetch_read_throughput_measure_time =
+ header.prefetch_read_throughput_measure_time;
+ s->prefetch_write_throughput_measure_time =
+ header.prefetch_write_throughput_measure_time;
+
+ /* Convert KB/s to bytes/millisec. */
+ s->prefetch_min_read_throughput =
+ ((double)header.prefetch_min_read_throughput) * 1024.0 / 1000.0;
+ s->prefetch_min_write_throughput =
+ ((double)header.prefetch_min_write_throughput) * 1024.0 / 1000.0;
+
+ if (header.base_img[0] != 0 && s->sectors_per_prefetch % s->block_size!=0)
{
+ fprintf(stderr, "sectors_per_prefetch (%" PRIu64 ") is not a multiple "
+ "of block_size (%" PRIu64 ")\n",
+ s->sectors_per_prefetch * 512, s->block_size * 512);
+ }
+ s->max_outstanding_copy_on_read_data =
+ header.max_outstanding_copy_on_read_data;
+ if (s->max_outstanding_copy_on_read_data < header.block_size * 2) {
+ s->max_outstanding_copy_on_read_data = header.block_size;
+ }
+
+ if (header.num_prefetch_slots < 1) {
+ s->num_prefetch_slots = 1;
+ } else {
+ s->num_prefetch_slots = header.num_prefetch_slots;
+ }
+
+ const int read_only = !(flags & BDRV_O_RDWR);
+
+ if (read_only || IN_QEMU_TOOL) {
+ /* Disable prefetching and copy_on_read. */
+ s->prefetch_start_delay = -1;
+ s->copy_on_read = false;
+ } else {
+ s->prefetch_start_delay = header.prefetch_start_delay;
+ s->copy_on_read = header.copy_on_read;
+ }
+ s->virtual_disk_size = header.virtual_disk_size;
+ s->bitmap_offset = header.bitmap_offset / 512;
+ s->base_img_sectors = header.base_img_size / 512;
+ bs->total_sectors = s->virtual_disk_size / 512;
+
+ if ((ret = init_data_file(s, &header, flags))) {
+ goto fail;
+ }
+
+ if ((ret = init_bitmap(bs, s, &header, filename))) {
+ goto fail;
+ }
+
+ if ((ret = load_table(s, &header, filename))) {
+ goto fail;
+ }
+
+ if ((ret = init_journal(read_only, bs, &header))) {
+ goto fail;
+ }
+
+ /* This must be done after init_journal() because it may use metadata
+ * recovered from the journal. */
+ if ((ret = init_compact_image(s, &header, filename))) {
+ goto fail;
+ }
+
+ if (!read_only) {
+ /* This flag will be cleaned when the image is shut down gracefully. */
+ update_clean_shutdown_flag(s, false);
+ init_prefetch_timer(bs, s);
+ }
+
+ QDEBUG("copy_on_read=%s compact_image=%s block_size=%" PRIu64
+ " chunk_size=%"PRId64
+ " journal_size=%" PRId64 " prefetching_delay=%" PRId64
+ " prefetch_slots=%d "
+ "prefetch_read_threshold_KB=%.0lf "
+ "prefetch_write_threshold_KB=%.0lf "
+ "prefetch_throttle_time=%" PRIu64 " bytes_per_prefetch=%" PRIu64
+ " max_outstanding_copy_on_read_data=%" PRId64 "\n",
+ BOOL(s->copy_on_read), BOOL(s->table_offset > 0),
+ s->block_size * 512, s->chunk_size * 512,
+ s->journal_size * 512, s->prefetch_start_delay,
+ s->num_prefetch_slots,
+ s->prefetch_min_read_throughput * 1000.0 / 1024.0,
+ s->prefetch_min_write_throughput * 1000.0 / 1024.0,
+ s->prefetch_throttle_time, s->sectors_per_prefetch * 512,
+ s->max_outstanding_copy_on_read_data);
+
+ return 0;
+
+fail:
+ fprintf(stderr, "Failed to open %s using the FVD format.\n", filename);
+ fvd_close(bs);
+ return ret;
+}
+
+static int load_table(BDRVFvdState * s, FvdHeader * header,
+ const char *const filename)
+{
+ if (header->table_offset <= 0) {
+ return 0; /* Not a compact image and no table. */
+ }
+
+ /* Initialize the table. */
+ s->table_offset = header->table_offset / 512;
+ s->table_size = header->table_size;
+ s->chunk_size = header->chunk_size / 512;
+ s->table = my_qemu_blockalign(s->fvd_metadata, s->table_size);
+
+ if (bdrv_pread(s->fvd_metadata, header->table_offset, s->table,
+ (int)s->table_size) != (int)s->table_size) {
+ fprintf(stderr, "Failed to read the table of %s\n", filename);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int init_compact_image(BDRVFvdState * s, FvdHeader * header,
+ const char *const filename)
+{
+ s->leaked_chunks = NULL;
+ s->num_leaked_chunks = 0;
+ s->next_avail_leaked_chunk = 0;
+
+ if (header->table_offset <= 0) {
+ /* Not a compact image. */
+ s->data_region_prepared = false;
+ return 0;
+ }
+
+ /* Scan the table to find the max used chunk and leaked chunks. */
+ uint32_t i;
+ uint32_t max_chunk = 0;
+ uint32_t table_entries = ROUND_UP(header->virtual_disk_size,
+ header->chunk_size) / header->chunk_size;
+ uint8_t *used_chunks = my_qemu_mallocz(table_entries);
+ for (i = 0; i < table_entries; i++) {
+ if (!IS_EMPTY(s->table[i])) {
+ uint32_t id = READ_TABLE(s->table[i]);
+ if (id >= max_chunk) {
+ max_chunk = id + 1;
+ }
+ if (used_chunks[id]) {
+ fprintf(stderr, "ERROR: corrupted image with multiple "
+ "virtual chunks mapped to physical chunk %u\n", id);
+ my_qemu_free(used_chunks);
+ return -EIO;
+ }
+ used_chunks[id] = true;
+ }
+ }
+
+ /* Count the number of leaked chunks. */
+ uint32_t num_leaked_chunks = 0;
+ for (i = 0; i < max_chunk; i++) {
+ if (!used_chunks[i]) {
+ num_leaked_chunks++;
+ }
+ }
+ QDEBUG("leaked_chunks=%u max_chunk=%u\n", num_leaked_chunks, max_chunk);
+
+ /* Record leaked chunks, which will be used later. */
+ if (num_leaked_chunks > 0) {
+ s->num_leaked_chunks = num_leaked_chunks;
+ s->leaked_chunks = my_qemu_malloc(sizeof(uint32_t) *
num_leaked_chunks);
+ num_leaked_chunks = 0;
+ for (i = 0; i < max_chunk; i++) {
+ if (!used_chunks[i]) {
+ s->leaked_chunks[num_leaked_chunks++] = i;
+ QDEBUG("Recover leaked physical chunk %u\n", i);
+ }
+ }
+ }
+ s->used_storage = max_chunk * s->chunk_size;
+ s->storage_grow_unit = header->storage_grow_unit / 512;
+ my_qemu_free(used_chunks);
+
+ /* Check if the image is directly stored on a raw device, including
+ * logical volume. If so, figure out the size of the device. */
+ struct stat stat_buf;
+ if (stat(filename, &stat_buf) != 0) {
+ fprintf(stderr, "Failed to stat() %s\n", filename);
+ return -EIO;
+ }
+
+ /* Check how much storage space is already allocated. */
+ int64_t size = bdrv_getlength(s->fvd_data);
+ if (size < 0) {
+ fprintf(stderr, "Failed in bdrv_getlength(%s)\n", filename);
+ return -EIO;
+ }
+
+ if (S_ISBLK(stat_buf.st_mode) || S_ISCHR(stat_buf.st_mode)) {
+ const int64_t min_size = (s->data_offset + s->used_storage) * 512;
+ if (size < min_size) {
+ fprintf(stderr, "The size of device %s is not even big enough to "
+ "store already allocated data.\n", filename);
+ return -EIO;
+ }
+
+ /* Initialize the command to grow storage space. */
+ char cmd[2048];
+ if (header->add_storage_cmd[0] == 0) {
+ s->add_storage_cmd = NULL;
+ } else {
+ if (strcmp(header->add_storage_cmd, "builtin:lvextend") == 0) {
+ /* Note the following:
+ * 1. lvextend may generate warning messages like "File
+ * descriptor...leaked...", * which is fine. See the
+ * following from LVM manual: "On invocation, lvm requires
+ * that only the standard file descriptors stdin,
+ * stdout * and stderr are available. If others are
+ * found, they get closed and messages are issued warning
+ * about the leak."
+ * 2. Instead of using the lvextend command line, one
+ * option is to use liblvm directly, which avoids creating
+ * a process to resize a LV.
+ * 3. On Ubuntu, /bin/sh is linked to /bin/dash, which
+ * does not support ">&" for stdout and stderr
+ * redirection. */
+ snprintf(cmd, sizeof(cmd) - 1, "/sbin/lvextend -L+%" PRIu64
+ "B %s >/dev/null 2>/dev/null",
+ header->storage_grow_unit,
+ header->data_file[0] ? header->data_file : filename);
+ } else {
+ snprintf(cmd, sizeof(cmd) - 1, "%s %" PRIu64
+ " %s >/dev/null 2>/dev/null",
+ header->add_storage_cmd, header->storage_grow_unit,
+ header->data_file[0] ? header->data_file : filename);
+ }
+ int len = strlen(cmd);
+ s->add_storage_cmd = my_qemu_malloc(len + 1);
+ memcpy(s->add_storage_cmd, cmd, len + 1);
+ }
+ }
+
+ s->avail_storage = size / 512 - s->data_offset;
+ s->fvd_data->growable = true;
+ s->data_region_prepared = true;
+
+ return 0;
+}
+
+static int init_data_file(BDRVFvdState * s, FvdHeader * header, int flags)
+{
+ int ret;
+
+ if (header->data_file[0]) {
+ /* Open a separate data file. */
+ s->data_offset = 0;
+ s->fvd_data = bdrv_new("");
+ if (!s->fvd_data) {
+ fprintf(stderr, "Failed to create a new block device driver.\n");
+ return -EIO;
+ }
+
+ if (header->data_file_fmt[0] == 0) {
+ ret = bdrv_open(s->fvd_data, header->data_file, flags, NULL);
+ } else {
+ BlockDriver *data_drv = bdrv_find_format(header->data_file_fmt);
+ if (!data_drv) {
+ fprintf(stderr, "Failed to find driver for image format "
+ "'%s' of data file %s\n",
+ header->data_file_fmt, header->data_file);
+ return -EINVAL;
+ }
+ ret = bdrv_open(s->fvd_data, header->data_file, flags, data_drv);
+ }
+ if (ret != 0) {
+ fprintf(stderr, "Failed to open data file %s\n",
header->data_file);
+ return -EIO;
+ }
+ } else {
+ s->data_offset = header->data_offset / 512; /* In sectors. */
+ s->fvd_data = s->fvd_metadata;
+ }
+
+ if (header->need_zero_init && !bdrv_has_zero_init(s->fvd_data)) {
+ if (IN_QEMU_TOOL) {
+ /* Only give a warning to allow 'qemu-img update' to modify
+ * need_zero_init if the user manually zero-init the device. */
+ fprintf(stderr, "Warning: image needs zero_init but it is not "
+ "supported by the storage media.\n");
+ } else {
+ fprintf(stderr, "Error: image needs zero_init but it is not "
+ "supported by the storage media.\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int init_bitmap(BlockDriverState * bs, BDRVFvdState * s,
+ FvdHeader * header, const char *const filename)
+{
+ if (header->base_img_fully_prefetched) {
+ /* This also covers the case of no base image. */
+ s->prefetch_state = PREFETCH_STATE_FINISHED;
+ s->copy_on_read = false;
+ s->prefetch_start_delay = -1;
+
+ if (bs->backing_file[0] != 0) {
+ /* No need to use the base image. It may operate without problem
+ * even if the base image is no longer accessible. */
+ bs->backing_file[0] = 0;
+ }
+ } else {
+ ASSERT(header->base_img[0] != 0);
+ pstrcpy(bs->backing_file, 1024, header->base_img);
+ pstrcpy(bs->backing_format, 16, header->base_img_fmt);
+
+ /* This will be enabled in init_prefetch() after a timer expires. */
+ s->prefetch_state = PREFETCH_STATE_DISABLED;
+ s->stale_bitmap = my_qemu_blockalign(s->fvd_metadata, s->bitmap_size);
+ if (bdrv_pread(s->fvd_metadata, header->bitmap_offset,
+ s->stale_bitmap, s->bitmap_size) != s->bitmap_size) {
+ fprintf(stderr, "Failed to read the bitmap of %s.\n", filename);
+ return -EIO;
+ }
+
+ if (s->copy_on_read || (s->prefetch_state != PREFETCH_STATE_FINISHED &&
+ s->prefetch_start_delay > 0)) {
+ /* Use two bitmaps only if copy_on_read or prefetching is enabled.
+ * See Section 3.3.4 of the FVD-cow paper. */
+ s->fresh_bitmap = my_qemu_blockalign(s->fvd_metadata,
+ s->bitmap_size);
+ memcpy(s->fresh_bitmap, s->stale_bitmap, s->bitmap_size);
+ } else {
+ s->fresh_bitmap = s->stale_bitmap;
+ }
+ }
+
+ return 0;
+}
+
+static void init_prefetch_timer(BlockDriverState * bs, BDRVFvdState * s)
+{
+ if (IN_QEMU_TOOL) {
+ return;
+ }
+
+ if (s->prefetch_state == PREFETCH_STATE_FINISHED ||
+ s->prefetch_start_delay <= 0) {
+ return;
+ }
+
+ /* Start prefetching after a delay. Times 1000 to convert sec to ms. */
+ int64_t expire = qemu_get_clock(rt_clock) + s->prefetch_start_delay * 1000;
+ s->prefetch_timer = qemu_new_timer(rt_clock, fvd_init_prefetch, bs);
+ qemu_mod_timer(s->prefetch_timer, expire);
}
diff --git a/block/fvd-prefetch.c b/block/fvd-prefetch.c
new file mode 100644
index 0000000..5844aa7
--- /dev/null
+++ b/block/fvd-prefetch.c
@@ -0,0 +1,17 @@
+/*
+ * QEMU Fast Virtual Disk Format Adaptive Prefetching
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ * Chunqiang Tang <address@hidden>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+void fvd_init_prefetch(void *opaque)
+{
+ /* To be implemented. */
+}
diff --git a/block/fvd.c b/block/fvd.c
index d6263e7..e41f419 100644
--- a/block/fvd.c
+++ b/block/fvd.c
@@ -33,6 +33,7 @@
#include "block/fvd-read.c"
#include "block/fvd-write.c"
#include "block/fvd-journal.c"
+#include "block/fvd-prefetch.c"
#include "block/fvd-update.c"
static BlockDriver bdrv_fvd = {
--
1.7.0.4
- [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim', Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 10/26] FVD: add impl of interface bdrv_file_open(),
Chunqiang Tang <=
- [Qemu-devel] [PATCH 08/26] FVD: add debugging utilities, Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 16/26] FVD: add impl for buffered journal updates, Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 20/26] FVD: add impl of interface bdrv_get_info(), Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 24/26] FVD: add impl of interface bdrv_has_zero_init(), Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 21/26] FVD: add impl of interface bdrv_close(), Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 14/26] FVD: add impl of loading data from compact image, Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 26/26] FVD: add fully automated test-fvd.sh, Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 23/26] FVD: add impl of interface bdrv_is_allocated(), Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 17/26] FVD: add impl of bdrv_flush() and bdrv_aio_flush(), Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 22/26] FVD: add impl of interface bdrv_update(), Chunqiang Tang, 2011/02/25