[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [PATCH 17/26] FVD: add impl of bdrv_flush() and bdrv_aio_fl
From: |
Chunqiang Tang |
Subject: |
[Qemu-devel] [PATCH 17/26] FVD: add impl of bdrv_flush() and bdrv_aio_flush() |
Date: |
Fri, 25 Feb 2011 17:37:57 -0500 |
This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.
This patch adds FVD's implementation of the bdrv_flush() and bdrv_aio_flush()
interfaces.
Signed-off-by: Chunqiang Tang <address@hidden>
---
block/fvd-flush.c | 176 +++++++++++++++++++++++++++++++++++++-
block/fvd-journal-buf.c | 218 +++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 390 insertions(+), 4 deletions(-)
diff --git a/block/fvd-flush.c b/block/fvd-flush.c
index 34bd5cb..6658d27 100644
--- a/block/fvd-flush.c
+++ b/block/fvd-flush.c
@@ -1,5 +1,5 @@
/*
- * QEMU Fast Virtual Disk Format bdrv_flush() and bdrv_aio_flush()
+ * QEMU Fast Virtual Disk Format Misc Functions of BlockDriver Interface
*
* Copyright IBM, Corp. 2010
*
@@ -11,14 +11,182 @@
*
*/
+static void aio_wrapper_bh(void *opaque);
+static int bjnl_sync_flush(BlockDriverState * bs);
+static bool bjnl_clean_buf_on_aio_flush(BlockDriverState *bs,
+ BlockDriverCompletionFunc * cb,
+ void *opaque, BlockDriverAIOCB **p_acb);
+static BlockDriverAIOCB *fvd_aio_flush_start(BlockDriverState * bs,
+ BlockDriverCompletionFunc * cb,
+ void *opaque, FvdAIOCB *parent_acb);
+
+static int fvd_flush(BlockDriverState * bs)
+{
+ BDRVFvdState *s = bs->opaque;
+ int ret;
+
+ QDEBUG("fvd_flush() invoked\n");
+
+ if (s->metadata_err_prohibit_write) {
+ return -EIO;
+ }
+
+ if (!s->fvd_metadata->enable_write_cache) {
+ /* No need to flush since it uses O_DSYNC. */
+ return 0;
+ }
+
+ if (s->use_bjnl) {
+ return bjnl_sync_flush(bs);
+ }
+
+ /* Simply flush for unbuffered journal update. */
+ if ((ret = bdrv_flush(s->fvd_data))) {
+ return ret;
+ }
+ if (s->fvd_metadata == s->fvd_data) {
+ return 0;
+ }
+ return bdrv_flush(s->fvd_metadata);
+}
+
static BlockDriverAIOCB *fvd_aio_flush(BlockDriverState * bs,
BlockDriverCompletionFunc * cb,
void *opaque)
{
- return NULL;
+ BDRVFvdState *s = bs->opaque;
+ BlockDriverAIOCB * pacb;
+ FvdAIOCB *acb;
+
+ QDEBUG("fvd_aio_flush() invoked\n");
+
+ if (s->metadata_err_prohibit_write) {
+ return NULL;
+ }
+
+ if (!s->fvd_data->enable_write_cache) {
+ /* Need to flush since it uses O_DSYNC. Use a QEMUBH to invoke the
+ * callback. */
+
+ if (!(acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque))) {
+ return NULL;
+ }
+
+ acb->type = OP_WRAPPER;
+ acb->cancel_in_progress = false;
+ acb->wrapper.bh = qemu_bh_new(aio_wrapper_bh, acb);
+ qemu_bh_schedule(acb->wrapper.bh);
+ return &acb->common;
+ }
+
+ if (!s->use_bjnl) {
+ QDEBUG("FLUSH: start now for unbuffered journal update");
+ return fvd_aio_flush_start(bs, cb, opaque, NULL);
+ }
+
+ if (bjnl_clean_buf_on_aio_flush(bs, cb, opaque, &pacb)) {
+ /* Waiting for the journal buffer to be cleaned first. */
+ return pacb;
+ }
+
+ /* No buffered journal data. Start flush now. */
+ QDEBUG("FLUSH: start now as no buffered journal data");
+ return fvd_aio_flush_start(bs, cb, opaque, NULL);
+}
+
+static inline void finish_flush(FvdAIOCB * acb)
+{
+ QDEBUG("FLUSH: acb%llu-%p finish_flush ret=%d\n",
+ acb->uuid, acb, acb->flush.ret);
+ acb->common.cb(acb->common.opaque, acb->flush.ret);
+ my_qemu_aio_release(acb);
}
-static int fvd_flush(BlockDriverState * bs)
+static void flush_data_cb(void *opaque, int ret)
{
- return -ENOTSUP;
+ FvdAIOCB *acb = opaque;
+
+ if (acb->cancel_in_progress) {
+ return;
+ }
+
+ QDEBUG("FLUSH: acb%llu-%p flush_data_cb ret=%d\n", acb->uuid, acb, ret);
+
+ if (acb->flush.ret == 0) {
+ acb->flush.ret = ret;
+ }
+
+ acb->flush.data_acb = NULL;
+ acb->flush.num_finished++;
+ if (acb->flush.num_finished == 2) {
+ finish_flush(acb);
+ }
+}
+
+static void flush_metadata_cb(void *opaque, int ret)
+{
+ FvdAIOCB *acb = opaque;
+
+ if (acb->cancel_in_progress) {
+ return;
+ }
+
+ QDEBUG("FLUSH: acb%llu-%p flush_metadata_cb ret=%d\n",
+ acb->uuid, acb, ret);
+
+ if (acb->flush.ret == 0) {
+ acb->flush.ret = ret;
+ }
+
+ acb->flush.metadata_acb = NULL;
+ acb->flush.num_finished++;
+ if (acb->flush.num_finished == 2) {
+ finish_flush(acb);
+ }
+}
+
+static BlockDriverAIOCB *fvd_aio_flush_start(BlockDriverState * bs,
+ BlockDriverCompletionFunc * cb,
+ void *opaque, FvdAIOCB *parent_acb)
+{
+ BDRVFvdState *s = bs->opaque;
+ FvdAIOCB *acb;
+
+ if (s->fvd_data == s->fvd_metadata) {
+ if (parent_acb) {
+ QDEBUG("FLUSH: acb%llu-%p
started.\n",parent_acb->uuid,parent_acb);
+ }
+ return bdrv_aio_flush(s->fvd_metadata, cb, opaque);
+ }
+
+ acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque);
+ if (!acb) {
+ return NULL;
+ }
+ COPY_UUID(acb, parent_acb); /* UUID helps debugging. */
+
+ /* fvd_data and fvd_metadata are different. Need to flush both. The order
+ * is not important. If (cache != writethrough && bitmap_updated), a flush
+ * on fvd_data must have already been performed by write_journal_buf(). */
+
+ acb->type = OP_FLUSH;
+ acb->cancel_in_progress = false;
+ acb->flush.num_finished = 0;
+ acb->flush.ret = 0;
+ acb->flush.data_acb = bdrv_aio_flush(s->fvd_data, flush_data_cb, acb);
+ if (!acb->flush.data_acb) {
+ my_qemu_aio_release(acb);
+ return NULL;
+ }
+
+ acb->flush.metadata_acb = bdrv_aio_flush(s->fvd_metadata,
+ flush_metadata_cb, acb);
+ if (!acb->flush.metadata_acb) {
+ bdrv_aio_cancel(acb->flush.data_acb);
+ my_qemu_aio_release(acb);
+ return NULL;
+ }
+
+ QDEBUG("FLUSH: acb%llu-%p started.\n", acb->uuid, acb);
+ return &acb->common;
}
diff --git a/block/fvd-journal-buf.c b/block/fvd-journal-buf.c
index b4077ce..e99a585 100644
--- a/block/fvd-journal-buf.c
+++ b/block/fvd-journal-buf.c
@@ -23,6 +23,48 @@
static inline int bjnl_write_buf(FvdAIOCB *acb);
static void bjnl_send_current_buf_to_write_queue(BlockDriverState *bs);
+/* Return false if no buffered journal data. Invoked by fvd_aio_flush(). */
+static bool bjnl_clean_buf_on_aio_flush(BlockDriverState *bs,
+ BlockDriverCompletionFunc * cb,
+ void *opaque, BlockDriverAIOCB **p_acb)
+{
+ BDRVFvdState *s = bs->opaque;
+ FvdAIOCB *acb;
+
+ if (!s->bjnl.buf || s->bjnl.buf_used == 0) {
+ /* The current journal buffer is empty. */
+
+ if (QTAILQ_EMPTY(&s->bjnl.queued_bufs)) {
+ return false; /* Indicatte no previously buffered journal data. */
+ }
+ } else {
+ QDEBUG("JOURNAL: bjnl_clean_buf_on_aio_flush invoke "
+ "bjnl_send_current_buf_to_write_queue\n");
+ bjnl_send_current_buf_to_write_queue(bs);
+ }
+
+ /* Append an acb at the tail of bjnl.queued_bufs to invoke the aio_flush
+ * callback after all previous pending journal writes finish. See
+ * bjnl_write_next_buf() -> bjnl_write_buf(). */
+
+ acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque);
+ if (!acb) {
+ *p_acb = NULL; /* Indicate failure. */
+ return true;
+ }
+
+ acb->type = OP_BJNL_FLUSH;
+ acb->cancel_in_progress = false;
+ acb->jcb.iov.iov_base = NULL; /* Indicate no data. */
+ acb->jcb.hd_acb = NULL;
+ acb->jcb.bitmap_updated = false;
+ QTAILQ_INSERT_TAIL(&s->bjnl.queued_bufs, acb, jcb.bjnl_next_queued_buf);
+ *p_acb = &acb->common;
+
+ QDEBUG("JOURNAL: inserted OP_BJNL_FLUSH acb%llu-%p\n", acb->uuid, acb);
+ return true;
+}
+
static inline void bjnl_finish_write_buf(FvdAIOCB *acb, int ret)
{
ASSERT (acb->type == OP_BJNL_BUF_WRITE);
@@ -65,6 +107,30 @@ static inline void bjnl_aio_flush_cb(void *opaque, int ret)
my_qemu_aio_release(acb);
}
+/* This acb is inserted by clean_journal_buf() on behalf of a pending
+ * bdrv_aio_flush(). */
+static inline void bjnl_handle_aio_flush(FvdAIOCB *acb)
+{
+ BlockDriverState *bs = acb->common.bs;
+ BDRVFvdState *s = bs->opaque;
+
+ QTAILQ_REMOVE(&s->bjnl.queued_bufs, acb, jcb.bjnl_next_queued_buf);
+
+ if (!s->metadata_err_prohibit_write) {
+ /* Buffered data have been written to journal. Now start flush. */
+ QDEBUG("JOURNAL: bjnl_start_flush for acb%llu-%p\n", acb->uuid, acb);
+ acb->jcb.hd_acb = fvd_aio_flush_start(bs, bjnl_aio_flush_cb, acb, acb);
+ if (acb->jcb.hd_acb) {
+ return;
+ }
+ }
+
+ QDEBUG("JOURNAL: bjnl_handle_aio_flush err acb%llu-%p\n", acb->uuid, acb);
+ /* Failed. Invoke aio_flush callback. */
+ acb->common.cb(acb->common.opaque, -EIO);
+ my_qemu_aio_release(acb);
+}
+
static inline void bjnl_write_buf_cb(void *opaque, int ret)
{
FvdAIOCB *acb = (FvdAIOCB *) opaque;
@@ -153,6 +219,14 @@ static inline int bjnl_write_buf(FvdAIOCB *acb)
QDEBUG("JOURNAL: bjnl_write_buf acb%llu-%p\n", acb->uuid, acb);
+ if (acb->type == OP_BJNL_FLUSH) {
+ bjnl_handle_aio_flush(acb);
+
+ /* Return -1 to tell bjnl_write_next_buf() to move on to the next
+ * buffer write as no buffered journal data are being written.*/
+ return -1;
+ }
+
if (!acb->jcb.bitmap_updated) {
return bjnl_write_buf_start(acb);
}
@@ -313,6 +387,150 @@ static void bjnl_clean_buf_timer_cb(BlockDriverState * bs)
s->bjnl.timer_scheduled = false;
}
+/* Perform a synchronous flush. Invoked by fvd_close() and fvd_flush(). */
+static int bjnl_sync_flush(BlockDriverState * bs)
+{
+ BDRVFvdState *s = bs->opaque;
+ FvdAIOCB *acb, *a;
+ int ret = 0;
+ size_t buf_size;
+ uint8_t *p, *buf = NULL;
+ bool bitmap_updated = false;
+ int nb_sectors;
+ int64_t journal_sec;
+
+ /* Calculate the total buffered metadata updates. Check the current buffer
+ * first. */
+ if (!s->bjnl.buf) {
+ buf_size = 0;
+ } else if (s->bjnl.buf_used == 0) {
+ buf_size = 0;
+ } else {
+ if (s->bjnl.buf_used < s->bjnl.buf_size) {
+ /* Mark the end of the buffer as EMPTY_JRECORD. */
+ *((uint32_t*)(s->bjnl.buf + s->bjnl.buf_used)) = EMPTY_JRECORD;
+ }
+ buf_size = s->bjnl.buf_used = ROUND_UP(s->bjnl.buf_used, 512);
+ bitmap_updated = s->bjnl.buf_contains_bitmap_update;
+ }
+
+ /* Go through the queued buffers. */
+ acb = QTAILQ_FIRST(&s->bjnl.queued_bufs);
+ if (acb) {
+ if (acb->jcb.hd_acb) {
+ /* The first acb is the ongoing operation. Cancel and re-do it
+ * synchronously below. */
+ QDEBUG("JOURNAL: bjnl_sync_flush cancel ongoing buf_write "
+ "acb%llu-%p\n", acb->uuid, acb);
+ bdrv_aio_cancel(acb->jcb.hd_acb);
+ }
+
+ /* Calcualte buf_size. */
+ while (acb) {
+ if (acb->type == OP_BJNL_BUF_WRITE) {
+ buf_size += acb->jcb.iov.iov_len;
+ if (acb->jcb.bitmap_updated) {
+ bitmap_updated = true;
+ }
+ }
+ acb = QTAILQ_NEXT(acb, jcb.bjnl_next_queued_buf);
+ }
+ }
+
+ if (buf_size == 0) {
+ QDEBUG("JOURNAL: bjnl_sync_flush no_data\n");
+ goto done; /* No buffered metadata updates. */
+ }
+
+ if (bitmap_updated) {
+ /* Need a flush to ensure the correct semantics of copy-on-write in
+ * the event of a host crash. */
+ QDEBUG("JOURNAL: bjnl_sync_flush bitmap_updated flush_fvd_data\n");
+ if ((ret = bdrv_flush(s->fvd_data))) {
+ goto cleanup;
+ }
+ }
+
+ /* Allocate journal sectors. */
+ ASSERT(buf_size % 512 == 0);
+ nb_sectors = buf_size / 512;
+ if (s->next_journal_sector + nb_sectors > s->journal_size) {
+ QDEBUG("JOURNAL: bjnl_sync_flush recycle_journal\n");
+ ret = recycle_journal(bs);
+ /* Journal recycle writes out the entire bitmap and table. Therefore,
+ * there is no need to write buffered metadata updates to journal. */
+ goto done;
+ }
+ journal_sec = s->next_journal_sector;
+ s->next_journal_sector += nb_sectors;
+
+ /* Copy all metadata updates into one buffer. */
+ p = buf = my_qemu_blockalign(s->fvd_metadata, buf_size);
+ acb = QTAILQ_FIRST(&s->bjnl.queued_bufs);
+ while (acb) {
+ if (acb->type == OP_BJNL_BUF_WRITE) {
+ QDEBUG("JOURNAL: bjnl_sync_flush takes care buf_write
acb%llu-%p\n",
+ acb->uuid, acb);
+ ASSERT(acb->jcb.iov.iov_len > 0);
+ memcpy(p, acb->jcb.iov.iov_base, acb->jcb.iov.iov_len);
+ PRINT_JRECORDS(p, acb->jcb.iov.iov_len);
+ p += acb->jcb.iov.iov_len;
+ }
+ acb = QTAILQ_NEXT(acb, jcb.bjnl_next_queued_buf);
+ }
+
+ if (s->bjnl.buf && s->bjnl.buf_used > 0) {
+ /* Copy the current buffer. */
+ memcpy(p, s->bjnl.buf, s->bjnl.buf_used);
+ PRINT_JRECORDS(p, s->bjnl.buf_used);
+ }
+
+ /* Write all metadata updates synchronously. */
+ QDEBUG("JOURNAL: bjnl_sync_flush write_buffer\n");
+ if ((ret=bdrv_write(s->fvd_metadata, s->journal_offset + journal_sec,
+ buf, nb_sectors)) < 0) {
+ goto cleanup;
+ }
+
+done:
+ /* Flush finally. */
+ QDEBUG("JOURNAL: bjnl_sync_flush do final flush\n");
+ if (s->fvd_data != s->fvd_metadata) {
+ if ((ret = bdrv_flush(s->fvd_data)) != 0) {
+ goto cleanup;
+ }
+ }
+ ret = bdrv_flush(s->fvd_metadata);
+
+cleanup:
+ if (buf) {
+ my_qemu_vfree(buf);
+ }
+ if (s->bjnl.buf) {
+ my_qemu_vfree (s->bjnl.buf);
+ s->bjnl.buf = NULL;
+ }
+
+ acb = QTAILQ_FIRST(&s->bjnl.queued_bufs);
+ QTAILQ_INIT(&s->bjnl.queued_bufs);
+ while (acb) {
+ if (acb->type == OP_BJNL_BUF_WRITE) {
+ my_qemu_vfree(acb->jcb.iov.iov_base);
+ } else {
+ ASSERT(acb->type == OP_BJNL_FLUSH);
+ /* Invoke the callback for bdrv_aio_flush(). */
+ QDEBUG("JOURNAL: aio_flush acb%llu-%p finished by sync_flush\n",
+ acb->uuid, acb);
+ acb->common.cb(acb->common.opaque, ret);
+ }
+ a = acb;
+ acb = QTAILQ_NEXT(acb, jcb.bjnl_next_queued_buf);
+ my_qemu_aio_release(a);
+ }
+
+ return ret;
+}
+
#ifdef ENABLE_QDEBUG
static void print_jrecords(const uint8_t *sector, size_t len)
{
--
1.7.0.4
- [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim', Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 10/26] FVD: add impl of interface bdrv_file_open(), Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 08/26] FVD: add debugging utilities, Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 16/26] FVD: add impl for buffered journal updates, Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 20/26] FVD: add impl of interface bdrv_get_info(), Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 24/26] FVD: add impl of interface bdrv_has_zero_init(), Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 21/26] FVD: add impl of interface bdrv_close(), Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 14/26] FVD: add impl of loading data from compact image, Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 26/26] FVD: add fully automated test-fvd.sh, Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 23/26] FVD: add impl of interface bdrv_is_allocated(), Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 17/26] FVD: add impl of bdrv_flush() and bdrv_aio_flush(),
Chunqiang Tang <=
- [Qemu-devel] [PATCH 22/26] FVD: add impl of interface bdrv_update(), Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 13/26] FVD: add impl of storing data in compact image, Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 19/26] FVD: add support for aio_cancel, Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 25/26] FVD: add impl of interface bdrv_probe(), Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 15/26] FVD: add basic journal functionality, Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 18/26] FVD: add support for base image prefetching, Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 03/26] FVD: add fully automated test-qcow2.sh, Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 02/26] FVD: extend qemu-io to do fully automated testing, Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 07/26] FVD: extend FVD header fvd.h to be more complete, Chunqiang Tang, 2011/02/25
- [Qemu-devel] [PATCH 12/26] FVD: add impl of interface bdrv_aio_readv(), Chunqiang Tang, 2011/02/25