[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [PATCH 09/30] block: add live block commit functionality
From: |
Kevin Wolf |
Subject: |
[Qemu-devel] [PATCH 09/30] block: add live block commit functionality |
Date: |
Fri, 28 Sep 2012 19:56:52 +0200 |
From: Jeff Cody <address@hidden>
This adds the live commit coroutine. This iteration focuses on the
commit only below the active layer, and not the active layer itself.
The behaviour is similar to block streaming; the sectors are walked
through, and anything that exists above 'base' is committed back down
into base. At the end, intermediate images are deleted, and the
chain stitched together. Images are restored to their original open
flags upon completion.
Signed-off-by: Jeff Cody <address@hidden>
Reviewed-by: Eric Blake <address@hidden>
Signed-off-by: Kevin Wolf <address@hidden>
---
block/Makefile.objs | 1 +
block/commit.c | 267 +++++++++++++++++++++++++++++++++++++++++++++++++++
block_int.h | 16 +++
trace-events | 2 +
4 files changed, 286 insertions(+), 0 deletions(-)
create mode 100644 block/commit.c
diff --git a/block/Makefile.objs b/block/Makefile.objs
index a1ae67f..81fd43c 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -4,6 +4,7 @@ block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o
qed-cluster.o
block-obj-y += qed-check.o
block-obj-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
block-obj-y += stream.o
+block-obj-y += commit.o
block-obj-$(CONFIG_WIN32) += raw-win32.o
block-obj-$(CONFIG_POSIX) += raw-posix.o
block-obj-$(CONFIG_LIBISCSI) += iscsi.o
diff --git a/block/commit.c b/block/commit.c
new file mode 100644
index 0000000..624ec5f
--- /dev/null
+++ b/block/commit.c
@@ -0,0 +1,267 @@
+/*
+ * Live block commit
+ *
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ * Jeff Cody <address@hidden>
+ * Based on stream.c by Stefan Hajnoczi
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "trace.h"
+#include "block_int.h"
+#include "qemu/ratelimit.h"
+
+enum {
+ /*
+ * Size of data buffer for populating the image file. This should be large
+ * enough to process multiple clusters in a single call, so that populating
+ * contiguous regions of the image is efficient.
+ */
+ COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */
+};
+
+#define SLICE_TIME 100000000ULL /* ns */
+
+typedef struct CommitBlockJob {
+ BlockJob common;
+ RateLimit limit;
+ BlockDriverState *active;
+ BlockDriverState *top;
+ BlockDriverState *base;
+ BlockErrorAction on_error;
+ int base_flags;
+ int orig_overlay_flags;
+} CommitBlockJob;
+
+static int coroutine_fn commit_populate(BlockDriverState *bs,
+ BlockDriverState *base,
+ int64_t sector_num, int nb_sectors,
+ void *buf)
+{
+ int ret = 0;
+
+ ret = bdrv_read(bs, sector_num, buf, nb_sectors);
+ if (ret) {
+ return ret;
+ }
+
+ ret = bdrv_write(base, sector_num, buf, nb_sectors);
+ if (ret) {
+ return ret;
+ }
+
+ return 0;
+}
+
+static void coroutine_fn commit_run(void *opaque)
+{
+ CommitBlockJob *s = opaque;
+ BlockDriverState *active = s->active;
+ BlockDriverState *top = s->top;
+ BlockDriverState *base = s->base;
+ BlockDriverState *overlay_bs = NULL;
+ int64_t sector_num, end;
+ int ret = 0;
+ int n = 0;
+ void *buf;
+ int bytes_written = 0;
+ int64_t base_len;
+
+ ret = s->common.len = bdrv_getlength(top);
+
+
+ if (s->common.len < 0) {
+ goto exit_restore_reopen;
+ }
+
+ ret = base_len = bdrv_getlength(base);
+ if (base_len < 0) {
+ goto exit_restore_reopen;
+ }
+
+ if (base_len < s->common.len) {
+ ret = bdrv_truncate(base, s->common.len);
+ if (ret) {
+ goto exit_restore_reopen;
+ }
+ }
+
+ overlay_bs = bdrv_find_overlay(active, top);
+
+ end = s->common.len >> BDRV_SECTOR_BITS;
+ buf = qemu_blockalign(top, COMMIT_BUFFER_SIZE);
+
+ for (sector_num = 0; sector_num < end; sector_num += n) {
+ uint64_t delay_ns = 0;
+ bool copy;
+
+wait:
+ /* Note that even when no rate limit is applied we need to yield
+ * with no pending I/O here so that qemu_aio_flush() returns.
+ */
+ block_job_sleep_ns(&s->common, rt_clock, delay_ns);
+ if (block_job_is_cancelled(&s->common)) {
+ break;
+ }
+ /* Copy if allocated above the base */
+ ret = bdrv_co_is_allocated_above(top, base, sector_num,
+ COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE,
+ &n);
+ copy = (ret == 1);
+ trace_commit_one_iteration(s, sector_num, n, ret);
+ if (copy) {
+ if (s->common.speed) {
+ delay_ns = ratelimit_calculate_delay(&s->limit, n);
+ if (delay_ns > 0) {
+ goto wait;
+ }
+ }
+ ret = commit_populate(top, base, sector_num, n, buf);
+ bytes_written += n * BDRV_SECTOR_SIZE;
+ }
+ if (ret < 0) {
+ if (s->on_error == BLOCK_ERR_STOP_ANY ||
+ s->on_error == BLOCK_ERR_REPORT ||
+ (s->on_error == BLOCK_ERR_STOP_ENOSPC && ret == -ENOSPC)) {
+ goto exit_free_buf;
+ } else {
+ n = 0;
+ continue;
+ }
+ }
+ /* Publish progress */
+ s->common.offset += n * BDRV_SECTOR_SIZE;
+ }
+
+ ret = 0;
+
+ if (!block_job_is_cancelled(&s->common) && sector_num == end) {
+ /* success */
+ ret = bdrv_drop_intermediate(active, top, base);
+ }
+
+exit_free_buf:
+ qemu_vfree(buf);
+
+exit_restore_reopen:
+ /* restore base open flags here if appropriate (e.g., change the base back
+ * to r/o). These reopens do not need to be atomic, since we won't abort
+ * even on failure here */
+ if (s->base_flags != bdrv_get_flags(base)) {
+ bdrv_reopen(base, s->base_flags, NULL);
+ }
+ if (s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
+ bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
+ }
+
+ block_job_complete(&s->common, ret);
+}
+
+static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
+{
+ CommitBlockJob *s = container_of(job, CommitBlockJob, common);
+
+ if (speed < 0) {
+ error_set(errp, QERR_INVALID_PARAMETER, "speed");
+ return;
+ }
+ ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
+}
+
+static BlockJobType commit_job_type = {
+ .instance_size = sizeof(CommitBlockJob),
+ .job_type = "commit",
+ .set_speed = commit_set_speed,
+};
+
+void commit_start(BlockDriverState *bs, BlockDriverState *base,
+ BlockDriverState *top, int64_t speed,
+ BlockErrorAction on_error, BlockDriverCompletionFunc *cb,
+ void *opaque, Error **errp)
+{
+ CommitBlockJob *s;
+ BlockReopenQueue *reopen_queue = NULL;
+ int orig_overlay_flags;
+ int orig_base_flags;
+ BlockDriverState *overlay_bs;
+ Error *local_err = NULL;
+
+ if ((on_error == BLOCK_ERR_STOP_ANY ||
+ on_error == BLOCK_ERR_STOP_ENOSPC) &&
+ !bdrv_iostatus_is_enabled(bs)) {
+ error_set(errp, QERR_INVALID_PARAMETER_COMBINATION);
+ return;
+ }
+
+ /* Once we support top == active layer, remove this check */
+ if (top == bs) {
+ error_setg(errp,
+ "Top image as the active layer is currently unsupported");
+ return;
+ }
+
+ if (top == base) {
+ error_setg(errp, "Invalid files for merge: top and base are the same");
+ return;
+ }
+
+ /* top and base may be valid, but let's make sure that base is reachable
+ * from top */
+ if (bdrv_find_backing_image(top, base->filename) != base) {
+ error_setg(errp,
+ "Base (%s) is not reachable from top (%s)",
+ base->filename, top->filename);
+ return;
+ }
+
+ overlay_bs = bdrv_find_overlay(bs, top);
+
+ if (overlay_bs == NULL) {
+ error_setg(errp, "Could not find overlay image for %s:",
top->filename);
+ return;
+ }
+
+ orig_base_flags = bdrv_get_flags(base);
+ orig_overlay_flags = bdrv_get_flags(overlay_bs);
+
+ /* convert base & overlay_bs to r/w, if necessary */
+ if (!(orig_base_flags & BDRV_O_RDWR)) {
+ reopen_queue = bdrv_reopen_queue(reopen_queue, base,
+ orig_base_flags | BDRV_O_RDWR);
+ }
+ if (!(orig_overlay_flags & BDRV_O_RDWR)) {
+ reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs,
+ orig_overlay_flags | BDRV_O_RDWR);
+ }
+ if (reopen_queue) {
+ bdrv_reopen_multiple(reopen_queue, &local_err);
+ if (local_err != NULL) {
+ error_propagate(errp, local_err);
+ return;
+ }
+ }
+
+
+ s = block_job_create(&commit_job_type, bs, speed, cb, opaque, errp);
+ if (!s) {
+ return;
+ }
+
+ s->base = base;
+ s->top = top;
+ s->active = bs;
+
+ s->base_flags = orig_base_flags;
+ s->orig_overlay_flags = orig_overlay_flags;
+
+ s->on_error = on_error;
+ s->common.co = qemu_coroutine_create(commit_run);
+
+ trace_commit_start(bs, base, top, s, s->common.co, opaque);
+ qemu_coroutine_enter(s->common.co, s);
+}
diff --git a/block_int.h b/block_int.h
index ac4245c..56164a7 100644
--- a/block_int.h
+++ b/block_int.h
@@ -463,4 +463,20 @@ void stream_start(BlockDriverState *bs, BlockDriverState
*base,
BlockDriverCompletionFunc *cb,
void *opaque, Error **errp);
+/**
+ * commit_start:
+ * @bs: Top Block device
+ * @base: Block device that will be written into, and become the new top
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @on_error: The action to take upon error.
+ * @cb: Completion function for the job.
+ * @opaque: Opaque pointer value passed to @cb.
+ * @errp: Error object.
+ *
+ */
+void commit_start(BlockDriverState *bs, BlockDriverState *base,
+ BlockDriverState *top, int64_t speed,
+ BlockErrorAction on_error, BlockDriverCompletionFunc *cb,
+ void *opaque, Error **errp);
+
#endif /* BLOCK_INT_H */
diff --git a/trace-events b/trace-events
index f5b5097..dbc3007 100644
--- a/trace-events
+++ b/trace-events
@@ -74,6 +74,8 @@ bdrv_co_do_copy_on_readv(void *bs, int64_t sector_num, int
nb_sectors, int64_t c
# block/stream.c
stream_one_iteration(void *s, int64_t sector_num, int nb_sectors, int
is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d"
stream_start(void *bs, void *base, void *s, void *co, void *opaque) "bs %p
base %p s %p co %p opaque %p"
+commit_one_iteration(void *s, int64_t sector_num, int nb_sectors, int
is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d"
+commit_start(void *bs, void *base, void *top, void *s, void *co, void *opaque)
"bs %p base %p top %p s %p co %p opaque %p"
# blockdev.c
qmp_block_job_cancel(void *job) "job %p"
--
1.7.6.5
- [Qemu-devel] [PULL 00/30] Block patches, Kevin Wolf, 2012/09/28
- [Qemu-devel] [PATCH 02/30] block: after creating a live snapshot, make old image read-only, Kevin Wolf, 2012/09/28
- [Qemu-devel] [PATCH 01/30] block-migration: Flush requests in blk_mig_cleanup, Kevin Wolf, 2012/09/28
- [Qemu-devel] [PATCH 03/30] aio: Fix qemu_aio_wait() to maintain correct walking_handlers count, Kevin Wolf, 2012/09/28
- [Qemu-devel] [PATCH 06/30] configure: Add a config option for GlusterFS as block backend, Kevin Wolf, 2012/09/28
- [Qemu-devel] [PATCH 07/30] block: Support GlusterFS as a QEMU block backend., Kevin Wolf, 2012/09/28
- [Qemu-devel] [PATCH 05/30] aio: Another fix to the walking_handlers logic, Kevin Wolf, 2012/09/28
- [Qemu-devel] [PATCH 10/30] blockdev: rename block_stream_cb to a generic block_job_cb, Kevin Wolf, 2012/09/28
- [Qemu-devel] [PATCH 09/30] block: add live block commit functionality,
Kevin Wolf <=
- [Qemu-devel] [PATCH 11/30] block: helper function, to find the base image of a chain, Kevin Wolf, 2012/09/28
- [Qemu-devel] [PATCH 08/30] block: add support functions for live commit, to find and delete images., Kevin Wolf, 2012/09/28
- [Qemu-devel] [PATCH 04/30] qemu: URI parsing library, Kevin Wolf, 2012/09/28
- [Qemu-devel] [PATCH 12/30] QAPI: add command for live block commit, 'block-commit', Kevin Wolf, 2012/09/28
- [Qemu-devel] [PATCH 13/30] qemu-iotests: add initial tests for live block commit, Kevin Wolf, 2012/09/28
- [Qemu-devel] [PATCH 14/30] qerror/block: introduce QERR_BLOCK_JOB_NOT_ACTIVE, Kevin Wolf, 2012/09/28
- [Qemu-devel] [PATCH 15/30] block: fix documentation of block_job_cancel_sync, Kevin Wolf, 2012/09/28
- [Qemu-devel] [PATCH 17/30] block: add block_job_query, Kevin Wolf, 2012/09/28
- [Qemu-devel] [PATCH 18/30] qmp: add 'busy' member to BlockJobInfo, Kevin Wolf, 2012/09/28
- [Qemu-devel] [PATCH 16/30] block: move job APIs to separate files, Kevin Wolf, 2012/09/28