qemu-block
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH 5/5] block/io: improve savevm performance


From: Denis V. Lunev
Subject: Re: [PATCH 5/5] block/io: improve savevm performance
Date: Thu, 18 Jun 2020 14:07:42 +0300
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Thunderbird/68.8.0

On 6/18/20 1:56 PM, Vladimir Sementsov-Ogievskiy wrote:
> 16.06.2020 19:20, Denis V. Lunev wrote:
>> This patch does 2 standard basic things:
>> - it creates intermediate buffer for all writes from QEMU migration code
>>    to block driver,
>> - this buffer is sent to disk asynchronously, allowing several writes to
>>    run in parallel.
>>
>> Thus bdrv_vmstate_write() is becoming asynchronous. All pending
>> operations
>> completion are performed in newly invented bdrv_flush_vmstate().
>>
>> In general, migration code is fantastically inefficent (by observation),
>> buffers are not aligned and sent with arbitrary pieces, a lot of time
>> less than 100 bytes at a chunk, which results in read-modify-write
>> operations if target file descriptor is opened with O_DIRECT. It should
>> also be noted that all operations are performed into unallocated image
>> blocks, which also suffer due to partial writes to such new clusters
>> even on cached file descriptors.
>>
>> Snapshot creation time (2 GB Fedora-31 VM running over NVME storage):
>>                  original     fixed
>> cached:          1.79s       1.27s
>> non-cached:      3.29s       0.81s
>>
>> The difference over HDD would be more significant :)
>>
>> Signed-off-by: Denis V. Lunev <den@openvz.org>
>> CC: Kevin Wolf <kwolf@redhat.com>
>> CC: Max Reitz <mreitz@redhat.com>
>> CC: Stefan Hajnoczi <stefanha@redhat.com>
>> CC: Fam Zheng <fam@euphon.net>
>> CC: Juan Quintela <quintela@redhat.com>
>> CC: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
>> CC: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> CC: Denis Plotnikov <dplotnikov@virtuozzo.com>
>> ---
>>   block/io.c                | 123 +++++++++++++++++++++++++++++++++++++-
>>   include/block/block_int.h |   8 +++
>>   2 files changed, 129 insertions(+), 2 deletions(-)
>>
>> diff --git a/block/io.c b/block/io.c
>> index 8718df4ea8..1979098c02 100644
>> --- a/block/io.c
>> +++ b/block/io.c
>> @@ -26,6 +26,7 @@
>>   #include "trace.h"
>>   #include "sysemu/block-backend.h"
>>   #include "block/aio-wait.h"
>> +#include "block/aio_task.h"
>>   #include "block/blockjob.h"
>>   #include "block/blockjob_int.h"
>>   #include "block/block_int.h"
>> @@ -33,6 +34,7 @@
>>   #include "qapi/error.h"
>>   #include "qemu/error-report.h"
>>   #include "qemu/main-loop.h"
>> +#include "qemu/units.h"
>>   #include "sysemu/replay.h"
>>     /* Maximum bounce buffer for copy-on-read and write zeroes, in
>> bytes */
>> @@ -2640,6 +2642,100 @@ typedef struct BdrvVmstateCo {
>>       bool                is_read;
>>   } BdrvVmstateCo;
>>   +typedef struct BdrvVMStateTask {
>> +    AioTask task;
>> +
>> +    BlockDriverState *bs;
>> +    int64_t offset;
>> +    void *buf;
>> +    size_t bytes;
>> +} BdrvVMStateTask;
>> +
>> +typedef struct BdrvSaveVMState {
>> +    AioTaskPool *pool;
>> +    BdrvVMStateTask *t;
>> +} BdrvSaveVMState;
>> +
>> +
>> +static coroutine_fn int bdrv_co_vmstate_save_task_entry(AioTask *task)
>> +{
>> +    int err = 0;
>> +    BdrvVMStateTask *t = container_of(task, BdrvVMStateTask, task);
>> +
>> +    if (t->bytes != 0) {
>> +        QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, t->buf,
>> t->bytes);
>> +
>> +        bdrv_inc_in_flight(t->bs);
>> +        err = t->bs->drv->bdrv_save_vmstate(t->bs, &qiov, t->offset);
>> +        bdrv_dec_in_flight(t->bs);
>> +    }
>> +
>> +    qemu_vfree(t->buf);
>> +    return err;
>> +}
>> +
>> +static BdrvVMStateTask *bdrv_vmstate_task_create(BlockDriverState *bs,
>> +                                                 int64_t pos, size_t
>> size)
>> +{
>> +    BdrvVMStateTask *t = g_new(BdrvVMStateTask, 1);
>> +
>> +    *t = (BdrvVMStateTask) {
>> +        .task.func = bdrv_co_vmstate_save_task_entry,
>> +        .buf = qemu_blockalign(bs, size),
>> +        .offset = pos,
>> +        .bs = bs,
>> +    };
>> +
>> +    return t;
>> +}
>> +
>> +static int bdrv_co_do_save_vmstate(BlockDriverState *bs,
>> QEMUIOVector *qiov,
>> +                                   int64_t pos)
>> +{
>> +    BdrvSaveVMState *state = bs->savevm_state;
>> +    BdrvVMStateTask *t;
>> +    size_t buf_size = MAX(bdrv_get_cluster_size(bs), 1 * MiB);
>> +    size_t to_copy, off;
>> +
>> +    if (state == NULL) {
>> +        state = g_new(BdrvSaveVMState, 1);
>> +        *state = (BdrvSaveVMState) {
>> +            .pool = aio_task_pool_new(BDRV_VMSTATE_WORKERS_MAX),
>> +            .t = bdrv_vmstate_task_create(bs, pos, buf_size),
>> +        };
>> +
>> +        bs->savevm_state = state;
>> +    }
>> +
>> +    if (aio_task_pool_status(state->pool) < 0) {
>> +        /* Caller is responsible for cleanup. We should block all
>> further
>> +         * save operations for this exact state */
>> +        return aio_task_pool_status(state->pool);
>> +    }
>> +
>> +    t = state->t;
>> +    if (t->offset + t->bytes != pos) {
>> +        /* Normally this branch is not reachable from migration */
>> +        return bs->drv->bdrv_save_vmstate(bs, qiov, pos);
>> +    }
>> +
>> +    off = 0;
>> +    while (1) {
>
> "while (aio_task_pool_status(state->pool) == 0)" + "return
> aio_task_pool_status(state->pool)" after loop will improve
> interactivity on failure path, but shouldn't be necessary.
>
>> +        to_copy = MIN(qiov->size - off, buf_size - t->bytes);
>> +        qemu_iovec_to_buf(qiov, off, t->buf + t->bytes, to_copy);
>> +        t->bytes += to_copy;
>> +        if (t->bytes < buf_size) {
>> +            return qiov->size;
>
> Intersting: we are substituting .bdrv_save_vmstate by this function.
> There are two existing now:
>
> qcow2_save_vmstate, which doesn't care to return qiov->size and
> sd_save_vmstate which does it.
>
> So, it seems safe to return qiov->size now, but I'm sure that it's
> actually unused and should be
> refactored to return 0 on success everywhere.

This looks like my mistake now. I have spent some time
with error codes working with Eric's suggestions and
believe that 0 should be returned now.

Will fix in next revision.

Den



reply via email to

[Prev in Thread] Current Thread [Next in Thread]