[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH v4 2/3] block: introduce zone append write for zoned devices
From: |
Sam Li |
Subject: |
Re: [PATCH v4 2/3] block: introduce zone append write for zoned devices |
Date: |
Tue, 18 Oct 2022 16:48:23 +0800 |
Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年10月17日周一 13:22写道:
>
> On 10/16/22 23:56, Sam Li wrote:
> > A zone append command is a write operation that specifies the first
> > logical block of a zone as the write position. When writing to a zoned
> > block device using zone append, the byte offset of writes is pointing
> > to the write pointer of that zone. Upon completion the device will
> > respond with the position the data has been written in the zone.
> >
> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > ---
> > block/block-backend.c | 65 ++++++++++++++++++++++
> > block/file-posix.c | 89 +++++++++++++++++++++++++++++--
> > block/io.c | 21 ++++++++
> > block/raw-format.c | 8 +++
> > include/block/block-io.h | 3 ++
> > include/block/block_int-common.h | 5 ++
> > include/block/raw-aio.h | 4 +-
> > include/sysemu/block-backend-io.h | 9 ++++
> > 8 files changed, 198 insertions(+), 6 deletions(-)
> >
> > diff --git a/block/block-backend.c b/block/block-backend.c
> > index 1c618e9c68..06931ddd24 100644
> > --- a/block/block-backend.c
> > +++ b/block/block-backend.c
> > @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
> > struct {
> > unsigned long op;
> > } zone_mgmt;
> > + struct {
> > + int64_t *append_sector;
>
> As mentioned previosuly, call this sector. "append" is already in the
> zone_append struct member name....
Right, missed it here. I'll use "offset" then since it's in block layer.
>
> > + } zone_append;
> > };
> > } BlkRwCo;
> >
> > @@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk,
> > BlockZoneOp op,
> > return &acb->common;
> > }
> >
> > +static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
> > +{
> > + BlkAioEmAIOCB *acb = opaque;
> > + BlkRwCo *rwco = &acb->rwco;
> > +
> > + rwco->ret = blk_co_zone_append(rwco->blk,
> > rwco->zone_append.append_sector,
>
> ...so you avoid awkward repetitions of "append" like here. You'll have:
> rwco->zone_append.sector, which is shorter and more natural.
>
> > + rwco->iobuf, rwco->flags);
> > + blk_aio_complete(acb);
> > +}
> > +
> > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> > + QEMUIOVector *qiov, BdrvRequestFlags flags,
> > + BlockCompletionFunc *cb, void *opaque) {
> > + BlkAioEmAIOCB *acb;
> > + Coroutine *co;
> > + IO_CODE();
> > +
> > + blk_inc_in_flight(blk);
> > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> > + acb->rwco = (BlkRwCo) {
> > + .blk = blk,
> > + .ret = NOT_DONE,
> > + .flags = flags,
> > + .iobuf = qiov,
> > + .zone_append = {
> > + .append_sector = offset,
> > + },
> > + };
> > + acb->has_returned = false;
> > +
> > + co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> > + bdrv_coroutine_enter(blk_bs(blk), co);
> > + acb->has_returned = true;
> > + if (acb->rwco.ret != NOT_DONE) {
> > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> > + blk_aio_complete_bh, acb);
> > + }
> > +
> > + return &acb->common;
> > +}
> > +
> > /*
> > * Send a zone_report command.
> > * offset is a byte offset from the start of the device. No alignment
> > @@ -1923,6 +1967,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk,
> > BlockZoneOp op,
> > return ret;
> > }
> >
> > +/*
> > + * Send a zone_append command.
> > + */
> > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> > + QEMUIOVector *qiov, BdrvRequestFlags flags)
> > +{
> > + int ret;
> > + IO_CODE();
> > +
> > + blk_inc_in_flight(blk);
> > + blk_wait_while_drained(blk);
> > + if (!blk_is_available(blk)) {
> > + blk_dec_in_flight(blk);
> > + return -ENOMEDIUM;
> > + }
> > +
> > + ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> > + blk_dec_in_flight(blk);
> > + return ret;
> > +}
> > +
> > void blk_drain(BlockBackend *blk)
> > {
> > BlockDriverState *bs = blk_bs(blk);
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 5ff5500301..3d0cc33d02 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -205,6 +205,7 @@ typedef struct RawPosixAIOData {
> > struct {
> > struct iovec *iov;
> > int niov;
> > + int64_t *offset;
> > } io;
> > struct {
> > uint64_t cmd;
> > @@ -1475,6 +1476,11 @@ static void raw_refresh_limits(BlockDriverState *bs,
> > Error **errp)
> > bs->bl.max_active_zones = ret;
> > }
> >
> > + ret = get_sysfs_long_val(&st, "physical_block_size");
> > + if (ret >= 0) {
> > + bs->bl.write_granularity = ret;
> > + }
> > +
> > bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) *
> > ret);
> > if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) {
> > error_report("report wps failed");
> > @@ -1647,9 +1653,18 @@ qemu_pwritev(int fd, const struct iovec *iov, int
> > nr_iov, off_t offset)
> > static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
> > {
> > ssize_t len;
> > + BlockZoneWps *wps = aiocb->bs->bl.wps;
> > + int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> > +
> > + if (wps) {
> > + qemu_mutex_lock(&wps->lock);
> > + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
> > + aiocb->aio_offset = wps->wp[index];
> > + }
> > + }
> >
> > do {
> > - if (aiocb->aio_type & QEMU_AIO_WRITE)
> > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
> > len = qemu_pwritev(aiocb->aio_fildes,
> > aiocb->io.iov,
> > aiocb->io.niov,
> > @@ -1660,6 +1675,9 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData
> > *aiocb)
> > aiocb->io.niov,
> > aiocb->aio_offset);
> > } while (len == -1 && errno == EINTR);
> > + if (wps) {
> > + qemu_mutex_unlock(&wps->lock);
>
> As commented in the previous patch, you cannot release the lock until you
> update the wp array entry. So this means that you should be taking the wp
> lock at the beginning of handle_aiocb_rw() and release it only after the
> wp array is updated. That will also simplify the code and avoid spreading
> lock/unlock of that array to many places.
>
> > + }
> >
> > if (len == -1) {
> > return -errno;
> > @@ -1677,9 +1695,17 @@ static ssize_t
> > handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
> > {
> > ssize_t offset = 0;
> > ssize_t len;
> > + BlockZoneWps *wps = aiocb->bs->bl.wps;
> > + int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> >
> > + if (wps) {
> > + qemu_mutex_lock(&wps->lock);
> > + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
> > + aiocb->aio_offset = wps->wp[index];
> > + }
> > + }
> > while (offset < aiocb->aio_nbytes) {
> > - if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> > len = pwrite(aiocb->aio_fildes,
> > (const char *)buf + offset,
> > aiocb->aio_nbytes - offset,
> > @@ -1709,6 +1735,9 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData
> > *aiocb, char *buf)
> > }
> > offset += len;
> > }
> > + if (wps) {
> > + qemu_mutex_unlock(&wps->lock);
>
> Same comment as above.
>
> > + }
> >
> > return offset;
> > }
> > @@ -1772,7 +1801,7 @@ static int handle_aiocb_rw(void *opaque)
> > }
> >
> > nbytes = handle_aiocb_rw_linear(aiocb, buf);
> > - if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
> > + if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
> > char *p = buf;
> > size_t count = aiocb->aio_nbytes, copy;
> > int i;
> > @@ -1794,7 +1823,7 @@ static int handle_aiocb_rw(void *opaque)
> > out:
> > if (nbytes == aiocb->aio_nbytes) {
> > #if defined(CONFIG_BLKZONED)
> > - if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> > BlockZoneWps *wps = aiocb->bs->bl.wps;
> > int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> > if (wps) {
> > @@ -1802,6 +1831,9 @@ out:
> > if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> > uint64_t wend_offset =
> > aiocb->aio_offset + aiocb->aio_nbytes;
> > + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) {
> > + *aiocb->io.offset = wps->wp[index];
> > + }
>
> The comment above will address the problem here, which is that you are
> accessing the array without it being locked. You cannot do that.
>
> >
> > /* Advance the wp if needed */
> > if (wend_offset > wps->wp[index]) {
> > @@ -1824,7 +1856,7 @@ out:
> > } else {
> > assert(nbytes < 0);
> > #if defined(CONFIG_BLKZONED)
> > - if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> > update_zones_wp(aiocb->aio_fildes, aiocb->bs->bl.wps, 0, 1);
> > }
> > #endif
> > @@ -3478,6 +3510,52 @@ static int coroutine_fn
> > raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> > }
> > #endif
> >
> > +#if defined(CONFIG_BLKZONED)
> > +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
> > + int64_t *offset,
> > + QEMUIOVector *qiov,
> > + BdrvRequestFlags flags) {
> > + BDRVRawState *s = bs->opaque;
> > + int64_t zone_size_mask = bs->bl.zone_size - 1;
> > + int64_t iov_len = 0;
> > + int64_t len = 0;
> > + RawPosixAIOData acb;
> > +
> > + if (*offset & zone_size_mask) {
> > + error_report("sector offset %" PRId64 " is not aligned to zone
> > size "
> > + "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
> > + return -EINVAL;
> > + }
> > +
> > + int64_t wg = bs->bl.write_granularity;
> > + int64_t wg_mask = wg - 1;
> > + for (int i = 0; i < qiov->niov; i++) {
> > + iov_len = qiov->iov[i].iov_len;
> > + if (iov_len & wg_mask) {
> > + error_report("len of IOVector[%d] %" PRId64 " is not aligned
> > to "
> > + "block size %" PRId64 "", i, iov_len, wg);
> > + return -EINVAL;
> > + }
> > + len += iov_len;
> > + }
> > +
> > + acb = (RawPosixAIOData) {
> > + .bs = bs,
> > + .aio_fildes = s->fd,
> > + .aio_type = QEMU_AIO_ZONE_APPEND,
> > + .aio_offset = *offset,
> > + .aio_nbytes = len,
> > + .io = {
> > + .iov = qiov->iov,
> > + .niov = qiov->niov,
> > + .offset = offset,
> > + },
> > + };
> > +
> > + return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb);
> > +}
> > +#endif
> > +
> > static coroutine_fn int
> > raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
> > bool blkdev)
> > @@ -4253,6 +4331,7 @@ static BlockDriver bdrv_zoned_host_device = {
> > /* zone management operations */
> > .bdrv_co_zone_report = raw_co_zone_report,
> > .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> > + .bdrv_co_zone_append = raw_co_zone_append,
> > };
> > #endif
> >
> > diff --git a/block/io.c b/block/io.c
> > index 88f707ea4d..03e1109056 100644
> > --- a/block/io.c
> > +++ b/block/io.c
> > @@ -3230,6 +3230,27 @@ out:
> > return co.ret;
> > }
> >
> > +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
> > + QEMUIOVector *qiov,
> > + BdrvRequestFlags flags)
> > +{
> > + BlockDriver *drv = bs->drv;
> > + CoroutineIOCompletion co = {
> > + .coroutine = qemu_coroutine_self(),
> > + };
> > + IO_CODE();
> > +
> > + bdrv_inc_in_flight(bs);
> > + if (!drv || !drv->bdrv_co_zone_append) {
> > + co.ret = -ENOTSUP;
> > + goto out;
> > + }
> > + co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
> > +out:
> > + bdrv_dec_in_flight(bs);
> > + return co.ret;
> > +}
> > +
> > void *qemu_blockalign(BlockDriverState *bs, size_t size)
> > {
> > IO_CODE();
> > diff --git a/block/raw-format.c b/block/raw-format.c
> > index 18dc52a150..33bff8516e 100644
> > --- a/block/raw-format.c
> > +++ b/block/raw-format.c
> > @@ -325,6 +325,13 @@ static int coroutine_fn
> > raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> > return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
> > }
> >
> > +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
> > + int64_t *offset,
> > + QEMUIOVector *qiov,
> > + BdrvRequestFlags flags) {
> > + return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
> > +}
> > +
> > static int64_t raw_getlength(BlockDriverState *bs)
> > {
> > int64_t len;
> > @@ -629,6 +636,7 @@ BlockDriver bdrv_raw = {
> > .bdrv_co_pdiscard = &raw_co_pdiscard,
> > .bdrv_co_zone_report = &raw_co_zone_report,
> > .bdrv_co_zone_mgmt = &raw_co_zone_mgmt,
> > + .bdrv_co_zone_append = &raw_co_zone_append,
> > .bdrv_co_block_status = &raw_co_block_status,
> > .bdrv_co_copy_range_from = &raw_co_copy_range_from,
> > .bdrv_co_copy_range_to = &raw_co_copy_range_to,
> > diff --git a/include/block/block-io.h b/include/block/block-io.h
> > index f0cdf67d33..6a54453578 100644
> > --- a/include/block/block-io.h
> > +++ b/include/block/block-io.h
> > @@ -94,6 +94,9 @@ int coroutine_fn bdrv_co_zone_report(BlockDriverState
> > *bs, int64_t offset,
> > BlockZoneDescriptor *zones);
> > int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> > int64_t offset, int64_t len);
> > +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
> > + QEMUIOVector *qiov,
> > + BdrvRequestFlags flags);
> >
> > int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
> > bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
> > diff --git a/include/block/block_int-common.h
> > b/include/block/block_int-common.h
> > index e3136146aa..a7e7db5646 100644
> > --- a/include/block/block_int-common.h
> > +++ b/include/block/block_int-common.h
> > @@ -701,6 +701,9 @@ struct BlockDriver {
> > BlockZoneDescriptor *zones);
> > int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs,
> > BlockZoneOp op,
> > int64_t offset, int64_t len);
> > + int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs,
> > + int64_t *offset, QEMUIOVector *qiov,
> > + BdrvRequestFlags flags);
> >
> > /* removable device specific */
> > bool (*bdrv_is_inserted)(BlockDriverState *bs);
> > @@ -860,6 +863,8 @@ typedef struct BlockLimits {
> >
> > /* array of write pointers' location of each zone in the zoned device.
> > */
> > BlockZoneWps *wps;
> > +
> > + int64_t write_granularity;
> > } BlockLimits;
> >
> > typedef struct BdrvOpBlocker BdrvOpBlocker;
> > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
> > index 877b2240b3..53033a5ca7 100644
> > --- a/include/block/raw-aio.h
> > +++ b/include/block/raw-aio.h
> > @@ -31,6 +31,7 @@
> > #define QEMU_AIO_TRUNCATE 0x0080
> > #define QEMU_AIO_ZONE_REPORT 0x0100
> > #define QEMU_AIO_ZONE_MGMT 0x0200
> > +#define QEMU_AIO_ZONE_APPEND 0x0400
> > #define QEMU_AIO_TYPE_MASK \
> > (QEMU_AIO_READ | \
> > QEMU_AIO_WRITE | \
> > @@ -41,7 +42,8 @@
> > QEMU_AIO_COPY_RANGE | \
> > QEMU_AIO_TRUNCATE | \
> > QEMU_AIO_ZONE_REPORT | \
> > - QEMU_AIO_ZONE_MGMT)
> > + QEMU_AIO_ZONE_MGMT | \
> > + QEMU_AIO_ZONE_APPEND)
> >
> > /* AIO flags */
> > #define QEMU_AIO_MISALIGNED 0x1000
> > diff --git a/include/sysemu/block-backend-io.h
> > b/include/sysemu/block-backend-io.h
> > index 1b5fc7db6b..ff9f777f52 100644
> > --- a/include/sysemu/block-backend-io.h
> > +++ b/include/sysemu/block-backend-io.h
> > @@ -52,6 +52,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk,
> > int64_t offset,
> > BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> > int64_t offset, int64_t len,
> > BlockCompletionFunc *cb, void *opaque);
> > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> > + QEMUIOVector *qiov, BdrvRequestFlags flags,
> > + BlockCompletionFunc *cb, void *opaque);
> > BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t
> > bytes,
> > BlockCompletionFunc *cb, void *opaque);
> > void blk_aio_cancel_async(BlockAIOCB *acb);
> > @@ -173,6 +176,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk,
> > BlockZoneOp op,
> > int64_t offset, int64_t len);
> > int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> > int64_t offset, int64_t len);
> > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> > + QEMUIOVector *qiov,
> > + BdrvRequestFlags flags);
> > +int generated_co_wrapper blk_zone_append(BlockBackend *blk, int64_t
> > *offset,
> > + QEMUIOVector *qiov,
> > + BdrvRequestFlags flags);
> >
> > int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
> > int64_t bytes);
>
> --
> Damien Le Moal
> Western Digital Research
>