qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v6 2/4] block: introduce zone append write for zoned devices


From: Dmitry Fomichev
Subject: Re: [PATCH v6 2/4] block: introduce zone append write for zoned devices
Date: Tue, 14 Mar 2023 02:55:25 +0000
User-agent: Evolution 3.44.4-0ubuntu1

On Fri, 2023-03-10 at 18:31 +0800, Sam Li wrote:
> A zone append command is a write operation that specifies the first
> logical block of a zone as the write position. When writing to a zoned
> block device using zone append, the byte offset of writes is pointing
> to the write pointer of that zone.

s/writes is pointing to the write pointer of that zone/the call may point at any
position within the zone to which the data is being appended/

>  Upon completion the device will
> respond with the position the data

s/position the data/position where the data/

>  has been written in the zone.
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>

With nits above,

Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>

> ---
>  block/block-backend.c             | 60 +++++++++++++++++++++++++++++++
>  block/file-posix.c                | 54 +++++++++++++++++++++++++---
>  block/io.c                        | 21 +++++++++++
>  block/io_uring.c                  |  4 +++
>  block/linux-aio.c                 |  3 ++
>  block/raw-format.c                |  8 +++++
>  include/block/block-io.h          |  4 +++
>  include/block/block_int-common.h  |  5 +++
>  include/block/raw-aio.h           |  4 ++-
>  include/sysemu/block-backend-io.h |  9 +++++
>  10 files changed, 166 insertions(+), 6 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index f70b08e3f6..28e8f5d778 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1888,6 +1888,45 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk,
> BlockZoneOp op,
>      return &acb->common;
>  }
>  
> +static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
> +{
> +    BlkAioEmAIOCB *acb = opaque;
> +    BlkRwCo *rwco = &acb->rwco;
> +
> +    rwco->ret = blk_co_zone_append(rwco->blk, &acb->bytes,
> +                                   rwco->iobuf, rwco->flags);
> +    blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +                                QEMUIOVector *qiov, BdrvRequestFlags flags,
> +                                BlockCompletionFunc *cb, void *opaque) {
> +    BlkAioEmAIOCB *acb;
> +    Coroutine *co;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> +    acb->rwco = (BlkRwCo) {
> +        .blk    = blk,
> +        .ret    = NOT_DONE,
> +        .flags  = flags,
> +        .iobuf  = qiov,
> +    };
> +    acb->bytes = *offset;
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> +    aio_co_enter(blk_get_aio_context(blk), co);
> +    acb->has_returned = true;
> +    if (acb->rwco.ret != NOT_DONE) {
> +        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> +                                         blk_aio_complete_bh, acb);
> +    }
> +
> +    return &acb->common;
> +}
> +
>  /*
>   * Send a zone_report command.
>   * offset is a byte offset from the start of the device. No alignment
> @@ -1939,6 +1978,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk,
> BlockZoneOp op,
>      return ret;
>  }
>  
> +/*
> + * Send a zone_append command.
> + */
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +        QEMUIOVector *qiov, BdrvRequestFlags flags)
> +{
> +    int ret;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    blk_wait_while_drained(blk);
> +    if (!blk_is_available(blk)) {
> +        blk_dec_in_flight(blk);
> +        return -ENOMEDIUM;
> +    }
> +
> +    ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>      BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 61ed769ac8..2ba9174778 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -160,6 +160,7 @@ typedef struct BDRVRawState {
>      bool has_write_zeroes:1;
>      bool use_linux_aio:1;
>      bool use_linux_io_uring:1;
> +    int64_t *offset; /* offset of zone append operation */
>      int page_cache_inconsistent; /* errno from fdatasync failure */
>      bool has_fallocate;
>      bool needs_alignment;
> @@ -1672,7 +1673,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData
> *aiocb)
>      ssize_t len;
>  
>      len = RETRY_ON_EINTR(
> -        (aiocb->aio_type & QEMU_AIO_WRITE) ?
> +        (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
>              qemu_pwritev(aiocb->aio_fildes,
>                             aiocb->io.iov,
>                             aiocb->io.niov,
> @@ -1701,7 +1702,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData
> *aiocb, char *buf)
>      ssize_t len;
>  
>      while (offset < aiocb->aio_nbytes) {
> -        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>              len = pwrite(aiocb->aio_fildes,
>                           (const char *)buf + offset,
>                           aiocb->aio_nbytes - offset,
> @@ -1794,7 +1795,7 @@ static int handle_aiocb_rw(void *opaque)
>      }
>  
>      nbytes = handle_aiocb_rw_linear(aiocb, buf);
> -    if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
> +    if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
>          char *p = buf;
>          size_t count = aiocb->aio_nbytes, copy;
>          int i;
> @@ -2431,6 +2432,10 @@ static int coroutine_fn raw_co_prw(BlockDriverState 
> *bs,
> uint64_t offset,
>  #if defined(CONFIG_BLKZONED)
>      if (bs->bl.wps) {
>          qemu_co_mutex_lock(&bs->bl.wps->colock);
> +        if (type & QEMU_AIO_ZONE_APPEND && bs->bl.zone_size) {
> +            int index = offset / bs->bl.zone_size;
> +            offset = bs->bl.wps->wp[index];
> +        }
>      }
>  #endif
>  
> @@ -2478,9 +2483,13 @@ out:
>  #if defined(CONFIG_BLKZONED)
>      BlockZoneWps *wps = bs->bl.wps;
>      if (ret == 0) {
> -        if (type & QEMU_AIO_WRITE && wps && bs->bl.zone_size) {
> +        if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
> +            && wps && bs->bl.zone_size) {
>              int index = offset / bs->bl.zone_size;
>              if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> +                if (type & QEMU_AIO_ZONE_APPEND) {
> +                    *s->offset = wps->wp[index];
> +                }
>                  /* Advance the wp if needed */
>                  if (offset + bytes > wps->wp[index]) {
>                      wps->wp[index] = offset + bytes;
> @@ -2488,7 +2497,7 @@ out:
>              }
>          }
>      } else {
> -        if (type & QEMU_AIO_WRITE) {
> +        if (type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
>              update_zones_wp(s->fd, bs->bl.wps, 0, 1);
>          }
>      }
> @@ -3498,6 +3507,40 @@ out:
>  }
>  #endif
>  
> +#if defined(CONFIG_BLKZONED)
> +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
> +                                           int64_t *offset,
> +                                           QEMUIOVector *qiov,
> +                                           BdrvRequestFlags flags) {
> +    assert(flags == 0);
> +    int64_t zone_size_mask = bs->bl.zone_size - 1;
> +    int64_t iov_len = 0;
> +    int64_t len = 0;
> +    BDRVRawState *s = bs->opaque;
> +    s->offset = offset;
> +
> +    if (*offset & zone_size_mask) {
> +        error_report("sector offset %" PRId64 " is not aligned to zone size "
> +                     "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
> +        return -EINVAL;
> +    }
> +
> +    int64_t wg = bs->bl.write_granularity;
> +    int64_t wg_mask = wg - 1;
> +    for (int i = 0; i < qiov->niov; i++) {
> +        iov_len = qiov->iov[i].iov_len;
> +        if (iov_len & wg_mask) {
> +            error_report("len of IOVector[%d] %" PRId64 " is not aligned to "
> +                         "block size %" PRId64 "", i, iov_len, wg);
> +            return -EINVAL;
> +        }
> +        len += iov_len;
> +    }
> +
> +    return raw_co_prw(bs, *offset, len, qiov, QEMU_AIO_ZONE_APPEND);
> +}
> +#endif
> +
>  static coroutine_fn int
>  raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
>                  bool blkdev)
> @@ -4259,6 +4302,7 @@ static BlockDriver bdrv_host_device = {
>      /* zone management operations */
>      .bdrv_co_zone_report = raw_co_zone_report,
>      .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> +    .bdrv_co_zone_append = raw_co_zone_append,
>  #endif
>  };
>  
> diff --git a/block/io.c b/block/io.c
> index 5dbf1e50f2..fe9cabaaf6 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -3152,6 +3152,27 @@ out:
>      return co.ret;
>  }
>  
> +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
> +                        QEMUIOVector *qiov,
> +                        BdrvRequestFlags flags)
> +{
> +    BlockDriver *drv = bs->drv;
> +    CoroutineIOCompletion co = {
> +            .coroutine = qemu_coroutine_self(),
> +    };
> +    IO_CODE();
> +
> +    bdrv_inc_in_flight(bs);
> +    if (!drv || !drv->bdrv_co_zone_append || bs->bl.zoned == BLK_Z_NONE) {
> +        co.ret = -ENOTSUP;
> +        goto out;
> +    }
> +    co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
> +out:
> +    bdrv_dec_in_flight(bs);
> +    return co.ret;
> +}
> +
>  void *qemu_blockalign(BlockDriverState *bs, size_t size)
>  {
>      IO_CODE();
> diff --git a/block/io_uring.c b/block/io_uring.c
> index 973e15d876..f7488c241a 100644
> --- a/block/io_uring.c
> +++ b/block/io_uring.c
> @@ -345,6 +345,10 @@ static int luring_do_submit(int fd, LuringAIOCB 
> *luringcb,
> LuringState *s,
>          io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
>                               luringcb->qiov->niov, offset);
>          break;
> +    case QEMU_AIO_ZONE_APPEND:
> +        io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
> +                             luringcb->qiov->niov, offset);
> +        break;
>      case QEMU_AIO_READ:
>          io_uring_prep_readv(sqes, fd, luringcb->qiov->iov,
>                              luringcb->qiov->niov, offset);
> diff --git a/block/linux-aio.c b/block/linux-aio.c
> index d2cfb7f523..1959834156 100644
> --- a/block/linux-aio.c
> +++ b/block/linux-aio.c
> @@ -389,6 +389,9 @@ static int laio_do_submit(int fd, struct qemu_laiocb
> *laiocb, off_t offset,
>      case QEMU_AIO_WRITE:
>          io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
>          break;
> +    case QEMU_AIO_ZONE_APPEND:
> +        io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
> +        break;
>      case QEMU_AIO_READ:
>          io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
>          break;
> diff --git a/block/raw-format.c b/block/raw-format.c
> index 72e23e7b55..64e7d48d04 100644
> --- a/block/raw-format.c
> +++ b/block/raw-format.c
> @@ -332,6 +332,13 @@ raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>      return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
>  }
>  
> +static int coroutine_fn GRAPH_RDLOCK
> +raw_co_zone_append(BlockDriverState *bs,int64_t *offset, QEMUIOVector *qiov,
> +                   BdrvRequestFlags flags)
> +{
> +    return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
> +}
> +
>  static int64_t coroutine_fn GRAPH_RDLOCK
>  raw_co_getlength(BlockDriverState *bs)
>  {
> @@ -635,6 +642,7 @@ BlockDriver bdrv_raw = {
>      .bdrv_co_pdiscard     = &raw_co_pdiscard,
>      .bdrv_co_zone_report  = &raw_co_zone_report,
>      .bdrv_co_zone_mgmt  = &raw_co_zone_mgmt,
> +    .bdrv_co_zone_append = &raw_co_zone_append,
>      .bdrv_co_block_status = &raw_co_block_status,
>      .bdrv_co_copy_range_from = &raw_co_copy_range_from,
>      .bdrv_co_copy_range_to  = &raw_co_copy_range_to,
> diff --git a/include/block/block-io.h b/include/block/block-io.h
> index 19d1fad9cf..55fca02991 100644
> --- a/include/block/block-io.h
> +++ b/include/block/block-io.h
> @@ -120,6 +120,10 @@ int coroutine_fn GRAPH_RDLOCK
> bdrv_co_zone_report(BlockDriverState *bs,
>  int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_mgmt(BlockDriverState *bs,
>                                                  BlockZoneOp op,
>                                                  int64_t offset, int64_t len);
> +int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_append(BlockDriverState *bs,
> +                                                  int64_t *offset,
> +                                                  QEMUIOVector *qiov,
> +                                                  BdrvRequestFlags flags);
>  
>  bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
>  int bdrv_block_status(BlockDriverState *bs, int64_t offset,
> diff --git a/include/block/block_int-common.h b/include/block/block_int-
> common.h
> index 19915b34af..ccd8811919 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -724,6 +724,9 @@ struct BlockDriver {
>              BlockZoneDescriptor *zones);
>      int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp
> op,
>              int64_t offset, int64_t len);
> +    int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs,
> +            int64_t *offset, QEMUIOVector *qiov,
> +            BdrvRequestFlags flags);
>  
>      /* removable device specific */
>      bool coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_is_inserted)(
> @@ -887,6 +890,8 @@ typedef struct BlockLimits {
>  
>      /* array of write pointers' location of each zone in the zoned device. */
>      BlockZoneWps *wps;
> +
> +    int64_t write_granularity;
>  } BlockLimits;
>  
>  typedef struct BdrvOpBlocker BdrvOpBlocker;
> diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
> index eda6a7a253..fb9c9f5a01 100644
> --- a/include/block/raw-aio.h
> +++ b/include/block/raw-aio.h
> @@ -30,6 +30,7 @@
>  #define QEMU_AIO_TRUNCATE     0x0080
>  #define QEMU_AIO_ZONE_REPORT  0x0100
>  #define QEMU_AIO_ZONE_MGMT    0x0200
> +#define QEMU_AIO_ZONE_APPEND  0x0400
>  #define QEMU_AIO_TYPE_MASK \
>          (QEMU_AIO_READ | \
>           QEMU_AIO_WRITE | \
> @@ -40,7 +41,8 @@
>           QEMU_AIO_COPY_RANGE | \
>           QEMU_AIO_TRUNCATE | \
>           QEMU_AIO_ZONE_REPORT | \
> -         QEMU_AIO_ZONE_MGMT)
> +         QEMU_AIO_ZONE_MGMT | \
> +         QEMU_AIO_ZONE_APPEND)
>  
>  /* AIO flags */
>  #define QEMU_AIO_MISALIGNED   0x1000
> diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-
> io.h
> index f575ab5b6b..e716591a1a 100644
> --- a/include/sysemu/block-backend-io.h
> +++ b/include/sysemu/block-backend-io.h
> @@ -53,6 +53,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t
> offset,
>  BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>                                int64_t offset, int64_t len,
>                                BlockCompletionFunc *cb, void *opaque);
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +                                QEMUIOVector *qiov, BdrvRequestFlags flags,
> +                                BlockCompletionFunc *cb, void *opaque);
>  BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t 
> bytes,
>                               BlockCompletionFunc *cb, void *opaque);
>  void blk_aio_cancel_async(BlockAIOCB *acb);
> @@ -201,6 +204,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk,
> BlockZoneOp op,
>                                    int64_t offset, int64_t len);
>  int co_wrapper_mixed blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>                                         int64_t offset, int64_t len);
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +                                    QEMUIOVector *qiov,
> +                                    BdrvRequestFlags flags);
> +int co_wrapper_mixed blk_zone_append(BlockBackend *blk, int64_t *offset,
> +                                         QEMUIOVector *qiov,
> +                                         BdrvRequestFlags flags);
>  
>  int co_wrapper_mixed blk_pdiscard(BlockBackend *blk, int64_t offset,
>                                    int64_t bytes);


reply via email to

[Prev in Thread] Current Thread [Next in Thread]