[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlock
From: |
Sam Li |
Subject: |
Re: [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls. |
Date: |
Mon, 27 Jun 2022 15:45:19 +0800 |
Hi Hannes,
Hannes Reinecke <hare@suse.de> 于2022年6月27日周一 15:21写道:
>
> On 6/27/22 02:19, Sam Li wrote:
> > By adding zone management operations in BlockDriver, storage
> > controller emulation can use the new block layer APIs including
> > zone_report and zone_mgmt(open, close, finish, reset).
> > ---
> > block/block-backend.c | 56 ++++++++
> > block/coroutines.h | 5 +
> > block/file-posix.c | 238 +++++++++++++++++++++++++++++++
> > include/block/block-common.h | 43 +++++-
> > include/block/block_int-common.h | 20 +++
> > 5 files changed, 361 insertions(+), 1 deletion(-)
> >
> > diff --git a/block/block-backend.c b/block/block-backend.c
> > index e0e1aff4b1..786f964d02 100644
> > --- a/block/block-backend.c
> > +++ b/block/block-backend.c
> > @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk)
> > return ret;
> > }
> >
> > +/*
> > + * Return zone_report from BlockDriver. Offset can be any number within
> > + * the zone size. No alignment for offset and len.
> > + */
> > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> > + int64_t len, int64_t *nr_zones,
> > + BlockZoneDescriptor *zones)
> > +{
> > + int ret;
> > + BlockDriverState *bs;
> > + IO_CODE();
> > +
> > + blk_inc_in_flight(blk); /* increase before waiting */
> > + blk_wait_while_drained(blk);
> > + bs = blk_bs(blk);
> > +
> > + ret = blk_check_byte_request(blk, offset, len);
> > + if (ret < 0) {
> > + return ret;
> > + }
> > +
> > + bdrv_inc_in_flight(bs);
> > + ret = bdrv_co_zone_report(blk->root->bs, offset, len,
> > + nr_zones, zones);
> > + bdrv_dec_in_flight(bs);
> > + blk_dec_in_flight(blk);
> > + return ret;
> > +}
> > +
> > +/*
> > + * Return zone_mgmt from BlockDriver.
> > + * Offset is the start of a zone and len is aligned to zones.
> > + */
> > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> > + int64_t offset, int64_t len)
> > +{
> > + int ret;
> > + BlockDriverState *bs;
> > + IO_CODE();
> > +
> > + blk_inc_in_flight(blk);
> > + blk_wait_while_drained(blk);
> > + bs = blk_bs(blk);
> > +
> > + ret = blk_check_byte_request(blk, offset, len);
> > + if (ret < 0) {
> > + return ret;
> > + }
> > +
> > + bdrv_inc_in_flight(bs);
> > + ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len);
> > + bdrv_dec_in_flight(bs);
> > + blk_dec_in_flight(blk);
> > + return ret;
> > +}
> > +
> > void blk_drain(BlockBackend *blk)
> > {
> > BlockDriverState *bs = blk_bs(blk);
> > diff --git a/block/coroutines.h b/block/coroutines.h
> > index 830ecaa733..a114d7bc30 100644
> > --- a/block/coroutines.h
> > +++ b/block/coroutines.h
> > @@ -80,6 +80,11 @@ int coroutine_fn
> > blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
> >
> > int coroutine_fn blk_co_do_flush(BlockBackend *blk);
> > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> > + int64_t len, int64_t *nr_zones,
> > + BlockZoneDescriptor *zones);
> > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> > + int64_t offset, int64_t len);
> >
> >
> > /*
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 48cd096624..1b8b0d351f 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -67,6 +67,7 @@
> > #include <sys/param.h>
> > #include <sys/syscall.h>
> > #include <sys/vfs.h>
> > +#include <linux/blkzoned.h>
> > #include <linux/cdrom.h>
> > #include <linux/fd.h>
> > #include <linux/fs.h>
> > @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData {
> > PreallocMode prealloc;
> > Error **errp;
> > } truncate;
> > + struct {
> > + int64_t *nr_zones;
> > + BlockZoneDescriptor *zones;
> > + } zone_report;
> > + zone_op op;
> > };
> > } RawPosixAIOData;
> >
> > @@ -1801,6 +1807,135 @@ static off_t copy_file_range(int in_fd, off_t
> > *in_off, int out_fd,
> > }
> > #endif
> >
> > +/*
> > + * parse_zone - Fill a zone descriptor
> > + */
> > +static inline void parse_zone(struct BlockZoneDescriptor *zone,
> > + struct blk_zone *blkz) {
> > + zone->start = blkz->start;
> > + zone->length = blkz->len;
> > + zone->cap = blkz->capacity;
> > + zone->wp = blkz->wp - blkz->start;
> > + zone->type = blkz->type;
> > + zone->cond = blkz->cond;
> > +}
> > +
> > +static int handle_aiocb_zone_report(void *opaque) {
> > + RawPosixAIOData *aiocb = opaque;
> > + int fd = aiocb->aio_fildes;
> > + int64_t *nr_zones = aiocb->zone_report.nr_zones;
> > + BlockZoneDescriptor *zones = aiocb->zone_report.zones;
> > + int64_t offset = aiocb->aio_offset;
> > + int64_t len = aiocb->aio_nbytes;
> > +
> > + struct blk_zone *blkz;
> > + int64_t rep_size, nrz;
> > + int ret, n = 0, i = 0;
> > +
> > + nrz = *nr_zones;
> > + if (len == -1) {
> > + return -errno;
> > + }
> > + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct
> > blk_zone);
> > + g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report,
> > nrz);
> > + offset = offset / 512; /* get the unit of the start sector: sector
> > size is 512 bytes. */
> > + printf("start to report zone with offset: 0x%lx\n", offset);
> > +
> > + blkz = (struct blk_zone *)(rep + 1);
> > + while (n < nrz) {
> > + memset(rep, 0, rep_size);
> > + rep->sector = offset;
> > + rep->nr_zones = nrz;
> > +
> > + ret = ioctl(fd, BLKREPORTZONE, rep);
> > + if (ret != 0) {
> > + ret = -errno;
> > + error_report("%d: ioctl BLKREPORTZONE at %ld failed %d",
> > + fd, offset, errno);
> > + return ret;
> > + }
> > +
> > + if (!rep->nr_zones) {
> > + break;
> > + }
> > +
> > + for (i = 0; i < rep->nr_zones; i++, n++) {
> > + parse_zone(&zones[n], &blkz[i]);
> > + /* The next report should start after the last zone reported */
> > + offset = blkz[i].start + blkz[i].len;
> > + }
>
> Where do you increase 'n' such that the loop can make forward progress?
> Wouldn't it be better to use a for() loop here?
>
'n' increases in this for loop as 'i' increases. I think the for()
loop can serve the same purpose with some modifications.
> > + }
> > +
> > + *nr_zones = n;
> > + return 0;
> > +}
> > +
> > +static int handle_aiocb_zone_mgmt(void *opaque) {
> > + RawPosixAIOData *aiocb = opaque;
> > + int fd = aiocb->aio_fildes;
> > + int64_t offset = aiocb->aio_offset;
> > + int64_t len = aiocb->aio_nbytes;
> > + zone_op op = aiocb->op;
> > +
> > + struct blk_zone_range range;
> > + const char *ioctl_name;
> > + unsigned long ioctl_op;
> > + int64_t zone_size;
> > + int64_t zone_size_mask;
> > + int ret;
> > +
>
> Shouldn't we add a check here if 'fd' points to a zoned device?
> ioctl errors are not _that_ helpful here, as you might get a variety
> of errors and it's not quite obvious which of those errors indicate
> an unsupported feature.
>
Yes, I'll add it in the next patch.
> > + ret = ioctl(fd, BLKGETZONESZ, &zone_size);
> > + if (ret) {
> > + return -1;
> > + }
> > +
> > + zone_size_mask = zone_size - 1;
> > + if (offset & zone_size_mask) {
> > + error_report("offset is not the start of a zone");
> > + return -1;
> > + }
> > +
> > + if (len & zone_size_mask) {
> > + error_report("len is not aligned to zones");
> > + return -1;
> > + }
> > +
> > + switch (op) {
> > + case zone_open:
> > + ioctl_name = "BLKOPENZONE";
> > + ioctl_op = BLKOPENZONE;
> > + break;
> > + case zone_close:
> > + ioctl_name = "BLKCLOSEZONE";
> > + ioctl_op = BLKCLOSEZONE;
> > + break;
> > + case zone_finish:
> > + ioctl_name = "BLKFINISHZONE";
> > + ioctl_op = BLKFINISHZONE;
> > + break;
> > + case zone_reset:
> > + ioctl_name = "BLKRESETZONE";
> > + ioctl_op = BLKRESETZONE;
> > + break;
> > + default:
> > + error_report("Invalid zone operation 0x%x", op);
> > + errno = -EINVAL;
> > + return -1;
> > + }
> > +
> > + /* Execute the operation */
> > + range.sector = offset;
> > + range.nr_sectors = len;
> > + ret = ioctl(fd, ioctl_op, &range);
> > + if (ret != 0) {
> > + error_report("ioctl %s failed %d",
> > + ioctl_name, errno);
> > + return -1;
> > + }
> > +
> > + return 0;
> > +}
> > +
> > static int handle_aiocb_copy_range(void *opaque)
> > {
> > RawPosixAIOData *aiocb = opaque;
> > @@ -2973,6 +3108,58 @@ static void raw_account_discard(BDRVRawState *s,
> > uint64_t nbytes, int ret)
> > }
> > }
> >
> > +/*
> > + * zone report - Get a zone block device's information in the form
> > + * of an array of zone descriptors.
> > + *
> > + * @param bs: passing zone block device file descriptor
> > + * @param zones: an array of zone descriptors to hold zone
> > + * information on reply
> > + * @param offset: offset can be any byte within the zone size.
> > + * @param len: (not sure yet.
> > + * @return 0 on success, -1 on failure
> > + */
> > +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t
> > offset,
> > + int64_t len, int64_t *nr_zones,
> > + BlockZoneDescriptor *zones) {
> > + BDRVRawState *s = bs->opaque;
> > + RawPosixAIOData acb;
> > +
> > + acb = (RawPosixAIOData) {
> > + .bs = bs,
> > + .aio_fildes = s->fd,
> > + .aio_type = QEMU_AIO_IOCTL,
> > + .aio_offset = offset,
> > + .aio_nbytes = len,
> > + .zone_report = {
> > + .nr_zones = nr_zones,
> > + .zones = zones,
> > + },
> > + };
> > +
> > + return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb);
> > +}
> > +
> > +/*
> > + * zone management operations - Execute an operation on a zone
> > + */
> > +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, zone_op op,
> > + int64_t offset, int64_t len) {
> > + BDRVRawState *s = bs->opaque;
> > + RawPosixAIOData acb;
> > +
> > + acb = (RawPosixAIOData) {
> > + .bs = bs,
> > + .aio_fildes = s->fd,
> > + .aio_type = QEMU_AIO_IOCTL,
> > + .aio_offset = offset,
> > + .aio_nbytes = len,
> > + .op = op,
> > + };
> > +
> > + return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
> > +}
> > +
> > static coroutine_fn int
> > raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
> > bool blkdev)
> > @@ -3324,6 +3511,9 @@ BlockDriver bdrv_file = {
> > .bdrv_abort_perm_update = raw_abort_perm_update,
> > .create_opts = &raw_create_opts,
> > .mutable_opts = mutable_opts,
> > +
> > + .bdrv_co_zone_report = raw_co_zone_report,
> > + .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> > };
> >
> > /***********************************************/
> > @@ -3703,6 +3893,53 @@ static BlockDriver bdrv_host_device = {
> > #endif
> > };
> >
> > +static BlockDriver bdrv_zoned_host_device = {
> > + .format_name = "zoned_host_device",
> > + .protocol_name = "zoned_host_device",
> > + .instance_size = sizeof(BDRVRawState),
> > + .bdrv_needs_filename = true,
> > + .bdrv_probe_device = hdev_probe_device,
> > + .bdrv_parse_filename = hdev_parse_filename,
> > + .bdrv_file_open = hdev_open,
> > + .bdrv_close = raw_close,
> > + .bdrv_reopen_prepare = raw_reopen_prepare,
> > + .bdrv_reopen_commit = raw_reopen_commit,
> > + .bdrv_reopen_abort = raw_reopen_abort,
> > + .bdrv_co_create_opts = bdrv_co_create_opts_simple,
> > + .create_opts = &bdrv_create_opts_simple,
> > + .mutable_opts = mutable_opts,
> > + .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
> > + .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
> > +
> > + .bdrv_co_preadv = raw_co_preadv,
> > + .bdrv_co_pwritev = raw_co_pwritev,
> > + .bdrv_co_flush_to_disk = raw_co_flush_to_disk,
> > + .bdrv_co_pdiscard = hdev_co_pdiscard,
> > + .bdrv_co_copy_range_from = raw_co_copy_range_from,
> > + .bdrv_co_copy_range_to = raw_co_copy_range_to,
> > + .bdrv_refresh_limits = raw_refresh_limits,
> > + .bdrv_io_plug = raw_aio_plug,
> > + .bdrv_io_unplug = raw_aio_unplug,
> > + .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> > +
> > + .bdrv_co_truncate = raw_co_truncate,
> > + .bdrv_getlength = raw_getlength,
> > + .bdrv_get_info = raw_get_info,
> > + .bdrv_get_allocated_file_size
> > + = raw_get_allocated_file_size,
> > + .bdrv_get_specific_stats = hdev_get_specific_stats,
> > + .bdrv_check_perm = raw_check_perm,
> > + .bdrv_set_perm = raw_set_perm,
> > + .bdrv_abort_perm_update = raw_abort_perm_update,
> > + .bdrv_probe_blocksizes = hdev_probe_blocksizes,
> > + .bdrv_probe_geometry = hdev_probe_geometry,
> > + .bdrv_co_ioctl = hdev_co_ioctl,
> > +
> > + /* zone management operations */
> > + .bdrv_co_zone_report = raw_co_zone_report,
> > + .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> > +};
> > +
> > #if defined(__linux__) || defined(__FreeBSD__) ||
> > defined(__FreeBSD_kernel__)
> > static void cdrom_parse_filename(const char *filename, QDict *options,
> > Error **errp)
> > @@ -3964,6 +4201,7 @@ static void bdrv_file_init(void)
> > #if defined(HAVE_HOST_BLOCK_DEVICE)
> > bdrv_register(&bdrv_host_device);
> > #ifdef __linux__
> > + bdrv_register(&bdrv_zoned_host_device);
> > bdrv_register(&bdrv_host_cdrom);
> > #endif
> > #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> > diff --git a/include/block/block-common.h b/include/block/block-common.h
> > index fdb7306e78..78cddeeda5 100644
> > --- a/include/block/block-common.h
> > +++ b/include/block/block-common.h
> > @@ -23,7 +23,6 @@
> > */
> > #ifndef BLOCK_COMMON_H
> > #define BLOCK_COMMON_H
> > -
> > #include "block/aio.h"
> > #include "block/aio-wait.h"
> > #include "qemu/iov.h"
> > @@ -49,6 +48,48 @@ typedef struct BlockDriver BlockDriver;
> > typedef struct BdrvChild BdrvChild;
> > typedef struct BdrvChildClass BdrvChildClass;
> >
> > +typedef enum zone_op {
> > + zone_open,
> > + zone_close,
> > + zone_finish,
> > + zone_reset,
> > +} zone_op;
> > +
> > +typedef enum zone_model {
> > + BLK_Z_HM,
> > + BLK_Z_HA,
> > +} zone_model;
> > +
> > +typedef enum BlkZoneCondition {
> > + BLK_ZS_NOT_WP = 0x0,
> > + BLK_ZS_EMPTY = 0x1,
> > + BLK_ZS_IOPEN = 0x2,
> > + BLK_ZS_EOPEN = 0x3,
> > + BLK_ZS_CLOSED = 0x4,
> > + BLK_ZS_RDONLY = 0xD,
> > + BLK_ZS_FULL = 0xE,
> > + BLK_ZS_OFFLINE = 0xF,
> > +} BlkZoneCondition;
> > +
> > +typedef enum BlkZoneType {
> > + BLK_ZT_CONV = 0x1,
> > + BLK_ZT_SWR = 0x2,
> > + BLK_ZT_SWP = 0x3,
> > +} BlkZoneType;
> > +
> > +/*
> > + * Zone descriptor data structure.
> > + * Provide information on a zone with all position and size values in
> > bytes.
> > + */
> > +typedef struct BlockZoneDescriptor {
> > + uint64_t start;
> > + uint64_t length;
> > + uint64_t cap;
> > + uint64_t wp;
> > + BlkZoneType type;
> > + BlkZoneCondition cond;
> > +} BlockZoneDescriptor;
> > +
> > typedef struct BlockDriverInfo {
> > /* in bytes, 0 if irrelevant */
> > int cluster_size;
> > diff --git a/include/block/block_int-common.h
> > b/include/block/block_int-common.h
> > index 8947abab76..b9ea9db6dc 100644
> > --- a/include/block/block_int-common.h
> > +++ b/include/block/block_int-common.h
> > @@ -94,6 +94,20 @@ typedef struct BdrvTrackedRequest {
> > struct BdrvTrackedRequest *waiting_for;
> > } BdrvTrackedRequest;
> >
> > +/**
> > + * Zone device information data structure.
> > + * Provide information on a device.
> > + */
> > +typedef struct zbd_dev {
> > + uint32_t zone_size;
> > + zone_model model;
> > + uint32_t block_size;
> > + uint32_t write_granularity;
> > + uint32_t nr_zones;
> > + struct BlockZoneDescriptor *zones; /* array of zones */
> > + uint32_t max_nr_open_zones; /* maximum number of explicitly open zones
> > */
> > + uint32_t max_nr_active_zones;
> > +} zbd_dev;
> >
> > struct BlockDriver {
> > /*
> > @@ -691,6 +705,12 @@ struct BlockDriver {
> > QEMUIOVector *qiov,
> > int64_t pos);
> >
> > + int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
> > + int64_t offset, int64_t len, int64_t *nr_zones,
> > + BlockZoneDescriptor *zones);
> > + int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, enum
> > zone_op op,
> > + int64_t offset, int64_t len);
> > +
> > /* removable device specific */
> > bool (*bdrv_is_inserted)(BlockDriverState *bs);
> > void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
>
> Other than that: Well done!
>
Thanks for reviewing!
Have a good day!
Sam
> Cheers,
>
> Hannes
> --
> Dr. Hannes Reinecke Kernel Storage Architect
> hare@suse.de +49 911 74053 688
> SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
> HRB 36809 (AG Nürnberg), GF: Felix Imendörffer
- [RFC v3 0/5] *** Add support for zoned device ***, Sam Li, 2022/06/26
- [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls., Sam Li, 2022/06/26
- Re: [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls., Hannes Reinecke, 2022/06/27
- Re: [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls.,
Sam Li <=
- Re: [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls., Stefan Hajnoczi, 2022/06/28
- Re: [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls., Sam Li, 2022/06/28
- Re: [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls., Damien Le Moal, 2022/06/28
- Re: [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls., Sam Li, 2022/06/28
- Re: [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls., Damien Le Moal, 2022/06/28
- Re: [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls., Sam Li, 2022/06/28
- Re: [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls., Damien Le Moal, 2022/06/28
- Re: [RFC v3 1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls., Sam Li, 2022/06/28
[RFC v3 2/5] qemu-io: add zoned block device operations., Sam Li, 2022/06/26