[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH v10 3/7] block: add block layer APIs resembling Linux ZonedBl
From: |
Damien Le Moal |
Subject: |
Re: [PATCH v10 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls |
Date: |
Tue, 4 Oct 2022 08:23:15 +0900 |
User-agent: |
Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:102.0) Gecko/20100101 Thunderbird/102.3.1 |
On 2022/10/04 2:47, Stefan Hajnoczi wrote:
> On Thu, Sep 29, 2022 at 04:36:27PM +0800, Sam Li wrote:
>> Add a new zoned_host_device BlockDriver. The zoned_host_device option
>> accepts only zoned host block devices. By adding zone management
>> operations in this new BlockDriver, users can use the new block
>> layer APIs including Report Zone and four zone management operations
>> (open, close, finish, reset).
>>
>> Qemu-io uses the new APIs to perform zoned storage commands of the device:
>> zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs),
>> zone_finish(zf).
>>
>> For example, to test zone_report, use following command:
>> $ ./build/qemu-io --image-opts -n driver=zoned_host_device,
>> filename=/dev/nullb0
>> -c "zrp offset nr_zones"
>>
>> Signed-off-by: Sam Li <faithilikerun@gmail.com>
>> Reviewed-by: Hannes Reinecke <hare@suse.de>
>> ---
>> block/block-backend.c | 146 +++++++++++++
>> block/file-posix.c | 340 +++++++++++++++++++++++++++++-
>> block/io.c | 41 ++++
>> include/block/block-common.h | 4 +
>> include/block/block-io.h | 7 +
>> include/block/block_int-common.h | 24 +++
>> include/block/raw-aio.h | 6 +-
>> include/sysemu/block-backend-io.h | 17 ++
>> meson.build | 4 +
>> qapi/block-core.json | 8 +-
>> qemu-io-cmds.c | 148 +++++++++++++
>> 11 files changed, 741 insertions(+), 4 deletions(-)
>>
>> diff --git a/block/block-backend.c b/block/block-backend.c
>> index d4a5df2ac2..f7f7acd6f4 100644
>> --- a/block/block-backend.c
>> +++ b/block/block-backend.c
>> @@ -1431,6 +1431,15 @@ typedef struct BlkRwCo {
>> void *iobuf;
>> int ret;
>> BdrvRequestFlags flags;
>> + union {
>> + struct {
>> + unsigned int *nr_zones;
>> + BlockZoneDescriptor *zones;
>> + } zone_report;
>> + struct {
>> + BlockZoneOp op;
>> + } zone_mgmt;
>> + };
>> } BlkRwCo;
>>
>> int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
>> @@ -1775,6 +1784,143 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
>> return ret;
>> }
>>
>> +static void blk_aio_zone_report_entry(void *opaque) {
>
>
> The coroutine_fn annotation is missing:
>
> static void coroutine_fn blk_aio_zone_report_entry(void *opaque) {
>
>> + BlkAioEmAIOCB *acb = opaque;
>> + BlkRwCo *rwco = &acb->rwco;
>> +
>> + rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset,
>> + rwco->zone_report.nr_zones,
>> + rwco->zone_report.zones);
>> + blk_aio_complete(acb);
>> +}
>> +
>> +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
>> + unsigned int *nr_zones,
>> + BlockZoneDescriptor *zones,
>> + BlockCompletionFunc *cb, void *opaque)
>> +{
>> + BlkAioEmAIOCB *acb;
>> + Coroutine *co;
>> + IO_CODE();
>> +
>> + blk_inc_in_flight(blk);
>> + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
>> + acb->rwco = (BlkRwCo) {
>> + .blk = blk,
>> + .offset = offset,
>> + .ret = NOT_DONE,
>> + .zone_report = {
>> + .zones = zones,
>> + .nr_zones = nr_zones,
>> + },
>> + };
>> + acb->has_returned = false;
>> +
>> + co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
>> + bdrv_coroutine_enter(blk_bs(blk), co);
>> +
>> + acb->has_returned = true;
>> + if (acb->rwco.ret != NOT_DONE) {
>> + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
>> + blk_aio_complete_bh, acb);
>> + }
>> +
>> + return &acb->common;
>> +}
>> +
>> +static void blk_aio_zone_mgmt_entry(void *opaque) {
>
> coroutine_fn is missing here.
>
>> + BlkAioEmAIOCB *acb = opaque;
>> + BlkRwCo *rwco = &acb->rwco;
>> +
>> + rwco->ret = blk_co_zone_mgmt(rwco->blk, rwco->zone_mgmt.op,
>> + rwco->offset, acb->bytes);
>> + blk_aio_complete(acb);
>> +}
>> +
>> +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>> + int64_t offset, int64_t len,
>> + BlockCompletionFunc *cb, void *opaque) {
>> + BlkAioEmAIOCB *acb;
>> + Coroutine *co;
>> + IO_CODE();
>> +
>> + blk_inc_in_flight(blk);
>> + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
>> + acb->rwco = (BlkRwCo) {
>> + .blk = blk,
>> + .offset = offset,
>> + .ret = NOT_DONE,
>> + .zone_mgmt = {
>> + .op = op,
>> + },
>> + };
>> + acb->bytes = len;
>> + acb->has_returned = false;
>> +
>> + co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
>> + bdrv_coroutine_enter(blk_bs(blk), co);
>> +
>> + acb->has_returned = true;
>> + if (acb->rwco.ret != NOT_DONE) {
>> + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
>> + blk_aio_complete_bh, acb);
>> + }
>> +
>> + return &acb->common;
>> +}
>> +
>> +/*
>> + * Send a zone_report command.
>> + * offset is a byte offset from the start of the device. No alignment
>> + * required for offset.
>> + * nr_zones represents IN maximum and OUT actual.
>> + */
>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
>> + unsigned int *nr_zones,
>> + BlockZoneDescriptor *zones)
>> +{
>> + int ret;
>> + IO_CODE();
>> +
>> + blk_inc_in_flight(blk); /* increase before waiting */
>> + blk_wait_while_drained(blk);
>> + if (!blk_is_available(blk)) {
>> + blk_dec_in_flight(blk);
>> + return -ENOMEDIUM;
>> + }
>> + ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
>> + blk_dec_in_flight(blk);
>> + return ret;
>> +}
>> +
>> +/*
>> + * Send a zone_management command.
>> + * op is the zone operation;
>> + * offset is the byte offset from the start of the zoned device;
>> + * len is the maximum number of bytes the command should operate on. It
>> + * should be aligned with the device zone size.
>> + */
>> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>> + int64_t offset, int64_t len)
>> +{
>> + int ret;
>> + IO_CODE();
>> +
>> +
>> + blk_inc_in_flight(blk);
>> + blk_wait_while_drained(blk);
>> +
>> + ret = blk_check_byte_request(blk, offset, len);
>> + if (ret < 0) {
>> + blk_dec_in_flight(blk);
>> + return ret;
>> + }
>> +
>> + ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
>> + blk_dec_in_flight(blk);
>> + return ret;
>> +}
>> +
>> void blk_drain(BlockBackend *blk)
>> {
>> BlockDriverState *bs = blk_bs(blk);
>> diff --git a/block/file-posix.c b/block/file-posix.c
>> index 0a8b4b426e..0a6c781201 100644
>> --- a/block/file-posix.c
>> +++ b/block/file-posix.c
>> @@ -67,6 +67,9 @@
>> #include <sys/param.h>
>> #include <sys/syscall.h>
>> #include <sys/vfs.h>
>> +#if defined(CONFIG_BLKZONED)
>> +#include <linux/blkzoned.h>
>> +#endif
>> #include <linux/cdrom.h>
>> #include <linux/fd.h>
>> #include <linux/fs.h>
>> @@ -216,6 +219,15 @@ typedef struct RawPosixAIOData {
>> PreallocMode prealloc;
>> Error **errp;
>> } truncate;
>> + struct {
>> + unsigned int *nr_zones;
>> + BlockZoneDescriptor *zones;
>> + } zone_report;
>> + struct {
>> + unsigned long zone_op;
>> + const char *zone_op_name;
>> + bool all;
>
> Please remove this field if it is unused.
>
>> + } zone_mgmt;
>> };
>> } RawPosixAIOData;
>>
>> @@ -1339,7 +1351,7 @@ static void raw_refresh_limits(BlockDriverState *bs,
>> Error **errp)
>> #endif
>>
>> if (bs->sg || S_ISBLK(st.st_mode)) {
>> - int ret = hdev_get_max_hw_transfer(s->fd, &st);
>> + ret = hdev_get_max_hw_transfer(s->fd, &st);
>>
>> if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
>> bs->bl.max_hw_transfer = ret;
>> @@ -1356,6 +1368,41 @@ static void raw_refresh_limits(BlockDriverState *bs,
>> Error **errp)
>> zoned = BLK_Z_NONE;
>> }
>> bs->bl.zoned = zoned;
>> + if (zoned != BLK_Z_NONE) {
>> + ret = get_sysfs_long_val(&st, "chunk_sectors");
>> + if (ret <= 0) {
>> + error_report("Invalid zone size %" PRId32 " sectors ", ret);
>> + bs->bl.zoned = BLK_Z_NONE;
>> + return;
>> + }
>> + bs->bl.zone_size = ret * 512;
>> +
>> + ret = get_sysfs_long_val(&st, "zone_append_max_bytes");
>> + if (ret > 0) {
>> + bs->bl.max_append_sectors = ret / 512;
>> + }
>> +
>> + ret = get_sysfs_long_val(&st, "max_open_zones");
>> + if (ret >= 0) {
>> + bs->bl.max_open_zones = ret;
>> + }
>> +
>> + ret = get_sysfs_long_val(&st, "max_active_zones");
>> + if (ret >= 0) {
>> + bs->bl.max_active_zones = ret;
>> + }
>> +
>> + ret = get_sysfs_long_val(&st, "nr_zones");
>> + if (ret >= 0) {
>> + bs->bl.nr_zones = ret;
>> + }
>> +
>> + ret = ioctl(s->fd, BLKGETSIZE64, &bs->bl.capacity);
>> + if (ret != 0) {
>> + error_report("Invalid device capacity %" PRId64 " bytes ",
>> bs->bl.capacity);
>> + return;
>> + }
>
> The QEMU block layer already knows the capacity of the device. Can
> bdrv_getlength() be used instead of introducing a new
> BlockLimits.capacity field?
>
>> + }
>> }
>>
>> static int check_for_dasd(int fd)
>> @@ -1850,6 +1897,147 @@ static off_t copy_file_range(int in_fd, off_t
>> *in_off, int out_fd,
>> }
>> #endif
>>
>> +/*
>> + * parse_zone - Fill a zone descriptor
>> + */
>> +#if defined(CONFIG_BLKZONED)
>> +static inline void parse_zone(struct BlockZoneDescriptor *zone,
>> + const struct blk_zone *blkz,
>> + const struct blk_zone_report *rep) {
>> + zone->start = blkz->start << BDRV_SECTOR_BITS;
>> + zone->length = blkz->len << BDRV_SECTOR_BITS;
>> + zone->wp = blkz->wp << BDRV_SECTOR_BITS;
>> +
>> + if (rep->flags & BLK_ZONE_REP_CAPACITY) {
>> + zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
>
> #ifdef HAVE_BLK_ZONE_REP_CAPACITY is needed since the rep->flags and
> blkz->capacity fields are missing and would cause a compilation error
> when HAVE_BLK_ZONE_REP_CAPACITY is not defined:
>
> zone->cap = blkz->len << BDRV_SECTOR_BITS;
> #ifdef HAVE_BLK_ZONE_REP_CAPACITY
> /* Replace with the dedicated field on newer kernels */
> if (rep->flags & BLK_ZONE_REP_CAPACITY) {
> zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
> }
> #endif
It would be a lot cleaner to do something like this:
in the block common header file, add:
#ifdef HAVE_BLK_ZONE_REP_CAPACITY
#define BLK_ZONE_REP_CAPACITY (1 << 0)
struct blk_zone_v2 {
__u64 start; /* Zone start sector */
__u64 len; /* Zone length in number of sectors */
__u64 wp; /* Zone write pointer position */
__u8 type; /* Zone type */
__u8 cond; /* Zone condition */
__u8 non_seq; /* Non-sequential write resources active */
__u8 reset; /* Reset write pointer recommended */
__u8 resv[4];
__u64 capacity; /* Zone capacity in number of sectors */
__u8 reserved[24];
};
#define blk_zone blk_zone_v2
struct blk_zone_report_v2 {
__u64 sector;
__u32 nr_zones;
__u32 flags;
struct blk_zone zones[0];
};
#define blk_zone_report blk_zone_report_v2
#endif
Then the above code becomes:
if (rep->flags & BLK_ZONE_REP_CAPACITY) {
zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
} else {
zone->cap = blkz->len << BDRV_SECTOR_BITS;
}
No #ifdef in the C code, only in the header and that compiles and works for all
host kernel versions.
>
>> + } else {
>> + zone->cap = blkz->len << BDRV_SECTOR_BITS;
>> + }
>> +
>> + switch (blkz->type) {
>> + case BLK_ZONE_TYPE_SEQWRITE_REQ:
>> + zone->type = BLK_ZT_SWR;
>> + break;
>> + case BLK_ZONE_TYPE_SEQWRITE_PREF:
>> + zone->type = BLK_ZT_SWP;
>> + break;
>> + case BLK_ZONE_TYPE_CONVENTIONAL:
>> + zone->type = BLK_ZT_CONV;
>> + break;
>> + default:
>> + g_assert_not_reached();
>> + }
>> +
>> + switch (blkz->cond) {
>> + case BLK_ZONE_COND_NOT_WP:
>> + zone->cond = BLK_ZS_NOT_WP;
>> + break;
>> + case BLK_ZONE_COND_EMPTY:
>> + zone->cond = BLK_ZS_EMPTY;
>> + break;
>> + case BLK_ZONE_COND_IMP_OPEN:
>> + zone->cond = BLK_ZS_IOPEN;
>> + break;
>> + case BLK_ZONE_COND_EXP_OPEN:
>> + zone->cond = BLK_ZS_EOPEN;
>> + break;
>> + case BLK_ZONE_COND_CLOSED:
>> + zone->cond = BLK_ZS_CLOSED;
>> + break;
>> + case BLK_ZONE_COND_READONLY:
>> + zone->cond = BLK_ZS_RDONLY;
>> + break;
>> + case BLK_ZONE_COND_FULL:
>> + zone->cond = BLK_ZS_FULL;
>> + break;
>> + case BLK_ZONE_COND_OFFLINE:
>> + zone->cond = BLK_ZS_OFFLINE;
>> + break;
>> + default:
>> + g_assert_not_reached();
>> + }
>> +}
>> +#endif
>> +
>> +static int handle_aiocb_zone_report(void *opaque) {
>> +#if defined(CONFIG_BLKZONED)
>> + RawPosixAIOData *aiocb = opaque;
>> + int fd = aiocb->aio_fildes;
>> + unsigned int *nr_zones = aiocb->zone_report.nr_zones;
>> + BlockZoneDescriptor *zones = aiocb->zone_report.zones;
>> + /* zoned block devices use 512-byte sectors */
>> + int64_t sector = aiocb->aio_offset / 512;
>> +
>> + struct blk_zone *blkz;
>> + int64_t rep_size;
>> + unsigned int nrz;
>> + int ret, n = 0, i = 0;
>> +
>> + nrz = *nr_zones;
>> + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct
>> blk_zone);
>> + g_autofree struct blk_zone_report *rep = NULL;
>> + rep = g_malloc(rep_size);
>> +
>> + blkz = (struct blk_zone *)(rep + 1);
>> + while (n < nrz) {
>> + memset(rep, 0, rep_size);
>> + rep->sector = sector;
>> + rep->nr_zones = nrz - n;
>> +
>> + do {
>> + ret = ioctl(fd, BLKREPORTZONE, rep);
>> + } while (ret != 0 && errno == EINTR);
>> + if (ret != 0) {
>> + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
>> + fd, sector, errno);
>> + return -errno;
>
> Using errno after calling error_report() is risky because functions
> called by error_report() may overwrite errno. It's safer to preserve
> the value:
>
> if (ret != 0) {
> ret = -errno;
> error_report(...);
> return ret;
> }
>
>> + }
>> +
>> + if (!rep->nr_zones) {
>> + break;
>> + }
>> +
>> + for (i = 0; i < rep->nr_zones; i++, n++) {
>> + parse_zone(&zones[n], &blkz[i], rep);
>> + /* The next report should start after the last zone reported */
>> + sector = blkz[i].start + blkz[i].len;
>> + }
>> + }
>> +
>> + *nr_zones = n;
>> + return 0;
>> +#else
>> + return -ENOTSUP;
>> +#endif
>> +}
>> +
>> +static int handle_aiocb_zone_mgmt(void *opaque) {
>> +#if defined(CONFIG_BLKZONED)
>> + RawPosixAIOData *aiocb = opaque;
>> + int fd = aiocb->aio_fildes;
>> + int64_t sector = aiocb->aio_offset / 512;
>> + int64_t nr_sectors = aiocb->aio_nbytes / 512;
>> + struct blk_zone_range range;
>> + int ret;
>> +
>> + /* Execute the operation */
>> + range.sector = sector;
>> + range.nr_sectors = nr_sectors;
>> + do {
>> + ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range);
>> + } while (ret != 0 && errno == EINTR);
>> +
>> + if (ret != 0) {
>> + error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name,
>> + errno);
>> + return -errno;
>
> Same errno value preservation thing here.
>
>> + }
>> + return ret;
>> +#else
>> + return -ENOTSUP;
>> +#endif
>> +}
>> +
>> static int handle_aiocb_copy_range(void *opaque)
>> {
>> RawPosixAIOData *aiocb = opaque;
>> @@ -3022,6 +3210,105 @@ static void raw_account_discard(BDRVRawState *s,
>> uint64_t nbytes, int ret)
>> }
>> }
>>
>> +/*
>> + * zone report - Get a zone block device's information in the form
>> + * of an array of zone descriptors.
>> + * zones is an array of zone descriptors to hold zone information on reply;
>> + * offset can be any byte within the entire size of the device;
>> + * nr_zones is the maxium number of sectors the command should operate on.
>> + */
>> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t
>> offset,
>> + unsigned int *nr_zones,
>> + BlockZoneDescriptor *zones) {
>> +#if defined(CONFIG_BLKZONED)
>> + BDRVRawState *s = bs->opaque;
>> + RawPosixAIOData acb;
>> +
>> + acb = (RawPosixAIOData) {
>> + .bs = bs,
>> + .aio_fildes = s->fd,
>> + .aio_type = QEMU_AIO_ZONE_REPORT,
>> + .aio_offset = offset,
>> + .zone_report = {
>> + .nr_zones = nr_zones,
>> + .zones = zones,
>> + },
>> + };
>> +
>> + return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb);
>> +#else
>> + return -ENOTSUP;
>> +#endif
>> +}
>> +
>> +/*
>> + * zone management operations - Execute an operation on a zone
>> + */
>> +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp
>> op,
>> + int64_t offset, int64_t len) {
>> +#if defined(CONFIG_BLKZONED)
>> + BDRVRawState *s = bs->opaque;
>> + RawPosixAIOData acb;
>> + int64_t zone_size, zone_size_mask;
>> + const char *zone_op_name;
>> + unsigned long zone_op;
>> + bool is_all = false;
>> +
>> + zone_size = bs->bl.zone_size;
>> + zone_size_mask = zone_size - 1;
>> + if (offset & zone_size_mask) {
>> + error_report("sector offset %" PRId64 " is not aligned to zone size
>> "
>> + "%" PRId64 "", offset / 512, zone_size / 512);
>> + return -EINVAL;
>> + }
>> +
>> + if (((offset + len) < bs->bl.capacity && len & zone_size_mask) ||
>> + offset + len > bs->bl.capacity) {
>> + error_report("number of sectors %" PRId64 " is not aligned to zone
>> size"
>> + " %" PRId64 "", len / 512, zone_size / 512);
>> + return -EINVAL;
>> + }
>> +
>> + switch (op) {
>> + case BLK_ZO_OPEN:
>> + zone_op_name = "BLKOPENZONE";
>> + zone_op = BLKOPENZONE;
>> + break;
>> + case BLK_ZO_CLOSE:
>> + zone_op_name = "BLKCLOSEZONE";
>> + zone_op = BLKCLOSEZONE;
>> + break;
>> + case BLK_ZO_FINISH:
>> + zone_op_name = "BLKFINISHZONE";
>> + zone_op = BLKFINISHZONE;
>> + break;
>> + case BLK_ZO_RESET:
>> + zone_op_name = "BLKRESETZONE";
>> + zone_op = BLKRESETZONE;
>> + break;
>> + default:
>> + g_assert_not_reached();
>> + }
>> +
>> + acb = (RawPosixAIOData) {
>> + .bs = bs,
>> + .aio_fildes = s->fd,
>> + .aio_type = QEMU_AIO_ZONE_MGMT,
>> + .aio_offset = offset,
>> + .aio_nbytes = len,
>> + .zone_mgmt = {
>> + .zone_op = zone_op,
>> + .zone_op_name = zone_op_name,
>> + .all = is_all,
>> + },
>> + };
>> +
>> + return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
>> +#else
>> + return -ENOTSUP;
>> +#endif
>> +}
>> +
>> static coroutine_fn int
>> raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
>> bool blkdev)
>> @@ -3752,6 +4039,54 @@ static BlockDriver bdrv_host_device = {
>> #endif
>> };
>>
>> +#if defined(CONFIG_BLKZONED)
>
> Have you tried building without CONFIG_BLKZONED? There might be "unused
> function" compiler warnings because raw_co_zone_mgmt() won't be
> referenced by anything if BlockDriver bdrv_zoned_host_device isn't
> compiled in.
>
> It may be necessary to completely #ifdef out raw_co_zone_mgmt() and
> related functions instead of #else return -ENOTSUP.
>
>> +static BlockDriver bdrv_zoned_host_device = {
>> + .format_name = "zoned_host_device",
>> + .protocol_name = "zoned_host_device",
>> + .instance_size = sizeof(BDRVRawState),
>> + .bdrv_needs_filename = true,
>> + .bdrv_probe_device = hdev_probe_device,
>> + .bdrv_file_open = hdev_open,
>> + .bdrv_close = raw_close,
>> + .bdrv_reopen_prepare = raw_reopen_prepare,
>> + .bdrv_reopen_commit = raw_reopen_commit,
>> + .bdrv_reopen_abort = raw_reopen_abort,
>> + .bdrv_co_create_opts = bdrv_co_create_opts_simple,
>> + .create_opts = &bdrv_create_opts_simple,
>> + .mutable_opts = mutable_opts,
>> + .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
>> + .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
>> +
>> + .bdrv_co_preadv = raw_co_preadv,
>> + .bdrv_co_pwritev = raw_co_pwritev,
>> + .bdrv_co_flush_to_disk = raw_co_flush_to_disk,
>> + .bdrv_co_pdiscard = hdev_co_pdiscard,
>> + .bdrv_co_copy_range_from = raw_co_copy_range_from,
>> + .bdrv_co_copy_range_to = raw_co_copy_range_to,
>> + .bdrv_refresh_limits = raw_refresh_limits,
>> + .bdrv_io_plug = raw_aio_plug,
>> + .bdrv_io_unplug = raw_aio_unplug,
>> + .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>> +
>> + .bdrv_co_truncate = raw_co_truncate,
>> + .bdrv_getlength = raw_getlength,
>> + .bdrv_get_info = raw_get_info,
>> + .bdrv_get_allocated_file_size
>> + = raw_get_allocated_file_size,
>> + .bdrv_get_specific_stats = hdev_get_specific_stats,
>> + .bdrv_check_perm = raw_check_perm,
>> + .bdrv_set_perm = raw_set_perm,
>> + .bdrv_abort_perm_update = raw_abort_perm_update,
>> + .bdrv_probe_blocksizes = hdev_probe_blocksizes,
>> + .bdrv_probe_geometry = hdev_probe_geometry,
>> + .bdrv_co_ioctl = hdev_co_ioctl,
>> +
>> + /* zone management operations */
>> + .bdrv_co_zone_report = raw_co_zone_report,
>> + .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
>> +};
>> +#endif
>> +
>> #if defined(__linux__) || defined(__FreeBSD__) ||
>> defined(__FreeBSD_kernel__)
>> static void cdrom_parse_filename(const char *filename, QDict *options,
>> Error **errp)
>> @@ -4012,6 +4347,9 @@ static void bdrv_file_init(void)
>> bdrv_register(&bdrv_file);
>> #if defined(HAVE_HOST_BLOCK_DEVICE)
>> bdrv_register(&bdrv_host_device);
>> +#if defined(CONFIG_BLKZONED)
>> + bdrv_register(&bdrv_zoned_host_device);
>> +#endif
>> #ifdef __linux__
>> bdrv_register(&bdrv_host_cdrom);
>> #endif
>> diff --git a/block/io.c b/block/io.c
>> index 0a8cbefe86..5ab2d169c8 100644
>> --- a/block/io.c
>> +++ b/block/io.c
>> @@ -3198,6 +3198,47 @@ out:
>> return co.ret;
>> }
>>
>> +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
>> + unsigned int *nr_zones,
>> + BlockZoneDescriptor *zones)
>> +{
>> + BlockDriver *drv = bs->drv;
>> + CoroutineIOCompletion co = {
>> + .coroutine = qemu_coroutine_self(),
>> + };
>> + IO_CODE();
>> +
>> + bdrv_inc_in_flight(bs);
>> + if (!drv || !drv->bdrv_co_zone_report) {
>> + co.ret = -ENOTSUP;
>> + goto out;
>> + }
>> + co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones);
>> +out:
>> + bdrv_dec_in_flight(bs);
>> + return co.ret;
>> +}
>> +
>> +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>> + int64_t offset, int64_t len)
>> +{
>> + BlockDriver *drv = bs->drv;
>> + CoroutineIOCompletion co = {
>> + .coroutine = qemu_coroutine_self(),
>> + };
>> + IO_CODE();
>> +
>> + bdrv_inc_in_flight(bs);
>> + if (!drv || !drv->bdrv_co_zone_mgmt) {
>> + co.ret = -ENOTSUP;
>> + goto out;
>> + }
>> + co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len);
>> +out:
>> + bdrv_dec_in_flight(bs);
>> + return co.ret;
>> +}
>> +
>> void *qemu_blockalign(BlockDriverState *bs, size_t size)
>> {
>> IO_CODE();
>> diff --git a/include/block/block-common.h b/include/block/block-common.h
>> index 36bd0e480e..8efb6b0c43 100644
>> --- a/include/block/block-common.h
>> +++ b/include/block/block-common.h
>> @@ -49,6 +49,10 @@ typedef struct BlockDriver BlockDriver;
>> typedef struct BdrvChild BdrvChild;
>> typedef struct BdrvChildClass BdrvChildClass;
>>
>> +#ifndef HAVE_BLK_ZONE_REP_CAPACITY
>> +#define BLK_ZONE_REP_CAPACITY (1 << 0)
>> +#endif
>
> This constant is defined in <linux/blkzoned.h> and shouldn't be
> redefined by QEMU. I think this was necessary because of the #ifdefs.
> Please see my comment above about HAVE_BLK_ZONE_REP_CAPACITY.
>
>> +
>> typedef enum BlockZoneOp {
>> BLK_ZO_OPEN,
>> BLK_ZO_CLOSE,
>> diff --git a/include/block/block-io.h b/include/block/block-io.h
>> index fd25ffa9be..65463b88d9 100644
>> --- a/include/block/block-io.h
>> +++ b/include/block/block-io.h
>> @@ -88,6 +88,13 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void
>> *buf);
>> /* Ensure contents are flushed to disk. */
>> int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
>>
>> +/* Report zone information of zone block device. */
>> +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
>> + unsigned int *nr_zones,
>> + BlockZoneDescriptor *zones);
>> +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>> + int64_t offset, int64_t len);
>> +
>> int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
>> bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
>> int bdrv_block_status(BlockDriverState *bs, int64_t offset,
>> diff --git a/include/block/block_int-common.h
>> b/include/block/block_int-common.h
>> index 7f7863cc9e..cdc06e77a6 100644
>> --- a/include/block/block_int-common.h
>> +++ b/include/block/block_int-common.h
>> @@ -691,6 +691,12 @@ struct BlockDriver {
>> QEMUIOVector *qiov,
>> int64_t pos);
>>
>> + int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
>> + int64_t offset, unsigned int *nr_zones,
>> + BlockZoneDescriptor *zones);
>> + int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp
>> op,
>> + int64_t offset, int64_t len);
>> +
>> /* removable device specific */
>> bool (*bdrv_is_inserted)(BlockDriverState *bs);
>> void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
>> @@ -828,6 +834,24 @@ typedef struct BlockLimits {
>>
>> /* device zone model */
>> BlockZoneModel zoned;
>> +
>> + /* zone size expressed in bytes */
>> + uint32_t zone_size;
>> +
>> + /* total number of zones */
>> + unsigned int nr_zones;
>> +
>> + /* maximum sectors of a zone append write operation */
>> + int64_t max_append_sectors;
>> +
>> + /* maximum number of open zones */
>> + int64_t max_open_zones;
>> +
>> + /* maximum number of active zones */
>> + int64_t max_active_zones;
>> +
>> + /* device capacity expressed in bytes */
>> + int64_t capacity;
>> } BlockLimits;
>>
>> typedef struct BdrvOpBlocker BdrvOpBlocker;
>> diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
>> index 21fc10c4c9..3d26929cdd 100644
>> --- a/include/block/raw-aio.h
>> +++ b/include/block/raw-aio.h
>> @@ -29,6 +29,8 @@
>> #define QEMU_AIO_WRITE_ZEROES 0x0020
>> #define QEMU_AIO_COPY_RANGE 0x0040
>> #define QEMU_AIO_TRUNCATE 0x0080
>> +#define QEMU_AIO_ZONE_REPORT 0x0100
>> +#define QEMU_AIO_ZONE_MGMT 0x0200
>> #define QEMU_AIO_TYPE_MASK \
>> (QEMU_AIO_READ | \
>> QEMU_AIO_WRITE | \
>> @@ -37,7 +39,9 @@
>> QEMU_AIO_DISCARD | \
>> QEMU_AIO_WRITE_ZEROES | \
>> QEMU_AIO_COPY_RANGE | \
>> - QEMU_AIO_TRUNCATE)
>> + QEMU_AIO_TRUNCATE | \
>> + QEMU_AIO_ZONE_REPORT | \
>> + QEMU_AIO_ZONE_MGMT)
>>
>> /* AIO flags */
>> #define QEMU_AIO_MISALIGNED 0x1000
>> diff --git a/include/sysemu/block-backend-io.h
>> b/include/sysemu/block-backend-io.h
>> index 50f5aa2e07..6835525582 100644
>> --- a/include/sysemu/block-backend-io.h
>> +++ b/include/sysemu/block-backend-io.h
>> @@ -45,6 +45,12 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t
>> offset,
>> BlockCompletionFunc *cb, void *opaque);
>> BlockAIOCB *blk_aio_flush(BlockBackend *blk,
>> BlockCompletionFunc *cb, void *opaque);
>> +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
>> + unsigned int *nr_zones, BlockZoneDescriptor
>> *zones,
>> + BlockCompletionFunc *cb, void *opaque);
>> +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>> + int64_t offset, int64_t len,
>> + BlockCompletionFunc *cb, void *opaque);
>> BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t
>> bytes,
>> BlockCompletionFunc *cb, void *opaque);
>> void blk_aio_cancel_async(BlockAIOCB *acb);
>> @@ -156,6 +162,17 @@ int generated_co_wrapper blk_pwrite_zeroes(BlockBackend
>> *blk, int64_t offset,
>> int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
>> int64_t bytes, BdrvRequestFlags
>> flags);
>>
>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
>> + unsigned int *nr_zones,
>> + BlockZoneDescriptor *zones);
>> +int generated_co_wrapper blk_zone_report(BlockBackend *blk, int64_t offset,
>> + unsigned int *nr_zones,
>> + BlockZoneDescriptor *zones);
>> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>> + int64_t offset, int64_t len);
>> +int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
>> + int64_t offset, int64_t len);
>> +
>> int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
>> int64_t bytes);
>> int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset,
>> diff --git a/meson.build b/meson.build
>> index 63cfb844cf..9a797388ad 100644
>> --- a/meson.build
>> +++ b/meson.build
>> @@ -1882,6 +1882,7 @@ config_host_data.set('CONFIG_REPLICATION',
>> get_option('replication').allowed())
>> # has_header
>> config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h'))
>> config_host_data.set('CONFIG_LINUX_MAGIC_H', cc.has_header('linux/magic.h'))
>> +config_host_data.set('CONFIG_BLKZONED', cc.has_header('linux/blkzoned.h'))
>> config_host_data.set('CONFIG_VALGRIND_H',
>> cc.has_header('valgrind/valgrind.h'))
>> config_host_data.set('HAVE_BTRFS_H', cc.has_header('linux/btrfs.h'))
>> config_host_data.set('HAVE_DRM_H', cc.has_header('libdrm/drm.h'))
>> @@ -1975,6 +1976,9 @@ config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID',
>> config_host_data.set('HAVE_STRUCT_STAT_ST_ATIM',
>> cc.has_member('struct stat', 'st_atim',
>> prefix: '#include <sys/stat.h>'))
>> +config_host_data.set('HAVE_BLK_ZONE_REP_CAPACITY',
>> + cc.has_member('struct blk_zone', 'capacity',
>> + prefix: '#include <linux/blkzoned.h>'))
>>
>> # has_type
>> config_host_data.set('CONFIG_IOVEC',
>> diff --git a/qapi/block-core.json b/qapi/block-core.json
>> index f21fa235f2..ee87c1df8a 100644
>> --- a/qapi/block-core.json
>> +++ b/qapi/block-core.json
>> @@ -2942,6 +2942,7 @@
>> # @compress: Since 5.0
>> # @copy-before-write: Since 6.2
>> # @snapshot-access: Since 7.0
>> +# @zoned_host_device: Since 7.2
>> #
>> # Since: 2.9
>> ##
>> @@ -2955,7 +2956,8 @@
>> 'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme',
>> 'parallels',
>> 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
>> { 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
>> - 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
>> + 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat',
>> + { 'name': 'zoned_host_device', 'if': 'CONFIG_BLKZONED' } ] }
>>
>> ##
>> # @BlockdevOptionsFile:
>> @@ -4329,7 +4331,9 @@
>> 'vhdx': 'BlockdevOptionsGenericFormat',
>> 'vmdk': 'BlockdevOptionsGenericCOWFormat',
>> 'vpc': 'BlockdevOptionsGenericFormat',
>> - 'vvfat': 'BlockdevOptionsVVFAT'
>> + 'vvfat': 'BlockdevOptionsVVFAT',
>> + 'zoned_host_device': { 'type': 'BlockdevOptionsFile',
>> + 'if': 'CONFIG_BLKZONED' }
>> } }
>>
>> ##
>> diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
>> index 952dc940f1..e56c8d1c30 100644
>> --- a/qemu-io-cmds.c
>> +++ b/qemu-io-cmds.c
>> @@ -1712,6 +1712,149 @@ static const cmdinfo_t flush_cmd = {
>> .oneline = "flush all in-core file state to disk",
>> };
>>
>> +static inline int64_t tosector(int64_t bytes) {
>> + return bytes >> BDRV_SECTOR_BITS;
>> +}
>> +
>> +static int zone_report_f(BlockBackend *blk, int argc, char **argv)
>> +{
>> + int ret;
>> + int64_t offset;
>> + unsigned int nr_zones;
>> +
>> + ++optind;
>> + offset = cvtnum(argv[optind]);
>> + ++optind;
>> + nr_zones = cvtnum(argv[optind]);
>> +
>> + g_autofree BlockZoneDescriptor *zones = NULL;
>> + zones = g_new(BlockZoneDescriptor, nr_zones);
>> + ret = blk_zone_report(blk, offset, &nr_zones, zones);
>> + if (ret < 0) {
>> + printf("zone report failed: %s\n", strerror(-ret));
>> + } else {
>> + for (int i = 0; i < nr_zones; ++i) {
>> + printf("start: 0x%" PRIx64 ", len 0x%" PRIx64 ", "
>> + "cap"" 0x%" PRIx64 ", wptr 0x%" PRIx64 ", "
>> + "zcond:%u, [type: %u]\n",
>> + tosector(zones[i].start), tosector(zones[i].length),
>> + tosector(zones[i].cap), tosector(zones[i].wp),
>> + zones[i].cond, zones[i].type);
>> + }
>> + }
>> + return ret;
>> +}
>> +
>> +static const cmdinfo_t zone_report_cmd = {
>> + .name = "zone_report",
>> + .altname = "zrp",
>> + .cfunc = zone_report_f,
>> + .argmin = 2,
>> + .argmax = 2,
>> + .args = "offset number",
>> + .oneline = "report zone information",
>> +};
>> +
>> +static int zone_open_f(BlockBackend *blk, int argc, char **argv)
>> +{
>> + int ret;
>> + int64_t offset, len;
>> + ++optind;
>> + offset = cvtnum(argv[optind]);
>> + ++optind;
>> + len = cvtnum(argv[optind]);
>> + ret = blk_zone_mgmt(blk, BLK_ZO_OPEN, offset, len);
>> + if (ret < 0) {
>> + printf("zone open failed: %s\n", strerror(-ret));
>> + }
>> + return ret;
>> +}
>> +
>> +static const cmdinfo_t zone_open_cmd = {
>> + .name = "zone_open",
>> + .altname = "zo",
>> + .cfunc = zone_open_f,
>> + .argmin = 2,
>> + .argmax = 2,
>> + .args = "offset len",
>> + .oneline = "explicit open a range of zones in zone block device",
>> +};
>> +
>> +static int zone_close_f(BlockBackend *blk, int argc, char **argv)
>> +{
>> + int ret;
>> + int64_t offset, len;
>> + ++optind;
>> + offset = cvtnum(argv[optind]);
>> + ++optind;
>> + len = cvtnum(argv[optind]);
>> + ret = blk_zone_mgmt(blk, BLK_ZO_CLOSE, offset, len);
>> + if (ret < 0) {
>> + printf("zone close failed: %s\n", strerror(-ret));
>> + }
>> + return ret;
>> +}
>> +
>> +static const cmdinfo_t zone_close_cmd = {
>> + .name = "zone_close",
>> + .altname = "zc",
>> + .cfunc = zone_close_f,
>> + .argmin = 2,
>> + .argmax = 2,
>> + .args = "offset len",
>> + .oneline = "close a range of zones in zone block device",
>> +};
>> +
>> +static int zone_finish_f(BlockBackend *blk, int argc, char **argv)
>> +{
>> + int ret;
>> + int64_t offset, len;
>> + ++optind;
>> + offset = cvtnum(argv[optind]);
>> + ++optind;
>> + len = cvtnum(argv[optind]);
>> + ret = blk_zone_mgmt(blk, BLK_ZO_FINISH, offset, len);
>> + if (ret < 0) {
>> + printf("zone finish failed: %s\n", strerror(-ret));
>> + }
>> + return ret;
>> +}
>> +
>> +static const cmdinfo_t zone_finish_cmd = {
>> + .name = "zone_finish",
>> + .altname = "zf",
>> + .cfunc = zone_finish_f,
>> + .argmin = 2,
>> + .argmax = 2,
>> + .args = "offset len",
>> + .oneline = "finish a range of zones in zone block device",
>> +};
>> +
>> +static int zone_reset_f(BlockBackend *blk, int argc, char **argv)
>> +{
>> + int ret;
>> + int64_t offset, len;
>> + ++optind;
>> + offset = cvtnum(argv[optind]);
>> + ++optind;
>> + len = cvtnum(argv[optind]);
>> + ret = blk_zone_mgmt(blk, BLK_ZO_RESET, offset, len);
>> + if (ret < 0) {
>> + printf("zone reset failed: %s\n", strerror(-ret));
>> + }
>> + return ret;
>> +}
>> +
>> +static const cmdinfo_t zone_reset_cmd = {
>> + .name = "zone_reset",
>> + .altname = "zrs",
>> + .cfunc = zone_reset_f,
>> + .argmin = 2,
>> + .argmax = 2,
>> + .args = "offset len",
>> + .oneline = "reset a zone write pointer in zone block device",
>> +};
>> +
>> static int truncate_f(BlockBackend *blk, int argc, char **argv);
>> static const cmdinfo_t truncate_cmd = {
>> .name = "truncate",
>> @@ -2504,6 +2647,11 @@ static void __attribute((constructor))
>> init_qemuio_commands(void)
>> qemuio_add_command(&aio_write_cmd);
>> qemuio_add_command(&aio_flush_cmd);
>> qemuio_add_command(&flush_cmd);
>> + qemuio_add_command(&zone_report_cmd);
>> + qemuio_add_command(&zone_open_cmd);
>> + qemuio_add_command(&zone_close_cmd);
>> + qemuio_add_command(&zone_finish_cmd);
>> + qemuio_add_command(&zone_reset_cmd);
>> qemuio_add_command(&truncate_cmd);
>> qemuio_add_command(&length_cmd);
>> qemuio_add_command(&info_cmd);
>> --
>> 2.37.3
>>
--
Damien Le Moal
Western Digital Research