Message ID | 20221016145110.171262-4-faithilikerun@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add support for zoned device | expand |
On Sun, 2022-10-16 at 22:51 +0800, Sam Li wrote: > Add a new zoned_host_device BlockDriver. The zoned_host_device option > accepts only zoned host block devices. By adding zone management > operations in this new BlockDriver, users can use the new block > layer APIs including Report Zone and four zone management operations > (open, close, finish, reset, reset_all). > > Qemu-io uses the new APIs to perform zoned storage commands of the device: > zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs), > zone_finish(zf). > > For example, to test zone_report, use following command: > $ ./build/qemu-io --image-opts -n driver=zoned_host_device, > filename=/dev/nullb0 > -c "zrp offset nr_zones" > > Signed-off-by: Sam Li <faithilikerun@gmail.com> > Reviewed-by: Hannes Reinecke <hare@suse.de> > --- > block/block-backend.c | 148 +++++++++++++ > block/file-posix.c | 335 ++++++++++++++++++++++++++++++ > block/io.c | 41 ++++ > include/block/block-io.h | 7 + > include/block/block_int-common.h | 24 +++ > include/block/raw-aio.h | 6 +- > include/sysemu/block-backend-io.h | 18 ++ > meson.build | 4 + > qapi/block-core.json | 8 +- > qemu-io-cmds.c | 149 +++++++++++++ > 10 files changed, 737 insertions(+), 3 deletions(-) > > diff --git a/block/block-backend.c b/block/block-backend.c > index aa4adf06ae..1c618e9c68 100644 > --- a/block/block-backend.c > +++ b/block/block-backend.c > @@ -1431,6 +1431,15 @@ typedef struct BlkRwCo { > void *iobuf; > int ret; > BdrvRequestFlags flags; > + union { > + struct { > + unsigned int *nr_zones; > + BlockZoneDescriptor *zones; > + } zone_report; > + struct { > + unsigned long op; > + } zone_mgmt; > + }; > } BlkRwCo; > > int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags) > @@ -1775,6 +1784,145 @@ int coroutine_fn blk_co_flush(BlockBackend *blk) > return ret; > } > > +static void coroutine_fn blk_aio_zone_report_entry(void *opaque) > +{ > + BlkAioEmAIOCB *acb = opaque; > + BlkRwCo *rwco = &acb->rwco; > + > + rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset, > + rwco->zone_report.nr_zones, > + rwco->zone_report.zones); > + blk_aio_complete(acb); > +} > + > +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones, > + BlockCompletionFunc *cb, void *opaque) > +{ > + BlkAioEmAIOCB *acb; > + Coroutine *co; > + IO_CODE(); > + > + blk_inc_in_flight(blk); > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > + acb->rwco = (BlkRwCo) { > + .blk = blk, > + .offset = offset, > + .ret = NOT_DONE, > + .zone_report = { > + .zones = zones, > + .nr_zones = nr_zones, > + }, > + }; > + acb->has_returned = false; > + > + co = qemu_coroutine_create(blk_aio_zone_report_entry, acb); > + bdrv_coroutine_enter(blk_bs(blk), co); > + > + acb->has_returned = true; > + if (acb->rwco.ret != NOT_DONE) { > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > + blk_aio_complete_bh, acb); > + } > + > + return &acb->common; > +} > + > +static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque) > +{ > + BlkAioEmAIOCB *acb = opaque; > + BlkRwCo *rwco = &acb->rwco; > + > + rwco->ret = blk_co_zone_mgmt(rwco->blk, rwco->zone_mgmt.op, > + rwco->offset, acb->bytes); > + blk_aio_complete(acb); > +} > + > +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > + int64_t offset, int64_t len, > + BlockCompletionFunc *cb, void *opaque) { > + BlkAioEmAIOCB *acb; > + Coroutine *co; > + IO_CODE(); > + > + blk_inc_in_flight(blk); > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > + acb->rwco = (BlkRwCo) { > + .blk = blk, > + .offset = offset, > + .ret = NOT_DONE, > + .zone_mgmt = { > + .op = op, > + }, > + }; > + acb->bytes = len; > + acb->has_returned = false; > + > + co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb); > + bdrv_coroutine_enter(blk_bs(blk), co); > + > + acb->has_returned = true; > + if (acb->rwco.ret != NOT_DONE) { > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > + blk_aio_complete_bh, acb); > + } > + > + return &acb->common; > +} > + > +/* > + * Send a zone_report command. > + * offset is a byte offset from the start of the device. No alignment > + * required for offset. > + * nr_zones represents IN maximum and OUT actual. > + */ > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones) > +{ > + int ret; > + IO_CODE(); > + > + blk_inc_in_flight(blk); /* increase before waiting */ > + blk_wait_while_drained(blk); > + if (!blk_is_available(blk)) { > + blk_dec_in_flight(blk); > + return -ENOMEDIUM; > + } > + ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones); > + blk_dec_in_flight(blk); > + return ret; > +} > + > +/* > + * Send a zone_management command. > + * op is the zone operation; > + * offset is the byte offset from the start of the zoned device; > + * len is the maximum number of bytes the command should operate on. It > + * should be aligned with the device zone size. > + */ > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > + int64_t offset, int64_t len) > +{ > + int ret; > + IO_CODE(); > + > + > + blk_inc_in_flight(blk); > + blk_wait_while_drained(blk); > + > + ret = blk_check_byte_request(blk, offset, len); > + if (ret < 0) { > + blk_dec_in_flight(blk); > + return ret; > + } > + > + ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len); > + blk_dec_in_flight(blk); > + return ret; > +} > + > void blk_drain(BlockBackend *blk) > { > BlockDriverState *bs = blk_bs(blk); > diff --git a/block/file-posix.c b/block/file-posix.c > index 8cb07fdb8a..bd28e3eaea 100644 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -67,6 +67,9 @@ > #include <sys/param.h> > #include <sys/syscall.h> > #include <sys/vfs.h> > +#if defined(CONFIG_BLKZONED) > +#include <linux/blkzoned.h> > +#endif > #include <linux/cdrom.h> > #include <linux/fd.h> > #include <linux/fs.h> > @@ -215,6 +218,14 @@ typedef struct RawPosixAIOData { > PreallocMode prealloc; > Error **errp; > } truncate; > + struct { > + unsigned int *nr_zones; > + BlockZoneDescriptor *zones; > + } zone_report; > + struct { > + unsigned long zone_op; > + const char *zone_op_name; > + } zone_mgmt; > }; > } RawPosixAIOData; > > @@ -1350,6 +1361,48 @@ static void raw_refresh_limits(BlockDriverState *bs, > Error **errp) > zoned = BLK_Z_NONE; > } > bs->bl.zoned = zoned; > + if (zoned != BLK_Z_NONE) { > + /* > + * The zoned device must at least have zone size and nr_zones fields. > + */ > + ret = get_sysfs_long_val(&st, "chunk_sectors"); > + if (ret <= 0) { > + error_report("Invalid zone size %" PRId32 " sectors ", ret); > + bs->bl.zoned = BLK_Z_NONE; > + return; > + } > + bs->bl.zone_size = ret << BDRV_SECTOR_BITS; > + > + ret = get_sysfs_long_val(&st, "nr_zones"); > + if (ret <= 0) { > + error_report("Invalid number of zones %" PRId32 "", ret); > + bs->bl.zoned = BLK_Z_NONE; > + return; > + } > + bs->bl.nr_zones = ret; > + > + ret = ioctl(s->fd, BLKGETSIZE64, &bs->bl.capacity); > + if (ret != 0) { > + error_report("Failed to get device capacity %" PRId64 " bytes ", > + bs->bl.capacity); > + return; > + } > + > + ret = get_sysfs_long_val(&st, "zone_append_max_bytes"); > + if (ret > 0) { > + bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS; > + } > + > + ret = get_sysfs_long_val(&st, "max_open_zones"); > + if (ret >= 0) { > + bs->bl.max_open_zones = ret; > + } > + > + ret = get_sysfs_long_val(&st, "max_active_zones"); > + if (ret >= 0) { > + bs->bl.max_active_zones = ret; > + } > + } > } > > static int check_for_dasd(int fd) > @@ -1844,6 +1897,144 @@ static off_t copy_file_range(int in_fd, off_t *in_off, > int out_fd, > } > #endif > > +/* > + * parse_zone - Fill a zone descriptor > + */ > +#if defined(CONFIG_BLKZONED) > +static inline void parse_zone(struct BlockZoneDescriptor *zone, > + const struct blk_zone *blkz) { > + zone->start = blkz->start << BDRV_SECTOR_BITS; > + zone->length = blkz->len << BDRV_SECTOR_BITS; > + zone->wp = blkz->wp << BDRV_SECTOR_BITS; > + > +#ifdef HAVE_BLK_ZONE_REP_CAPACITY > + zone->cap = blkz->capacity << BDRV_SECTOR_BITS; > +#else > + zone->cap = blkz->len << BDRV_SECTOR_BITS; > +#endif > + > + switch (blkz->type) { > + case BLK_ZONE_TYPE_SEQWRITE_REQ: > + zone->type = BLK_ZT_SWR; > + break; > + case BLK_ZONE_TYPE_SEQWRITE_PREF: > + zone->type = BLK_ZT_SWP; > + break; > + case BLK_ZONE_TYPE_CONVENTIONAL: > + zone->type = BLK_ZT_CONV; > + break; > + default: > + error_report("Invalid zone type: 0x%x", blkz->type); > + } > + > + switch (blkz->cond) { > + case BLK_ZONE_COND_NOT_WP: > + zone->cond = BLK_ZS_NOT_WP; > + break; > + case BLK_ZONE_COND_EMPTY: > + zone->cond = BLK_ZS_EMPTY; > + break; > + case BLK_ZONE_COND_IMP_OPEN: > + zone->cond = BLK_ZS_IOPEN; > + break; > + case BLK_ZONE_COND_EXP_OPEN: > + zone->cond = BLK_ZS_EOPEN; > + break; > + case BLK_ZONE_COND_CLOSED: > + zone->cond = BLK_ZS_CLOSED; > + break; > + case BLK_ZONE_COND_READONLY: > + zone->cond = BLK_ZS_RDONLY; > + break; > + case BLK_ZONE_COND_FULL: > + zone->cond = BLK_ZS_FULL; > + break; > + case BLK_ZONE_COND_OFFLINE: > + zone->cond = BLK_ZS_OFFLINE; > + break; > + default: > + error_report("Invalid zone state: 0x%x", blkz->cond); > + } > +} > +#endif > + > +#if defined(CONFIG_BLKZONED) > +static int handle_aiocb_zone_report(void *opaque) > +{ > + RawPosixAIOData *aiocb = opaque; > + int fd = aiocb->aio_fildes; > + unsigned int *nr_zones = aiocb->zone_report.nr_zones; > + BlockZoneDescriptor *zones = aiocb->zone_report.zones; > + /* zoned block devices use 512-byte sectors */ > + int64_t sector = aiocb->aio_offset / 512; Shouldn't this be uint64_t ? > + > + struct blk_zone *blkz; > + int64_t rep_size; size_t type is more appropriate for the variable above > + unsigned int nrz; > + int ret, n = 0, i = 0; > + > + nrz = *nr_zones; > + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); > + g_autofree struct blk_zone_report *rep = NULL; > + rep = g_malloc(rep_size); > + > + blkz = (struct blk_zone *)(rep + 1); > + while (n < nrz) { > + memset(rep, 0, rep_size); > + rep->sector = sector; > + rep->nr_zones = nrz - n; > + > + do { > + ret = ioctl(fd, BLKREPORTZONE, rep); > + } while (ret != 0 && errno == EINTR); > + if (ret != 0) { > + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d", > + fd, sector, errno); > + return -errno; > + } > + > + if (!rep->nr_zones) { > + break; > + } > + > + for (i = 0; i < rep->nr_zones; i++, n++) { > + parse_zone(&zones[n], &blkz[i]); > + /* The next report should start after the last zone reported */ > + sector = blkz[i].start + blkz[i].len; > + } > + } > + > + *nr_zones = n; > + return 0; > +} > +#endif > + > +#if defined(CONFIG_BLKZONED) > +static int handle_aiocb_zone_mgmt(void *opaque) > +{ > + RawPosixAIOData *aiocb = opaque; > + int fd = aiocb->aio_fildes; > + int64_t sector = aiocb->aio_offset / 512; uint64_t? > + int64_t nr_sectors = aiocb->aio_nbytes / 512; > + struct blk_zone_range range; > + int ret; > + > + /* Execute the operation */ > + range.sector = sector; > + range.nr_sectors = nr_sectors; > + do { > + ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range); > + } while (ret != 0 && errno == EINTR); > + if (ret != 0) { > + ret = -errno; > + error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name, > + ret); > + return ret; > + } > + return 0; > +} > +#endif > + > static int handle_aiocb_copy_range(void *opaque) > { > RawPosixAIOData *aiocb = opaque; > @@ -3038,6 +3229,99 @@ static void raw_account_discard(BDRVRawState *s, > uint64_t nbytes, int ret) > } > } > > +/* > + * zone report - Get a zone block device's information in the form > + * of an array of zone descriptors. > + * zones is an array of zone descriptors to hold zone information on reply; > + * offset can be any byte within the entire size of the device; > + * nr_zones is the maxium number of sectors the command should operate on. > + */ > +#if defined(CONFIG_BLKZONED) > +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t > offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones) { > + BDRVRawState *s = bs->opaque; > + RawPosixAIOData acb; > + > + acb = (RawPosixAIOData) { > + .bs = bs, > + .aio_fildes = s->fd, > + .aio_type = QEMU_AIO_ZONE_REPORT, > + .aio_offset = offset, > + .zone_report = { > + .nr_zones = nr_zones, > + .zones = zones, > + }, > + }; > + > + return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb); > +} > +#endif > + > +/* > + * zone management operations - Execute an operation on a zone > + */ > +#if defined(CONFIG_BLKZONED) > +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > + int64_t offset, int64_t len) { > + BDRVRawState *s = bs->opaque; > + RawPosixAIOData acb; > + int64_t zone_size, zone_size_mask; > + const char *zone_op_name; > + unsigned long zone_op; > + > + zone_size = bs->bl.zone_size; > + zone_size_mask = zone_size - 1; > + if (offset & zone_size_mask) { > + error_report("sector offset %" PRId64 " is not aligned to zone size " > + "%" PRId64 "", offset / 512, zone_size / 512); > + return -EINVAL; > + } > + > + if (((offset + len) < bs->bl.capacity && len & zone_size_mask) || > + offset + len > bs->bl.capacity) { > + error_report("number of sectors %" PRId64 " is not aligned to zone > size" > + " %" PRId64 "", len / 512, zone_size / 512); > + return -EINVAL; > + } > + > + switch (op) { > + case BLK_ZO_OPEN: > + zone_op_name = "BLKOPENZONE"; > + zone_op = BLKOPENZONE; > + break; > + case BLK_ZO_CLOSE: > + zone_op_name = "BLKCLOSEZONE"; > + zone_op = BLKCLOSEZONE; > + break; > + case BLK_ZO_FINISH: > + zone_op_name = "BLKFINISHZONE"; > + zone_op = BLKFINISHZONE; > + break; > + case BLK_ZO_RESET: > + zone_op_name = "BLKRESETZONE"; > + zone_op = BLKRESETZONE; > + break; > + default: > + g_assert_not_reached(); > + } > + > + acb = (RawPosixAIOData) { > + .bs = bs, > + .aio_fildes = s->fd, > + .aio_type = QEMU_AIO_ZONE_MGMT, > + .aio_offset = offset, > + .aio_nbytes = len, > + .zone_mgmt = { > + .zone_op = zone_op, > + .zone_op_name = zone_op_name, > + }, > + }; > + > + return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb); > +} > +#endif > + > static coroutine_fn int > raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, > bool blkdev) > @@ -3768,6 +4052,54 @@ static BlockDriver bdrv_host_device = { > #endif > }; > > +#if defined(CONFIG_BLKZONED) > +static BlockDriver bdrv_zoned_host_device = { > + .format_name = "zoned_host_device", > + .protocol_name = "zoned_host_device", > + .instance_size = sizeof(BDRVRawState), > + .bdrv_needs_filename = true, > + .bdrv_probe_device = hdev_probe_device, > + .bdrv_file_open = hdev_open, > + .bdrv_close = raw_close, > + .bdrv_reopen_prepare = raw_reopen_prepare, > + .bdrv_reopen_commit = raw_reopen_commit, > + .bdrv_reopen_abort = raw_reopen_abort, > + .bdrv_co_create_opts = bdrv_co_create_opts_simple, > + .create_opts = &bdrv_create_opts_simple, > + .mutable_opts = mutable_opts, > + .bdrv_co_invalidate_cache = raw_co_invalidate_cache, > + .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, > + > + .bdrv_co_preadv = raw_co_preadv, > + .bdrv_co_pwritev = raw_co_pwritev, > + .bdrv_co_flush_to_disk = raw_co_flush_to_disk, > + .bdrv_co_pdiscard = hdev_co_pdiscard, > + .bdrv_co_copy_range_from = raw_co_copy_range_from, > + .bdrv_co_copy_range_to = raw_co_copy_range_to, > + .bdrv_refresh_limits = raw_refresh_limits, > + .bdrv_io_plug = raw_aio_plug, > + .bdrv_io_unplug = raw_aio_unplug, > + .bdrv_attach_aio_context = raw_aio_attach_aio_context, > + > + .bdrv_co_truncate = raw_co_truncate, > + .bdrv_getlength = raw_getlength, > + .bdrv_get_info = raw_get_info, > + .bdrv_get_allocated_file_size > + = raw_get_allocated_file_size, > + .bdrv_get_specific_stats = hdev_get_specific_stats, > + .bdrv_check_perm = raw_check_perm, > + .bdrv_set_perm = raw_set_perm, > + .bdrv_abort_perm_update = raw_abort_perm_update, > + .bdrv_probe_blocksizes = hdev_probe_blocksizes, hdev_probe_blocksizes() only seems to probe blocksizes if the hardware device is DASD. Not sure why is like that, but is very important for ZBDs to be virtualized with the correct physical and logical block sizes. To make sure this is the case, you can change the line above to + .bdrv_probe_blocksizes = hdev_zoned_probe_blocksizes, and define the following function elsewhere in this file _ +/** + * Try to get @bs's logical and physical block size of a zoned device. + * On success, store them in @bsz and return zero. + * On failure, return negative errno. + */ +static int hdev_zoned_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) +{ + BDRVRawState *s = bs->opaque; + int ret;+ return ret; + + ret = probe_logical_blocksize(s->fd, &bsz->log); + if (ret < 0) { + return ret; + } + return probe_physical_blocksize(s->fd, &bsz->phys); +} + I tested this change with hardware ZBDs and it works fine, no guesswork about what LBA/PBA size to specify in the command line is required. > + .bdrv_probe_geometry = hdev_probe_geometry, > + .bdrv_co_ioctl = hdev_co_ioctl, > + > + /* zone management operations */ > + .bdrv_co_zone_report = raw_co_zone_report, > + .bdrv_co_zone_mgmt = raw_co_zone_mgmt, > +}; > +#endif > + > #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) > static void cdrom_parse_filename(const char *filename, QDict *options, > Error **errp) > @@ -4028,6 +4360,9 @@ static void bdrv_file_init(void) > bdrv_register(&bdrv_file); > #if defined(HAVE_HOST_BLOCK_DEVICE) > bdrv_register(&bdrv_host_device); > +#if defined(CONFIG_BLKZONED) > + bdrv_register(&bdrv_zoned_host_device); > +#endif > #ifdef __linux__ > bdrv_register(&bdrv_host_cdrom); > #endif > diff --git a/block/io.c b/block/io.c > index d30073036e..88f707ea4d 100644 > --- a/block/io.c > +++ b/block/io.c > @@ -3189,6 +3189,47 @@ out: > return co.ret; > } > > +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones) > +{ > + BlockDriver *drv = bs->drv; > + CoroutineIOCompletion co = { > + .coroutine = qemu_coroutine_self(), > + }; > + IO_CODE(); > + > + bdrv_inc_in_flight(bs); > + if (!drv || !drv->bdrv_co_zone_report) { > + co.ret = -ENOTSUP; > + goto out; > + } > + co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones); > +out: > + bdrv_dec_in_flight(bs); > + return co.ret; > +} > + > +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > + int64_t offset, int64_t len) > +{ > + BlockDriver *drv = bs->drv; > + CoroutineIOCompletion co = { > + .coroutine = qemu_coroutine_self(), > + }; > + IO_CODE(); > + > + bdrv_inc_in_flight(bs); > + if (!drv || !drv->bdrv_co_zone_mgmt) { > + co.ret = -ENOTSUP; > + goto out; > + } > + co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len); > +out: > + bdrv_dec_in_flight(bs); > + return co.ret; > +} > + > void *qemu_blockalign(BlockDriverState *bs, size_t size) > { > IO_CODE(); > diff --git a/include/block/block-io.h b/include/block/block-io.h > index 492f95fc05..f0cdf67d33 100644 > --- a/include/block/block-io.h > +++ b/include/block/block-io.h > @@ -88,6 +88,13 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf); > /* Ensure contents are flushed to disk. */ > int coroutine_fn bdrv_co_flush(BlockDriverState *bs); > > +/* Report zone information of zone block device. */ > +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones); > +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > + int64_t offset, int64_t len); > + > int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes); > bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); > int bdrv_block_status(BlockDriverState *bs, int64_t offset, > diff --git a/include/block/block_int-common.h b/include/block/block_int- > common.h > index 7f7863cc9e..cdc06e77a6 100644 > --- a/include/block/block_int-common.h > +++ b/include/block/block_int-common.h > @@ -691,6 +691,12 @@ struct BlockDriver { > QEMUIOVector *qiov, > int64_t pos); > > + int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs, > + int64_t offset, unsigned int *nr_zones, > + BlockZoneDescriptor *zones); > + int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp > op, > + int64_t offset, int64_t len); > + > /* removable device specific */ > bool (*bdrv_is_inserted)(BlockDriverState *bs); > void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag); > @@ -828,6 +834,24 @@ typedef struct BlockLimits { > > /* device zone model */ > BlockZoneModel zoned; > + > + /* zone size expressed in bytes */ > + uint32_t zone_size; > + > + /* total number of zones */ > + unsigned int nr_zones; Use the _t notation since it is used elsewhere in the definition of this struct - + uint32_t nr_zones; > + > + /* maximum sectors of a zone append write operation */ > + int64_t max_append_sectors; > + > + /* maximum number of open zones */ > + int64_t max_open_zones; > + > + /* maximum number of active zones */ > + int64_t max_active_zones; > + > + /* device capacity expressed in bytes */ > + int64_t capacity; Can capacity become negative? If not, define as uint64_t? > } BlockLimits; > > typedef struct BdrvOpBlocker BdrvOpBlocker; > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h > index 21fc10c4c9..877b2240b3 100644 > --- a/include/block/raw-aio.h > +++ b/include/block/raw-aio.h > @@ -29,6 +29,8 @@ > #define QEMU_AIO_WRITE_ZEROES 0x0020 > #define QEMU_AIO_COPY_RANGE 0x0040 > #define QEMU_AIO_TRUNCATE 0x0080 > +#define QEMU_AIO_ZONE_REPORT 0x0100 > +#define QEMU_AIO_ZONE_MGMT 0x0200 > #define QEMU_AIO_TYPE_MASK \ > (QEMU_AIO_READ | \ > QEMU_AIO_WRITE | \ > @@ -37,7 +39,9 @@ > QEMU_AIO_DISCARD | \ > QEMU_AIO_WRITE_ZEROES | \ > QEMU_AIO_COPY_RANGE | \ > - QEMU_AIO_TRUNCATE) > + QEMU_AIO_TRUNCATE | \ > + QEMU_AIO_ZONE_REPORT | \ > + QEMU_AIO_ZONE_MGMT) > > /* AIO flags */ > #define QEMU_AIO_MISALIGNED 0x1000 > diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend- > io.h > index 50f5aa2e07..1b5fc7db6b 100644 > --- a/include/sysemu/block-backend-io.h > +++ b/include/sysemu/block-backend-io.h > @@ -45,6 +45,13 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t > offset, > BlockCompletionFunc *cb, void *opaque); > BlockAIOCB *blk_aio_flush(BlockBackend *blk, > BlockCompletionFunc *cb, void *opaque); > +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones, > + BlockCompletionFunc *cb, void *opaque); > +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > + int64_t offset, int64_t len, > + BlockCompletionFunc *cb, void *opaque); > BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes, > BlockCompletionFunc *cb, void *opaque); > void blk_aio_cancel_async(BlockAIOCB *acb); > @@ -156,6 +163,17 @@ int generated_co_wrapper blk_pwrite_zeroes(BlockBackend > *blk, int64_t offset, > int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, > int64_t bytes, BdrvRequestFlags flags); > > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones); > +int generated_co_wrapper blk_zone_report(BlockBackend *blk, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones); > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > + int64_t offset, int64_t len); > +int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > + int64_t offset, int64_t len); > + > int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset, > int64_t bytes); > int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, > diff --git a/meson.build b/meson.build > index 44c1f92697..0aa99b64a0 100644 > --- a/meson.build > +++ b/meson.build > @@ -1928,6 +1928,7 @@ config_host_data.set('CONFIG_REPLICATION', > get_option('replication').allowed()) > # has_header > config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h')) > config_host_data.set('CONFIG_LINUX_MAGIC_H', cc.has_header('linux/magic.h')) > +config_host_data.set('CONFIG_BLKZONED', cc.has_header('linux/blkzoned.h')) > config_host_data.set('CONFIG_VALGRIND_H', > cc.has_header('valgrind/valgrind.h')) > config_host_data.set('HAVE_BTRFS_H', cc.has_header('linux/btrfs.h')) > config_host_data.set('HAVE_DRM_H', cc.has_header('libdrm/drm.h')) > @@ -2021,6 +2022,9 @@ config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID', > config_host_data.set('HAVE_STRUCT_STAT_ST_ATIM', > cc.has_member('struct stat', 'st_atim', > prefix: '#include <sys/stat.h>')) > +config_host_data.set('HAVE_BLK_ZONE_REP_CAPACITY', > + cc.has_member('struct blk_zone', 'capacity', > + prefix: '#include <linux/blkzoned.h>')) > > # has_type > config_host_data.set('CONFIG_IOVEC', > diff --git a/qapi/block-core.json b/qapi/block-core.json > index 882b266532..05a3b44731 100644 > --- a/qapi/block-core.json > +++ b/qapi/block-core.json > @@ -2942,6 +2942,7 @@ > # @compress: Since 5.0 > # @copy-before-write: Since 6.2 > # @snapshot-access: Since 7.0 > +# @zoned_host_device: Since 7.2 > # > # Since: 2.9 > ## > @@ -2955,7 +2956,8 @@ > 'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', > 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd', > { 'name': 'replication', 'if': 'CONFIG_REPLICATION' }, > - 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] } > + 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', > + { 'name': 'zoned_host_device', 'if': 'CONFIG_BLKZONED' } ] } > > ## > # @BlockdevOptionsFile: > @@ -4329,7 +4331,9 @@ > 'vhdx': 'BlockdevOptionsGenericFormat', > 'vmdk': 'BlockdevOptionsGenericCOWFormat', > 'vpc': 'BlockdevOptionsGenericFormat', > - 'vvfat': 'BlockdevOptionsVVFAT' > + 'vvfat': 'BlockdevOptionsVVFAT', > + 'zoned_host_device': { 'type': 'BlockdevOptionsFile', > + 'if': 'CONFIG_BLKZONED' } > } } > > ## > diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c > index 952dc940f1..c1b28ea108 100644 > --- a/qemu-io-cmds.c > +++ b/qemu-io-cmds.c > @@ -1712,6 +1712,150 @@ static const cmdinfo_t flush_cmd = { > .oneline = "flush all in-core file state to disk", > }; > > +static inline int64_t tosector(int64_t bytes) > +{ > + return bytes >> BDRV_SECTOR_BITS; > +} > + > +static int zone_report_f(BlockBackend *blk, int argc, char **argv) > +{ > + int ret; > + int64_t offset; > + unsigned int nr_zones; > + > + ++optind; > + offset = cvtnum(argv[optind]); > + ++optind; > + nr_zones = cvtnum(argv[optind]); > + > + g_autofree BlockZoneDescriptor *zones = NULL; > + zones = g_new(BlockZoneDescriptor, nr_zones); > + ret = blk_zone_report(blk, offset, &nr_zones, zones); > + if (ret < 0) { > + printf("zone report failed: %s\n", strerror(-ret)); > + } else { > + for (int i = 0; i < nr_zones; ++i) { > + printf("start: 0x%" PRIx64 ", len 0x%" PRIx64 ", " > + "cap"" 0x%" PRIx64 ", wptr 0x%" PRIx64 ", " > + "zcond:%u, [type: %u]\n", > + tosector(zones[i].start), tosector(zones[i].length), > + tosector(zones[i].cap), tosector(zones[i].wp), > + zones[i].cond, zones[i].type); > + } > + } > + return ret; > +} > + > +static const cmdinfo_t zone_report_cmd = { > + .name = "zone_report", > + .altname = "zrp", > + .cfunc = zone_report_f, > + .argmin = 2, > + .argmax = 2, > + .args = "offset number", > + .oneline = "report zone information", > +}; > + > +static int zone_open_f(BlockBackend *blk, int argc, char **argv) > +{ > + int ret; > + int64_t offset, len; > + ++optind; > + offset = cvtnum(argv[optind]); > + ++optind; > + len = cvtnum(argv[optind]); > + ret = blk_zone_mgmt(blk, BLK_ZO_OPEN, offset, len); > + if (ret < 0) { > + printf("zone open failed: %s\n", strerror(-ret)); > + } > + return ret; > +} > + > +static const cmdinfo_t zone_open_cmd = { > + .name = "zone_open", > + .altname = "zo", > + .cfunc = zone_open_f, > + .argmin = 2, > + .argmax = 2, > + .args = "offset len", > + .oneline = "explicit open a range of zones in zone block device", > +}; > + > +static int zone_close_f(BlockBackend *blk, int argc, char **argv) > +{ > + int ret; > + int64_t offset, len; > + ++optind; > + offset = cvtnum(argv[optind]); > + ++optind; > + len = cvtnum(argv[optind]); > + ret = blk_zone_mgmt(blk, BLK_ZO_CLOSE, offset, len); > + if (ret < 0) { > + printf("zone close failed: %s\n", strerror(-ret)); > + } > + return ret; > +} > + > +static const cmdinfo_t zone_close_cmd = { > + .name = "zone_close", > + .altname = "zc", > + .cfunc = zone_close_f, > + .argmin = 2, > + .argmax = 2, > + .args = "offset len", > + .oneline = "close a range of zones in zone block device", > +}; > + > +static int zone_finish_f(BlockBackend *blk, int argc, char **argv) > +{ > + int ret; > + int64_t offset, len; > + ++optind; > + offset = cvtnum(argv[optind]); > + ++optind; > + len = cvtnum(argv[optind]); > + ret = blk_zone_mgmt(blk, BLK_ZO_FINISH, offset, len); > + if (ret < 0) { > + printf("zone finish failed: %s\n", strerror(-ret)); > + } > + return ret; > +} > + > +static const cmdinfo_t zone_finish_cmd = { > + .name = "zone_finish", > + .altname = "zf", > + .cfunc = zone_finish_f, > + .argmin = 2, > + .argmax = 2, > + .args = "offset len", > + .oneline = "finish a range of zones in zone block device", > +}; > + > +static int zone_reset_f(BlockBackend *blk, int argc, char **argv) > +{ > + int ret; > + int64_t offset, len; > + ++optind; > + offset = cvtnum(argv[optind]); > + ++optind; > + len = cvtnum(argv[optind]); > + ret = blk_zone_mgmt(blk, BLK_ZO_RESET, offset, len); > + if (ret < 0) { > + printf("zone reset failed: %s\n", strerror(-ret)); > + } > + return ret; > +} > + > +static const cmdinfo_t zone_reset_cmd = { > + .name = "zone_reset", > + .altname = "zrs", > + .cfunc = zone_reset_f, > + .argmin = 2, > + .argmax = 2, > + .args = "offset len", > + .oneline = "reset a zone write pointer in zone block device", > +}; > + > static int truncate_f(BlockBackend *blk, int argc, char **argv); > static const cmdinfo_t truncate_cmd = { > .name = "truncate", > @@ -2504,6 +2648,11 @@ static void __attribute((constructor)) > init_qemuio_commands(void) > qemuio_add_command(&aio_write_cmd); > qemuio_add_command(&aio_flush_cmd); > qemuio_add_command(&flush_cmd); > + qemuio_add_command(&zone_report_cmd); > + qemuio_add_command(&zone_open_cmd); > + qemuio_add_command(&zone_close_cmd); > + qemuio_add_command(&zone_finish_cmd); > + qemuio_add_command(&zone_reset_cmd); > qemuio_add_command(&truncate_cmd); > qemuio_add_command(&length_cmd); > qemuio_add_command(&info_cmd);
On 10/16/22 23:51, Sam Li wrote: > Add a new zoned_host_device BlockDriver. The zoned_host_device option > accepts only zoned host block devices. By adding zone management > operations in this new BlockDriver, users can use the new block > layer APIs including Report Zone and four zone management operations > (open, close, finish, reset, reset_all). > > Qemu-io uses the new APIs to perform zoned storage commands of the device: > zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs), > zone_finish(zf). > > For example, to test zone_report, use following command: > $ ./build/qemu-io --image-opts -n driver=zoned_host_device, filename=/dev/nullb0 > -c "zrp offset nr_zones" > > Signed-off-by: Sam Li <faithilikerun@gmail.com> > Reviewed-by: Hannes Reinecke <hare@suse.de> > --- > block/block-backend.c | 148 +++++++++++++ > block/file-posix.c | 335 ++++++++++++++++++++++++++++++ > block/io.c | 41 ++++ > include/block/block-io.h | 7 + > include/block/block_int-common.h | 24 +++ > include/block/raw-aio.h | 6 +- > include/sysemu/block-backend-io.h | 18 ++ > meson.build | 4 + > qapi/block-core.json | 8 +- > qemu-io-cmds.c | 149 +++++++++++++ > 10 files changed, 737 insertions(+), 3 deletions(-) > > diff --git a/block/block-backend.c b/block/block-backend.c > index aa4adf06ae..1c618e9c68 100644 > --- a/block/block-backend.c > +++ b/block/block-backend.c > @@ -1431,6 +1431,15 @@ typedef struct BlkRwCo { > void *iobuf; > int ret; > BdrvRequestFlags flags; > + union { > + struct { > + unsigned int *nr_zones; > + BlockZoneDescriptor *zones; > + } zone_report; > + struct { > + unsigned long op; > + } zone_mgmt; > + }; > } BlkRwCo; > > int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags) > @@ -1775,6 +1784,145 @@ int coroutine_fn blk_co_flush(BlockBackend *blk) > return ret; > } > > +static void coroutine_fn blk_aio_zone_report_entry(void *opaque) > +{ > + BlkAioEmAIOCB *acb = opaque; > + BlkRwCo *rwco = &acb->rwco; > + > + rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset, > + rwco->zone_report.nr_zones, > + rwco->zone_report.zones); > + blk_aio_complete(acb); > +} > + > +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones, > + BlockCompletionFunc *cb, void *opaque) > +{ > + BlkAioEmAIOCB *acb; > + Coroutine *co; > + IO_CODE(); > + > + blk_inc_in_flight(blk); > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > + acb->rwco = (BlkRwCo) { > + .blk = blk, > + .offset = offset, > + .ret = NOT_DONE, > + .zone_report = { > + .zones = zones, > + .nr_zones = nr_zones, > + }, > + }; > + acb->has_returned = false; > + > + co = qemu_coroutine_create(blk_aio_zone_report_entry, acb); > + bdrv_coroutine_enter(blk_bs(blk), co); > + > + acb->has_returned = true; > + if (acb->rwco.ret != NOT_DONE) { > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > + blk_aio_complete_bh, acb); > + } > + > + return &acb->common; > +} > + > +static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque) > +{ > + BlkAioEmAIOCB *acb = opaque; > + BlkRwCo *rwco = &acb->rwco; > + > + rwco->ret = blk_co_zone_mgmt(rwco->blk, rwco->zone_mgmt.op, > + rwco->offset, acb->bytes); > + blk_aio_complete(acb); > +} > + > +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > + int64_t offset, int64_t len, > + BlockCompletionFunc *cb, void *opaque) { > + BlkAioEmAIOCB *acb; > + Coroutine *co; > + IO_CODE(); > + > + blk_inc_in_flight(blk); > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > + acb->rwco = (BlkRwCo) { > + .blk = blk, > + .offset = offset, > + .ret = NOT_DONE, > + .zone_mgmt = { > + .op = op, > + }, > + }; > + acb->bytes = len; > + acb->has_returned = false; > + > + co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb); > + bdrv_coroutine_enter(blk_bs(blk), co); > + > + acb->has_returned = true; > + if (acb->rwco.ret != NOT_DONE) { > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > + blk_aio_complete_bh, acb); > + } > + > + return &acb->common; > +} > + > +/* > + * Send a zone_report command. > + * offset is a byte offset from the start of the device. No alignment > + * required for offset. > + * nr_zones represents IN maximum and OUT actual. > + */ > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones) > +{ > + int ret; > + IO_CODE(); > + > + blk_inc_in_flight(blk); /* increase before waiting */ > + blk_wait_while_drained(blk); > + if (!blk_is_available(blk)) { > + blk_dec_in_flight(blk); > + return -ENOMEDIUM; > + } > + ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones); > + blk_dec_in_flight(blk); > + return ret; > +} > + > +/* > + * Send a zone_management command. > + * op is the zone operation; > + * offset is the byte offset from the start of the zoned device; > + * len is the maximum number of bytes the command should operate on. It > + * should be aligned with the device zone size. > + */ > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > + int64_t offset, int64_t len) > +{ > + int ret; > + IO_CODE(); > + > + Nit: extra blank line here. > + blk_inc_in_flight(blk); > + blk_wait_while_drained(blk); > + > + ret = blk_check_byte_request(blk, offset, len); > + if (ret < 0) { > + blk_dec_in_flight(blk); > + return ret; > + } > + > + ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len); > + blk_dec_in_flight(blk); > + return ret; > +} > + > void blk_drain(BlockBackend *blk) > { > BlockDriverState *bs = blk_bs(blk); > diff --git a/block/file-posix.c b/block/file-posix.c > index 8cb07fdb8a..bd28e3eaea 100644 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -67,6 +67,9 @@ > #include <sys/param.h> > #include <sys/syscall.h> > #include <sys/vfs.h> > +#if defined(CONFIG_BLKZONED) > +#include <linux/blkzoned.h> > +#endif > #include <linux/cdrom.h> > #include <linux/fd.h> > #include <linux/fs.h> > @@ -215,6 +218,14 @@ typedef struct RawPosixAIOData { > PreallocMode prealloc; > Error **errp; > } truncate; > + struct { > + unsigned int *nr_zones; > + BlockZoneDescriptor *zones; > + } zone_report; > + struct { > + unsigned long zone_op; > + const char *zone_op_name; Nit: This could be simplified to "op" and "op_name". No big deal though. > + } zone_mgmt; > }; > } RawPosixAIOData; > > @@ -1350,6 +1361,48 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) > zoned = BLK_Z_NONE; > } > bs->bl.zoned = zoned; > + if (zoned != BLK_Z_NONE) { > + /* > + * The zoned device must at least have zone size and nr_zones fields. > + */ > + ret = get_sysfs_long_val(&st, "chunk_sectors"); > + if (ret <= 0) { > + error_report("Invalid zone size %" PRId32 " sectors ", ret); > + bs->bl.zoned = BLK_Z_NONE; > + return; > + } > + bs->bl.zone_size = ret << BDRV_SECTOR_BITS; > + > + ret = get_sysfs_long_val(&st, "nr_zones"); > + if (ret <= 0) { > + error_report("Invalid number of zones %" PRId32 "", ret); > + bs->bl.zoned = BLK_Z_NONE; > + return; > + } > + bs->bl.nr_zones = ret; > + > + ret = ioctl(s->fd, BLKGETSIZE64, &bs->bl.capacity); > + if (ret != 0) { > + error_report("Failed to get device capacity %" PRId64 " bytes ", > + bs->bl.capacity); > + return; > + } > + > + ret = get_sysfs_long_val(&st, "zone_append_max_bytes"); > + if (ret > 0) { > + bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS; > + } > + > + ret = get_sysfs_long_val(&st, "max_open_zones"); > + if (ret >= 0) { > + bs->bl.max_open_zones = ret; > + } > + > + ret = get_sysfs_long_val(&st, "max_active_zones"); > + if (ret >= 0) { > + bs->bl.max_active_zones = ret; > + } > + } > } > > static int check_for_dasd(int fd) > @@ -1844,6 +1897,144 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd, > } > #endif > > +/* > + * parse_zone - Fill a zone descriptor > + */ > +#if defined(CONFIG_BLKZONED) > +static inline void parse_zone(struct BlockZoneDescriptor *zone, > + const struct blk_zone *blkz) { > + zone->start = blkz->start << BDRV_SECTOR_BITS; > + zone->length = blkz->len << BDRV_SECTOR_BITS; > + zone->wp = blkz->wp << BDRV_SECTOR_BITS; > + > +#ifdef HAVE_BLK_ZONE_REP_CAPACITY > + zone->cap = blkz->capacity << BDRV_SECTOR_BITS; > +#else > + zone->cap = blkz->len << BDRV_SECTOR_BITS; > +#endif > + > + switch (blkz->type) { > + case BLK_ZONE_TYPE_SEQWRITE_REQ: > + zone->type = BLK_ZT_SWR; > + break; > + case BLK_ZONE_TYPE_SEQWRITE_PREF: > + zone->type = BLK_ZT_SWP; > + break; > + case BLK_ZONE_TYPE_CONVENTIONAL: > + zone->type = BLK_ZT_CONV; > + break; > + default: > + error_report("Invalid zone type: 0x%x", blkz->type); The error message here should be: "Unsupported zone type...", as the value may be perfectly valid for Linux but QEMU not know about it because e.g. an old version is being used. Also, I would rather have an error return here so that the caller is notified. > + } > + > + switch (blkz->cond) { > + case BLK_ZONE_COND_NOT_WP: > + zone->cond = BLK_ZS_NOT_WP; > + break; > + case BLK_ZONE_COND_EMPTY: > + zone->cond = BLK_ZS_EMPTY; > + break; > + case BLK_ZONE_COND_IMP_OPEN: > + zone->cond = BLK_ZS_IOPEN; > + break; > + case BLK_ZONE_COND_EXP_OPEN: > + zone->cond = BLK_ZS_EOPEN; > + break; > + case BLK_ZONE_COND_CLOSED: > + zone->cond = BLK_ZS_CLOSED; > + break; > + case BLK_ZONE_COND_READONLY: > + zone->cond = BLK_ZS_RDONLY; > + break; > + case BLK_ZONE_COND_FULL: > + zone->cond = BLK_ZS_FULL; > + break; > + case BLK_ZONE_COND_OFFLINE: > + zone->cond = BLK_ZS_OFFLINE; > + break; > + default: > + error_report("Invalid zone state: 0x%x", blkz->cond); Same comment here. And we need an error return. > + } > +} > +#endif > + > +#if defined(CONFIG_BLKZONED) > +static int handle_aiocb_zone_report(void *opaque) > +{ > + RawPosixAIOData *aiocb = opaque; > + int fd = aiocb->aio_fildes; > + unsigned int *nr_zones = aiocb->zone_report.nr_zones; > + BlockZoneDescriptor *zones = aiocb->zone_report.zones; > + /* zoned block devices use 512-byte sectors */ > + int64_t sector = aiocb->aio_offset / 512; > + > + struct blk_zone *blkz; > + int64_t rep_size; > + unsigned int nrz; > + int ret, n = 0, i = 0; > + > + nrz = *nr_zones; > + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); > + g_autofree struct blk_zone_report *rep = NULL; > + rep = g_malloc(rep_size); > + > + blkz = (struct blk_zone *)(rep + 1); > + while (n < nrz) { > + memset(rep, 0, rep_size); > + rep->sector = sector; > + rep->nr_zones = nrz - n; > + > + do { > + ret = ioctl(fd, BLKREPORTZONE, rep); > + } while (ret != 0 && errno == EINTR); > + if (ret != 0) { > + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d", > + fd, sector, errno); > + return -errno; > + } > + > + if (!rep->nr_zones) { > + break; > + } > + > + for (i = 0; i < rep->nr_zones; i++, n++) { > + parse_zone(&zones[n], &blkz[i]); Need to check for errors here, and if there is an error reported, fail this aio so that unhandled/unsupported values are no propagated to the guest. > + /* The next report should start after the last zone reported */ > + sector = blkz[i].start + blkz[i].len; > + } > + } > + > + *nr_zones = n; > + return 0; > +} > +#endif > + > +#if defined(CONFIG_BLKZONED) > +static int handle_aiocb_zone_mgmt(void *opaque) > +{ > + RawPosixAIOData *aiocb = opaque; > + int fd = aiocb->aio_fildes; > + int64_t sector = aiocb->aio_offset / 512; > + int64_t nr_sectors = aiocb->aio_nbytes / 512; > + struct blk_zone_range range; > + int ret; > + > + /* Execute the operation */ > + range.sector = sector; > + range.nr_sectors = nr_sectors; > + do { > + ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range); > + } while (ret != 0 && errno == EINTR); > + if (ret != 0) { > + ret = -errno; > + error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name, > + ret); > + return ret; > + } > + return 0; > +} > +#endif > + > static int handle_aiocb_copy_range(void *opaque) > { > RawPosixAIOData *aiocb = opaque; > @@ -3038,6 +3229,99 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret) > } > } > > +/* > + * zone report - Get a zone block device's information in the form > + * of an array of zone descriptors. > + * zones is an array of zone descriptors to hold zone information on reply; > + * offset can be any byte within the entire size of the device; > + * nr_zones is the maxium number of sectors the command should operate on. > + */ > +#if defined(CONFIG_BLKZONED) > +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones) { > + BDRVRawState *s = bs->opaque; > + RawPosixAIOData acb; > + > + acb = (RawPosixAIOData) { > + .bs = bs, > + .aio_fildes = s->fd, > + .aio_type = QEMU_AIO_ZONE_REPORT, > + .aio_offset = offset, > + .zone_report = { > + .nr_zones = nr_zones, > + .zones = zones, > + }, > + }; > + > + return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb); > +} > +#endif > + > +/* > + * zone management operations - Execute an operation on a zone > + */ > +#if defined(CONFIG_BLKZONED) > +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > + int64_t offset, int64_t len) { > + BDRVRawState *s = bs->opaque; > + RawPosixAIOData acb; > + int64_t zone_size, zone_size_mask; > + const char *zone_op_name; > + unsigned long zone_op; > + > + zone_size = bs->bl.zone_size; > + zone_size_mask = zone_size - 1; > + if (offset & zone_size_mask) { > + error_report("sector offset %" PRId64 " is not aligned to zone size " > + "%" PRId64 "", offset / 512, zone_size / 512); > + return -EINVAL; > + } > + > + if (((offset + len) < bs->bl.capacity && len & zone_size_mask) || > + offset + len > bs->bl.capacity) { > + error_report("number of sectors %" PRId64 " is not aligned to zone size" > + " %" PRId64 "", len / 512, zone_size / 512); > + return -EINVAL; > + } > + > + switch (op) { > + case BLK_ZO_OPEN: > + zone_op_name = "BLKOPENZONE"; > + zone_op = BLKOPENZONE; > + break; > + case BLK_ZO_CLOSE: > + zone_op_name = "BLKCLOSEZONE"; > + zone_op = BLKCLOSEZONE; > + break; > + case BLK_ZO_FINISH: > + zone_op_name = "BLKFINISHZONE"; > + zone_op = BLKFINISHZONE; > + break; > + case BLK_ZO_RESET: > + zone_op_name = "BLKRESETZONE"; > + zone_op = BLKRESETZONE; > + break; > + default: > + g_assert_not_reached(); > + } > + > + acb = (RawPosixAIOData) { > + .bs = bs, > + .aio_fildes = s->fd, > + .aio_type = QEMU_AIO_ZONE_MGMT, > + .aio_offset = offset, > + .aio_nbytes = len, > + .zone_mgmt = { > + .zone_op = zone_op, > + .zone_op_name = zone_op_name, > + }, > + }; > + > + return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb); > +} > +#endif > + > static coroutine_fn int > raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, > bool blkdev) > @@ -3768,6 +4052,54 @@ static BlockDriver bdrv_host_device = { > #endif > }; > > +#if defined(CONFIG_BLKZONED) > +static BlockDriver bdrv_zoned_host_device = { > + .format_name = "zoned_host_device", > + .protocol_name = "zoned_host_device", > + .instance_size = sizeof(BDRVRawState), > + .bdrv_needs_filename = true, > + .bdrv_probe_device = hdev_probe_device, > + .bdrv_file_open = hdev_open, > + .bdrv_close = raw_close, > + .bdrv_reopen_prepare = raw_reopen_prepare, > + .bdrv_reopen_commit = raw_reopen_commit, > + .bdrv_reopen_abort = raw_reopen_abort, > + .bdrv_co_create_opts = bdrv_co_create_opts_simple, > + .create_opts = &bdrv_create_opts_simple, > + .mutable_opts = mutable_opts, > + .bdrv_co_invalidate_cache = raw_co_invalidate_cache, > + .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, > + > + .bdrv_co_preadv = raw_co_preadv, > + .bdrv_co_pwritev = raw_co_pwritev, > + .bdrv_co_flush_to_disk = raw_co_flush_to_disk, > + .bdrv_co_pdiscard = hdev_co_pdiscard, > + .bdrv_co_copy_range_from = raw_co_copy_range_from, > + .bdrv_co_copy_range_to = raw_co_copy_range_to, > + .bdrv_refresh_limits = raw_refresh_limits, > + .bdrv_io_plug = raw_aio_plug, > + .bdrv_io_unplug = raw_aio_unplug, > + .bdrv_attach_aio_context = raw_aio_attach_aio_context, > + > + .bdrv_co_truncate = raw_co_truncate, > + .bdrv_getlength = raw_getlength, > + .bdrv_get_info = raw_get_info, > + .bdrv_get_allocated_file_size > + = raw_get_allocated_file_size, > + .bdrv_get_specific_stats = hdev_get_specific_stats, > + .bdrv_check_perm = raw_check_perm, > + .bdrv_set_perm = raw_set_perm, > + .bdrv_abort_perm_update = raw_abort_perm_update, > + .bdrv_probe_blocksizes = hdev_probe_blocksizes, > + .bdrv_probe_geometry = hdev_probe_geometry, > + .bdrv_co_ioctl = hdev_co_ioctl, > + > + /* zone management operations */ > + .bdrv_co_zone_report = raw_co_zone_report, > + .bdrv_co_zone_mgmt = raw_co_zone_mgmt, > +}; > +#endif > + > #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) > static void cdrom_parse_filename(const char *filename, QDict *options, > Error **errp) > @@ -4028,6 +4360,9 @@ static void bdrv_file_init(void) > bdrv_register(&bdrv_file); > #if defined(HAVE_HOST_BLOCK_DEVICE) > bdrv_register(&bdrv_host_device); > +#if defined(CONFIG_BLKZONED) > + bdrv_register(&bdrv_zoned_host_device); > +#endif > #ifdef __linux__ > bdrv_register(&bdrv_host_cdrom); > #endif > diff --git a/block/io.c b/block/io.c > index d30073036e..88f707ea4d 100644 > --- a/block/io.c > +++ b/block/io.c > @@ -3189,6 +3189,47 @@ out: > return co.ret; > } > > +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones) > +{ > + BlockDriver *drv = bs->drv; > + CoroutineIOCompletion co = { > + .coroutine = qemu_coroutine_self(), > + }; > + IO_CODE(); > + > + bdrv_inc_in_flight(bs); > + if (!drv || !drv->bdrv_co_zone_report) { > + co.ret = -ENOTSUP; > + goto out; > + } > + co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones); > +out: > + bdrv_dec_in_flight(bs); > + return co.ret; > +} > + > +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > + int64_t offset, int64_t len) > +{ > + BlockDriver *drv = bs->drv; > + CoroutineIOCompletion co = { > + .coroutine = qemu_coroutine_self(), > + }; > + IO_CODE(); > + > + bdrv_inc_in_flight(bs); > + if (!drv || !drv->bdrv_co_zone_mgmt) { > + co.ret = -ENOTSUP; > + goto out; > + } > + co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len); > +out: > + bdrv_dec_in_flight(bs); > + return co.ret; > +} > + > void *qemu_blockalign(BlockDriverState *bs, size_t size) > { > IO_CODE(); > diff --git a/include/block/block-io.h b/include/block/block-io.h > index 492f95fc05..f0cdf67d33 100644 > --- a/include/block/block-io.h > +++ b/include/block/block-io.h > @@ -88,6 +88,13 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf); > /* Ensure contents are flushed to disk. */ > int coroutine_fn bdrv_co_flush(BlockDriverState *bs); > > +/* Report zone information of zone block device. */ > +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones); > +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > + int64_t offset, int64_t len); > + > int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes); > bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); > int bdrv_block_status(BlockDriverState *bs, int64_t offset, > diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h > index 7f7863cc9e..cdc06e77a6 100644 > --- a/include/block/block_int-common.h > +++ b/include/block/block_int-common.h > @@ -691,6 +691,12 @@ struct BlockDriver { > QEMUIOVector *qiov, > int64_t pos); > > + int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs, > + int64_t offset, unsigned int *nr_zones, > + BlockZoneDescriptor *zones); > + int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op, > + int64_t offset, int64_t len); > + > /* removable device specific */ > bool (*bdrv_is_inserted)(BlockDriverState *bs); > void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag); > @@ -828,6 +834,24 @@ typedef struct BlockLimits { > > /* device zone model */ > BlockZoneModel zoned; > + > + /* zone size expressed in bytes */ > + uint32_t zone_size; > + > + /* total number of zones */ > + unsigned int nr_zones; > + > + /* maximum sectors of a zone append write operation */ > + int64_t max_append_sectors; > + > + /* maximum number of open zones */ > + int64_t max_open_zones; > + > + /* maximum number of active zones */ > + int64_t max_active_zones; > + > + /* device capacity expressed in bytes */ > + int64_t capacity; > } BlockLimits; > > typedef struct BdrvOpBlocker BdrvOpBlocker; > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h > index 21fc10c4c9..877b2240b3 100644 > --- a/include/block/raw-aio.h > +++ b/include/block/raw-aio.h > @@ -29,6 +29,8 @@ > #define QEMU_AIO_WRITE_ZEROES 0x0020 > #define QEMU_AIO_COPY_RANGE 0x0040 > #define QEMU_AIO_TRUNCATE 0x0080 > +#define QEMU_AIO_ZONE_REPORT 0x0100 > +#define QEMU_AIO_ZONE_MGMT 0x0200 > #define QEMU_AIO_TYPE_MASK \ > (QEMU_AIO_READ | \ > QEMU_AIO_WRITE | \ > @@ -37,7 +39,9 @@ > QEMU_AIO_DISCARD | \ > QEMU_AIO_WRITE_ZEROES | \ > QEMU_AIO_COPY_RANGE | \ > - QEMU_AIO_TRUNCATE) > + QEMU_AIO_TRUNCATE | \ > + QEMU_AIO_ZONE_REPORT | \ > + QEMU_AIO_ZONE_MGMT) > > /* AIO flags */ > #define QEMU_AIO_MISALIGNED 0x1000 > diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h > index 50f5aa2e07..1b5fc7db6b 100644 > --- a/include/sysemu/block-backend-io.h > +++ b/include/sysemu/block-backend-io.h > @@ -45,6 +45,13 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset, > BlockCompletionFunc *cb, void *opaque); > BlockAIOCB *blk_aio_flush(BlockBackend *blk, > BlockCompletionFunc *cb, void *opaque); > +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones, > + BlockCompletionFunc *cb, void *opaque); > +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > + int64_t offset, int64_t len, > + BlockCompletionFunc *cb, void *opaque); > BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes, > BlockCompletionFunc *cb, void *opaque); > void blk_aio_cancel_async(BlockAIOCB *acb); > @@ -156,6 +163,17 @@ int generated_co_wrapper blk_pwrite_zeroes(BlockBackend *blk, int64_t offset, > int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, > int64_t bytes, BdrvRequestFlags flags); > > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones); > +int generated_co_wrapper blk_zone_report(BlockBackend *blk, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones); > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > + int64_t offset, int64_t len); > +int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > + int64_t offset, int64_t len); > + > int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset, > int64_t bytes); > int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, > diff --git a/meson.build b/meson.build > index 44c1f92697..0aa99b64a0 100644 > --- a/meson.build > +++ b/meson.build > @@ -1928,6 +1928,7 @@ config_host_data.set('CONFIG_REPLICATION', get_option('replication').allowed()) > # has_header > config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h')) > config_host_data.set('CONFIG_LINUX_MAGIC_H', cc.has_header('linux/magic.h')) > +config_host_data.set('CONFIG_BLKZONED', cc.has_header('linux/blkzoned.h')) > config_host_data.set('CONFIG_VALGRIND_H', cc.has_header('valgrind/valgrind.h')) > config_host_data.set('HAVE_BTRFS_H', cc.has_header('linux/btrfs.h')) > config_host_data.set('HAVE_DRM_H', cc.has_header('libdrm/drm.h')) > @@ -2021,6 +2022,9 @@ config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID', > config_host_data.set('HAVE_STRUCT_STAT_ST_ATIM', > cc.has_member('struct stat', 'st_atim', > prefix: '#include <sys/stat.h>')) > +config_host_data.set('HAVE_BLK_ZONE_REP_CAPACITY', > + cc.has_member('struct blk_zone', 'capacity', > + prefix: '#include <linux/blkzoned.h>')) > > # has_type > config_host_data.set('CONFIG_IOVEC', > diff --git a/qapi/block-core.json b/qapi/block-core.json > index 882b266532..05a3b44731 100644 > --- a/qapi/block-core.json > +++ b/qapi/block-core.json > @@ -2942,6 +2942,7 @@ > # @compress: Since 5.0 > # @copy-before-write: Since 6.2 > # @snapshot-access: Since 7.0 > +# @zoned_host_device: Since 7.2 > # > # Since: 2.9 > ## > @@ -2955,7 +2956,8 @@ > 'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', > 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd', > { 'name': 'replication', 'if': 'CONFIG_REPLICATION' }, > - 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] } > + 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', > + { 'name': 'zoned_host_device', 'if': 'CONFIG_BLKZONED' } ] } > > ## > # @BlockdevOptionsFile: > @@ -4329,7 +4331,9 @@ > 'vhdx': 'BlockdevOptionsGenericFormat', > 'vmdk': 'BlockdevOptionsGenericCOWFormat', > 'vpc': 'BlockdevOptionsGenericFormat', > - 'vvfat': 'BlockdevOptionsVVFAT' > + 'vvfat': 'BlockdevOptionsVVFAT', > + 'zoned_host_device': { 'type': 'BlockdevOptionsFile', > + 'if': 'CONFIG_BLKZONED' } > } } > > ## > diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c > index 952dc940f1..c1b28ea108 100644 > --- a/qemu-io-cmds.c > +++ b/qemu-io-cmds.c > @@ -1712,6 +1712,150 @@ static const cmdinfo_t flush_cmd = { > .oneline = "flush all in-core file state to disk", > }; > > +static inline int64_t tosector(int64_t bytes) > +{ > + return bytes >> BDRV_SECTOR_BITS; > +} > + > +static int zone_report_f(BlockBackend *blk, int argc, char **argv) > +{ > + int ret; > + int64_t offset; > + unsigned int nr_zones; > + > + ++optind; > + offset = cvtnum(argv[optind]); > + ++optind; > + nr_zones = cvtnum(argv[optind]); > + > + g_autofree BlockZoneDescriptor *zones = NULL; > + zones = g_new(BlockZoneDescriptor, nr_zones); > + ret = blk_zone_report(blk, offset, &nr_zones, zones); > + if (ret < 0) { > + printf("zone report failed: %s\n", strerror(-ret)); > + } else { > + for (int i = 0; i < nr_zones; ++i) { > + printf("start: 0x%" PRIx64 ", len 0x%" PRIx64 ", " > + "cap"" 0x%" PRIx64 ", wptr 0x%" PRIx64 ", " > + "zcond:%u, [type: %u]\n", > + tosector(zones[i].start), tosector(zones[i].length), > + tosector(zones[i].cap), tosector(zones[i].wp), > + zones[i].cond, zones[i].type); > + } > + } > + return ret; > +} > + > +static const cmdinfo_t zone_report_cmd = { > + .name = "zone_report", > + .altname = "zrp", > + .cfunc = zone_report_f, > + .argmin = 2, > + .argmax = 2, > + .args = "offset number", > + .oneline = "report zone information", > +}; > + > +static int zone_open_f(BlockBackend *blk, int argc, char **argv) > +{ > + int ret; > + int64_t offset, len; > + ++optind; > + offset = cvtnum(argv[optind]); > + ++optind; > + len = cvtnum(argv[optind]); > + ret = blk_zone_mgmt(blk, BLK_ZO_OPEN, offset, len); > + if (ret < 0) { > + printf("zone open failed: %s\n", strerror(-ret)); > + } > + return ret; > +} > + > +static const cmdinfo_t zone_open_cmd = { > + .name = "zone_open", > + .altname = "zo", > + .cfunc = zone_open_f, > + .argmin = 2, > + .argmax = 2, > + .args = "offset len", > + .oneline = "explicit open a range of zones in zone block device", > +}; > + > +static int zone_close_f(BlockBackend *blk, int argc, char **argv) > +{ > + int ret; > + int64_t offset, len; > + ++optind; > + offset = cvtnum(argv[optind]); > + ++optind; > + len = cvtnum(argv[optind]); > + ret = blk_zone_mgmt(blk, BLK_ZO_CLOSE, offset, len); > + if (ret < 0) { > + printf("zone close failed: %s\n", strerror(-ret)); > + } > + return ret; > +} > + > +static const cmdinfo_t zone_close_cmd = { > + .name = "zone_close", > + .altname = "zc", > + .cfunc = zone_close_f, > + .argmin = 2, > + .argmax = 2, > + .args = "offset len", > + .oneline = "close a range of zones in zone block device", > +}; > + > +static int zone_finish_f(BlockBackend *blk, int argc, char **argv) > +{ > + int ret; > + int64_t offset, len; > + ++optind; > + offset = cvtnum(argv[optind]); > + ++optind; > + len = cvtnum(argv[optind]); > + ret = blk_zone_mgmt(blk, BLK_ZO_FINISH, offset, len); > + if (ret < 0) { > + printf("zone finish failed: %s\n", strerror(-ret)); > + } > + return ret; > +} > + > +static const cmdinfo_t zone_finish_cmd = { > + .name = "zone_finish", > + .altname = "zf", > + .cfunc = zone_finish_f, > + .argmin = 2, > + .argmax = 2, > + .args = "offset len", > + .oneline = "finish a range of zones in zone block device", > +}; > + > +static int zone_reset_f(BlockBackend *blk, int argc, char **argv) > +{ > + int ret; > + int64_t offset, len; > + ++optind; > + offset = cvtnum(argv[optind]); > + ++optind; > + len = cvtnum(argv[optind]); > + ret = blk_zone_mgmt(blk, BLK_ZO_RESET, offset, len); > + if (ret < 0) { > + printf("zone reset failed: %s\n", strerror(-ret)); > + } > + return ret; > +} > + > +static const cmdinfo_t zone_reset_cmd = { > + .name = "zone_reset", > + .altname = "zrs", > + .cfunc = zone_reset_f, > + .argmin = 2, > + .argmax = 2, > + .args = "offset len", > + .oneline = "reset a zone write pointer in zone block device", > +}; > + > static int truncate_f(BlockBackend *blk, int argc, char **argv); > static const cmdinfo_t truncate_cmd = { > .name = "truncate", > @@ -2504,6 +2648,11 @@ static void __attribute((constructor)) init_qemuio_commands(void) > qemuio_add_command(&aio_write_cmd); > qemuio_add_command(&aio_flush_cmd); > qemuio_add_command(&flush_cmd); > + qemuio_add_command(&zone_report_cmd); > + qemuio_add_command(&zone_open_cmd); > + qemuio_add_command(&zone_close_cmd); > + qemuio_add_command(&zone_finish_cmd); > + qemuio_add_command(&zone_reset_cmd); > qemuio_add_command(&truncate_cmd); > qemuio_add_command(&length_cmd); > qemuio_add_command(&info_cmd);
On Sun, Oct 16, 2022 at 10:51:06PM +0800, Sam Li wrote: > Add a new zoned_host_device BlockDriver. The zoned_host_device option > accepts only zoned host block devices. By adding zone management > operations in this new BlockDriver, users can use the new block > layer APIs including Report Zone and four zone management operations > (open, close, finish, reset, reset_all). > > Qemu-io uses the new APIs to perform zoned storage commands of the device: > zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs), > zone_finish(zf). > > For example, to test zone_report, use following command: > $ ./build/qemu-io --image-opts -n driver=zoned_host_device, filename=/dev/nullb0 > -c "zrp offset nr_zones" > > Signed-off-by: Sam Li <faithilikerun@gmail.com> > Reviewed-by: Hannes Reinecke <hare@suse.de> > --- > block/block-backend.c | 148 +++++++++++++ > block/file-posix.c | 335 ++++++++++++++++++++++++++++++ > block/io.c | 41 ++++ > include/block/block-io.h | 7 + > include/block/block_int-common.h | 24 +++ > include/block/raw-aio.h | 6 +- > include/sysemu/block-backend-io.h | 18 ++ > meson.build | 4 + > qapi/block-core.json | 8 +- > qemu-io-cmds.c | 149 +++++++++++++ > 10 files changed, 737 insertions(+), 3 deletions(-) > > diff --git a/block/block-backend.c b/block/block-backend.c > index aa4adf06ae..1c618e9c68 100644 > --- a/block/block-backend.c > +++ b/block/block-backend.c > @@ -1431,6 +1431,15 @@ typedef struct BlkRwCo { > void *iobuf; > int ret; > BdrvRequestFlags flags; > + union { > + struct { > + unsigned int *nr_zones; > + BlockZoneDescriptor *zones; > + } zone_report; > + struct { > + unsigned long op; > + } zone_mgmt; > + }; > } BlkRwCo; > > int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags) > @@ -1775,6 +1784,145 @@ int coroutine_fn blk_co_flush(BlockBackend *blk) > return ret; > } > > +static void coroutine_fn blk_aio_zone_report_entry(void *opaque) > +{ > + BlkAioEmAIOCB *acb = opaque; > + BlkRwCo *rwco = &acb->rwco; > + > + rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset, > + rwco->zone_report.nr_zones, > + rwco->zone_report.zones); > + blk_aio_complete(acb); > +} > + > +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones, > + BlockCompletionFunc *cb, void *opaque) > +{ > + BlkAioEmAIOCB *acb; > + Coroutine *co; > + IO_CODE(); > + > + blk_inc_in_flight(blk); > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > + acb->rwco = (BlkRwCo) { > + .blk = blk, > + .offset = offset, > + .ret = NOT_DONE, > + .zone_report = { > + .zones = zones, > + .nr_zones = nr_zones, > + }, > + }; > + acb->has_returned = false; > + > + co = qemu_coroutine_create(blk_aio_zone_report_entry, acb); > + bdrv_coroutine_enter(blk_bs(blk), co); > + > + acb->has_returned = true; > + if (acb->rwco.ret != NOT_DONE) { > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > + blk_aio_complete_bh, acb); > + } > + > + return &acb->common; > +} > + > +static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque) > +{ > + BlkAioEmAIOCB *acb = opaque; > + BlkRwCo *rwco = &acb->rwco; > + > + rwco->ret = blk_co_zone_mgmt(rwco->blk, rwco->zone_mgmt.op, > + rwco->offset, acb->bytes); > + blk_aio_complete(acb); > +} > + > +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > + int64_t offset, int64_t len, > + BlockCompletionFunc *cb, void *opaque) { > + BlkAioEmAIOCB *acb; > + Coroutine *co; > + IO_CODE(); > + > + blk_inc_in_flight(blk); > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > + acb->rwco = (BlkRwCo) { > + .blk = blk, > + .offset = offset, > + .ret = NOT_DONE, > + .zone_mgmt = { > + .op = op, > + }, > + }; > + acb->bytes = len; > + acb->has_returned = false; > + > + co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb); > + bdrv_coroutine_enter(blk_bs(blk), co); > + > + acb->has_returned = true; > + if (acb->rwco.ret != NOT_DONE) { > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > + blk_aio_complete_bh, acb); > + } > + > + return &acb->common; > +} > + > +/* > + * Send a zone_report command. > + * offset is a byte offset from the start of the device. No alignment > + * required for offset. > + * nr_zones represents IN maximum and OUT actual. > + */ > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones) > +{ > + int ret; > + IO_CODE(); > + > + blk_inc_in_flight(blk); /* increase before waiting */ > + blk_wait_while_drained(blk); > + if (!blk_is_available(blk)) { > + blk_dec_in_flight(blk); > + return -ENOMEDIUM; > + } > + ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones); > + blk_dec_in_flight(blk); > + return ret; > +} > + > +/* > + * Send a zone_management command. > + * op is the zone operation; > + * offset is the byte offset from the start of the zoned device; > + * len is the maximum number of bytes the command should operate on. It > + * should be aligned with the device zone size. > + */ > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > + int64_t offset, int64_t len) > +{ > + int ret; > + IO_CODE(); > + > + > + blk_inc_in_flight(blk); > + blk_wait_while_drained(blk); > + > + ret = blk_check_byte_request(blk, offset, len); > + if (ret < 0) { > + blk_dec_in_flight(blk); > + return ret; > + } > + > + ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len); > + blk_dec_in_flight(blk); > + return ret; > +} > + > void blk_drain(BlockBackend *blk) > { > BlockDriverState *bs = blk_bs(blk); > diff --git a/block/file-posix.c b/block/file-posix.c > index 8cb07fdb8a..bd28e3eaea 100644 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -67,6 +67,9 @@ > #include <sys/param.h> > #include <sys/syscall.h> > #include <sys/vfs.h> > +#if defined(CONFIG_BLKZONED) > +#include <linux/blkzoned.h> > +#endif > #include <linux/cdrom.h> > #include <linux/fd.h> > #include <linux/fs.h> > @@ -215,6 +218,14 @@ typedef struct RawPosixAIOData { > PreallocMode prealloc; > Error **errp; > } truncate; > + struct { > + unsigned int *nr_zones; > + BlockZoneDescriptor *zones; > + } zone_report; > + struct { > + unsigned long zone_op; > + const char *zone_op_name; > + } zone_mgmt; > }; > } RawPosixAIOData; > > @@ -1350,6 +1361,48 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) > zoned = BLK_Z_NONE; > } > bs->bl.zoned = zoned; > + if (zoned != BLK_Z_NONE) { > + /* > + * The zoned device must at least have zone size and nr_zones fields. > + */ > + ret = get_sysfs_long_val(&st, "chunk_sectors"); > + if (ret <= 0) { > + error_report("Invalid zone size %" PRId32 " sectors ", ret); The error should be set in raw_refresh_limits()'s Error ** argument. Please use error_setg_errno(errp, -ret, "Unable to read chunk_sectors sysfs attribute") when ret < 0. When ret == 0 you can use error_setg(errp, "Read 0 from chunk_sectors sysfs attribute"). This way the error is propagated to the caller and opening the block device will fail. I noticed there are code paths in block.c during reopen when the error is not displayed, but overall I think it's still best to set errp instead of calling error_report() ourselves. The same applies to the other sysfs attributes below. > + bs->bl.zoned = BLK_Z_NONE; > + return; > + } > + bs->bl.zone_size = ret << BDRV_SECTOR_BITS; > + > + ret = get_sysfs_long_val(&st, "nr_zones"); > + if (ret <= 0) { > + error_report("Invalid number of zones %" PRId32 "", ret); > + bs->bl.zoned = BLK_Z_NONE; > + return; > + } > + bs->bl.nr_zones = ret; > + > + ret = ioctl(s->fd, BLKGETSIZE64, &bs->bl.capacity); What is the purpose of this? The size of the device is already queried by raw_getlength(). It is stored in bs->total_sectors (e.g. raw_do_pwrite_zeroes() uses that value). This looks like a Linux-specific ioctl(2), so the code will probably fail to compile on other operating systems. > + if (ret != 0) { > + error_report("Failed to get device capacity %" PRId64 " bytes ", > + bs->bl.capacity); > + return; > + } > + > + ret = get_sysfs_long_val(&st, "zone_append_max_bytes"); > + if (ret > 0) { > + bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS; > + } > + > + ret = get_sysfs_long_val(&st, "max_open_zones"); > + if (ret >= 0) { > + bs->bl.max_open_zones = ret; > + } > + > + ret = get_sysfs_long_val(&st, "max_active_zones"); > + if (ret >= 0) { > + bs->bl.max_active_zones = ret; > + } > + } > } > > static int check_for_dasd(int fd) > @@ -1844,6 +1897,144 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd, > } > #endif > > +/* > + * parse_zone - Fill a zone descriptor > + */ > +#if defined(CONFIG_BLKZONED) > +static inline void parse_zone(struct BlockZoneDescriptor *zone, > + const struct blk_zone *blkz) { > + zone->start = blkz->start << BDRV_SECTOR_BITS; > + zone->length = blkz->len << BDRV_SECTOR_BITS; > + zone->wp = blkz->wp << BDRV_SECTOR_BITS; > + > +#ifdef HAVE_BLK_ZONE_REP_CAPACITY > + zone->cap = blkz->capacity << BDRV_SECTOR_BITS; > +#else > + zone->cap = blkz->len << BDRV_SECTOR_BITS; > +#endif > + > + switch (blkz->type) { > + case BLK_ZONE_TYPE_SEQWRITE_REQ: > + zone->type = BLK_ZT_SWR; > + break; > + case BLK_ZONE_TYPE_SEQWRITE_PREF: > + zone->type = BLK_ZT_SWP; > + break; > + case BLK_ZONE_TYPE_CONVENTIONAL: > + zone->type = BLK_ZT_CONV; > + break; > + default: > + error_report("Invalid zone type: 0x%x", blkz->type); > + } > + > + switch (blkz->cond) { > + case BLK_ZONE_COND_NOT_WP: > + zone->cond = BLK_ZS_NOT_WP; > + break; > + case BLK_ZONE_COND_EMPTY: > + zone->cond = BLK_ZS_EMPTY; > + break; > + case BLK_ZONE_COND_IMP_OPEN: > + zone->cond = BLK_ZS_IOPEN; > + break; > + case BLK_ZONE_COND_EXP_OPEN: > + zone->cond = BLK_ZS_EOPEN; > + break; > + case BLK_ZONE_COND_CLOSED: > + zone->cond = BLK_ZS_CLOSED; > + break; > + case BLK_ZONE_COND_READONLY: > + zone->cond = BLK_ZS_RDONLY; > + break; > + case BLK_ZONE_COND_FULL: > + zone->cond = BLK_ZS_FULL; > + break; > + case BLK_ZONE_COND_OFFLINE: > + zone->cond = BLK_ZS_OFFLINE; > + break; > + default: > + error_report("Invalid zone state: 0x%x", blkz->cond); > + } > +} > +#endif > + > +#if defined(CONFIG_BLKZONED) > +static int handle_aiocb_zone_report(void *opaque) > +{ > + RawPosixAIOData *aiocb = opaque; > + int fd = aiocb->aio_fildes; > + unsigned int *nr_zones = aiocb->zone_report.nr_zones; > + BlockZoneDescriptor *zones = aiocb->zone_report.zones; > + /* zoned block devices use 512-byte sectors */ > + int64_t sector = aiocb->aio_offset / 512; > + > + struct blk_zone *blkz; > + int64_t rep_size; > + unsigned int nrz; > + int ret, n = 0, i = 0; > + > + nrz = *nr_zones; > + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); > + g_autofree struct blk_zone_report *rep = NULL; > + rep = g_malloc(rep_size); > + > + blkz = (struct blk_zone *)(rep + 1); > + while (n < nrz) { > + memset(rep, 0, rep_size); > + rep->sector = sector; > + rep->nr_zones = nrz - n; > + > + do { > + ret = ioctl(fd, BLKREPORTZONE, rep); > + } while (ret != 0 && errno == EINTR); > + if (ret != 0) { > + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d", > + fd, sector, errno); > + return -errno; > + } > + > + if (!rep->nr_zones) { > + break; > + } > + > + for (i = 0; i < rep->nr_zones; i++, n++) { > + parse_zone(&zones[n], &blkz[i]); > + /* The next report should start after the last zone reported */ > + sector = blkz[i].start + blkz[i].len; > + } > + } > + > + *nr_zones = n; > + return 0; > +} > +#endif > + > +#if defined(CONFIG_BLKZONED) > +static int handle_aiocb_zone_mgmt(void *opaque) > +{ > + RawPosixAIOData *aiocb = opaque; > + int fd = aiocb->aio_fildes; > + int64_t sector = aiocb->aio_offset / 512; > + int64_t nr_sectors = aiocb->aio_nbytes / 512; > + struct blk_zone_range range; > + int ret; > + > + /* Execute the operation */ > + range.sector = sector; > + range.nr_sectors = nr_sectors; > + do { > + ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range); > + } while (ret != 0 && errno == EINTR); > + if (ret != 0) { > + ret = -errno; > + error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name, > + ret); > + return ret; > + } > + return 0; > +} > +#endif > + > static int handle_aiocb_copy_range(void *opaque) > { > RawPosixAIOData *aiocb = opaque; > @@ -3038,6 +3229,99 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret) > } > } > > +/* > + * zone report - Get a zone block device's information in the form > + * of an array of zone descriptors. > + * zones is an array of zone descriptors to hold zone information on reply; > + * offset can be any byte within the entire size of the device; > + * nr_zones is the maxium number of sectors the command should operate on. > + */ > +#if defined(CONFIG_BLKZONED) > +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones) { > + BDRVRawState *s = bs->opaque; > + RawPosixAIOData acb; > + > + acb = (RawPosixAIOData) { > + .bs = bs, > + .aio_fildes = s->fd, > + .aio_type = QEMU_AIO_ZONE_REPORT, > + .aio_offset = offset, > + .zone_report = { > + .nr_zones = nr_zones, > + .zones = zones, > + }, > + }; > + > + return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb); > +} > +#endif > + > +/* > + * zone management operations - Execute an operation on a zone > + */ > +#if defined(CONFIG_BLKZONED) > +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > + int64_t offset, int64_t len) { > + BDRVRawState *s = bs->opaque; > + RawPosixAIOData acb; > + int64_t zone_size, zone_size_mask; > + const char *zone_op_name; > + unsigned long zone_op; > + > + zone_size = bs->bl.zone_size; > + zone_size_mask = zone_size - 1; > + if (offset & zone_size_mask) { > + error_report("sector offset %" PRId64 " is not aligned to zone size " > + "%" PRId64 "", offset / 512, zone_size / 512); > + return -EINVAL; > + } > + > + if (((offset + len) < bs->bl.capacity && len & zone_size_mask) || > + offset + len > bs->bl.capacity) { > + error_report("number of sectors %" PRId64 " is not aligned to zone size" > + " %" PRId64 "", len / 512, zone_size / 512); > + return -EINVAL; > + } > + > + switch (op) { > + case BLK_ZO_OPEN: > + zone_op_name = "BLKOPENZONE"; > + zone_op = BLKOPENZONE; > + break; > + case BLK_ZO_CLOSE: > + zone_op_name = "BLKCLOSEZONE"; > + zone_op = BLKCLOSEZONE; > + break; > + case BLK_ZO_FINISH: > + zone_op_name = "BLKFINISHZONE"; > + zone_op = BLKFINISHZONE; > + break; > + case BLK_ZO_RESET: > + zone_op_name = "BLKRESETZONE"; > + zone_op = BLKRESETZONE; > + break; > + default: > + g_assert_not_reached(); > + } > + > + acb = (RawPosixAIOData) { > + .bs = bs, > + .aio_fildes = s->fd, > + .aio_type = QEMU_AIO_ZONE_MGMT, > + .aio_offset = offset, > + .aio_nbytes = len, > + .zone_mgmt = { > + .zone_op = zone_op, > + .zone_op_name = zone_op_name, > + }, > + }; > + > + return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb); > +} > +#endif > + > static coroutine_fn int > raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, > bool blkdev) > @@ -3768,6 +4052,54 @@ static BlockDriver bdrv_host_device = { > #endif > }; > > +#if defined(CONFIG_BLKZONED) > +static BlockDriver bdrv_zoned_host_device = { > + .format_name = "zoned_host_device", > + .protocol_name = "zoned_host_device", > + .instance_size = sizeof(BDRVRawState), > + .bdrv_needs_filename = true, > + .bdrv_probe_device = hdev_probe_device, > + .bdrv_file_open = hdev_open, > + .bdrv_close = raw_close, > + .bdrv_reopen_prepare = raw_reopen_prepare, > + .bdrv_reopen_commit = raw_reopen_commit, > + .bdrv_reopen_abort = raw_reopen_abort, > + .bdrv_co_create_opts = bdrv_co_create_opts_simple, > + .create_opts = &bdrv_create_opts_simple, > + .mutable_opts = mutable_opts, > + .bdrv_co_invalidate_cache = raw_co_invalidate_cache, > + .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, > + > + .bdrv_co_preadv = raw_co_preadv, > + .bdrv_co_pwritev = raw_co_pwritev, > + .bdrv_co_flush_to_disk = raw_co_flush_to_disk, > + .bdrv_co_pdiscard = hdev_co_pdiscard, > + .bdrv_co_copy_range_from = raw_co_copy_range_from, > + .bdrv_co_copy_range_to = raw_co_copy_range_to, > + .bdrv_refresh_limits = raw_refresh_limits, > + .bdrv_io_plug = raw_aio_plug, > + .bdrv_io_unplug = raw_aio_unplug, > + .bdrv_attach_aio_context = raw_aio_attach_aio_context, > + > + .bdrv_co_truncate = raw_co_truncate, > + .bdrv_getlength = raw_getlength, > + .bdrv_get_info = raw_get_info, > + .bdrv_get_allocated_file_size > + = raw_get_allocated_file_size, > + .bdrv_get_specific_stats = hdev_get_specific_stats, > + .bdrv_check_perm = raw_check_perm, > + .bdrv_set_perm = raw_set_perm, > + .bdrv_abort_perm_update = raw_abort_perm_update, > + .bdrv_probe_blocksizes = hdev_probe_blocksizes, > + .bdrv_probe_geometry = hdev_probe_geometry, > + .bdrv_co_ioctl = hdev_co_ioctl, > + > + /* zone management operations */ > + .bdrv_co_zone_report = raw_co_zone_report, > + .bdrv_co_zone_mgmt = raw_co_zone_mgmt, > +}; > +#endif > + > #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) > static void cdrom_parse_filename(const char *filename, QDict *options, > Error **errp) > @@ -4028,6 +4360,9 @@ static void bdrv_file_init(void) > bdrv_register(&bdrv_file); > #if defined(HAVE_HOST_BLOCK_DEVICE) > bdrv_register(&bdrv_host_device); > +#if defined(CONFIG_BLKZONED) > + bdrv_register(&bdrv_zoned_host_device); > +#endif > #ifdef __linux__ > bdrv_register(&bdrv_host_cdrom); > #endif > diff --git a/block/io.c b/block/io.c > index d30073036e..88f707ea4d 100644 > --- a/block/io.c > +++ b/block/io.c > @@ -3189,6 +3189,47 @@ out: > return co.ret; > } > > +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones) > +{ > + BlockDriver *drv = bs->drv; > + CoroutineIOCompletion co = { > + .coroutine = qemu_coroutine_self(), > + }; > + IO_CODE(); > + > + bdrv_inc_in_flight(bs); > + if (!drv || !drv->bdrv_co_zone_report) { > + co.ret = -ENOTSUP; > + goto out; > + } > + co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones); > +out: > + bdrv_dec_in_flight(bs); > + return co.ret; > +} > + > +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > + int64_t offset, int64_t len) > +{ > + BlockDriver *drv = bs->drv; > + CoroutineIOCompletion co = { > + .coroutine = qemu_coroutine_self(), > + }; > + IO_CODE(); > + > + bdrv_inc_in_flight(bs); > + if (!drv || !drv->bdrv_co_zone_mgmt) { > + co.ret = -ENOTSUP; > + goto out; > + } > + co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len); > +out: > + bdrv_dec_in_flight(bs); > + return co.ret; > +} > + > void *qemu_blockalign(BlockDriverState *bs, size_t size) > { > IO_CODE(); > diff --git a/include/block/block-io.h b/include/block/block-io.h > index 492f95fc05..f0cdf67d33 100644 > --- a/include/block/block-io.h > +++ b/include/block/block-io.h > @@ -88,6 +88,13 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf); > /* Ensure contents are flushed to disk. */ > int coroutine_fn bdrv_co_flush(BlockDriverState *bs); > > +/* Report zone information of zone block device. */ > +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones); > +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > + int64_t offset, int64_t len); > + > int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes); > bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); > int bdrv_block_status(BlockDriverState *bs, int64_t offset, > diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h > index 7f7863cc9e..cdc06e77a6 100644 > --- a/include/block/block_int-common.h > +++ b/include/block/block_int-common.h > @@ -691,6 +691,12 @@ struct BlockDriver { > QEMUIOVector *qiov, > int64_t pos); > > + int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs, > + int64_t offset, unsigned int *nr_zones, > + BlockZoneDescriptor *zones); > + int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op, > + int64_t offset, int64_t len); > + > /* removable device specific */ > bool (*bdrv_is_inserted)(BlockDriverState *bs); > void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag); > @@ -828,6 +834,24 @@ typedef struct BlockLimits { > > /* device zone model */ > BlockZoneModel zoned; > + > + /* zone size expressed in bytes */ > + uint32_t zone_size; > + > + /* total number of zones */ > + unsigned int nr_zones; > + > + /* maximum sectors of a zone append write operation */ > + int64_t max_append_sectors; > + > + /* maximum number of open zones */ > + int64_t max_open_zones; > + > + /* maximum number of active zones */ > + int64_t max_active_zones; > + > + /* device capacity expressed in bytes */ > + int64_t capacity; This field is not necessary. See bs->total_sectors. > } BlockLimits; > > typedef struct BdrvOpBlocker BdrvOpBlocker; > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h > index 21fc10c4c9..877b2240b3 100644 > --- a/include/block/raw-aio.h > +++ b/include/block/raw-aio.h > @@ -29,6 +29,8 @@ > #define QEMU_AIO_WRITE_ZEROES 0x0020 > #define QEMU_AIO_COPY_RANGE 0x0040 > #define QEMU_AIO_TRUNCATE 0x0080 > +#define QEMU_AIO_ZONE_REPORT 0x0100 > +#define QEMU_AIO_ZONE_MGMT 0x0200 > #define QEMU_AIO_TYPE_MASK \ > (QEMU_AIO_READ | \ > QEMU_AIO_WRITE | \ > @@ -37,7 +39,9 @@ > QEMU_AIO_DISCARD | \ > QEMU_AIO_WRITE_ZEROES | \ > QEMU_AIO_COPY_RANGE | \ > - QEMU_AIO_TRUNCATE) > + QEMU_AIO_TRUNCATE | \ > + QEMU_AIO_ZONE_REPORT | \ > + QEMU_AIO_ZONE_MGMT) > > /* AIO flags */ > #define QEMU_AIO_MISALIGNED 0x1000 > diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h > index 50f5aa2e07..1b5fc7db6b 100644 > --- a/include/sysemu/block-backend-io.h > +++ b/include/sysemu/block-backend-io.h > @@ -45,6 +45,13 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset, > BlockCompletionFunc *cb, void *opaque); > BlockAIOCB *blk_aio_flush(BlockBackend *blk, > BlockCompletionFunc *cb, void *opaque); > +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones, > + BlockCompletionFunc *cb, void *opaque); > +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > + int64_t offset, int64_t len, > + BlockCompletionFunc *cb, void *opaque); > BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes, > BlockCompletionFunc *cb, void *opaque); > void blk_aio_cancel_async(BlockAIOCB *acb); > @@ -156,6 +163,17 @@ int generated_co_wrapper blk_pwrite_zeroes(BlockBackend *blk, int64_t offset, > int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, > int64_t bytes, BdrvRequestFlags flags); > > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones); > +int generated_co_wrapper blk_zone_report(BlockBackend *blk, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones); > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > + int64_t offset, int64_t len); > +int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > + int64_t offset, int64_t len); > + > int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset, > int64_t bytes); > int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, > diff --git a/meson.build b/meson.build > index 44c1f92697..0aa99b64a0 100644 > --- a/meson.build > +++ b/meson.build > @@ -1928,6 +1928,7 @@ config_host_data.set('CONFIG_REPLICATION', get_option('replication').allowed()) > # has_header > config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h')) > config_host_data.set('CONFIG_LINUX_MAGIC_H', cc.has_header('linux/magic.h')) > +config_host_data.set('CONFIG_BLKZONED', cc.has_header('linux/blkzoned.h')) > config_host_data.set('CONFIG_VALGRIND_H', cc.has_header('valgrind/valgrind.h')) > config_host_data.set('HAVE_BTRFS_H', cc.has_header('linux/btrfs.h')) > config_host_data.set('HAVE_DRM_H', cc.has_header('libdrm/drm.h')) > @@ -2021,6 +2022,9 @@ config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID', > config_host_data.set('HAVE_STRUCT_STAT_ST_ATIM', > cc.has_member('struct stat', 'st_atim', > prefix: '#include <sys/stat.h>')) > +config_host_data.set('HAVE_BLK_ZONE_REP_CAPACITY', > + cc.has_member('struct blk_zone', 'capacity', > + prefix: '#include <linux/blkzoned.h>')) > > # has_type > config_host_data.set('CONFIG_IOVEC', > diff --git a/qapi/block-core.json b/qapi/block-core.json > index 882b266532..05a3b44731 100644 > --- a/qapi/block-core.json > +++ b/qapi/block-core.json > @@ -2942,6 +2942,7 @@ > # @compress: Since 5.0 > # @copy-before-write: Since 6.2 > # @snapshot-access: Since 7.0 > +# @zoned_host_device: Since 7.2 > # > # Since: 2.9 > ## > @@ -2955,7 +2956,8 @@ > 'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', > 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd', > { 'name': 'replication', 'if': 'CONFIG_REPLICATION' }, > - 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] } > + 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', > + { 'name': 'zoned_host_device', 'if': 'CONFIG_BLKZONED' } ] } > > ## > # @BlockdevOptionsFile: > @@ -4329,7 +4331,9 @@ > 'vhdx': 'BlockdevOptionsGenericFormat', > 'vmdk': 'BlockdevOptionsGenericCOWFormat', > 'vpc': 'BlockdevOptionsGenericFormat', > - 'vvfat': 'BlockdevOptionsVVFAT' > + 'vvfat': 'BlockdevOptionsVVFAT', > + 'zoned_host_device': { 'type': 'BlockdevOptionsFile', > + 'if': 'CONFIG_BLKZONED' } > } } > > ## > diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c > index 952dc940f1..c1b28ea108 100644 > --- a/qemu-io-cmds.c > +++ b/qemu-io-cmds.c > @@ -1712,6 +1712,150 @@ static const cmdinfo_t flush_cmd = { > .oneline = "flush all in-core file state to disk", > }; > > +static inline int64_t tosector(int64_t bytes) > +{ > + return bytes >> BDRV_SECTOR_BITS; > +} > + > +static int zone_report_f(BlockBackend *blk, int argc, char **argv) > +{ > + int ret; > + int64_t offset; > + unsigned int nr_zones; > + > + ++optind; > + offset = cvtnum(argv[optind]); > + ++optind; > + nr_zones = cvtnum(argv[optind]); > + > + g_autofree BlockZoneDescriptor *zones = NULL; > + zones = g_new(BlockZoneDescriptor, nr_zones); > + ret = blk_zone_report(blk, offset, &nr_zones, zones); > + if (ret < 0) { > + printf("zone report failed: %s\n", strerror(-ret)); > + } else { > + for (int i = 0; i < nr_zones; ++i) { > + printf("start: 0x%" PRIx64 ", len 0x%" PRIx64 ", " > + "cap"" 0x%" PRIx64 ", wptr 0x%" PRIx64 ", " > + "zcond:%u, [type: %u]\n", > + tosector(zones[i].start), tosector(zones[i].length), > + tosector(zones[i].cap), tosector(zones[i].wp), > + zones[i].cond, zones[i].type); > + } > + } > + return ret; > +} > + > +static const cmdinfo_t zone_report_cmd = { > + .name = "zone_report", > + .altname = "zrp", > + .cfunc = zone_report_f, > + .argmin = 2, > + .argmax = 2, > + .args = "offset number", > + .oneline = "report zone information", > +}; > + > +static int zone_open_f(BlockBackend *blk, int argc, char **argv) > +{ > + int ret; > + int64_t offset, len; > + ++optind; > + offset = cvtnum(argv[optind]); > + ++optind; > + len = cvtnum(argv[optind]); > + ret = blk_zone_mgmt(blk, BLK_ZO_OPEN, offset, len); > + if (ret < 0) { > + printf("zone open failed: %s\n", strerror(-ret)); > + } > + return ret; > +} > + > +static const cmdinfo_t zone_open_cmd = { > + .name = "zone_open", > + .altname = "zo", > + .cfunc = zone_open_f, > + .argmin = 2, > + .argmax = 2, > + .args = "offset len", > + .oneline = "explicit open a range of zones in zone block device", > +}; > + > +static int zone_close_f(BlockBackend *blk, int argc, char **argv) > +{ > + int ret; > + int64_t offset, len; > + ++optind; > + offset = cvtnum(argv[optind]); > + ++optind; > + len = cvtnum(argv[optind]); > + ret = blk_zone_mgmt(blk, BLK_ZO_CLOSE, offset, len); > + if (ret < 0) { > + printf("zone close failed: %s\n", strerror(-ret)); > + } > + return ret; > +} > + > +static const cmdinfo_t zone_close_cmd = { > + .name = "zone_close", > + .altname = "zc", > + .cfunc = zone_close_f, > + .argmin = 2, > + .argmax = 2, > + .args = "offset len", > + .oneline = "close a range of zones in zone block device", > +}; > + > +static int zone_finish_f(BlockBackend *blk, int argc, char **argv) > +{ > + int ret; > + int64_t offset, len; > + ++optind; > + offset = cvtnum(argv[optind]); > + ++optind; > + len = cvtnum(argv[optind]); > + ret = blk_zone_mgmt(blk, BLK_ZO_FINISH, offset, len); > + if (ret < 0) { > + printf("zone finish failed: %s\n", strerror(-ret)); > + } > + return ret; > +} > + > +static const cmdinfo_t zone_finish_cmd = { > + .name = "zone_finish", > + .altname = "zf", > + .cfunc = zone_finish_f, > + .argmin = 2, > + .argmax = 2, > + .args = "offset len", > + .oneline = "finish a range of zones in zone block device", > +}; > + > +static int zone_reset_f(BlockBackend *blk, int argc, char **argv) > +{ > + int ret; > + int64_t offset, len; > + ++optind; > + offset = cvtnum(argv[optind]); > + ++optind; > + len = cvtnum(argv[optind]); > + ret = blk_zone_mgmt(blk, BLK_ZO_RESET, offset, len); > + if (ret < 0) { > + printf("zone reset failed: %s\n", strerror(-ret)); > + } > + return ret; > +} > + > +static const cmdinfo_t zone_reset_cmd = { > + .name = "zone_reset", > + .altname = "zrs", > + .cfunc = zone_reset_f, > + .argmin = 2, > + .argmax = 2, > + .args = "offset len", > + .oneline = "reset a zone write pointer in zone block device", > +}; > + > static int truncate_f(BlockBackend *blk, int argc, char **argv); > static const cmdinfo_t truncate_cmd = { > .name = "truncate", > @@ -2504,6 +2648,11 @@ static void __attribute((constructor)) init_qemuio_commands(void) > qemuio_add_command(&aio_write_cmd); > qemuio_add_command(&aio_flush_cmd); > qemuio_add_command(&flush_cmd); > + qemuio_add_command(&zone_report_cmd); > + qemuio_add_command(&zone_open_cmd); > + qemuio_add_command(&zone_close_cmd); > + qemuio_add_command(&zone_finish_cmd); > + qemuio_add_command(&zone_reset_cmd); > qemuio_add_command(&truncate_cmd); > qemuio_add_command(&length_cmd); > qemuio_add_command(&info_cmd); > -- > 2.37.3 >
Dmitry Fomichev <Dmitry.Fomichev@wdc.com> 于2022年10月17日周一 08:57写道: > > On Sun, 2022-10-16 at 22:51 +0800, Sam Li wrote: > > Add a new zoned_host_device BlockDriver. The zoned_host_device option > > accepts only zoned host block devices. By adding zone management > > operations in this new BlockDriver, users can use the new block > > layer APIs including Report Zone and four zone management operations > > (open, close, finish, reset, reset_all). > > > > Qemu-io uses the new APIs to perform zoned storage commands of the device: > > zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs), > > zone_finish(zf). > > > > For example, to test zone_report, use following command: > > $ ./build/qemu-io --image-opts -n driver=zoned_host_device, > > filename=/dev/nullb0 > > -c "zrp offset nr_zones" > > > > Signed-off-by: Sam Li <faithilikerun@gmail.com> > > Reviewed-by: Hannes Reinecke <hare@suse.de> > > --- > > block/block-backend.c | 148 +++++++++++++ > > block/file-posix.c | 335 ++++++++++++++++++++++++++++++ > > block/io.c | 41 ++++ > > include/block/block-io.h | 7 + > > include/block/block_int-common.h | 24 +++ > > include/block/raw-aio.h | 6 +- > > include/sysemu/block-backend-io.h | 18 ++ > > meson.build | 4 + > > qapi/block-core.json | 8 +- > > qemu-io-cmds.c | 149 +++++++++++++ > > 10 files changed, 737 insertions(+), 3 deletions(-) > > > > diff --git a/block/block-backend.c b/block/block-backend.c > > index aa4adf06ae..1c618e9c68 100644 > > --- a/block/block-backend.c > > +++ b/block/block-backend.c > > @@ -1431,6 +1431,15 @@ typedef struct BlkRwCo { > > void *iobuf; > > int ret; > > BdrvRequestFlags flags; > > + union { > > + struct { > > + unsigned int *nr_zones; > > + BlockZoneDescriptor *zones; > > + } zone_report; > > + struct { > > + unsigned long op; > > + } zone_mgmt; > > + }; > > } BlkRwCo; > > > > int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags) > > @@ -1775,6 +1784,145 @@ int coroutine_fn blk_co_flush(BlockBackend *blk) > > return ret; > > } > > > > +static void coroutine_fn blk_aio_zone_report_entry(void *opaque) > > +{ > > + BlkAioEmAIOCB *acb = opaque; > > + BlkRwCo *rwco = &acb->rwco; > > + > > + rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset, > > + rwco->zone_report.nr_zones, > > + rwco->zone_report.zones); > > + blk_aio_complete(acb); > > +} > > + > > +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones, > > + BlockCompletionFunc *cb, void *opaque) > > +{ > > + BlkAioEmAIOCB *acb; > > + Coroutine *co; > > + IO_CODE(); > > + > > + blk_inc_in_flight(blk); > > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > > + acb->rwco = (BlkRwCo) { > > + .blk = blk, > > + .offset = offset, > > + .ret = NOT_DONE, > > + .zone_report = { > > + .zones = zones, > > + .nr_zones = nr_zones, > > + }, > > + }; > > + acb->has_returned = false; > > + > > + co = qemu_coroutine_create(blk_aio_zone_report_entry, acb); > > + bdrv_coroutine_enter(blk_bs(blk), co); > > + > > + acb->has_returned = true; > > + if (acb->rwco.ret != NOT_DONE) { > > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > > + blk_aio_complete_bh, acb); > > + } > > + > > + return &acb->common; > > +} > > + > > +static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque) > > +{ > > + BlkAioEmAIOCB *acb = opaque; > > + BlkRwCo *rwco = &acb->rwco; > > + > > + rwco->ret = blk_co_zone_mgmt(rwco->blk, rwco->zone_mgmt.op, > > + rwco->offset, acb->bytes); > > + blk_aio_complete(acb); > > +} > > + > > +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > + int64_t offset, int64_t len, > > + BlockCompletionFunc *cb, void *opaque) { > > + BlkAioEmAIOCB *acb; > > + Coroutine *co; > > + IO_CODE(); > > + > > + blk_inc_in_flight(blk); > > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > > + acb->rwco = (BlkRwCo) { > > + .blk = blk, > > + .offset = offset, > > + .ret = NOT_DONE, > > + .zone_mgmt = { > > + .op = op, > > + }, > > + }; > > + acb->bytes = len; > > + acb->has_returned = false; > > + > > + co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb); > > + bdrv_coroutine_enter(blk_bs(blk), co); > > + > > + acb->has_returned = true; > > + if (acb->rwco.ret != NOT_DONE) { > > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > > + blk_aio_complete_bh, acb); > > + } > > + > > + return &acb->common; > > +} > > + > > +/* > > + * Send a zone_report command. > > + * offset is a byte offset from the start of the device. No alignment > > + * required for offset. > > + * nr_zones represents IN maximum and OUT actual. > > + */ > > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones) > > +{ > > + int ret; > > + IO_CODE(); > > + > > + blk_inc_in_flight(blk); /* increase before waiting */ > > + blk_wait_while_drained(blk); > > + if (!blk_is_available(blk)) { > > + blk_dec_in_flight(blk); > > + return -ENOMEDIUM; > > + } > > + ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones); > > + blk_dec_in_flight(blk); > > + return ret; > > +} > > + > > +/* > > + * Send a zone_management command. > > + * op is the zone operation; > > + * offset is the byte offset from the start of the zoned device; > > + * len is the maximum number of bytes the command should operate on. It > > + * should be aligned with the device zone size. > > + */ > > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > + int64_t offset, int64_t len) > > +{ > > + int ret; > > + IO_CODE(); > > + > > + > > + blk_inc_in_flight(blk); > > + blk_wait_while_drained(blk); > > + > > + ret = blk_check_byte_request(blk, offset, len); > > + if (ret < 0) { > > + blk_dec_in_flight(blk); > > + return ret; > > + } > > + > > + ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len); > > + blk_dec_in_flight(blk); > > + return ret; > > +} > > + > > void blk_drain(BlockBackend *blk) > > { > > BlockDriverState *bs = blk_bs(blk); > > diff --git a/block/file-posix.c b/block/file-posix.c > > index 8cb07fdb8a..bd28e3eaea 100644 > > --- a/block/file-posix.c > > +++ b/block/file-posix.c > > @@ -67,6 +67,9 @@ > > #include <sys/param.h> > > #include <sys/syscall.h> > > #include <sys/vfs.h> > > +#if defined(CONFIG_BLKZONED) > > +#include <linux/blkzoned.h> > > +#endif > > #include <linux/cdrom.h> > > #include <linux/fd.h> > > #include <linux/fs.h> > > @@ -215,6 +218,14 @@ typedef struct RawPosixAIOData { > > PreallocMode prealloc; > > Error **errp; > > } truncate; > > + struct { > > + unsigned int *nr_zones; > > + BlockZoneDescriptor *zones; > > + } zone_report; > > + struct { > > + unsigned long zone_op; > > + const char *zone_op_name; > > + } zone_mgmt; > > }; > > } RawPosixAIOData; > > > > @@ -1350,6 +1361,48 @@ static void raw_refresh_limits(BlockDriverState *bs, > > Error **errp) > > zoned = BLK_Z_NONE; > > } > > bs->bl.zoned = zoned; > > + if (zoned != BLK_Z_NONE) { > > + /* > > + * The zoned device must at least have zone size and nr_zones fields. > > + */ > > + ret = get_sysfs_long_val(&st, "chunk_sectors"); > > + if (ret <= 0) { > > + error_report("Invalid zone size %" PRId32 " sectors ", ret); > > + bs->bl.zoned = BLK_Z_NONE; > > + return; > > + } > > + bs->bl.zone_size = ret << BDRV_SECTOR_BITS; > > + > > + ret = get_sysfs_long_val(&st, "nr_zones"); > > + if (ret <= 0) { > > + error_report("Invalid number of zones %" PRId32 "", ret); > > + bs->bl.zoned = BLK_Z_NONE; > > + return; > > + } > > + bs->bl.nr_zones = ret; > > + > > + ret = ioctl(s->fd, BLKGETSIZE64, &bs->bl.capacity); > > + if (ret != 0) { > > + error_report("Failed to get device capacity %" PRId64 " bytes ", > > + bs->bl.capacity); > > + return; > > + } > > + > > + ret = get_sysfs_long_val(&st, "zone_append_max_bytes"); > > + if (ret > 0) { > > + bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS; > > + } > > + > > + ret = get_sysfs_long_val(&st, "max_open_zones"); > > + if (ret >= 0) { > > + bs->bl.max_open_zones = ret; > > + } > > + > > + ret = get_sysfs_long_val(&st, "max_active_zones"); > > + if (ret >= 0) { > > + bs->bl.max_active_zones = ret; > > + } > > + } > > } > > > > static int check_for_dasd(int fd) > > @@ -1844,6 +1897,144 @@ static off_t copy_file_range(int in_fd, off_t *in_off, > > int out_fd, > > } > > #endif > > > > +/* > > + * parse_zone - Fill a zone descriptor > > + */ > > +#if defined(CONFIG_BLKZONED) > > +static inline void parse_zone(struct BlockZoneDescriptor *zone, > > + const struct blk_zone *blkz) { > > + zone->start = blkz->start << BDRV_SECTOR_BITS; > > + zone->length = blkz->len << BDRV_SECTOR_BITS; > > + zone->wp = blkz->wp << BDRV_SECTOR_BITS; > > + > > +#ifdef HAVE_BLK_ZONE_REP_CAPACITY > > + zone->cap = blkz->capacity << BDRV_SECTOR_BITS; > > +#else > > + zone->cap = blkz->len << BDRV_SECTOR_BITS; > > +#endif > > + > > + switch (blkz->type) { > > + case BLK_ZONE_TYPE_SEQWRITE_REQ: > > + zone->type = BLK_ZT_SWR; > > + break; > > + case BLK_ZONE_TYPE_SEQWRITE_PREF: > > + zone->type = BLK_ZT_SWP; > > + break; > > + case BLK_ZONE_TYPE_CONVENTIONAL: > > + zone->type = BLK_ZT_CONV; > > + break; > > + default: > > + error_report("Invalid zone type: 0x%x", blkz->type); > > + } > > + > > + switch (blkz->cond) { > > + case BLK_ZONE_COND_NOT_WP: > > + zone->cond = BLK_ZS_NOT_WP; > > + break; > > + case BLK_ZONE_COND_EMPTY: > > + zone->cond = BLK_ZS_EMPTY; > > + break; > > + case BLK_ZONE_COND_IMP_OPEN: > > + zone->cond = BLK_ZS_IOPEN; > > + break; > > + case BLK_ZONE_COND_EXP_OPEN: > > + zone->cond = BLK_ZS_EOPEN; > > + break; > > + case BLK_ZONE_COND_CLOSED: > > + zone->cond = BLK_ZS_CLOSED; > > + break; > > + case BLK_ZONE_COND_READONLY: > > + zone->cond = BLK_ZS_RDONLY; > > + break; > > + case BLK_ZONE_COND_FULL: > > + zone->cond = BLK_ZS_FULL; > > + break; > > + case BLK_ZONE_COND_OFFLINE: > > + zone->cond = BLK_ZS_OFFLINE; > > + break; > > + default: > > + error_report("Invalid zone state: 0x%x", blkz->cond); > > + } > > +} > > +#endif > > + > > +#if defined(CONFIG_BLKZONED) > > +static int handle_aiocb_zone_report(void *opaque) > > +{ > > + RawPosixAIOData *aiocb = opaque; > > + int fd = aiocb->aio_fildes; > > + unsigned int *nr_zones = aiocb->zone_report.nr_zones; > > + BlockZoneDescriptor *zones = aiocb->zone_report.zones; > > + /* zoned block devices use 512-byte sectors */ > > + int64_t sector = aiocb->aio_offset / 512; > > Shouldn't this be uint64_t ? Yes, I'll change the type of offset and len to uint64_t. > > > + > > + struct blk_zone *blkz; > > + int64_t rep_size; > > size_t type is more appropriate for the variable above Ok, size_t is semantically correct. > > > + unsigned int nrz; > > + int ret, n = 0, i = 0; > > + > > + nrz = *nr_zones; > > + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); > > + g_autofree struct blk_zone_report *rep = NULL; > > + rep = g_malloc(rep_size); > > + > > + blkz = (struct blk_zone *)(rep + 1); > > + while (n < nrz) { > > + memset(rep, 0, rep_size); > > + rep->sector = sector; > > + rep->nr_zones = nrz - n; > > + > > + do { > > + ret = ioctl(fd, BLKREPORTZONE, rep); > > + } while (ret != 0 && errno == EINTR); > > + if (ret != 0) { > > + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d", > > + fd, sector, errno); > > + return -errno; > > + } > > + > > + if (!rep->nr_zones) { > > + break; > > + } > > + > > + for (i = 0; i < rep->nr_zones; i++, n++) { > > + parse_zone(&zones[n], &blkz[i]); > > + /* The next report should start after the last zone reported */ > > + sector = blkz[i].start + blkz[i].len; > > + } > > + } > > + > > + *nr_zones = n; > > + return 0; > > +} > > +#endif > > + > > +#if defined(CONFIG_BLKZONED) > > +static int handle_aiocb_zone_mgmt(void *opaque) > > +{ > > + RawPosixAIOData *aiocb = opaque; > > + int fd = aiocb->aio_fildes; > > + int64_t sector = aiocb->aio_offset / 512; > > uint64_t? > > > + int64_t nr_sectors = aiocb->aio_nbytes / 512; > > + struct blk_zone_range range; > > + int ret; > > + > > + /* Execute the operation */ > > + range.sector = sector; > > + range.nr_sectors = nr_sectors; > > + do { > > + ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range); > > + } while (ret != 0 && errno == EINTR); > > + if (ret != 0) { > > + ret = -errno; > > + error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name, > > + ret); > > + return ret; > > + } > > + return 0; > > +} > > +#endif > > + > > static int handle_aiocb_copy_range(void *opaque) > > { > > RawPosixAIOData *aiocb = opaque; > > @@ -3038,6 +3229,99 @@ static void raw_account_discard(BDRVRawState *s, > > uint64_t nbytes, int ret) > > } > > } > > > > +/* > > + * zone report - Get a zone block device's information in the form > > + * of an array of zone descriptors. > > + * zones is an array of zone descriptors to hold zone information on reply; > > + * offset can be any byte within the entire size of the device; > > + * nr_zones is the maxium number of sectors the command should operate on. > > + */ > > +#if defined(CONFIG_BLKZONED) > > +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t > > offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones) { > > + BDRVRawState *s = bs->opaque; > > + RawPosixAIOData acb; > > + > > + acb = (RawPosixAIOData) { > > + .bs = bs, > > + .aio_fildes = s->fd, > > + .aio_type = QEMU_AIO_ZONE_REPORT, > > + .aio_offset = offset, > > + .zone_report = { > > + .nr_zones = nr_zones, > > + .zones = zones, > > + }, > > + }; > > + > > + return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb); > > +} > > +#endif > > + > > +/* > > + * zone management operations - Execute an operation on a zone > > + */ > > +#if defined(CONFIG_BLKZONED) > > +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > > + int64_t offset, int64_t len) { > > + BDRVRawState *s = bs->opaque; > > + RawPosixAIOData acb; > > + int64_t zone_size, zone_size_mask; > > + const char *zone_op_name; > > + unsigned long zone_op; > > + > > + zone_size = bs->bl.zone_size; > > + zone_size_mask = zone_size - 1; > > + if (offset & zone_size_mask) { > > + error_report("sector offset %" PRId64 " is not aligned to zone size " > > + "%" PRId64 "", offset / 512, zone_size / 512); > > + return -EINVAL; > > + } > > + > > + if (((offset + len) < bs->bl.capacity && len & zone_size_mask) || > > + offset + len > bs->bl.capacity) { > > + error_report("number of sectors %" PRId64 " is not aligned to zone > > size" > > + " %" PRId64 "", len / 512, zone_size / 512); > > + return -EINVAL; > > + } > > + > > + switch (op) { > > + case BLK_ZO_OPEN: > > + zone_op_name = "BLKOPENZONE"; > > + zone_op = BLKOPENZONE; > > + break; > > + case BLK_ZO_CLOSE: > > + zone_op_name = "BLKCLOSEZONE"; > > + zone_op = BLKCLOSEZONE; > > + break; > > + case BLK_ZO_FINISH: > > + zone_op_name = "BLKFINISHZONE"; > > + zone_op = BLKFINISHZONE; > > + break; > > + case BLK_ZO_RESET: > > + zone_op_name = "BLKRESETZONE"; > > + zone_op = BLKRESETZONE; > > + break; > > + default: > > + g_assert_not_reached(); > > + } > > + > > + acb = (RawPosixAIOData) { > > + .bs = bs, > > + .aio_fildes = s->fd, > > + .aio_type = QEMU_AIO_ZONE_MGMT, > > + .aio_offset = offset, > > + .aio_nbytes = len, > > + .zone_mgmt = { > > + .zone_op = zone_op, > > + .zone_op_name = zone_op_name, > > + }, > > + }; > > + > > + return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb); > > +} > > +#endif > > + > > static coroutine_fn int > > raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, > > bool blkdev) > > @@ -3768,6 +4052,54 @@ static BlockDriver bdrv_host_device = { > > #endif > > }; > > > > +#if defined(CONFIG_BLKZONED) > > +static BlockDriver bdrv_zoned_host_device = { > > + .format_name = "zoned_host_device", > > + .protocol_name = "zoned_host_device", > > + .instance_size = sizeof(BDRVRawState), > > + .bdrv_needs_filename = true, > > + .bdrv_probe_device = hdev_probe_device, > > + .bdrv_file_open = hdev_open, > > + .bdrv_close = raw_close, > > + .bdrv_reopen_prepare = raw_reopen_prepare, > > + .bdrv_reopen_commit = raw_reopen_commit, > > + .bdrv_reopen_abort = raw_reopen_abort, > > + .bdrv_co_create_opts = bdrv_co_create_opts_simple, > > + .create_opts = &bdrv_create_opts_simple, > > + .mutable_opts = mutable_opts, > > + .bdrv_co_invalidate_cache = raw_co_invalidate_cache, > > + .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, > > + > > + .bdrv_co_preadv = raw_co_preadv, > > + .bdrv_co_pwritev = raw_co_pwritev, > > + .bdrv_co_flush_to_disk = raw_co_flush_to_disk, > > + .bdrv_co_pdiscard = hdev_co_pdiscard, > > + .bdrv_co_copy_range_from = raw_co_copy_range_from, > > + .bdrv_co_copy_range_to = raw_co_copy_range_to, > > + .bdrv_refresh_limits = raw_refresh_limits, > > + .bdrv_io_plug = raw_aio_plug, > > + .bdrv_io_unplug = raw_aio_unplug, > > + .bdrv_attach_aio_context = raw_aio_attach_aio_context, > > + > > + .bdrv_co_truncate = raw_co_truncate, > > + .bdrv_getlength = raw_getlength, > > + .bdrv_get_info = raw_get_info, > > + .bdrv_get_allocated_file_size > > + = raw_get_allocated_file_size, > > + .bdrv_get_specific_stats = hdev_get_specific_stats, > > + .bdrv_check_perm = raw_check_perm, > > + .bdrv_set_perm = raw_set_perm, > > + .bdrv_abort_perm_update = raw_abort_perm_update, > > + .bdrv_probe_blocksizes = hdev_probe_blocksizes, > > hdev_probe_blocksizes() only seems to probe blocksizes if the > hardware device is DASD. Not sure why is like that, but is very > important for ZBDs to be virtualized with the correct physical and > logical block sizes. To make sure this is the case, you can change > the line above to > > + .bdrv_probe_blocksizes = hdev_zoned_probe_blocksizes, > > and define the following function elsewhere in this file _ > > +/** > + * Try to get @bs's logical and physical block size of a zoned device. > + * On success, store them in @bsz and return zero. > + * On failure, return negative errno. > + */ > +static int hdev_zoned_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) > +{ > + BDRVRawState *s = bs->opaque; > + int ret;+ return ret; > + > + ret = probe_logical_blocksize(s->fd, &bsz->log); > + if (ret < 0) { > + return ret; > + } > + return probe_physical_blocksize(s->fd, &bsz->phys); > +} > + > > I tested this change with hardware ZBDs and it works fine, no guesswork > about what LBA/PBA size to specify in the command line is required. Thanks! We talked about getting the correct block size before and came up with setting it in the command line. Pleased to drop that setting. > > > + .bdrv_probe_geometry = hdev_probe_geometry, > > + .bdrv_co_ioctl = hdev_co_ioctl, > > + > > + /* zone management operations */ > > + .bdrv_co_zone_report = raw_co_zone_report, > > + .bdrv_co_zone_mgmt = raw_co_zone_mgmt, > > +}; > > +#endif > > + > > #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) > > static void cdrom_parse_filename(const char *filename, QDict *options, > > Error **errp) > > @@ -4028,6 +4360,9 @@ static void bdrv_file_init(void) > > bdrv_register(&bdrv_file); > > #if defined(HAVE_HOST_BLOCK_DEVICE) > > bdrv_register(&bdrv_host_device); > > +#if defined(CONFIG_BLKZONED) > > + bdrv_register(&bdrv_zoned_host_device); > > +#endif > > #ifdef __linux__ > > bdrv_register(&bdrv_host_cdrom); > > #endif > > diff --git a/block/io.c b/block/io.c > > index d30073036e..88f707ea4d 100644 > > --- a/block/io.c > > +++ b/block/io.c > > @@ -3189,6 +3189,47 @@ out: > > return co.ret; > > } > > > > +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones) > > +{ > > + BlockDriver *drv = bs->drv; > > + CoroutineIOCompletion co = { > > + .coroutine = qemu_coroutine_self(), > > + }; > > + IO_CODE(); > > + > > + bdrv_inc_in_flight(bs); > > + if (!drv || !drv->bdrv_co_zone_report) { > > + co.ret = -ENOTSUP; > > + goto out; > > + } > > + co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones); > > +out: > > + bdrv_dec_in_flight(bs); > > + return co.ret; > > +} > > + > > +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > > + int64_t offset, int64_t len) > > +{ > > + BlockDriver *drv = bs->drv; > > + CoroutineIOCompletion co = { > > + .coroutine = qemu_coroutine_self(), > > + }; > > + IO_CODE(); > > + > > + bdrv_inc_in_flight(bs); > > + if (!drv || !drv->bdrv_co_zone_mgmt) { > > + co.ret = -ENOTSUP; > > + goto out; > > + } > > + co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len); > > +out: > > + bdrv_dec_in_flight(bs); > > + return co.ret; > > +} > > + > > void *qemu_blockalign(BlockDriverState *bs, size_t size) > > { > > IO_CODE(); > > diff --git a/include/block/block-io.h b/include/block/block-io.h > > index 492f95fc05..f0cdf67d33 100644 > > --- a/include/block/block-io.h > > +++ b/include/block/block-io.h > > @@ -88,6 +88,13 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf); > > /* Ensure contents are flushed to disk. */ > > int coroutine_fn bdrv_co_flush(BlockDriverState *bs); > > > > +/* Report zone information of zone block device. */ > > +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones); > > +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > > + int64_t offset, int64_t len); > > + > > int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes); > > bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); > > int bdrv_block_status(BlockDriverState *bs, int64_t offset, > > diff --git a/include/block/block_int-common.h b/include/block/block_int- > > common.h > > index 7f7863cc9e..cdc06e77a6 100644 > > --- a/include/block/block_int-common.h > > +++ b/include/block/block_int-common.h > > @@ -691,6 +691,12 @@ struct BlockDriver { > > QEMUIOVector *qiov, > > int64_t pos); > > > > + int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs, > > + int64_t offset, unsigned int *nr_zones, > > + BlockZoneDescriptor *zones); > > + int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp > > op, > > + int64_t offset, int64_t len); > > + > > /* removable device specific */ > > bool (*bdrv_is_inserted)(BlockDriverState *bs); > > void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag); > > @@ -828,6 +834,24 @@ typedef struct BlockLimits { > > > > /* device zone model */ > > BlockZoneModel zoned; > > + > > + /* zone size expressed in bytes */ > > + uint32_t zone_size; > > + > > + /* total number of zones */ > > + unsigned int nr_zones; > > Use the _t notation since it is used elsewhere in the definition > of this struct - > > + uint32_t nr_zones; > > > + > > + /* maximum sectors of a zone append write operation */ > > + int64_t max_append_sectors; > > + > > + /* maximum number of open zones */ > > + int64_t max_open_zones; > > + > > + /* maximum number of active zones */ > > + int64_t max_active_zones; > > + > > + /* device capacity expressed in bytes */ > > + int64_t capacity; > > Can capacity become negative? If not, define as uint64_t? No, will change it to uint64_t. Now that talking about uint/int, all those fields related to zones should not be negative, right? If true, then I should change all of those to uint. > > > } BlockLimits; > > > > typedef struct BdrvOpBlocker BdrvOpBlocker; > > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h > > index 21fc10c4c9..877b2240b3 100644 > > --- a/include/block/raw-aio.h > > +++ b/include/block/raw-aio.h > > @@ -29,6 +29,8 @@ > > #define QEMU_AIO_WRITE_ZEROES 0x0020 > > #define QEMU_AIO_COPY_RANGE 0x0040 > > #define QEMU_AIO_TRUNCATE 0x0080 > > +#define QEMU_AIO_ZONE_REPORT 0x0100 > > +#define QEMU_AIO_ZONE_MGMT 0x0200 > > #define QEMU_AIO_TYPE_MASK \ > > (QEMU_AIO_READ | \ > > QEMU_AIO_WRITE | \ > > @@ -37,7 +39,9 @@ > > QEMU_AIO_DISCARD | \ > > QEMU_AIO_WRITE_ZEROES | \ > > QEMU_AIO_COPY_RANGE | \ > > - QEMU_AIO_TRUNCATE) > > + QEMU_AIO_TRUNCATE | \ > > + QEMU_AIO_ZONE_REPORT | \ > > + QEMU_AIO_ZONE_MGMT) > > > > /* AIO flags */ > > #define QEMU_AIO_MISALIGNED 0x1000 > > diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend- > > io.h > > index 50f5aa2e07..1b5fc7db6b 100644 > > --- a/include/sysemu/block-backend-io.h > > +++ b/include/sysemu/block-backend-io.h > > @@ -45,6 +45,13 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t > > offset, > > BlockCompletionFunc *cb, void *opaque); > > BlockAIOCB *blk_aio_flush(BlockBackend *blk, > > BlockCompletionFunc *cb, void *opaque); > > +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones, > > + BlockCompletionFunc *cb, void *opaque); > > +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > + int64_t offset, int64_t len, > > + BlockCompletionFunc *cb, void *opaque); > > BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes, > > BlockCompletionFunc *cb, void *opaque); > > void blk_aio_cancel_async(BlockAIOCB *acb); > > @@ -156,6 +163,17 @@ int generated_co_wrapper blk_pwrite_zeroes(BlockBackend > > *blk, int64_t offset, > > int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, > > int64_t bytes, BdrvRequestFlags flags); > > > > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones); > > +int generated_co_wrapper blk_zone_report(BlockBackend *blk, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones); > > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > + int64_t offset, int64_t len); > > +int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > + int64_t offset, int64_t len); > > + > > int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset, > > int64_t bytes); > > int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, > > diff --git a/meson.build b/meson.build > > index 44c1f92697..0aa99b64a0 100644 > > --- a/meson.build > > +++ b/meson.build > > @@ -1928,6 +1928,7 @@ config_host_data.set('CONFIG_REPLICATION', > > get_option('replication').allowed()) > > # has_header > > config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h')) > > config_host_data.set('CONFIG_LINUX_MAGIC_H', cc.has_header('linux/magic.h')) > > +config_host_data.set('CONFIG_BLKZONED', cc.has_header('linux/blkzoned.h')) > > config_host_data.set('CONFIG_VALGRIND_H', > > cc.has_header('valgrind/valgrind.h')) > > config_host_data.set('HAVE_BTRFS_H', cc.has_header('linux/btrfs.h')) > > config_host_data.set('HAVE_DRM_H', cc.has_header('libdrm/drm.h')) > > @@ -2021,6 +2022,9 @@ config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID', > > config_host_data.set('HAVE_STRUCT_STAT_ST_ATIM', > > cc.has_member('struct stat', 'st_atim', > > prefix: '#include <sys/stat.h>')) > > +config_host_data.set('HAVE_BLK_ZONE_REP_CAPACITY', > > + cc.has_member('struct blk_zone', 'capacity', > > + prefix: '#include <linux/blkzoned.h>')) > > > > # has_type > > config_host_data.set('CONFIG_IOVEC', > > diff --git a/qapi/block-core.json b/qapi/block-core.json > > index 882b266532..05a3b44731 100644 > > --- a/qapi/block-core.json > > +++ b/qapi/block-core.json > > @@ -2942,6 +2942,7 @@ > > # @compress: Since 5.0 > > # @copy-before-write: Since 6.2 > > # @snapshot-access: Since 7.0 > > +# @zoned_host_device: Since 7.2 > > # > > # Since: 2.9 > > ## > > @@ -2955,7 +2956,8 @@ > > 'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', > > 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd', > > { 'name': 'replication', 'if': 'CONFIG_REPLICATION' }, > > - 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] } > > + 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', > > + { 'name': 'zoned_host_device', 'if': 'CONFIG_BLKZONED' } ] } > > > > ## > > # @BlockdevOptionsFile: > > @@ -4329,7 +4331,9 @@ > > 'vhdx': 'BlockdevOptionsGenericFormat', > > 'vmdk': 'BlockdevOptionsGenericCOWFormat', > > 'vpc': 'BlockdevOptionsGenericFormat', > > - 'vvfat': 'BlockdevOptionsVVFAT' > > + 'vvfat': 'BlockdevOptionsVVFAT', > > + 'zoned_host_device': { 'type': 'BlockdevOptionsFile', > > + 'if': 'CONFIG_BLKZONED' } > > } } > > > > ## > > diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c > > index 952dc940f1..c1b28ea108 100644 > > --- a/qemu-io-cmds.c > > +++ b/qemu-io-cmds.c > > @@ -1712,6 +1712,150 @@ static const cmdinfo_t flush_cmd = { > > .oneline = "flush all in-core file state to disk", > > }; > > > > +static inline int64_t tosector(int64_t bytes) > > +{ > > + return bytes >> BDRV_SECTOR_BITS; > > +} > > + > > +static int zone_report_f(BlockBackend *blk, int argc, char **argv) > > +{ > > + int ret; > > + int64_t offset; > > + unsigned int nr_zones; > > + > > + ++optind; > > + offset = cvtnum(argv[optind]); > > + ++optind; > > + nr_zones = cvtnum(argv[optind]); > > + > > + g_autofree BlockZoneDescriptor *zones = NULL; > > + zones = g_new(BlockZoneDescriptor, nr_zones); > > + ret = blk_zone_report(blk, offset, &nr_zones, zones); > > + if (ret < 0) { > > + printf("zone report failed: %s\n", strerror(-ret)); > > + } else { > > + for (int i = 0; i < nr_zones; ++i) { > > + printf("start: 0x%" PRIx64 ", len 0x%" PRIx64 ", " > > + "cap"" 0x%" PRIx64 ", wptr 0x%" PRIx64 ", " > > + "zcond:%u, [type: %u]\n", > > + tosector(zones[i].start), tosector(zones[i].length), > > + tosector(zones[i].cap), tosector(zones[i].wp), > > + zones[i].cond, zones[i].type); > > + } > > + } > > + return ret; > > +} > > + > > +static const cmdinfo_t zone_report_cmd = { > > + .name = "zone_report", > > + .altname = "zrp", > > + .cfunc = zone_report_f, > > + .argmin = 2, > > + .argmax = 2, > > + .args = "offset number", > > + .oneline = "report zone information", > > +}; > > + > > +static int zone_open_f(BlockBackend *blk, int argc, char **argv) > > +{ > > + int ret; > > + int64_t offset, len; > > + ++optind; > > + offset = cvtnum(argv[optind]); > > + ++optind; > > + len = cvtnum(argv[optind]); > > + ret = blk_zone_mgmt(blk, BLK_ZO_OPEN, offset, len); > > + if (ret < 0) { > > + printf("zone open failed: %s\n", strerror(-ret)); > > + } > > + return ret; > > +} > > + > > +static const cmdinfo_t zone_open_cmd = { > > + .name = "zone_open", > > + .altname = "zo", > > + .cfunc = zone_open_f, > > + .argmin = 2, > > + .argmax = 2, > > + .args = "offset len", > > + .oneline = "explicit open a range of zones in zone block device", > > +}; > > + > > +static int zone_close_f(BlockBackend *blk, int argc, char **argv) > > +{ > > + int ret; > > + int64_t offset, len; > > + ++optind; > > + offset = cvtnum(argv[optind]); > > + ++optind; > > + len = cvtnum(argv[optind]); > > + ret = blk_zone_mgmt(blk, BLK_ZO_CLOSE, offset, len); > > + if (ret < 0) { > > + printf("zone close failed: %s\n", strerror(-ret)); > > + } > > + return ret; > > +} > > + > > +static const cmdinfo_t zone_close_cmd = { > > + .name = "zone_close", > > + .altname = "zc", > > + .cfunc = zone_close_f, > > + .argmin = 2, > > + .argmax = 2, > > + .args = "offset len", > > + .oneline = "close a range of zones in zone block device", > > +}; > > + > > +static int zone_finish_f(BlockBackend *blk, int argc, char **argv) > > +{ > > + int ret; > > + int64_t offset, len; > > + ++optind; > > + offset = cvtnum(argv[optind]); > > + ++optind; > > + len = cvtnum(argv[optind]); > > + ret = blk_zone_mgmt(blk, BLK_ZO_FINISH, offset, len); > > + if (ret < 0) { > > + printf("zone finish failed: %s\n", strerror(-ret)); > > + } > > + return ret; > > +} > > + > > +static const cmdinfo_t zone_finish_cmd = { > > + .name = "zone_finish", > > + .altname = "zf", > > + .cfunc = zone_finish_f, > > + .argmin = 2, > > + .argmax = 2, > > + .args = "offset len", > > + .oneline = "finish a range of zones in zone block device", > > +}; > > + > > +static int zone_reset_f(BlockBackend *blk, int argc, char **argv) > > +{ > > + int ret; > > + int64_t offset, len; > > + ++optind; > > + offset = cvtnum(argv[optind]); > > + ++optind; > > + len = cvtnum(argv[optind]); > > + ret = blk_zone_mgmt(blk, BLK_ZO_RESET, offset, len); > > + if (ret < 0) { > > + printf("zone reset failed: %s\n", strerror(-ret)); > > + } > > + return ret; > > +} > > + > > +static const cmdinfo_t zone_reset_cmd = { > > + .name = "zone_reset", > > + .altname = "zrs", > > + .cfunc = zone_reset_f, > > + .argmin = 2, > > + .argmax = 2, > > + .args = "offset len", > > + .oneline = "reset a zone write pointer in zone block device", > > +}; > > + > > static int truncate_f(BlockBackend *blk, int argc, char **argv); > > static const cmdinfo_t truncate_cmd = { > > .name = "truncate", > > @@ -2504,6 +2648,11 @@ static void __attribute((constructor)) > > init_qemuio_commands(void) > > qemuio_add_command(&aio_write_cmd); > > qemuio_add_command(&aio_flush_cmd); > > qemuio_add_command(&flush_cmd); > > + qemuio_add_command(&zone_report_cmd); > > + qemuio_add_command(&zone_open_cmd); > > + qemuio_add_command(&zone_close_cmd); > > + qemuio_add_command(&zone_finish_cmd); > > + qemuio_add_command(&zone_reset_cmd); > > qemuio_add_command(&truncate_cmd); > > qemuio_add_command(&length_cmd); > > qemuio_add_command(&info_cmd); >
Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年10月17日周一 12:57写道: > > On 10/16/22 23:51, Sam Li wrote: > > Add a new zoned_host_device BlockDriver. The zoned_host_device option > > accepts only zoned host block devices. By adding zone management > > operations in this new BlockDriver, users can use the new block > > layer APIs including Report Zone and four zone management operations > > (open, close, finish, reset, reset_all). > > > > Qemu-io uses the new APIs to perform zoned storage commands of the device: > > zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs), > > zone_finish(zf). > > > > For example, to test zone_report, use following command: > > $ ./build/qemu-io --image-opts -n driver=zoned_host_device, filename=/dev/nullb0 > > -c "zrp offset nr_zones" > > > > Signed-off-by: Sam Li <faithilikerun@gmail.com> > > Reviewed-by: Hannes Reinecke <hare@suse.de> > > --- > > block/block-backend.c | 148 +++++++++++++ > > block/file-posix.c | 335 ++++++++++++++++++++++++++++++ > > block/io.c | 41 ++++ > > include/block/block-io.h | 7 + > > include/block/block_int-common.h | 24 +++ > > include/block/raw-aio.h | 6 +- > > include/sysemu/block-backend-io.h | 18 ++ > > meson.build | 4 + > > qapi/block-core.json | 8 +- > > qemu-io-cmds.c | 149 +++++++++++++ > > 10 files changed, 737 insertions(+), 3 deletions(-) > > > > diff --git a/block/block-backend.c b/block/block-backend.c > > index aa4adf06ae..1c618e9c68 100644 > > --- a/block/block-backend.c > > +++ b/block/block-backend.c > > @@ -1431,6 +1431,15 @@ typedef struct BlkRwCo { > > void *iobuf; > > int ret; > > BdrvRequestFlags flags; > > + union { > > + struct { > > + unsigned int *nr_zones; > > + BlockZoneDescriptor *zones; > > + } zone_report; > > + struct { > > + unsigned long op; > > + } zone_mgmt; > > + }; > > } BlkRwCo; > > > > int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags) > > @@ -1775,6 +1784,145 @@ int coroutine_fn blk_co_flush(BlockBackend *blk) > > return ret; > > } > > > > +static void coroutine_fn blk_aio_zone_report_entry(void *opaque) > > +{ > > + BlkAioEmAIOCB *acb = opaque; > > + BlkRwCo *rwco = &acb->rwco; > > + > > + rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset, > > + rwco->zone_report.nr_zones, > > + rwco->zone_report.zones); > > + blk_aio_complete(acb); > > +} > > + > > +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones, > > + BlockCompletionFunc *cb, void *opaque) > > +{ > > + BlkAioEmAIOCB *acb; > > + Coroutine *co; > > + IO_CODE(); > > + > > + blk_inc_in_flight(blk); > > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > > + acb->rwco = (BlkRwCo) { > > + .blk = blk, > > + .offset = offset, > > + .ret = NOT_DONE, > > + .zone_report = { > > + .zones = zones, > > + .nr_zones = nr_zones, > > + }, > > + }; > > + acb->has_returned = false; > > + > > + co = qemu_coroutine_create(blk_aio_zone_report_entry, acb); > > + bdrv_coroutine_enter(blk_bs(blk), co); > > + > > + acb->has_returned = true; > > + if (acb->rwco.ret != NOT_DONE) { > > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > > + blk_aio_complete_bh, acb); > > + } > > + > > + return &acb->common; > > +} > > + > > +static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque) > > +{ > > + BlkAioEmAIOCB *acb = opaque; > > + BlkRwCo *rwco = &acb->rwco; > > + > > + rwco->ret = blk_co_zone_mgmt(rwco->blk, rwco->zone_mgmt.op, > > + rwco->offset, acb->bytes); > > + blk_aio_complete(acb); > > +} > > + > > +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > + int64_t offset, int64_t len, > > + BlockCompletionFunc *cb, void *opaque) { > > + BlkAioEmAIOCB *acb; > > + Coroutine *co; > > + IO_CODE(); > > + > > + blk_inc_in_flight(blk); > > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > > + acb->rwco = (BlkRwCo) { > > + .blk = blk, > > + .offset = offset, > > + .ret = NOT_DONE, > > + .zone_mgmt = { > > + .op = op, > > + }, > > + }; > > + acb->bytes = len; > > + acb->has_returned = false; > > + > > + co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb); > > + bdrv_coroutine_enter(blk_bs(blk), co); > > + > > + acb->has_returned = true; > > + if (acb->rwco.ret != NOT_DONE) { > > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > > + blk_aio_complete_bh, acb); > > + } > > + > > + return &acb->common; > > +} > > + > > +/* > > + * Send a zone_report command. > > + * offset is a byte offset from the start of the device. No alignment > > + * required for offset. > > + * nr_zones represents IN maximum and OUT actual. > > + */ > > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones) > > +{ > > + int ret; > > + IO_CODE(); > > + > > + blk_inc_in_flight(blk); /* increase before waiting */ > > + blk_wait_while_drained(blk); > > + if (!blk_is_available(blk)) { > > + blk_dec_in_flight(blk); > > + return -ENOMEDIUM; > > + } > > + ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones); > > + blk_dec_in_flight(blk); > > + return ret; > > +} > > + > > +/* > > + * Send a zone_management command. > > + * op is the zone operation; > > + * offset is the byte offset from the start of the zoned device; > > + * len is the maximum number of bytes the command should operate on. It > > + * should be aligned with the device zone size. > > + */ > > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > + int64_t offset, int64_t len) > > +{ > > + int ret; > > + IO_CODE(); > > + > > + > > Nit: extra blank line here. > > > + blk_inc_in_flight(blk); > > + blk_wait_while_drained(blk); > > + > > + ret = blk_check_byte_request(blk, offset, len); > > + if (ret < 0) { > > + blk_dec_in_flight(blk); > > + return ret; > > + } > > + > > + ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len); > > + blk_dec_in_flight(blk); > > + return ret; > > +} > > + > > void blk_drain(BlockBackend *blk) > > { > > BlockDriverState *bs = blk_bs(blk); > > diff --git a/block/file-posix.c b/block/file-posix.c > > index 8cb07fdb8a..bd28e3eaea 100644 > > --- a/block/file-posix.c > > +++ b/block/file-posix.c > > @@ -67,6 +67,9 @@ > > #include <sys/param.h> > > #include <sys/syscall.h> > > #include <sys/vfs.h> > > +#if defined(CONFIG_BLKZONED) > > +#include <linux/blkzoned.h> > > +#endif > > #include <linux/cdrom.h> > > #include <linux/fd.h> > > #include <linux/fs.h> > > @@ -215,6 +218,14 @@ typedef struct RawPosixAIOData { > > PreallocMode prealloc; > > Error **errp; > > } truncate; > > + struct { > > + unsigned int *nr_zones; > > + BlockZoneDescriptor *zones; > > + } zone_report; > > + struct { > > + unsigned long zone_op; > > + const char *zone_op_name; > > Nit: This could be simplified to "op" and "op_name". No big deal though. > > > + } zone_mgmt; > > }; > > } RawPosixAIOData; > > > > @@ -1350,6 +1361,48 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) > > zoned = BLK_Z_NONE; > > } > > bs->bl.zoned = zoned; > > + if (zoned != BLK_Z_NONE) { > > + /* > > + * The zoned device must at least have zone size and nr_zones fields. > > + */ > > + ret = get_sysfs_long_val(&st, "chunk_sectors"); > > + if (ret <= 0) { > > + error_report("Invalid zone size %" PRId32 " sectors ", ret); > > + bs->bl.zoned = BLK_Z_NONE; > > + return; > > + } > > + bs->bl.zone_size = ret << BDRV_SECTOR_BITS; > > + > > + ret = get_sysfs_long_val(&st, "nr_zones"); > > + if (ret <= 0) { > > + error_report("Invalid number of zones %" PRId32 "", ret); > > + bs->bl.zoned = BLK_Z_NONE; > > + return; > > + } > > + bs->bl.nr_zones = ret; > > + > > + ret = ioctl(s->fd, BLKGETSIZE64, &bs->bl.capacity); > > + if (ret != 0) { > > + error_report("Failed to get device capacity %" PRId64 " bytes ", > > + bs->bl.capacity); > > + return; > > + } > > + > > + ret = get_sysfs_long_val(&st, "zone_append_max_bytes"); > > + if (ret > 0) { > > + bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS; > > + } > > + > > + ret = get_sysfs_long_val(&st, "max_open_zones"); > > + if (ret >= 0) { > > + bs->bl.max_open_zones = ret; > > + } > > + > > + ret = get_sysfs_long_val(&st, "max_active_zones"); > > + if (ret >= 0) { > > + bs->bl.max_active_zones = ret; > > + } > > + } > > } > > > > static int check_for_dasd(int fd) > > @@ -1844,6 +1897,144 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd, > > } > > #endif > > > > +/* > > + * parse_zone - Fill a zone descriptor > > + */ > > +#if defined(CONFIG_BLKZONED) > > +static inline void parse_zone(struct BlockZoneDescriptor *zone, > > + const struct blk_zone *blkz) { > > + zone->start = blkz->start << BDRV_SECTOR_BITS; > > + zone->length = blkz->len << BDRV_SECTOR_BITS; > > + zone->wp = blkz->wp << BDRV_SECTOR_BITS; > > + > > +#ifdef HAVE_BLK_ZONE_REP_CAPACITY > > + zone->cap = blkz->capacity << BDRV_SECTOR_BITS; > > +#else > > + zone->cap = blkz->len << BDRV_SECTOR_BITS; > > +#endif > > + > > + switch (blkz->type) { > > + case BLK_ZONE_TYPE_SEQWRITE_REQ: > > + zone->type = BLK_ZT_SWR; > > + break; > > + case BLK_ZONE_TYPE_SEQWRITE_PREF: > > + zone->type = BLK_ZT_SWP; > > + break; > > + case BLK_ZONE_TYPE_CONVENTIONAL: > > + zone->type = BLK_ZT_CONV; > > + break; > > + default: > > + error_report("Invalid zone type: 0x%x", blkz->type); > > The error message here should be: "Unsupported zone type...", as the value > may be perfectly valid for Linux but QEMU not know about it because e.g. > an old version is being used. > > Also, I would rather have an error return here so that the caller is notified. Ok. Will change the return type to int and return an unsupported error so that handle_aio_zone_report gets notified. > > > + } > > + > > + switch (blkz->cond) { > > + case BLK_ZONE_COND_NOT_WP: > > + zone->cond = BLK_ZS_NOT_WP; > > + break; > > + case BLK_ZONE_COND_EMPTY: > > + zone->cond = BLK_ZS_EMPTY; > > + break; > > + case BLK_ZONE_COND_IMP_OPEN: > > + zone->cond = BLK_ZS_IOPEN; > > + break; > > + case BLK_ZONE_COND_EXP_OPEN: > > + zone->cond = BLK_ZS_EOPEN; > > + break; > > + case BLK_ZONE_COND_CLOSED: > > + zone->cond = BLK_ZS_CLOSED; > > + break; > > + case BLK_ZONE_COND_READONLY: > > + zone->cond = BLK_ZS_RDONLY; > > + break; > > + case BLK_ZONE_COND_FULL: > > + zone->cond = BLK_ZS_FULL; > > + break; > > + case BLK_ZONE_COND_OFFLINE: > > + zone->cond = BLK_ZS_OFFLINE; > > + break; > > + default: > > + error_report("Invalid zone state: 0x%x", blkz->cond); > > Same comment here. And we need an error return. Same as above. > > > + } > > +} > > +#endif > > + > > +#if defined(CONFIG_BLKZONED) > > +static int handle_aiocb_zone_report(void *opaque) > > +{ > > + RawPosixAIOData *aiocb = opaque; > > + int fd = aiocb->aio_fildes; > > + unsigned int *nr_zones = aiocb->zone_report.nr_zones; > > + BlockZoneDescriptor *zones = aiocb->zone_report.zones; > > + /* zoned block devices use 512-byte sectors */ > > + int64_t sector = aiocb->aio_offset / 512; > > + > > + struct blk_zone *blkz; > > + int64_t rep_size; > > + unsigned int nrz; > > + int ret, n = 0, i = 0; > > + > > + nrz = *nr_zones; > > + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); > > + g_autofree struct blk_zone_report *rep = NULL; > > + rep = g_malloc(rep_size); > > + > > + blkz = (struct blk_zone *)(rep + 1); > > + while (n < nrz) { > > + memset(rep, 0, rep_size); > > + rep->sector = sector; > > + rep->nr_zones = nrz - n; > > + > > + do { > > + ret = ioctl(fd, BLKREPORTZONE, rep); > > + } while (ret != 0 && errno == EINTR); > > + if (ret != 0) { > > + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d", > > + fd, sector, errno); > > + return -errno; > > + } > > + > > + if (!rep->nr_zones) { > > + break; > > + } > > + > > + for (i = 0; i < rep->nr_zones; i++, n++) { > > + parse_zone(&zones[n], &blkz[i]); > > Need to check for errors here, and if there is an error reported, fail > this aio so that unhandled/unsupported values are no propagated to the guest. Ok. When an error is reported, it breaks the loop and fails the aio at that error value. > > > + /* The next report should start after the last zone reported */ > > + sector = blkz[i].start + blkz[i].len; > > + } > > + } > > + > > + *nr_zones = n; > > + return 0; > > +} > > +#endif > > + > > +#if defined(CONFIG_BLKZONED) > > +static int handle_aiocb_zone_mgmt(void *opaque) > > +{ > > + RawPosixAIOData *aiocb = opaque; > > + int fd = aiocb->aio_fildes; > > + int64_t sector = aiocb->aio_offset / 512; > > + int64_t nr_sectors = aiocb->aio_nbytes / 512; > > + struct blk_zone_range range; > > + int ret; > > + > > + /* Execute the operation */ > > + range.sector = sector; > > + range.nr_sectors = nr_sectors; > > + do { > > + ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range); > > + } while (ret != 0 && errno == EINTR); > > + if (ret != 0) { > > + ret = -errno; > > + error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name, > > + ret); > > + return ret; > > + } > > + return 0; > > +} > > +#endif > > + > > static int handle_aiocb_copy_range(void *opaque) > > { > > RawPosixAIOData *aiocb = opaque; > > @@ -3038,6 +3229,99 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret) > > } > > } > > > > +/* > > + * zone report - Get a zone block device's information in the form > > + * of an array of zone descriptors. > > + * zones is an array of zone descriptors to hold zone information on reply; > > + * offset can be any byte within the entire size of the device; > > + * nr_zones is the maxium number of sectors the command should operate on. > > + */ > > +#if defined(CONFIG_BLKZONED) > > +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones) { > > + BDRVRawState *s = bs->opaque; > > + RawPosixAIOData acb; > > + > > + acb = (RawPosixAIOData) { > > + .bs = bs, > > + .aio_fildes = s->fd, > > + .aio_type = QEMU_AIO_ZONE_REPORT, > > + .aio_offset = offset, > > + .zone_report = { > > + .nr_zones = nr_zones, > > + .zones = zones, > > + }, > > + }; > > + > > + return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb); > > +} > > +#endif > > + > > +/* > > + * zone management operations - Execute an operation on a zone > > + */ > > +#if defined(CONFIG_BLKZONED) > > +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > > + int64_t offset, int64_t len) { > > + BDRVRawState *s = bs->opaque; > > + RawPosixAIOData acb; > > + int64_t zone_size, zone_size_mask; > > + const char *zone_op_name; > > + unsigned long zone_op; > > + > > + zone_size = bs->bl.zone_size; > > + zone_size_mask = zone_size - 1; > > + if (offset & zone_size_mask) { > > + error_report("sector offset %" PRId64 " is not aligned to zone size " > > + "%" PRId64 "", offset / 512, zone_size / 512); > > + return -EINVAL; > > + } > > + > > + if (((offset + len) < bs->bl.capacity && len & zone_size_mask) || > > + offset + len > bs->bl.capacity) { > > + error_report("number of sectors %" PRId64 " is not aligned to zone size" > > + " %" PRId64 "", len / 512, zone_size / 512); > > + return -EINVAL; > > + } > > + > > + switch (op) { > > + case BLK_ZO_OPEN: > > + zone_op_name = "BLKOPENZONE"; > > + zone_op = BLKOPENZONE; > > + break; > > + case BLK_ZO_CLOSE: > > + zone_op_name = "BLKCLOSEZONE"; > > + zone_op = BLKCLOSEZONE; > > + break; > > + case BLK_ZO_FINISH: > > + zone_op_name = "BLKFINISHZONE"; > > + zone_op = BLKFINISHZONE; > > + break; > > + case BLK_ZO_RESET: > > + zone_op_name = "BLKRESETZONE"; > > + zone_op = BLKRESETZONE; > > + break; > > + default: > > + g_assert_not_reached(); > > + } > > + > > + acb = (RawPosixAIOData) { > > + .bs = bs, > > + .aio_fildes = s->fd, > > + .aio_type = QEMU_AIO_ZONE_MGMT, > > + .aio_offset = offset, > > + .aio_nbytes = len, > > + .zone_mgmt = { > > + .zone_op = zone_op, > > + .zone_op_name = zone_op_name, > > + }, > > + }; > > + > > + return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb); > > +} > > +#endif > > + > > static coroutine_fn int > > raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, > > bool blkdev) > > @@ -3768,6 +4052,54 @@ static BlockDriver bdrv_host_device = { > > #endif > > }; > > > > +#if defined(CONFIG_BLKZONED) > > +static BlockDriver bdrv_zoned_host_device = { > > + .format_name = "zoned_host_device", > > + .protocol_name = "zoned_host_device", > > + .instance_size = sizeof(BDRVRawState), > > + .bdrv_needs_filename = true, > > + .bdrv_probe_device = hdev_probe_device, > > + .bdrv_file_open = hdev_open, > > + .bdrv_close = raw_close, > > + .bdrv_reopen_prepare = raw_reopen_prepare, > > + .bdrv_reopen_commit = raw_reopen_commit, > > + .bdrv_reopen_abort = raw_reopen_abort, > > + .bdrv_co_create_opts = bdrv_co_create_opts_simple, > > + .create_opts = &bdrv_create_opts_simple, > > + .mutable_opts = mutable_opts, > > + .bdrv_co_invalidate_cache = raw_co_invalidate_cache, > > + .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, > > + > > + .bdrv_co_preadv = raw_co_preadv, > > + .bdrv_co_pwritev = raw_co_pwritev, > > + .bdrv_co_flush_to_disk = raw_co_flush_to_disk, > > + .bdrv_co_pdiscard = hdev_co_pdiscard, > > + .bdrv_co_copy_range_from = raw_co_copy_range_from, > > + .bdrv_co_copy_range_to = raw_co_copy_range_to, > > + .bdrv_refresh_limits = raw_refresh_limits, > > + .bdrv_io_plug = raw_aio_plug, > > + .bdrv_io_unplug = raw_aio_unplug, > > + .bdrv_attach_aio_context = raw_aio_attach_aio_context, > > + > > + .bdrv_co_truncate = raw_co_truncate, > > + .bdrv_getlength = raw_getlength, > > + .bdrv_get_info = raw_get_info, > > + .bdrv_get_allocated_file_size > > + = raw_get_allocated_file_size, > > + .bdrv_get_specific_stats = hdev_get_specific_stats, > > + .bdrv_check_perm = raw_check_perm, > > + .bdrv_set_perm = raw_set_perm, > > + .bdrv_abort_perm_update = raw_abort_perm_update, > > + .bdrv_probe_blocksizes = hdev_probe_blocksizes, > > + .bdrv_probe_geometry = hdev_probe_geometry, > > + .bdrv_co_ioctl = hdev_co_ioctl, > > + > > + /* zone management operations */ > > + .bdrv_co_zone_report = raw_co_zone_report, > > + .bdrv_co_zone_mgmt = raw_co_zone_mgmt, > > +}; > > +#endif > > + > > #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) > > static void cdrom_parse_filename(const char *filename, QDict *options, > > Error **errp) > > @@ -4028,6 +4360,9 @@ static void bdrv_file_init(void) > > bdrv_register(&bdrv_file); > > #if defined(HAVE_HOST_BLOCK_DEVICE) > > bdrv_register(&bdrv_host_device); > > +#if defined(CONFIG_BLKZONED) > > + bdrv_register(&bdrv_zoned_host_device); > > +#endif > > #ifdef __linux__ > > bdrv_register(&bdrv_host_cdrom); > > #endif > > diff --git a/block/io.c b/block/io.c > > index d30073036e..88f707ea4d 100644 > > --- a/block/io.c > > +++ b/block/io.c > > @@ -3189,6 +3189,47 @@ out: > > return co.ret; > > } > > > > +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones) > > +{ > > + BlockDriver *drv = bs->drv; > > + CoroutineIOCompletion co = { > > + .coroutine = qemu_coroutine_self(), > > + }; > > + IO_CODE(); > > + > > + bdrv_inc_in_flight(bs); > > + if (!drv || !drv->bdrv_co_zone_report) { > > + co.ret = -ENOTSUP; > > + goto out; > > + } > > + co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones); > > +out: > > + bdrv_dec_in_flight(bs); > > + return co.ret; > > +} > > + > > +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > > + int64_t offset, int64_t len) > > +{ > > + BlockDriver *drv = bs->drv; > > + CoroutineIOCompletion co = { > > + .coroutine = qemu_coroutine_self(), > > + }; > > + IO_CODE(); > > + > > + bdrv_inc_in_flight(bs); > > + if (!drv || !drv->bdrv_co_zone_mgmt) { > > + co.ret = -ENOTSUP; > > + goto out; > > + } > > + co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len); > > +out: > > + bdrv_dec_in_flight(bs); > > + return co.ret; > > +} > > + > > void *qemu_blockalign(BlockDriverState *bs, size_t size) > > { > > IO_CODE(); > > diff --git a/include/block/block-io.h b/include/block/block-io.h > > index 492f95fc05..f0cdf67d33 100644 > > --- a/include/block/block-io.h > > +++ b/include/block/block-io.h > > @@ -88,6 +88,13 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf); > > /* Ensure contents are flushed to disk. */ > > int coroutine_fn bdrv_co_flush(BlockDriverState *bs); > > > > +/* Report zone information of zone block device. */ > > +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones); > > +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > > + int64_t offset, int64_t len); > > + > > int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes); > > bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); > > int bdrv_block_status(BlockDriverState *bs, int64_t offset, > > diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h > > index 7f7863cc9e..cdc06e77a6 100644 > > --- a/include/block/block_int-common.h > > +++ b/include/block/block_int-common.h > > @@ -691,6 +691,12 @@ struct BlockDriver { > > QEMUIOVector *qiov, > > int64_t pos); > > > > + int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs, > > + int64_t offset, unsigned int *nr_zones, > > + BlockZoneDescriptor *zones); > > + int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op, > > + int64_t offset, int64_t len); > > + > > /* removable device specific */ > > bool (*bdrv_is_inserted)(BlockDriverState *bs); > > void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag); > > @@ -828,6 +834,24 @@ typedef struct BlockLimits { > > > > /* device zone model */ > > BlockZoneModel zoned; > > + > > + /* zone size expressed in bytes */ > > + uint32_t zone_size; > > + > > + /* total number of zones */ > > + unsigned int nr_zones; > > + > > + /* maximum sectors of a zone append write operation */ > > + int64_t max_append_sectors; > > + > > + /* maximum number of open zones */ > > + int64_t max_open_zones; > > + > > + /* maximum number of active zones */ > > + int64_t max_active_zones; > > + > > + /* device capacity expressed in bytes */ > > + int64_t capacity; > > } BlockLimits; > > > > typedef struct BdrvOpBlocker BdrvOpBlocker; > > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h > > index 21fc10c4c9..877b2240b3 100644 > > --- a/include/block/raw-aio.h > > +++ b/include/block/raw-aio.h > > @@ -29,6 +29,8 @@ > > #define QEMU_AIO_WRITE_ZEROES 0x0020 > > #define QEMU_AIO_COPY_RANGE 0x0040 > > #define QEMU_AIO_TRUNCATE 0x0080 > > +#define QEMU_AIO_ZONE_REPORT 0x0100 > > +#define QEMU_AIO_ZONE_MGMT 0x0200 > > #define QEMU_AIO_TYPE_MASK \ > > (QEMU_AIO_READ | \ > > QEMU_AIO_WRITE | \ > > @@ -37,7 +39,9 @@ > > QEMU_AIO_DISCARD | \ > > QEMU_AIO_WRITE_ZEROES | \ > > QEMU_AIO_COPY_RANGE | \ > > - QEMU_AIO_TRUNCATE) > > + QEMU_AIO_TRUNCATE | \ > > + QEMU_AIO_ZONE_REPORT | \ > > + QEMU_AIO_ZONE_MGMT) > > > > /* AIO flags */ > > #define QEMU_AIO_MISALIGNED 0x1000 > > diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h > > index 50f5aa2e07..1b5fc7db6b 100644 > > --- a/include/sysemu/block-backend-io.h > > +++ b/include/sysemu/block-backend-io.h > > @@ -45,6 +45,13 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset, > > BlockCompletionFunc *cb, void *opaque); > > BlockAIOCB *blk_aio_flush(BlockBackend *blk, > > BlockCompletionFunc *cb, void *opaque); > > +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones, > > + BlockCompletionFunc *cb, void *opaque); > > +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > + int64_t offset, int64_t len, > > + BlockCompletionFunc *cb, void *opaque); > > BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes, > > BlockCompletionFunc *cb, void *opaque); > > void blk_aio_cancel_async(BlockAIOCB *acb); > > @@ -156,6 +163,17 @@ int generated_co_wrapper blk_pwrite_zeroes(BlockBackend *blk, int64_t offset, > > int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, > > int64_t bytes, BdrvRequestFlags flags); > > > > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones); > > +int generated_co_wrapper blk_zone_report(BlockBackend *blk, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones); > > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > + int64_t offset, int64_t len); > > +int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > + int64_t offset, int64_t len); > > + > > int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset, > > int64_t bytes); > > int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, > > diff --git a/meson.build b/meson.build > > index 44c1f92697..0aa99b64a0 100644 > > --- a/meson.build > > +++ b/meson.build > > @@ -1928,6 +1928,7 @@ config_host_data.set('CONFIG_REPLICATION', get_option('replication').allowed()) > > # has_header > > config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h')) > > config_host_data.set('CONFIG_LINUX_MAGIC_H', cc.has_header('linux/magic.h')) > > +config_host_data.set('CONFIG_BLKZONED', cc.has_header('linux/blkzoned.h')) > > config_host_data.set('CONFIG_VALGRIND_H', cc.has_header('valgrind/valgrind.h')) > > config_host_data.set('HAVE_BTRFS_H', cc.has_header('linux/btrfs.h')) > > config_host_data.set('HAVE_DRM_H', cc.has_header('libdrm/drm.h')) > > @@ -2021,6 +2022,9 @@ config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID', > > config_host_data.set('HAVE_STRUCT_STAT_ST_ATIM', > > cc.has_member('struct stat', 'st_atim', > > prefix: '#include <sys/stat.h>')) > > +config_host_data.set('HAVE_BLK_ZONE_REP_CAPACITY', > > + cc.has_member('struct blk_zone', 'capacity', > > + prefix: '#include <linux/blkzoned.h>')) > > > > # has_type > > config_host_data.set('CONFIG_IOVEC', > > diff --git a/qapi/block-core.json b/qapi/block-core.json > > index 882b266532..05a3b44731 100644 > > --- a/qapi/block-core.json > > +++ b/qapi/block-core.json > > @@ -2942,6 +2942,7 @@ > > # @compress: Since 5.0 > > # @copy-before-write: Since 6.2 > > # @snapshot-access: Since 7.0 > > +# @zoned_host_device: Since 7.2 > > # > > # Since: 2.9 > > ## > > @@ -2955,7 +2956,8 @@ > > 'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', > > 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd', > > { 'name': 'replication', 'if': 'CONFIG_REPLICATION' }, > > - 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] } > > + 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', > > + { 'name': 'zoned_host_device', 'if': 'CONFIG_BLKZONED' } ] } > > > > ## > > # @BlockdevOptionsFile: > > @@ -4329,7 +4331,9 @@ > > 'vhdx': 'BlockdevOptionsGenericFormat', > > 'vmdk': 'BlockdevOptionsGenericCOWFormat', > > 'vpc': 'BlockdevOptionsGenericFormat', > > - 'vvfat': 'BlockdevOptionsVVFAT' > > + 'vvfat': 'BlockdevOptionsVVFAT', > > + 'zoned_host_device': { 'type': 'BlockdevOptionsFile', > > + 'if': 'CONFIG_BLKZONED' } > > } } > > > > ## > > diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c > > index 952dc940f1..c1b28ea108 100644 > > --- a/qemu-io-cmds.c > > +++ b/qemu-io-cmds.c > > @@ -1712,6 +1712,150 @@ static const cmdinfo_t flush_cmd = { > > .oneline = "flush all in-core file state to disk", > > }; > > > > +static inline int64_t tosector(int64_t bytes) > > +{ > > + return bytes >> BDRV_SECTOR_BITS; > > +} > > + > > +static int zone_report_f(BlockBackend *blk, int argc, char **argv) > > +{ > > + int ret; > > + int64_t offset; > > + unsigned int nr_zones; > > + > > + ++optind; > > + offset = cvtnum(argv[optind]); > > + ++optind; > > + nr_zones = cvtnum(argv[optind]); > > + > > + g_autofree BlockZoneDescriptor *zones = NULL; > > + zones = g_new(BlockZoneDescriptor, nr_zones); > > + ret = blk_zone_report(blk, offset, &nr_zones, zones); > > + if (ret < 0) { > > + printf("zone report failed: %s\n", strerror(-ret)); > > + } else { > > + for (int i = 0; i < nr_zones; ++i) { > > + printf("start: 0x%" PRIx64 ", len 0x%" PRIx64 ", " > > + "cap"" 0x%" PRIx64 ", wptr 0x%" PRIx64 ", " > > + "zcond:%u, [type: %u]\n", > > + tosector(zones[i].start), tosector(zones[i].length), > > + tosector(zones[i].cap), tosector(zones[i].wp), > > + zones[i].cond, zones[i].type); > > + } > > + } > > + return ret; > > +} > > + > > +static const cmdinfo_t zone_report_cmd = { > > + .name = "zone_report", > > + .altname = "zrp", > > + .cfunc = zone_report_f, > > + .argmin = 2, > > + .argmax = 2, > > + .args = "offset number", > > + .oneline = "report zone information", > > +}; > > + > > +static int zone_open_f(BlockBackend *blk, int argc, char **argv) > > +{ > > + int ret; > > + int64_t offset, len; > > + ++optind; > > + offset = cvtnum(argv[optind]); > > + ++optind; > > + len = cvtnum(argv[optind]); > > + ret = blk_zone_mgmt(blk, BLK_ZO_OPEN, offset, len); > > + if (ret < 0) { > > + printf("zone open failed: %s\n", strerror(-ret)); > > + } > > + return ret; > > +} > > + > > +static const cmdinfo_t zone_open_cmd = { > > + .name = "zone_open", > > + .altname = "zo", > > + .cfunc = zone_open_f, > > + .argmin = 2, > > + .argmax = 2, > > + .args = "offset len", > > + .oneline = "explicit open a range of zones in zone block device", > > +}; > > + > > +static int zone_close_f(BlockBackend *blk, int argc, char **argv) > > +{ > > + int ret; > > + int64_t offset, len; > > + ++optind; > > + offset = cvtnum(argv[optind]); > > + ++optind; > > + len = cvtnum(argv[optind]); > > + ret = blk_zone_mgmt(blk, BLK_ZO_CLOSE, offset, len); > > + if (ret < 0) { > > + printf("zone close failed: %s\n", strerror(-ret)); > > + } > > + return ret; > > +} > > + > > +static const cmdinfo_t zone_close_cmd = { > > + .name = "zone_close", > > + .altname = "zc", > > + .cfunc = zone_close_f, > > + .argmin = 2, > > + .argmax = 2, > > + .args = "offset len", > > + .oneline = "close a range of zones in zone block device", > > +}; > > + > > +static int zone_finish_f(BlockBackend *blk, int argc, char **argv) > > +{ > > + int ret; > > + int64_t offset, len; > > + ++optind; > > + offset = cvtnum(argv[optind]); > > + ++optind; > > + len = cvtnum(argv[optind]); > > + ret = blk_zone_mgmt(blk, BLK_ZO_FINISH, offset, len); > > + if (ret < 0) { > > + printf("zone finish failed: %s\n", strerror(-ret)); > > + } > > + return ret; > > +} > > + > > +static const cmdinfo_t zone_finish_cmd = { > > + .name = "zone_finish", > > + .altname = "zf", > > + .cfunc = zone_finish_f, > > + .argmin = 2, > > + .argmax = 2, > > + .args = "offset len", > > + .oneline = "finish a range of zones in zone block device", > > +}; > > + > > +static int zone_reset_f(BlockBackend *blk, int argc, char **argv) > > +{ > > + int ret; > > + int64_t offset, len; > > + ++optind; > > + offset = cvtnum(argv[optind]); > > + ++optind; > > + len = cvtnum(argv[optind]); > > + ret = blk_zone_mgmt(blk, BLK_ZO_RESET, offset, len); > > + if (ret < 0) { > > + printf("zone reset failed: %s\n", strerror(-ret)); > > + } > > + return ret; > > +} > > + > > +static const cmdinfo_t zone_reset_cmd = { > > + .name = "zone_reset", > > + .altname = "zrs", > > + .cfunc = zone_reset_f, > > + .argmin = 2, > > + .argmax = 2, > > + .args = "offset len", > > + .oneline = "reset a zone write pointer in zone block device", > > +}; > > + > > static int truncate_f(BlockBackend *blk, int argc, char **argv); > > static const cmdinfo_t truncate_cmd = { > > .name = "truncate", > > @@ -2504,6 +2648,11 @@ static void __attribute((constructor)) init_qemuio_commands(void) > > qemuio_add_command(&aio_write_cmd); > > qemuio_add_command(&aio_flush_cmd); > > qemuio_add_command(&flush_cmd); > > + qemuio_add_command(&zone_report_cmd); > > + qemuio_add_command(&zone_open_cmd); > > + qemuio_add_command(&zone_close_cmd); > > + qemuio_add_command(&zone_finish_cmd); > > + qemuio_add_command(&zone_reset_cmd); > > qemuio_add_command(&truncate_cmd); > > qemuio_add_command(&length_cmd); > > qemuio_add_command(&info_cmd); > > -- > Damien Le Moal > Western Digital Research >
Stefan Hajnoczi <stefanha@redhat.com> 于2022年10月17日周一 22:44写道: > > On Sun, Oct 16, 2022 at 10:51:06PM +0800, Sam Li wrote: > > Add a new zoned_host_device BlockDriver. The zoned_host_device option > > accepts only zoned host block devices. By adding zone management > > operations in this new BlockDriver, users can use the new block > > layer APIs including Report Zone and four zone management operations > > (open, close, finish, reset, reset_all). > > > > Qemu-io uses the new APIs to perform zoned storage commands of the device: > > zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs), > > zone_finish(zf). > > > > For example, to test zone_report, use following command: > > $ ./build/qemu-io --image-opts -n driver=zoned_host_device, filename=/dev/nullb0 > > -c "zrp offset nr_zones" > > > > Signed-off-by: Sam Li <faithilikerun@gmail.com> > > Reviewed-by: Hannes Reinecke <hare@suse.de> > > --- > > block/block-backend.c | 148 +++++++++++++ > > block/file-posix.c | 335 ++++++++++++++++++++++++++++++ > > block/io.c | 41 ++++ > > include/block/block-io.h | 7 + > > include/block/block_int-common.h | 24 +++ > > include/block/raw-aio.h | 6 +- > > include/sysemu/block-backend-io.h | 18 ++ > > meson.build | 4 + > > qapi/block-core.json | 8 +- > > qemu-io-cmds.c | 149 +++++++++++++ > > 10 files changed, 737 insertions(+), 3 deletions(-) > > > > diff --git a/block/block-backend.c b/block/block-backend.c > > index aa4adf06ae..1c618e9c68 100644 > > --- a/block/block-backend.c > > +++ b/block/block-backend.c > > @@ -1431,6 +1431,15 @@ typedef struct BlkRwCo { > > void *iobuf; > > int ret; > > BdrvRequestFlags flags; > > + union { > > + struct { > > + unsigned int *nr_zones; > > + BlockZoneDescriptor *zones; > > + } zone_report; > > + struct { > > + unsigned long op; > > + } zone_mgmt; > > + }; > > } BlkRwCo; > > > > int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags) > > @@ -1775,6 +1784,145 @@ int coroutine_fn blk_co_flush(BlockBackend *blk) > > return ret; > > } > > > > +static void coroutine_fn blk_aio_zone_report_entry(void *opaque) > > +{ > > + BlkAioEmAIOCB *acb = opaque; > > + BlkRwCo *rwco = &acb->rwco; > > + > > + rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset, > > + rwco->zone_report.nr_zones, > > + rwco->zone_report.zones); > > + blk_aio_complete(acb); > > +} > > + > > +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones, > > + BlockCompletionFunc *cb, void *opaque) > > +{ > > + BlkAioEmAIOCB *acb; > > + Coroutine *co; > > + IO_CODE(); > > + > > + blk_inc_in_flight(blk); > > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > > + acb->rwco = (BlkRwCo) { > > + .blk = blk, > > + .offset = offset, > > + .ret = NOT_DONE, > > + .zone_report = { > > + .zones = zones, > > + .nr_zones = nr_zones, > > + }, > > + }; > > + acb->has_returned = false; > > + > > + co = qemu_coroutine_create(blk_aio_zone_report_entry, acb); > > + bdrv_coroutine_enter(blk_bs(blk), co); > > + > > + acb->has_returned = true; > > + if (acb->rwco.ret != NOT_DONE) { > > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > > + blk_aio_complete_bh, acb); > > + } > > + > > + return &acb->common; > > +} > > + > > +static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque) > > +{ > > + BlkAioEmAIOCB *acb = opaque; > > + BlkRwCo *rwco = &acb->rwco; > > + > > + rwco->ret = blk_co_zone_mgmt(rwco->blk, rwco->zone_mgmt.op, > > + rwco->offset, acb->bytes); > > + blk_aio_complete(acb); > > +} > > + > > +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > + int64_t offset, int64_t len, > > + BlockCompletionFunc *cb, void *opaque) { > > + BlkAioEmAIOCB *acb; > > + Coroutine *co; > > + IO_CODE(); > > + > > + blk_inc_in_flight(blk); > > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > > + acb->rwco = (BlkRwCo) { > > + .blk = blk, > > + .offset = offset, > > + .ret = NOT_DONE, > > + .zone_mgmt = { > > + .op = op, > > + }, > > + }; > > + acb->bytes = len; > > + acb->has_returned = false; > > + > > + co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb); > > + bdrv_coroutine_enter(blk_bs(blk), co); > > + > > + acb->has_returned = true; > > + if (acb->rwco.ret != NOT_DONE) { > > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > > + blk_aio_complete_bh, acb); > > + } > > + > > + return &acb->common; > > +} > > + > > +/* > > + * Send a zone_report command. > > + * offset is a byte offset from the start of the device. No alignment > > + * required for offset. > > + * nr_zones represents IN maximum and OUT actual. > > + */ > > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones) > > +{ > > + int ret; > > + IO_CODE(); > > + > > + blk_inc_in_flight(blk); /* increase before waiting */ > > + blk_wait_while_drained(blk); > > + if (!blk_is_available(blk)) { > > + blk_dec_in_flight(blk); > > + return -ENOMEDIUM; > > + } > > + ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones); > > + blk_dec_in_flight(blk); > > + return ret; > > +} > > + > > +/* > > + * Send a zone_management command. > > + * op is the zone operation; > > + * offset is the byte offset from the start of the zoned device; > > + * len is the maximum number of bytes the command should operate on. It > > + * should be aligned with the device zone size. > > + */ > > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > + int64_t offset, int64_t len) > > +{ > > + int ret; > > + IO_CODE(); > > + > > + > > + blk_inc_in_flight(blk); > > + blk_wait_while_drained(blk); > > + > > + ret = blk_check_byte_request(blk, offset, len); > > + if (ret < 0) { > > + blk_dec_in_flight(blk); > > + return ret; > > + } > > + > > + ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len); > > + blk_dec_in_flight(blk); > > + return ret; > > +} > > + > > void blk_drain(BlockBackend *blk) > > { > > BlockDriverState *bs = blk_bs(blk); > > diff --git a/block/file-posix.c b/block/file-posix.c > > index 8cb07fdb8a..bd28e3eaea 100644 > > --- a/block/file-posix.c > > +++ b/block/file-posix.c > > @@ -67,6 +67,9 @@ > > #include <sys/param.h> > > #include <sys/syscall.h> > > #include <sys/vfs.h> > > +#if defined(CONFIG_BLKZONED) > > +#include <linux/blkzoned.h> > > +#endif > > #include <linux/cdrom.h> > > #include <linux/fd.h> > > #include <linux/fs.h> > > @@ -215,6 +218,14 @@ typedef struct RawPosixAIOData { > > PreallocMode prealloc; > > Error **errp; > > } truncate; > > + struct { > > + unsigned int *nr_zones; > > + BlockZoneDescriptor *zones; > > + } zone_report; > > + struct { > > + unsigned long zone_op; > > + const char *zone_op_name; > > + } zone_mgmt; > > }; > > } RawPosixAIOData; > > > > @@ -1350,6 +1361,48 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) > > zoned = BLK_Z_NONE; > > } > > bs->bl.zoned = zoned; > > + if (zoned != BLK_Z_NONE) { > > + /* > > + * The zoned device must at least have zone size and nr_zones fields. > > + */ > > + ret = get_sysfs_long_val(&st, "chunk_sectors"); > > + if (ret <= 0) { > > + error_report("Invalid zone size %" PRId32 " sectors ", ret); > > The error should be set in raw_refresh_limits()'s Error ** argument. > Please use error_setg_errno(errp, -ret, "Unable to read chunk_sectors > sysfs attribute") when ret < 0. When ret == 0 you can use > error_setg(errp, "Read 0 from chunk_sectors sysfs attribute"). > > This way the error is propagated to the caller and opening the block > device will fail. I noticed there are code paths in block.c during > reopen when the error is not displayed, but overall I think it's still > best to set errp instead of calling error_report() ourselves. > > The same applies to the other sysfs attributes below. > > > + bs->bl.zoned = BLK_Z_NONE; > > + return; > > + } > > + bs->bl.zone_size = ret << BDRV_SECTOR_BITS; > > + > > + ret = get_sysfs_long_val(&st, "nr_zones"); > > + if (ret <= 0) { > > + error_report("Invalid number of zones %" PRId32 "", ret); > > + bs->bl.zoned = BLK_Z_NONE; > > + return; > > + } > > + bs->bl.nr_zones = ret; > > + > > + ret = ioctl(s->fd, BLKGETSIZE64, &bs->bl.capacity); > > What is the purpose of this? The size of the device is already queried > by raw_getlength(). It is stored in bs->total_sectors (e.g. > raw_do_pwrite_zeroes() uses that value). Now that bs->total_sectors suggests the device capacity, I'll drop this field. > > This looks like a Linux-specific ioctl(2), so the code will probably > fail to compile on other operating systems. > > > + if (ret != 0) { > > + error_report("Failed to get device capacity %" PRId64 " bytes ", > > + bs->bl.capacity); > > + return; > > + } > > + > > + ret = get_sysfs_long_val(&st, "zone_append_max_bytes"); > > + if (ret > 0) { > > + bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS; > > + } > > + > > + ret = get_sysfs_long_val(&st, "max_open_zones"); > > + if (ret >= 0) { > > + bs->bl.max_open_zones = ret; > > + } > > + > > + ret = get_sysfs_long_val(&st, "max_active_zones"); > > + if (ret >= 0) { > > + bs->bl.max_active_zones = ret; > > + } > > + } > > } > > > > static int check_for_dasd(int fd) > > @@ -1844,6 +1897,144 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd, > > } > > #endif > > > > +/* > > + * parse_zone - Fill a zone descriptor > > + */ > > +#if defined(CONFIG_BLKZONED) > > +static inline void parse_zone(struct BlockZoneDescriptor *zone, > > + const struct blk_zone *blkz) { > > + zone->start = blkz->start << BDRV_SECTOR_BITS; > > + zone->length = blkz->len << BDRV_SECTOR_BITS; > > + zone->wp = blkz->wp << BDRV_SECTOR_BITS; > > + > > +#ifdef HAVE_BLK_ZONE_REP_CAPACITY > > + zone->cap = blkz->capacity << BDRV_SECTOR_BITS; > > +#else > > + zone->cap = blkz->len << BDRV_SECTOR_BITS; > > +#endif > > + > > + switch (blkz->type) { > > + case BLK_ZONE_TYPE_SEQWRITE_REQ: > > + zone->type = BLK_ZT_SWR; > > + break; > > + case BLK_ZONE_TYPE_SEQWRITE_PREF: > > + zone->type = BLK_ZT_SWP; > > + break; > > + case BLK_ZONE_TYPE_CONVENTIONAL: > > + zone->type = BLK_ZT_CONV; > > + break; > > + default: > > + error_report("Invalid zone type: 0x%x", blkz->type); > > + } > > + > > + switch (blkz->cond) { > > + case BLK_ZONE_COND_NOT_WP: > > + zone->cond = BLK_ZS_NOT_WP; > > + break; > > + case BLK_ZONE_COND_EMPTY: > > + zone->cond = BLK_ZS_EMPTY; > > + break; > > + case BLK_ZONE_COND_IMP_OPEN: > > + zone->cond = BLK_ZS_IOPEN; > > + break; > > + case BLK_ZONE_COND_EXP_OPEN: > > + zone->cond = BLK_ZS_EOPEN; > > + break; > > + case BLK_ZONE_COND_CLOSED: > > + zone->cond = BLK_ZS_CLOSED; > > + break; > > + case BLK_ZONE_COND_READONLY: > > + zone->cond = BLK_ZS_RDONLY; > > + break; > > + case BLK_ZONE_COND_FULL: > > + zone->cond = BLK_ZS_FULL; > > + break; > > + case BLK_ZONE_COND_OFFLINE: > > + zone->cond = BLK_ZS_OFFLINE; > > + break; > > + default: > > + error_report("Invalid zone state: 0x%x", blkz->cond); > > + } > > +} > > +#endif > > + > > +#if defined(CONFIG_BLKZONED) > > +static int handle_aiocb_zone_report(void *opaque) > > +{ > > + RawPosixAIOData *aiocb = opaque; > > + int fd = aiocb->aio_fildes; > > + unsigned int *nr_zones = aiocb->zone_report.nr_zones; > > + BlockZoneDescriptor *zones = aiocb->zone_report.zones; > > + /* zoned block devices use 512-byte sectors */ > > + int64_t sector = aiocb->aio_offset / 512; > > + > > + struct blk_zone *blkz; > > + int64_t rep_size; > > + unsigned int nrz; > > + int ret, n = 0, i = 0; > > + > > + nrz = *nr_zones; > > + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); > > + g_autofree struct blk_zone_report *rep = NULL; > > + rep = g_malloc(rep_size); > > + > > + blkz = (struct blk_zone *)(rep + 1); > > + while (n < nrz) { > > + memset(rep, 0, rep_size); > > + rep->sector = sector; > > + rep->nr_zones = nrz - n; > > + > > + do { > > + ret = ioctl(fd, BLKREPORTZONE, rep); > > + } while (ret != 0 && errno == EINTR); > > + if (ret != 0) { > > + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d", > > + fd, sector, errno); > > + return -errno; > > + } > > + > > + if (!rep->nr_zones) { > > + break; > > + } > > + > > + for (i = 0; i < rep->nr_zones; i++, n++) { > > + parse_zone(&zones[n], &blkz[i]); > > + /* The next report should start after the last zone reported */ > > + sector = blkz[i].start + blkz[i].len; > > + } > > + } > > + > > + *nr_zones = n; > > + return 0; > > +} > > +#endif > > + > > +#if defined(CONFIG_BLKZONED) > > +static int handle_aiocb_zone_mgmt(void *opaque) > > +{ > > + RawPosixAIOData *aiocb = opaque; > > + int fd = aiocb->aio_fildes; > > + int64_t sector = aiocb->aio_offset / 512; > > + int64_t nr_sectors = aiocb->aio_nbytes / 512; > > + struct blk_zone_range range; > > + int ret; > > + > > + /* Execute the operation */ > > + range.sector = sector; > > + range.nr_sectors = nr_sectors; > > + do { > > + ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range); > > + } while (ret != 0 && errno == EINTR); > > + if (ret != 0) { > > + ret = -errno; > > + error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name, > > + ret); > > + return ret; > > + } > > + return 0; > > +} > > +#endif > > + > > static int handle_aiocb_copy_range(void *opaque) > > { > > RawPosixAIOData *aiocb = opaque; > > @@ -3038,6 +3229,99 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret) > > } > > } > > > > +/* > > + * zone report - Get a zone block device's information in the form > > + * of an array of zone descriptors. > > + * zones is an array of zone descriptors to hold zone information on reply; > > + * offset can be any byte within the entire size of the device; > > + * nr_zones is the maxium number of sectors the command should operate on. > > + */ > > +#if defined(CONFIG_BLKZONED) > > +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones) { > > + BDRVRawState *s = bs->opaque; > > + RawPosixAIOData acb; > > + > > + acb = (RawPosixAIOData) { > > + .bs = bs, > > + .aio_fildes = s->fd, > > + .aio_type = QEMU_AIO_ZONE_REPORT, > > + .aio_offset = offset, > > + .zone_report = { > > + .nr_zones = nr_zones, > > + .zones = zones, > > + }, > > + }; > > + > > + return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb); > > +} > > +#endif > > + > > +/* > > + * zone management operations - Execute an operation on a zone > > + */ > > +#if defined(CONFIG_BLKZONED) > > +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > > + int64_t offset, int64_t len) { > > + BDRVRawState *s = bs->opaque; > > + RawPosixAIOData acb; > > + int64_t zone_size, zone_size_mask; > > + const char *zone_op_name; > > + unsigned long zone_op; > > + > > + zone_size = bs->bl.zone_size; > > + zone_size_mask = zone_size - 1; > > + if (offset & zone_size_mask) { > > + error_report("sector offset %" PRId64 " is not aligned to zone size " > > + "%" PRId64 "", offset / 512, zone_size / 512); > > + return -EINVAL; > > + } > > + > > + if (((offset + len) < bs->bl.capacity && len & zone_size_mask) || > > + offset + len > bs->bl.capacity) { > > + error_report("number of sectors %" PRId64 " is not aligned to zone size" > > + " %" PRId64 "", len / 512, zone_size / 512); > > + return -EINVAL; > > + } > > + > > + switch (op) { > > + case BLK_ZO_OPEN: > > + zone_op_name = "BLKOPENZONE"; > > + zone_op = BLKOPENZONE; > > + break; > > + case BLK_ZO_CLOSE: > > + zone_op_name = "BLKCLOSEZONE"; > > + zone_op = BLKCLOSEZONE; > > + break; > > + case BLK_ZO_FINISH: > > + zone_op_name = "BLKFINISHZONE"; > > + zone_op = BLKFINISHZONE; > > + break; > > + case BLK_ZO_RESET: > > + zone_op_name = "BLKRESETZONE"; > > + zone_op = BLKRESETZONE; > > + break; > > + default: > > + g_assert_not_reached(); > > + } > > + > > + acb = (RawPosixAIOData) { > > + .bs = bs, > > + .aio_fildes = s->fd, > > + .aio_type = QEMU_AIO_ZONE_MGMT, > > + .aio_offset = offset, > > + .aio_nbytes = len, > > + .zone_mgmt = { > > + .zone_op = zone_op, > > + .zone_op_name = zone_op_name, > > + }, > > + }; > > + > > + return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb); > > +} > > +#endif > > + > > static coroutine_fn int > > raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, > > bool blkdev) > > @@ -3768,6 +4052,54 @@ static BlockDriver bdrv_host_device = { > > #endif > > }; > > > > +#if defined(CONFIG_BLKZONED) > > +static BlockDriver bdrv_zoned_host_device = { > > + .format_name = "zoned_host_device", > > + .protocol_name = "zoned_host_device", > > + .instance_size = sizeof(BDRVRawState), > > + .bdrv_needs_filename = true, > > + .bdrv_probe_device = hdev_probe_device, > > + .bdrv_file_open = hdev_open, > > + .bdrv_close = raw_close, > > + .bdrv_reopen_prepare = raw_reopen_prepare, > > + .bdrv_reopen_commit = raw_reopen_commit, > > + .bdrv_reopen_abort = raw_reopen_abort, > > + .bdrv_co_create_opts = bdrv_co_create_opts_simple, > > + .create_opts = &bdrv_create_opts_simple, > > + .mutable_opts = mutable_opts, > > + .bdrv_co_invalidate_cache = raw_co_invalidate_cache, > > + .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, > > + > > + .bdrv_co_preadv = raw_co_preadv, > > + .bdrv_co_pwritev = raw_co_pwritev, > > + .bdrv_co_flush_to_disk = raw_co_flush_to_disk, > > + .bdrv_co_pdiscard = hdev_co_pdiscard, > > + .bdrv_co_copy_range_from = raw_co_copy_range_from, > > + .bdrv_co_copy_range_to = raw_co_copy_range_to, > > + .bdrv_refresh_limits = raw_refresh_limits, > > + .bdrv_io_plug = raw_aio_plug, > > + .bdrv_io_unplug = raw_aio_unplug, > > + .bdrv_attach_aio_context = raw_aio_attach_aio_context, > > + > > + .bdrv_co_truncate = raw_co_truncate, > > + .bdrv_getlength = raw_getlength, > > + .bdrv_get_info = raw_get_info, > > + .bdrv_get_allocated_file_size > > + = raw_get_allocated_file_size, > > + .bdrv_get_specific_stats = hdev_get_specific_stats, > > + .bdrv_check_perm = raw_check_perm, > > + .bdrv_set_perm = raw_set_perm, > > + .bdrv_abort_perm_update = raw_abort_perm_update, > > + .bdrv_probe_blocksizes = hdev_probe_blocksizes, > > + .bdrv_probe_geometry = hdev_probe_geometry, > > + .bdrv_co_ioctl = hdev_co_ioctl, > > + > > + /* zone management operations */ > > + .bdrv_co_zone_report = raw_co_zone_report, > > + .bdrv_co_zone_mgmt = raw_co_zone_mgmt, > > +}; > > +#endif > > + > > #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) > > static void cdrom_parse_filename(const char *filename, QDict *options, > > Error **errp) > > @@ -4028,6 +4360,9 @@ static void bdrv_file_init(void) > > bdrv_register(&bdrv_file); > > #if defined(HAVE_HOST_BLOCK_DEVICE) > > bdrv_register(&bdrv_host_device); > > +#if defined(CONFIG_BLKZONED) > > + bdrv_register(&bdrv_zoned_host_device); > > +#endif > > #ifdef __linux__ > > bdrv_register(&bdrv_host_cdrom); > > #endif > > diff --git a/block/io.c b/block/io.c > > index d30073036e..88f707ea4d 100644 > > --- a/block/io.c > > +++ b/block/io.c > > @@ -3189,6 +3189,47 @@ out: > > return co.ret; > > } > > > > +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones) > > +{ > > + BlockDriver *drv = bs->drv; > > + CoroutineIOCompletion co = { > > + .coroutine = qemu_coroutine_self(), > > + }; > > + IO_CODE(); > > + > > + bdrv_inc_in_flight(bs); > > + if (!drv || !drv->bdrv_co_zone_report) { > > + co.ret = -ENOTSUP; > > + goto out; > > + } > > + co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones); > > +out: > > + bdrv_dec_in_flight(bs); > > + return co.ret; > > +} > > + > > +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > > + int64_t offset, int64_t len) > > +{ > > + BlockDriver *drv = bs->drv; > > + CoroutineIOCompletion co = { > > + .coroutine = qemu_coroutine_self(), > > + }; > > + IO_CODE(); > > + > > + bdrv_inc_in_flight(bs); > > + if (!drv || !drv->bdrv_co_zone_mgmt) { > > + co.ret = -ENOTSUP; > > + goto out; > > + } > > + co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len); > > +out: > > + bdrv_dec_in_flight(bs); > > + return co.ret; > > +} > > + > > void *qemu_blockalign(BlockDriverState *bs, size_t size) > > { > > IO_CODE(); > > diff --git a/include/block/block-io.h b/include/block/block-io.h > > index 492f95fc05..f0cdf67d33 100644 > > --- a/include/block/block-io.h > > +++ b/include/block/block-io.h > > @@ -88,6 +88,13 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf); > > /* Ensure contents are flushed to disk. */ > > int coroutine_fn bdrv_co_flush(BlockDriverState *bs); > > > > +/* Report zone information of zone block device. */ > > +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones); > > +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > > + int64_t offset, int64_t len); > > + > > int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes); > > bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); > > int bdrv_block_status(BlockDriverState *bs, int64_t offset, > > diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h > > index 7f7863cc9e..cdc06e77a6 100644 > > --- a/include/block/block_int-common.h > > +++ b/include/block/block_int-common.h > > @@ -691,6 +691,12 @@ struct BlockDriver { > > QEMUIOVector *qiov, > > int64_t pos); > > > > + int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs, > > + int64_t offset, unsigned int *nr_zones, > > + BlockZoneDescriptor *zones); > > + int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op, > > + int64_t offset, int64_t len); > > + > > /* removable device specific */ > > bool (*bdrv_is_inserted)(BlockDriverState *bs); > > void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag); > > @@ -828,6 +834,24 @@ typedef struct BlockLimits { > > > > /* device zone model */ > > BlockZoneModel zoned; > > + > > + /* zone size expressed in bytes */ > > + uint32_t zone_size; > > + > > + /* total number of zones */ > > + unsigned int nr_zones; > > + > > + /* maximum sectors of a zone append write operation */ > > + int64_t max_append_sectors; > > + > > + /* maximum number of open zones */ > > + int64_t max_open_zones; > > + > > + /* maximum number of active zones */ > > + int64_t max_active_zones; > > + > > + /* device capacity expressed in bytes */ > > + int64_t capacity; > > This field is not necessary. See bs->total_sectors. > > > } BlockLimits; > > > > typedef struct BdrvOpBlocker BdrvOpBlocker; > > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h > > index 21fc10c4c9..877b2240b3 100644 > > --- a/include/block/raw-aio.h > > +++ b/include/block/raw-aio.h > > @@ -29,6 +29,8 @@ > > #define QEMU_AIO_WRITE_ZEROES 0x0020 > > #define QEMU_AIO_COPY_RANGE 0x0040 > > #define QEMU_AIO_TRUNCATE 0x0080 > > +#define QEMU_AIO_ZONE_REPORT 0x0100 > > +#define QEMU_AIO_ZONE_MGMT 0x0200 > > #define QEMU_AIO_TYPE_MASK \ > > (QEMU_AIO_READ | \ > > QEMU_AIO_WRITE | \ > > @@ -37,7 +39,9 @@ > > QEMU_AIO_DISCARD | \ > > QEMU_AIO_WRITE_ZEROES | \ > > QEMU_AIO_COPY_RANGE | \ > > - QEMU_AIO_TRUNCATE) > > + QEMU_AIO_TRUNCATE | \ > > + QEMU_AIO_ZONE_REPORT | \ > > + QEMU_AIO_ZONE_MGMT) > > > > /* AIO flags */ > > #define QEMU_AIO_MISALIGNED 0x1000 > > diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h > > index 50f5aa2e07..1b5fc7db6b 100644 > > --- a/include/sysemu/block-backend-io.h > > +++ b/include/sysemu/block-backend-io.h > > @@ -45,6 +45,13 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset, > > BlockCompletionFunc *cb, void *opaque); > > BlockAIOCB *blk_aio_flush(BlockBackend *blk, > > BlockCompletionFunc *cb, void *opaque); > > +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones, > > + BlockCompletionFunc *cb, void *opaque); > > +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > + int64_t offset, int64_t len, > > + BlockCompletionFunc *cb, void *opaque); > > BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes, > > BlockCompletionFunc *cb, void *opaque); > > void blk_aio_cancel_async(BlockAIOCB *acb); > > @@ -156,6 +163,17 @@ int generated_co_wrapper blk_pwrite_zeroes(BlockBackend *blk, int64_t offset, > > int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, > > int64_t bytes, BdrvRequestFlags flags); > > > > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones); > > +int generated_co_wrapper blk_zone_report(BlockBackend *blk, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones); > > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > + int64_t offset, int64_t len); > > +int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > + int64_t offset, int64_t len); > > + > > int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset, > > int64_t bytes); > > int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, > > diff --git a/meson.build b/meson.build > > index 44c1f92697..0aa99b64a0 100644 > > --- a/meson.build > > +++ b/meson.build > > @@ -1928,6 +1928,7 @@ config_host_data.set('CONFIG_REPLICATION', get_option('replication').allowed()) > > # has_header > > config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h')) > > config_host_data.set('CONFIG_LINUX_MAGIC_H', cc.has_header('linux/magic.h')) > > +config_host_data.set('CONFIG_BLKZONED', cc.has_header('linux/blkzoned.h')) > > config_host_data.set('CONFIG_VALGRIND_H', cc.has_header('valgrind/valgrind.h')) > > config_host_data.set('HAVE_BTRFS_H', cc.has_header('linux/btrfs.h')) > > config_host_data.set('HAVE_DRM_H', cc.has_header('libdrm/drm.h')) > > @@ -2021,6 +2022,9 @@ config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID', > > config_host_data.set('HAVE_STRUCT_STAT_ST_ATIM', > > cc.has_member('struct stat', 'st_atim', > > prefix: '#include <sys/stat.h>')) > > +config_host_data.set('HAVE_BLK_ZONE_REP_CAPACITY', > > + cc.has_member('struct blk_zone', 'capacity', > > + prefix: '#include <linux/blkzoned.h>')) > > > > # has_type > > config_host_data.set('CONFIG_IOVEC', > > diff --git a/qapi/block-core.json b/qapi/block-core.json > > index 882b266532..05a3b44731 100644 > > --- a/qapi/block-core.json > > +++ b/qapi/block-core.json > > @@ -2942,6 +2942,7 @@ > > # @compress: Since 5.0 > > # @copy-before-write: Since 6.2 > > # @snapshot-access: Since 7.0 > > +# @zoned_host_device: Since 7.2 > > # > > # Since: 2.9 > > ## > > @@ -2955,7 +2956,8 @@ > > 'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', > > 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd', > > { 'name': 'replication', 'if': 'CONFIG_REPLICATION' }, > > - 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] } > > + 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', > > + { 'name': 'zoned_host_device', 'if': 'CONFIG_BLKZONED' } ] } > > > > ## > > # @BlockdevOptionsFile: > > @@ -4329,7 +4331,9 @@ > > 'vhdx': 'BlockdevOptionsGenericFormat', > > 'vmdk': 'BlockdevOptionsGenericCOWFormat', > > 'vpc': 'BlockdevOptionsGenericFormat', > > - 'vvfat': 'BlockdevOptionsVVFAT' > > + 'vvfat': 'BlockdevOptionsVVFAT', > > + 'zoned_host_device': { 'type': 'BlockdevOptionsFile', > > + 'if': 'CONFIG_BLKZONED' } > > } } > > > > ## > > diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c > > index 952dc940f1..c1b28ea108 100644 > > --- a/qemu-io-cmds.c > > +++ b/qemu-io-cmds.c > > @@ -1712,6 +1712,150 @@ static const cmdinfo_t flush_cmd = { > > .oneline = "flush all in-core file state to disk", > > }; > > > > +static inline int64_t tosector(int64_t bytes) > > +{ > > + return bytes >> BDRV_SECTOR_BITS; > > +} > > + > > +static int zone_report_f(BlockBackend *blk, int argc, char **argv) > > +{ > > + int ret; > > + int64_t offset; > > + unsigned int nr_zones; > > + > > + ++optind; > > + offset = cvtnum(argv[optind]); > > + ++optind; > > + nr_zones = cvtnum(argv[optind]); > > + > > + g_autofree BlockZoneDescriptor *zones = NULL; > > + zones = g_new(BlockZoneDescriptor, nr_zones); > > + ret = blk_zone_report(blk, offset, &nr_zones, zones); > > + if (ret < 0) { > > + printf("zone report failed: %s\n", strerror(-ret)); > > + } else { > > + for (int i = 0; i < nr_zones; ++i) { > > + printf("start: 0x%" PRIx64 ", len 0x%" PRIx64 ", " > > + "cap"" 0x%" PRIx64 ", wptr 0x%" PRIx64 ", " > > + "zcond:%u, [type: %u]\n", > > + tosector(zones[i].start), tosector(zones[i].length), > > + tosector(zones[i].cap), tosector(zones[i].wp), > > + zones[i].cond, zones[i].type); > > + } > > + } > > + return ret; > > +} > > + > > +static const cmdinfo_t zone_report_cmd = { > > + .name = "zone_report", > > + .altname = "zrp", > > + .cfunc = zone_report_f, > > + .argmin = 2, > > + .argmax = 2, > > + .args = "offset number", > > + .oneline = "report zone information", > > +}; > > + > > +static int zone_open_f(BlockBackend *blk, int argc, char **argv) > > +{ > > + int ret; > > + int64_t offset, len; > > + ++optind; > > + offset = cvtnum(argv[optind]); > > + ++optind; > > + len = cvtnum(argv[optind]); > > + ret = blk_zone_mgmt(blk, BLK_ZO_OPEN, offset, len); > > + if (ret < 0) { > > + printf("zone open failed: %s\n", strerror(-ret)); > > + } > > + return ret; > > +} > > + > > +static const cmdinfo_t zone_open_cmd = { > > + .name = "zone_open", > > + .altname = "zo", > > + .cfunc = zone_open_f, > > + .argmin = 2, > > + .argmax = 2, > > + .args = "offset len", > > + .oneline = "explicit open a range of zones in zone block device", > > +}; > > + > > +static int zone_close_f(BlockBackend *blk, int argc, char **argv) > > +{ > > + int ret; > > + int64_t offset, len; > > + ++optind; > > + offset = cvtnum(argv[optind]); > > + ++optind; > > + len = cvtnum(argv[optind]); > > + ret = blk_zone_mgmt(blk, BLK_ZO_CLOSE, offset, len); > > + if (ret < 0) { > > + printf("zone close failed: %s\n", strerror(-ret)); > > + } > > + return ret; > > +} > > + > > +static const cmdinfo_t zone_close_cmd = { > > + .name = "zone_close", > > + .altname = "zc", > > + .cfunc = zone_close_f, > > + .argmin = 2, > > + .argmax = 2, > > + .args = "offset len", > > + .oneline = "close a range of zones in zone block device", > > +}; > > + > > +static int zone_finish_f(BlockBackend *blk, int argc, char **argv) > > +{ > > + int ret; > > + int64_t offset, len; > > + ++optind; > > + offset = cvtnum(argv[optind]); > > + ++optind; > > + len = cvtnum(argv[optind]); > > + ret = blk_zone_mgmt(blk, BLK_ZO_FINISH, offset, len); > > + if (ret < 0) { > > + printf("zone finish failed: %s\n", strerror(-ret)); > > + } > > + return ret; > > +} > > + > > +static const cmdinfo_t zone_finish_cmd = { > > + .name = "zone_finish", > > + .altname = "zf", > > + .cfunc = zone_finish_f, > > + .argmin = 2, > > + .argmax = 2, > > + .args = "offset len", > > + .oneline = "finish a range of zones in zone block device", > > +}; > > + > > +static int zone_reset_f(BlockBackend *blk, int argc, char **argv) > > +{ > > + int ret; > > + int64_t offset, len; > > + ++optind; > > + offset = cvtnum(argv[optind]); > > + ++optind; > > + len = cvtnum(argv[optind]); > > + ret = blk_zone_mgmt(blk, BLK_ZO_RESET, offset, len); > > + if (ret < 0) { > > + printf("zone reset failed: %s\n", strerror(-ret)); > > + } > > + return ret; > > +} > > + > > +static const cmdinfo_t zone_reset_cmd = { > > + .name = "zone_reset", > > + .altname = "zrs", > > + .cfunc = zone_reset_f, > > + .argmin = 2, > > + .argmax = 2, > > + .args = "offset len", > > + .oneline = "reset a zone write pointer in zone block device", > > +}; > > + > > static int truncate_f(BlockBackend *blk, int argc, char **argv); > > static const cmdinfo_t truncate_cmd = { > > .name = "truncate", > > @@ -2504,6 +2648,11 @@ static void __attribute((constructor)) init_qemuio_commands(void) > > qemuio_add_command(&aio_write_cmd); > > qemuio_add_command(&aio_flush_cmd); > > qemuio_add_command(&flush_cmd); > > + qemuio_add_command(&zone_report_cmd); > > + qemuio_add_command(&zone_open_cmd); > > + qemuio_add_command(&zone_close_cmd); > > + qemuio_add_command(&zone_finish_cmd); > > + qemuio_add_command(&zone_reset_cmd); > > qemuio_add_command(&truncate_cmd); > > qemuio_add_command(&length_cmd); > > qemuio_add_command(&info_cmd); > > -- > > 2.37.3 > >
diff --git a/block/block-backend.c b/block/block-backend.c index aa4adf06ae..1c618e9c68 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1431,6 +1431,15 @@ typedef struct BlkRwCo { void *iobuf; int ret; BdrvRequestFlags flags; + union { + struct { + unsigned int *nr_zones; + BlockZoneDescriptor *zones; + } zone_report; + struct { + unsigned long op; + } zone_mgmt; + }; } BlkRwCo; int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags) @@ -1775,6 +1784,145 @@ int coroutine_fn blk_co_flush(BlockBackend *blk) return ret; } +static void coroutine_fn blk_aio_zone_report_entry(void *opaque) +{ + BlkAioEmAIOCB *acb = opaque; + BlkRwCo *rwco = &acb->rwco; + + rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset, + rwco->zone_report.nr_zones, + rwco->zone_report.zones); + blk_aio_complete(acb); +} + +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones, + BlockCompletionFunc *cb, void *opaque) +{ + BlkAioEmAIOCB *acb; + Coroutine *co; + IO_CODE(); + + blk_inc_in_flight(blk); + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); + acb->rwco = (BlkRwCo) { + .blk = blk, + .offset = offset, + .ret = NOT_DONE, + .zone_report = { + .zones = zones, + .nr_zones = nr_zones, + }, + }; + acb->has_returned = false; + + co = qemu_coroutine_create(blk_aio_zone_report_entry, acb); + bdrv_coroutine_enter(blk_bs(blk), co); + + acb->has_returned = true; + if (acb->rwco.ret != NOT_DONE) { + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), + blk_aio_complete_bh, acb); + } + + return &acb->common; +} + +static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque) +{ + BlkAioEmAIOCB *acb = opaque; + BlkRwCo *rwco = &acb->rwco; + + rwco->ret = blk_co_zone_mgmt(rwco->blk, rwco->zone_mgmt.op, + rwco->offset, acb->bytes); + blk_aio_complete(acb); +} + +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, + int64_t offset, int64_t len, + BlockCompletionFunc *cb, void *opaque) { + BlkAioEmAIOCB *acb; + Coroutine *co; + IO_CODE(); + + blk_inc_in_flight(blk); + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); + acb->rwco = (BlkRwCo) { + .blk = blk, + .offset = offset, + .ret = NOT_DONE, + .zone_mgmt = { + .op = op, + }, + }; + acb->bytes = len; + acb->has_returned = false; + + co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb); + bdrv_coroutine_enter(blk_bs(blk), co); + + acb->has_returned = true; + if (acb->rwco.ret != NOT_DONE) { + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), + blk_aio_complete_bh, acb); + } + + return &acb->common; +} + +/* + * Send a zone_report command. + * offset is a byte offset from the start of the device. No alignment + * required for offset. + * nr_zones represents IN maximum and OUT actual. + */ +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones) +{ + int ret; + IO_CODE(); + + blk_inc_in_flight(blk); /* increase before waiting */ + blk_wait_while_drained(blk); + if (!blk_is_available(blk)) { + blk_dec_in_flight(blk); + return -ENOMEDIUM; + } + ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones); + blk_dec_in_flight(blk); + return ret; +} + +/* + * Send a zone_management command. + * op is the zone operation; + * offset is the byte offset from the start of the zoned device; + * len is the maximum number of bytes the command should operate on. It + * should be aligned with the device zone size. + */ +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, + int64_t offset, int64_t len) +{ + int ret; + IO_CODE(); + + + blk_inc_in_flight(blk); + blk_wait_while_drained(blk); + + ret = blk_check_byte_request(blk, offset, len); + if (ret < 0) { + blk_dec_in_flight(blk); + return ret; + } + + ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len); + blk_dec_in_flight(blk); + return ret; +} + void blk_drain(BlockBackend *blk) { BlockDriverState *bs = blk_bs(blk); diff --git a/block/file-posix.c b/block/file-posix.c index 8cb07fdb8a..bd28e3eaea 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -67,6 +67,9 @@ #include <sys/param.h> #include <sys/syscall.h> #include <sys/vfs.h> +#if defined(CONFIG_BLKZONED) +#include <linux/blkzoned.h> +#endif #include <linux/cdrom.h> #include <linux/fd.h> #include <linux/fs.h> @@ -215,6 +218,14 @@ typedef struct RawPosixAIOData { PreallocMode prealloc; Error **errp; } truncate; + struct { + unsigned int *nr_zones; + BlockZoneDescriptor *zones; + } zone_report; + struct { + unsigned long zone_op; + const char *zone_op_name; + } zone_mgmt; }; } RawPosixAIOData; @@ -1350,6 +1361,48 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) zoned = BLK_Z_NONE; } bs->bl.zoned = zoned; + if (zoned != BLK_Z_NONE) { + /* + * The zoned device must at least have zone size and nr_zones fields. + */ + ret = get_sysfs_long_val(&st, "chunk_sectors"); + if (ret <= 0) { + error_report("Invalid zone size %" PRId32 " sectors ", ret); + bs->bl.zoned = BLK_Z_NONE; + return; + } + bs->bl.zone_size = ret << BDRV_SECTOR_BITS; + + ret = get_sysfs_long_val(&st, "nr_zones"); + if (ret <= 0) { + error_report("Invalid number of zones %" PRId32 "", ret); + bs->bl.zoned = BLK_Z_NONE; + return; + } + bs->bl.nr_zones = ret; + + ret = ioctl(s->fd, BLKGETSIZE64, &bs->bl.capacity); + if (ret != 0) { + error_report("Failed to get device capacity %" PRId64 " bytes ", + bs->bl.capacity); + return; + } + + ret = get_sysfs_long_val(&st, "zone_append_max_bytes"); + if (ret > 0) { + bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS; + } + + ret = get_sysfs_long_val(&st, "max_open_zones"); + if (ret >= 0) { + bs->bl.max_open_zones = ret; + } + + ret = get_sysfs_long_val(&st, "max_active_zones"); + if (ret >= 0) { + bs->bl.max_active_zones = ret; + } + } } static int check_for_dasd(int fd) @@ -1844,6 +1897,144 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd, } #endif +/* + * parse_zone - Fill a zone descriptor + */ +#if defined(CONFIG_BLKZONED) +static inline void parse_zone(struct BlockZoneDescriptor *zone, + const struct blk_zone *blkz) { + zone->start = blkz->start << BDRV_SECTOR_BITS; + zone->length = blkz->len << BDRV_SECTOR_BITS; + zone->wp = blkz->wp << BDRV_SECTOR_BITS; + +#ifdef HAVE_BLK_ZONE_REP_CAPACITY + zone->cap = blkz->capacity << BDRV_SECTOR_BITS; +#else + zone->cap = blkz->len << BDRV_SECTOR_BITS; +#endif + + switch (blkz->type) { + case BLK_ZONE_TYPE_SEQWRITE_REQ: + zone->type = BLK_ZT_SWR; + break; + case BLK_ZONE_TYPE_SEQWRITE_PREF: + zone->type = BLK_ZT_SWP; + break; + case BLK_ZONE_TYPE_CONVENTIONAL: + zone->type = BLK_ZT_CONV; + break; + default: + error_report("Invalid zone type: 0x%x", blkz->type); + } + + switch (blkz->cond) { + case BLK_ZONE_COND_NOT_WP: + zone->cond = BLK_ZS_NOT_WP; + break; + case BLK_ZONE_COND_EMPTY: + zone->cond = BLK_ZS_EMPTY; + break; + case BLK_ZONE_COND_IMP_OPEN: + zone->cond = BLK_ZS_IOPEN; + break; + case BLK_ZONE_COND_EXP_OPEN: + zone->cond = BLK_ZS_EOPEN; + break; + case BLK_ZONE_COND_CLOSED: + zone->cond = BLK_ZS_CLOSED; + break; + case BLK_ZONE_COND_READONLY: + zone->cond = BLK_ZS_RDONLY; + break; + case BLK_ZONE_COND_FULL: + zone->cond = BLK_ZS_FULL; + break; + case BLK_ZONE_COND_OFFLINE: + zone->cond = BLK_ZS_OFFLINE; + break; + default: + error_report("Invalid zone state: 0x%x", blkz->cond); + } +} +#endif + +#if defined(CONFIG_BLKZONED) +static int handle_aiocb_zone_report(void *opaque) +{ + RawPosixAIOData *aiocb = opaque; + int fd = aiocb->aio_fildes; + unsigned int *nr_zones = aiocb->zone_report.nr_zones; + BlockZoneDescriptor *zones = aiocb->zone_report.zones; + /* zoned block devices use 512-byte sectors */ + int64_t sector = aiocb->aio_offset / 512; + + struct blk_zone *blkz; + int64_t rep_size; + unsigned int nrz; + int ret, n = 0, i = 0; + + nrz = *nr_zones; + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); + g_autofree struct blk_zone_report *rep = NULL; + rep = g_malloc(rep_size); + + blkz = (struct blk_zone *)(rep + 1); + while (n < nrz) { + memset(rep, 0, rep_size); + rep->sector = sector; + rep->nr_zones = nrz - n; + + do { + ret = ioctl(fd, BLKREPORTZONE, rep); + } while (ret != 0 && errno == EINTR); + if (ret != 0) { + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d", + fd, sector, errno); + return -errno; + } + + if (!rep->nr_zones) { + break; + } + + for (i = 0; i < rep->nr_zones; i++, n++) { + parse_zone(&zones[n], &blkz[i]); + /* The next report should start after the last zone reported */ + sector = blkz[i].start + blkz[i].len; + } + } + + *nr_zones = n; + return 0; +} +#endif + +#if defined(CONFIG_BLKZONED) +static int handle_aiocb_zone_mgmt(void *opaque) +{ + RawPosixAIOData *aiocb = opaque; + int fd = aiocb->aio_fildes; + int64_t sector = aiocb->aio_offset / 512; + int64_t nr_sectors = aiocb->aio_nbytes / 512; + struct blk_zone_range range; + int ret; + + /* Execute the operation */ + range.sector = sector; + range.nr_sectors = nr_sectors; + do { + ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range); + } while (ret != 0 && errno == EINTR); + if (ret != 0) { + ret = -errno; + error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name, + ret); + return ret; + } + return 0; +} +#endif + static int handle_aiocb_copy_range(void *opaque) { RawPosixAIOData *aiocb = opaque; @@ -3038,6 +3229,99 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret) } } +/* + * zone report - Get a zone block device's information in the form + * of an array of zone descriptors. + * zones is an array of zone descriptors to hold zone information on reply; + * offset can be any byte within the entire size of the device; + * nr_zones is the maxium number of sectors the command should operate on. + */ +#if defined(CONFIG_BLKZONED) +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones) { + BDRVRawState *s = bs->opaque; + RawPosixAIOData acb; + + acb = (RawPosixAIOData) { + .bs = bs, + .aio_fildes = s->fd, + .aio_type = QEMU_AIO_ZONE_REPORT, + .aio_offset = offset, + .zone_report = { + .nr_zones = nr_zones, + .zones = zones, + }, + }; + + return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb); +} +#endif + +/* + * zone management operations - Execute an operation on a zone + */ +#if defined(CONFIG_BLKZONED) +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, + int64_t offset, int64_t len) { + BDRVRawState *s = bs->opaque; + RawPosixAIOData acb; + int64_t zone_size, zone_size_mask; + const char *zone_op_name; + unsigned long zone_op; + + zone_size = bs->bl.zone_size; + zone_size_mask = zone_size - 1; + if (offset & zone_size_mask) { + error_report("sector offset %" PRId64 " is not aligned to zone size " + "%" PRId64 "", offset / 512, zone_size / 512); + return -EINVAL; + } + + if (((offset + len) < bs->bl.capacity && len & zone_size_mask) || + offset + len > bs->bl.capacity) { + error_report("number of sectors %" PRId64 " is not aligned to zone size" + " %" PRId64 "", len / 512, zone_size / 512); + return -EINVAL; + } + + switch (op) { + case BLK_ZO_OPEN: + zone_op_name = "BLKOPENZONE"; + zone_op = BLKOPENZONE; + break; + case BLK_ZO_CLOSE: + zone_op_name = "BLKCLOSEZONE"; + zone_op = BLKCLOSEZONE; + break; + case BLK_ZO_FINISH: + zone_op_name = "BLKFINISHZONE"; + zone_op = BLKFINISHZONE; + break; + case BLK_ZO_RESET: + zone_op_name = "BLKRESETZONE"; + zone_op = BLKRESETZONE; + break; + default: + g_assert_not_reached(); + } + + acb = (RawPosixAIOData) { + .bs = bs, + .aio_fildes = s->fd, + .aio_type = QEMU_AIO_ZONE_MGMT, + .aio_offset = offset, + .aio_nbytes = len, + .zone_mgmt = { + .zone_op = zone_op, + .zone_op_name = zone_op_name, + }, + }; + + return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb); +} +#endif + static coroutine_fn int raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, bool blkdev) @@ -3768,6 +4052,54 @@ static BlockDriver bdrv_host_device = { #endif }; +#if defined(CONFIG_BLKZONED) +static BlockDriver bdrv_zoned_host_device = { + .format_name = "zoned_host_device", + .protocol_name = "zoned_host_device", + .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, + .bdrv_probe_device = hdev_probe_device, + .bdrv_file_open = hdev_open, + .bdrv_close = raw_close, + .bdrv_reopen_prepare = raw_reopen_prepare, + .bdrv_reopen_commit = raw_reopen_commit, + .bdrv_reopen_abort = raw_reopen_abort, + .bdrv_co_create_opts = bdrv_co_create_opts_simple, + .create_opts = &bdrv_create_opts_simple, + .mutable_opts = mutable_opts, + .bdrv_co_invalidate_cache = raw_co_invalidate_cache, + .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, + + .bdrv_co_preadv = raw_co_preadv, + .bdrv_co_pwritev = raw_co_pwritev, + .bdrv_co_flush_to_disk = raw_co_flush_to_disk, + .bdrv_co_pdiscard = hdev_co_pdiscard, + .bdrv_co_copy_range_from = raw_co_copy_range_from, + .bdrv_co_copy_range_to = raw_co_copy_range_to, + .bdrv_refresh_limits = raw_refresh_limits, + .bdrv_io_plug = raw_aio_plug, + .bdrv_io_unplug = raw_aio_unplug, + .bdrv_attach_aio_context = raw_aio_attach_aio_context, + + .bdrv_co_truncate = raw_co_truncate, + .bdrv_getlength = raw_getlength, + .bdrv_get_info = raw_get_info, + .bdrv_get_allocated_file_size + = raw_get_allocated_file_size, + .bdrv_get_specific_stats = hdev_get_specific_stats, + .bdrv_check_perm = raw_check_perm, + .bdrv_set_perm = raw_set_perm, + .bdrv_abort_perm_update = raw_abort_perm_update, + .bdrv_probe_blocksizes = hdev_probe_blocksizes, + .bdrv_probe_geometry = hdev_probe_geometry, + .bdrv_co_ioctl = hdev_co_ioctl, + + /* zone management operations */ + .bdrv_co_zone_report = raw_co_zone_report, + .bdrv_co_zone_mgmt = raw_co_zone_mgmt, +}; +#endif + #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) static void cdrom_parse_filename(const char *filename, QDict *options, Error **errp) @@ -4028,6 +4360,9 @@ static void bdrv_file_init(void) bdrv_register(&bdrv_file); #if defined(HAVE_HOST_BLOCK_DEVICE) bdrv_register(&bdrv_host_device); +#if defined(CONFIG_BLKZONED) + bdrv_register(&bdrv_zoned_host_device); +#endif #ifdef __linux__ bdrv_register(&bdrv_host_cdrom); #endif diff --git a/block/io.c b/block/io.c index d30073036e..88f707ea4d 100644 --- a/block/io.c +++ b/block/io.c @@ -3189,6 +3189,47 @@ out: return co.ret; } +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones) +{ + BlockDriver *drv = bs->drv; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + IO_CODE(); + + bdrv_inc_in_flight(bs); + if (!drv || !drv->bdrv_co_zone_report) { + co.ret = -ENOTSUP; + goto out; + } + co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones); +out: + bdrv_dec_in_flight(bs); + return co.ret; +} + +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, + int64_t offset, int64_t len) +{ + BlockDriver *drv = bs->drv; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + IO_CODE(); + + bdrv_inc_in_flight(bs); + if (!drv || !drv->bdrv_co_zone_mgmt) { + co.ret = -ENOTSUP; + goto out; + } + co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len); +out: + bdrv_dec_in_flight(bs); + return co.ret; +} + void *qemu_blockalign(BlockDriverState *bs, size_t size) { IO_CODE(); diff --git a/include/block/block-io.h b/include/block/block-io.h index 492f95fc05..f0cdf67d33 100644 --- a/include/block/block-io.h +++ b/include/block/block-io.h @@ -88,6 +88,13 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf); /* Ensure contents are flushed to disk. */ int coroutine_fn bdrv_co_flush(BlockDriverState *bs); +/* Report zone information of zone block device. */ +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones); +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, + int64_t offset, int64_t len); + int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes); bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); int bdrv_block_status(BlockDriverState *bs, int64_t offset, diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h index 7f7863cc9e..cdc06e77a6 100644 --- a/include/block/block_int-common.h +++ b/include/block/block_int-common.h @@ -691,6 +691,12 @@ struct BlockDriver { QEMUIOVector *qiov, int64_t pos); + int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs, + int64_t offset, unsigned int *nr_zones, + BlockZoneDescriptor *zones); + int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op, + int64_t offset, int64_t len); + /* removable device specific */ bool (*bdrv_is_inserted)(BlockDriverState *bs); void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag); @@ -828,6 +834,24 @@ typedef struct BlockLimits { /* device zone model */ BlockZoneModel zoned; + + /* zone size expressed in bytes */ + uint32_t zone_size; + + /* total number of zones */ + unsigned int nr_zones; + + /* maximum sectors of a zone append write operation */ + int64_t max_append_sectors; + + /* maximum number of open zones */ + int64_t max_open_zones; + + /* maximum number of active zones */ + int64_t max_active_zones; + + /* device capacity expressed in bytes */ + int64_t capacity; } BlockLimits; typedef struct BdrvOpBlocker BdrvOpBlocker; diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h index 21fc10c4c9..877b2240b3 100644 --- a/include/block/raw-aio.h +++ b/include/block/raw-aio.h @@ -29,6 +29,8 @@ #define QEMU_AIO_WRITE_ZEROES 0x0020 #define QEMU_AIO_COPY_RANGE 0x0040 #define QEMU_AIO_TRUNCATE 0x0080 +#define QEMU_AIO_ZONE_REPORT 0x0100 +#define QEMU_AIO_ZONE_MGMT 0x0200 #define QEMU_AIO_TYPE_MASK \ (QEMU_AIO_READ | \ QEMU_AIO_WRITE | \ @@ -37,7 +39,9 @@ QEMU_AIO_DISCARD | \ QEMU_AIO_WRITE_ZEROES | \ QEMU_AIO_COPY_RANGE | \ - QEMU_AIO_TRUNCATE) + QEMU_AIO_TRUNCATE | \ + QEMU_AIO_ZONE_REPORT | \ + QEMU_AIO_ZONE_MGMT) /* AIO flags */ #define QEMU_AIO_MISALIGNED 0x1000 diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h index 50f5aa2e07..1b5fc7db6b 100644 --- a/include/sysemu/block-backend-io.h +++ b/include/sysemu/block-backend-io.h @@ -45,6 +45,13 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset, BlockCompletionFunc *cb, void *opaque); BlockAIOCB *blk_aio_flush(BlockBackend *blk, BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones, + BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, + int64_t offset, int64_t len, + BlockCompletionFunc *cb, void *opaque); BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes, BlockCompletionFunc *cb, void *opaque); void blk_aio_cancel_async(BlockAIOCB *acb); @@ -156,6 +163,17 @@ int generated_co_wrapper blk_pwrite_zeroes(BlockBackend *blk, int64_t offset, int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, int64_t bytes, BdrvRequestFlags flags); +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones); +int generated_co_wrapper blk_zone_report(BlockBackend *blk, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones); +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, + int64_t offset, int64_t len); +int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op, + int64_t offset, int64_t len); + int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes); int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, diff --git a/meson.build b/meson.build index 44c1f92697..0aa99b64a0 100644 --- a/meson.build +++ b/meson.build @@ -1928,6 +1928,7 @@ config_host_data.set('CONFIG_REPLICATION', get_option('replication').allowed()) # has_header config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h')) config_host_data.set('CONFIG_LINUX_MAGIC_H', cc.has_header('linux/magic.h')) +config_host_data.set('CONFIG_BLKZONED', cc.has_header('linux/blkzoned.h')) config_host_data.set('CONFIG_VALGRIND_H', cc.has_header('valgrind/valgrind.h')) config_host_data.set('HAVE_BTRFS_H', cc.has_header('linux/btrfs.h')) config_host_data.set('HAVE_DRM_H', cc.has_header('libdrm/drm.h')) @@ -2021,6 +2022,9 @@ config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID', config_host_data.set('HAVE_STRUCT_STAT_ST_ATIM', cc.has_member('struct stat', 'st_atim', prefix: '#include <sys/stat.h>')) +config_host_data.set('HAVE_BLK_ZONE_REP_CAPACITY', + cc.has_member('struct blk_zone', 'capacity', + prefix: '#include <linux/blkzoned.h>')) # has_type config_host_data.set('CONFIG_IOVEC', diff --git a/qapi/block-core.json b/qapi/block-core.json index 882b266532..05a3b44731 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2942,6 +2942,7 @@ # @compress: Since 5.0 # @copy-before-write: Since 6.2 # @snapshot-access: Since 7.0 +# @zoned_host_device: Since 7.2 # # Since: 2.9 ## @@ -2955,7 +2956,8 @@ 'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd', { 'name': 'replication', 'if': 'CONFIG_REPLICATION' }, - 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] } + 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', + { 'name': 'zoned_host_device', 'if': 'CONFIG_BLKZONED' } ] } ## # @BlockdevOptionsFile: @@ -4329,7 +4331,9 @@ 'vhdx': 'BlockdevOptionsGenericFormat', 'vmdk': 'BlockdevOptionsGenericCOWFormat', 'vpc': 'BlockdevOptionsGenericFormat', - 'vvfat': 'BlockdevOptionsVVFAT' + 'vvfat': 'BlockdevOptionsVVFAT', + 'zoned_host_device': { 'type': 'BlockdevOptionsFile', + 'if': 'CONFIG_BLKZONED' } } } ## diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c index 952dc940f1..c1b28ea108 100644 --- a/qemu-io-cmds.c +++ b/qemu-io-cmds.c @@ -1712,6 +1712,150 @@ static const cmdinfo_t flush_cmd = { .oneline = "flush all in-core file state to disk", }; +static inline int64_t tosector(int64_t bytes) +{ + return bytes >> BDRV_SECTOR_BITS; +} + +static int zone_report_f(BlockBackend *blk, int argc, char **argv) +{ + int ret; + int64_t offset; + unsigned int nr_zones; + + ++optind; + offset = cvtnum(argv[optind]); + ++optind; + nr_zones = cvtnum(argv[optind]); + + g_autofree BlockZoneDescriptor *zones = NULL; + zones = g_new(BlockZoneDescriptor, nr_zones); + ret = blk_zone_report(blk, offset, &nr_zones, zones); + if (ret < 0) { + printf("zone report failed: %s\n", strerror(-ret)); + } else { + for (int i = 0; i < nr_zones; ++i) { + printf("start: 0x%" PRIx64 ", len 0x%" PRIx64 ", " + "cap"" 0x%" PRIx64 ", wptr 0x%" PRIx64 ", " + "zcond:%u, [type: %u]\n", + tosector(zones[i].start), tosector(zones[i].length), + tosector(zones[i].cap), tosector(zones[i].wp), + zones[i].cond, zones[i].type); + } + } + return ret; +} + +static const cmdinfo_t zone_report_cmd = { + .name = "zone_report", + .altname = "zrp", + .cfunc = zone_report_f, + .argmin = 2, + .argmax = 2, + .args = "offset number", + .oneline = "report zone information", +}; + +static int zone_open_f(BlockBackend *blk, int argc, char **argv) +{ + int ret; + int64_t offset, len; + ++optind; + offset = cvtnum(argv[optind]); + ++optind; + len = cvtnum(argv[optind]); + ret = blk_zone_mgmt(blk, BLK_ZO_OPEN, offset, len); + if (ret < 0) { + printf("zone open failed: %s\n", strerror(-ret)); + } + return ret; +} + +static const cmdinfo_t zone_open_cmd = { + .name = "zone_open", + .altname = "zo", + .cfunc = zone_open_f, + .argmin = 2, + .argmax = 2, + .args = "offset len", + .oneline = "explicit open a range of zones in zone block device", +}; + +static int zone_close_f(BlockBackend *blk, int argc, char **argv) +{ + int ret; + int64_t offset, len; + ++optind; + offset = cvtnum(argv[optind]); + ++optind; + len = cvtnum(argv[optind]); + ret = blk_zone_mgmt(blk, BLK_ZO_CLOSE, offset, len); + if (ret < 0) { + printf("zone close failed: %s\n", strerror(-ret)); + } + return ret; +} + +static const cmdinfo_t zone_close_cmd = { + .name = "zone_close", + .altname = "zc", + .cfunc = zone_close_f, + .argmin = 2, + .argmax = 2, + .args = "offset len", + .oneline = "close a range of zones in zone block device", +}; + +static int zone_finish_f(BlockBackend *blk, int argc, char **argv) +{ + int ret; + int64_t offset, len; + ++optind; + offset = cvtnum(argv[optind]); + ++optind; + len = cvtnum(argv[optind]); + ret = blk_zone_mgmt(blk, BLK_ZO_FINISH, offset, len); + if (ret < 0) { + printf("zone finish failed: %s\n", strerror(-ret)); + } + return ret; +} + +static const cmdinfo_t zone_finish_cmd = { + .name = "zone_finish", + .altname = "zf", + .cfunc = zone_finish_f, + .argmin = 2, + .argmax = 2, + .args = "offset len", + .oneline = "finish a range of zones in zone block device", +}; + +static int zone_reset_f(BlockBackend *blk, int argc, char **argv) +{ + int ret; + int64_t offset, len; + ++optind; + offset = cvtnum(argv[optind]); + ++optind; + len = cvtnum(argv[optind]); + ret = blk_zone_mgmt(blk, BLK_ZO_RESET, offset, len); + if (ret < 0) { + printf("zone reset failed: %s\n", strerror(-ret)); + } + return ret; +} + +static const cmdinfo_t zone_reset_cmd = { + .name = "zone_reset", + .altname = "zrs", + .cfunc = zone_reset_f, + .argmin = 2, + .argmax = 2, + .args = "offset len", + .oneline = "reset a zone write pointer in zone block device", +}; + static int truncate_f(BlockBackend *blk, int argc, char **argv); static const cmdinfo_t truncate_cmd = { .name = "truncate", @@ -2504,6 +2648,11 @@ static void __attribute((constructor)) init_qemuio_commands(void) qemuio_add_command(&aio_write_cmd); qemuio_add_command(&aio_flush_cmd); qemuio_add_command(&flush_cmd); + qemuio_add_command(&zone_report_cmd); + qemuio_add_command(&zone_open_cmd); + qemuio_add_command(&zone_close_cmd); + qemuio_add_command(&zone_finish_cmd); + qemuio_add_command(&zone_reset_cmd); qemuio_add_command(&truncate_cmd); qemuio_add_command(&length_cmd); qemuio_add_command(&info_cmd);