Message ID | 20230129102850.84731-4-faithilikerun@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add support for zoned device | expand |
)(_On Sun, 29 Jan 2023 at 05:30, Sam Li <faithilikerun@gmail.com> wrote: > > Add zoned device option to host_device BlockDriver. It will be presented only > for zoned host block devices. By adding zone management operations to the > host_block_device BlockDriver, users can use the new block layer APIs > including Report Zone and four zone management operations > (open, close, finish, reset, reset_all). > > Qemu-io uses the new APIs to perform zoned storage commands of the device: > zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs), > zone_finish(zf). > > For example, to test zone_report, use following command: > $ ./build/qemu-io --image-opts -n driver=host_device, filename=/dev/nullb0 > -c "zrp offset nr_zones" > > Signed-off-by: Sam Li <faithilikerun@gmail.com> > Reviewed-by: Hannes Reinecke <hare@suse.de> > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> > --- > block/block-backend.c | 147 ++++++++++++++ > block/file-posix.c | 323 ++++++++++++++++++++++++++++++ > block/io.c | 41 ++++ > include/block/block-io.h | 7 + > include/block/block_int-common.h | 21 ++ > include/block/raw-aio.h | 6 +- > include/sysemu/block-backend-io.h | 18 ++ > meson.build | 4 + > qemu-io-cmds.c | 149 ++++++++++++++ > 9 files changed, 715 insertions(+), 1 deletion(-) > > diff --git a/block/block-backend.c b/block/block-backend.c > index ba7bf1d6bc..a4847b9131 100644 > --- a/block/block-backend.c > +++ b/block/block-backend.c > @@ -1451,6 +1451,15 @@ typedef struct BlkRwCo { > void *iobuf; > int ret; > BdrvRequestFlags flags; > + union { > + struct { > + unsigned int *nr_zones; > + BlockZoneDescriptor *zones; > + } zone_report; > + struct { > + unsigned long op; > + } zone_mgmt; > + }; > } BlkRwCo; > > int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags) > @@ -1795,6 +1804,144 @@ int coroutine_fn blk_co_flush(BlockBackend *blk) > return ret; > } > > +static void coroutine_fn blk_aio_zone_report_entry(void *opaque) > +{ > + BlkAioEmAIOCB *acb = opaque; > + BlkRwCo *rwco = &acb->rwco; > + > + rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset, > + rwco->zone_report.nr_zones, > + rwco->zone_report.zones); > + blk_aio_complete(acb); > +} > + > +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones, > + BlockCompletionFunc *cb, void *opaque) > +{ > + BlkAioEmAIOCB *acb; > + Coroutine *co; > + IO_CODE(); > + > + blk_inc_in_flight(blk); > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > + acb->rwco = (BlkRwCo) { > + .blk = blk, > + .offset = offset, > + .ret = NOT_DONE, > + .zone_report = { > + .zones = zones, > + .nr_zones = nr_zones, > + }, > + }; > + acb->has_returned = false; > + > + co = qemu_coroutine_create(blk_aio_zone_report_entry, acb); > + bdrv_coroutine_enter(blk_bs(blk), co); > + > + acb->has_returned = true; > + if (acb->rwco.ret != NOT_DONE) { > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > + blk_aio_complete_bh, acb); > + } > + > + return &acb->common; > +} > + > +static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque) > +{ > + BlkAioEmAIOCB *acb = opaque; > + BlkRwCo *rwco = &acb->rwco; > + > + rwco->ret = blk_co_zone_mgmt(rwco->blk, rwco->zone_mgmt.op, > + rwco->offset, acb->bytes); > + blk_aio_complete(acb); > +} > + > +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > + int64_t offset, int64_t len, > + BlockCompletionFunc *cb, void *opaque) { > + BlkAioEmAIOCB *acb; > + Coroutine *co; > + IO_CODE(); > + > + blk_inc_in_flight(blk); > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > + acb->rwco = (BlkRwCo) { > + .blk = blk, > + .offset = offset, > + .ret = NOT_DONE, > + .zone_mgmt = { > + .op = op, > + }, > + }; > + acb->bytes = len; > + acb->has_returned = false; > + > + co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb); > + bdrv_coroutine_enter(blk_bs(blk), co); > + > + acb->has_returned = true; > + if (acb->rwco.ret != NOT_DONE) { > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > + blk_aio_complete_bh, acb); > + } > + > + return &acb->common; > +} > + > +/* > + * Send a zone_report command. > + * offset is a byte offset from the start of the device. No alignment > + * required for offset. > + * nr_zones represents IN maximum and OUT actual. > + */ > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones) > +{ > + int ret; > + IO_CODE(); > + > + blk_inc_in_flight(blk); /* increase before waiting */ > + blk_wait_while_drained(blk); > + if (!blk_is_available(blk)) { > + blk_dec_in_flight(blk); > + return -ENOMEDIUM; > + } > + ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones); > + blk_dec_in_flight(blk); > + return ret; > +} > + > +/* > + * Send a zone_management command. > + * op is the zone operation; > + * offset is the byte offset from the start of the zoned device; > + * len is the maximum number of bytes the command should operate on. It > + * should be aligned with the device zone size. > + */ > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > + int64_t offset, int64_t len) > +{ > + int ret; > + IO_CODE(); > + > + blk_inc_in_flight(blk); > + blk_wait_while_drained(blk); > + > + ret = blk_check_byte_request(blk, offset, len); > + if (ret < 0) { > + blk_dec_in_flight(blk); > + return ret; > + } > + > + ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len); > + blk_dec_in_flight(blk); > + return ret; > +} > + > void blk_drain(BlockBackend *blk) > { > BlockDriverState *bs = blk_bs(blk); > diff --git a/block/file-posix.c b/block/file-posix.c > index 43c59c6d56..b6d88db208 100644 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -68,6 +68,9 @@ > #include <sys/param.h> > #include <sys/syscall.h> > #include <sys/vfs.h> > +#if defined(CONFIG_BLKZONED) > +#include <linux/blkzoned.h> > +#endif > #include <linux/cdrom.h> > #include <linux/fd.h> > #include <linux/fs.h> > @@ -216,6 +219,13 @@ typedef struct RawPosixAIOData { > PreallocMode prealloc; > Error **errp; > } truncate; > + struct { > + unsigned int *nr_zones; > + BlockZoneDescriptor *zones; > + } zone_report; > + struct { > + unsigned long op; > + } zone_mgmt; > }; > } RawPosixAIOData; > > @@ -1351,6 +1361,50 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) > zoned = BLK_Z_NONE; > } > bs->bl.zoned = zoned; > + if (zoned != BLK_Z_NONE) { > + /* > + * The zoned device must at least have zone size and nr_zones fields. > + */ > + ret = get_sysfs_long_val(&st, "chunk_sectors"); > + if (ret < 0) { > + error_setg_errno(errp, -ret, "Unable to read chunk_sectors " > + "sysfs attribute"); > + goto out; > + } else if (!ret) { > + error_setg(errp, "Read 0 from chunk_sectors sysfs attribute"); > + goto out; > + } > + bs->bl.zone_size = ret << BDRV_SECTOR_BITS; > + > + ret = get_sysfs_long_val(&st, "nr_zones"); > + if (ret < 0) { > + error_setg_errno(errp, -ret, "Unable to read nr_zones " > + "sysfs attribute"); > + goto out; > + } else if (!ret) { > + error_setg(errp, "Read 0 from nr_zones sysfs attribute"); > + goto out; > + } > + bs->bl.nr_zones = ret; > + > + ret = get_sysfs_long_val(&st, "zone_append_max_bytes"); > + if (ret > 0) { > + bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS; > + } > + > + ret = get_sysfs_long_val(&st, "max_open_zones"); > + if (ret >= 0) { > + bs->bl.max_open_zones = ret; > + } > + > + ret = get_sysfs_long_val(&st, "max_active_zones"); > + if (ret >= 0) { > + bs->bl.max_active_zones = ret; > + } > + return; > + } > +out: > + bs->bl.zoned = BLK_Z_NONE; > } > > static int check_for_dasd(int fd) > @@ -1364,6 +1418,23 @@ static int check_for_dasd(int fd) > #endif > } > > +#if defined(CONFIG_BLKZONED) > +/** > + * Zoned storage needs to be virtualized with the correct physical block size > + * and logical block size. > + */ > +static int hdev_probe_zoned_blocksizes(BlockDriverState *bs, BlockSizes *bsz) The #ifdef approach in this patch won't work because the same BlockDriver now handles both zoned and non-zoned devices at runtime. This function needs to be unified with hdev_probe_blocksizes(): if (check_for_dasd(s->fd) < 0 || bs->bl.zoned == BLK_Z_NONE) { return -ENOTSUP; } ...probe block sizes... > +{ > + BDRVRawState *s = bs->opaque; > + int ret; > + > + ret = probe_logical_blocksize(s->fd, &bsz->log); > + if (ret < 0) { > + return ret; > + } > + return probe_physical_blocksize(s->fd, &bsz->phys); > +} > +#else > /** > * Try to get @bs's logical and physical block size. > * On success, store them in @bsz and return zero. > @@ -1384,6 +1455,7 @@ static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) > } > return probe_physical_blocksize(s->fd, &bsz->phys); > } > +#endif > > /** > * Try to get @bs's geometry: cyls, heads, sectors. > @@ -1844,6 +1916,146 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd, > } > #endif > > +/* > + * parse_zone - Fill a zone descriptor > + */ > +#if defined(CONFIG_BLKZONED) > +static inline int parse_zone(struct BlockZoneDescriptor *zone, > + const struct blk_zone *blkz) { > + zone->start = blkz->start << BDRV_SECTOR_BITS; > + zone->length = blkz->len << BDRV_SECTOR_BITS; > + zone->wp = blkz->wp << BDRV_SECTOR_BITS; > + > +#ifdef HAVE_BLK_ZONE_REP_CAPACITY > + zone->cap = blkz->capacity << BDRV_SECTOR_BITS; > +#else > + zone->cap = blkz->len << BDRV_SECTOR_BITS; > +#endif > + > + switch (blkz->type) { > + case BLK_ZONE_TYPE_SEQWRITE_REQ: > + zone->type = BLK_ZT_SWR; > + break; > + case BLK_ZONE_TYPE_SEQWRITE_PREF: > + zone->type = BLK_ZT_SWP; > + break; > + case BLK_ZONE_TYPE_CONVENTIONAL: > + zone->type = BLK_ZT_CONV; > + break; > + default: > + error_report("Unsupported zone type: 0x%x", blkz->type); > + return -ENOTSUP; > + } > + > + switch (blkz->cond) { > + case BLK_ZONE_COND_NOT_WP: > + zone->state = BLK_ZS_NOT_WP; > + break; > + case BLK_ZONE_COND_EMPTY: > + zone->state = BLK_ZS_EMPTY; > + break; > + case BLK_ZONE_COND_IMP_OPEN: > + zone->state = BLK_ZS_IOPEN; > + break; > + case BLK_ZONE_COND_EXP_OPEN: > + zone->state = BLK_ZS_EOPEN; > + break; > + case BLK_ZONE_COND_CLOSED: > + zone->state = BLK_ZS_CLOSED; > + break; > + case BLK_ZONE_COND_READONLY: > + zone->state = BLK_ZS_RDONLY; > + break; > + case BLK_ZONE_COND_FULL: > + zone->state = BLK_ZS_FULL; > + break; > + case BLK_ZONE_COND_OFFLINE: > + zone->state = BLK_ZS_OFFLINE; > + break; > + default: > + error_report("Unsupported zone state: 0x%x", blkz->cond); > + return -ENOTSUP; > + } > + return 0; > +} > +#endif > + > +#if defined(CONFIG_BLKZONED) > +static int handle_aiocb_zone_report(void *opaque) > +{ > + RawPosixAIOData *aiocb = opaque; > + int fd = aiocb->aio_fildes; > + unsigned int *nr_zones = aiocb->zone_report.nr_zones; > + BlockZoneDescriptor *zones = aiocb->zone_report.zones; > + /* zoned block devices use 512-byte sectors */ > + uint64_t sector = aiocb->aio_offset / 512; > + > + struct blk_zone *blkz; > + size_t rep_size; > + unsigned int nrz; > + int ret, n = 0, i = 0; > + > + nrz = *nr_zones; > + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); > + g_autofree struct blk_zone_report *rep = NULL; > + rep = g_malloc(rep_size); > + > + blkz = (struct blk_zone *)(rep + 1); > + while (n < nrz) { > + memset(rep, 0, rep_size); > + rep->sector = sector; > + rep->nr_zones = nrz - n; > + > + do { > + ret = ioctl(fd, BLKREPORTZONE, rep); > + } while (ret != 0 && errno == EINTR); > + if (ret != 0) { > + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d", > + fd, sector, errno); > + return -errno; > + } > + > + if (!rep->nr_zones) { > + break; > + } > + > + for (i = 0; i < rep->nr_zones; i++, n++) { > + ret = parse_zone(&zones[n], &blkz[i]); > + if (ret != 0) { > + return ret; > + } > + > + /* The next report should start after the last zone reported */ > + sector = blkz[i].start + blkz[i].len; > + } > + } > + > + *nr_zones = n; > + return 0; > +} > +#endif > + > +#if defined(CONFIG_BLKZONED) > +static int handle_aiocb_zone_mgmt(void *opaque) > +{ > + RawPosixAIOData *aiocb = opaque; > + int fd = aiocb->aio_fildes; > + uint64_t sector = aiocb->aio_offset / 512; > + int64_t nr_sectors = aiocb->aio_nbytes / 512; > + struct blk_zone_range range; > + int ret; > + > + /* Execute the operation */ > + range.sector = sector; > + range.nr_sectors = nr_sectors; > + do { > + ret = ioctl(fd, aiocb->zone_mgmt.op, &range); > + } while (ret != 0 && errno == EINTR); > + > + return ret; > +} > +#endif > + > static int handle_aiocb_copy_range(void *opaque) > { > RawPosixAIOData *aiocb = opaque; > @@ -3035,6 +3247,107 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret) > } > } > > +/* > + * zone report - Get a zone block device's information in the form > + * of an array of zone descriptors. > + * zones is an array of zone descriptors to hold zone information on reply; > + * offset can be any byte within the entire size of the device; > + * nr_zones is the maxium number of sectors the command should operate on. > + */ > +#if defined(CONFIG_BLKZONED) > +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones) { > + BDRVRawState *s = bs->opaque; > + RawPosixAIOData acb; > + > + acb = (RawPosixAIOData) { > + .bs = bs, > + .aio_fildes = s->fd, > + .aio_type = QEMU_AIO_ZONE_REPORT, > + .aio_offset = offset, > + .zone_report = { > + .nr_zones = nr_zones, > + .zones = zones, > + }, > + }; > + > + return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb); > +} > +#endif > + > +/* > + * zone management operations - Execute an operation on a zone > + */ > +#if defined(CONFIG_BLKZONED) > +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > + int64_t offset, int64_t len) { > + BDRVRawState *s = bs->opaque; > + RawPosixAIOData acb; > + int64_t zone_size, zone_size_mask; > + const char *op_name; > + unsigned long zo; > + int ret; > + int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS; > + > + zone_size = bs->bl.zone_size; > + zone_size_mask = zone_size - 1; > + if (offset & zone_size_mask) { > + error_report("sector offset %" PRId64 " is not aligned to zone size " > + "%" PRId64 "", offset / 512, zone_size / 512); > + return -EINVAL; > + } > + > + if (((offset + len) < capacity && len & zone_size_mask) || > + offset + len > capacity) { > + error_report("number of sectors %" PRId64 " is not aligned to zone size" > + " %" PRId64 "", len / 512, zone_size / 512); > + return -EINVAL; > + } > + > + switch (op) { > + case BLK_ZO_OPEN: > + op_name = "BLKOPENZONE"; > + zo = BLKOPENZONE; > + break; > + case BLK_ZO_CLOSE: > + op_name = "BLKCLOSEZONE"; > + zo = BLKCLOSEZONE; > + break; > + case BLK_ZO_FINISH: > + op_name = "BLKFINISHZONE"; > + zo = BLKFINISHZONE; > + break; > + case BLK_ZO_RESET: > + op_name = "BLKRESETZONE"; > + zo = BLKRESETZONE; > + break; > + default: > + error_report("Unsupported zone op: 0x%x", op); > + return -ENOTSUP; > + } > + > + acb = (RawPosixAIOData) { > + .bs = bs, > + .aio_fildes = s->fd, > + .aio_type = QEMU_AIO_ZONE_MGMT, > + .aio_offset = offset, > + .aio_nbytes = len, > + .zone_mgmt = { > + .op = zo, > + }, > + }; > + > + ret = raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb); > + if (ret != 0) { > + ret = -errno; > + error_report("ioctl %s failed %d", op_name, ret); > + } > + > + return ret; > +} > +#endif > + > static coroutine_fn int > raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, > bool blkdev) > @@ -3756,13 +4069,23 @@ static BlockDriver bdrv_host_device = { > .bdrv_check_perm = raw_check_perm, > .bdrv_set_perm = raw_set_perm, > .bdrv_abort_perm_update = raw_abort_perm_update, > +#ifndef CONFIG_BLKZONED > .bdrv_probe_blocksizes = hdev_probe_blocksizes, > +#endif > .bdrv_probe_geometry = hdev_probe_geometry, > > /* generic scsi device */ > #ifdef __linux__ > .bdrv_co_ioctl = hdev_co_ioctl, > #endif > + > + /* zoned device */ > +#if defined(CONFIG_BLKZONED) > + /* zone management operations */ > + .bdrv_probe_blocksizes = hdev_probe_zoned_blocksizes, > + .bdrv_co_zone_report = raw_co_zone_report, > + .bdrv_co_zone_mgmt = raw_co_zone_mgmt, > +#endif > }; > > #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) > diff --git a/block/io.c b/block/io.c > index a09a19f7a7..1586e42ab9 100644 > --- a/block/io.c > +++ b/block/io.c > @@ -3099,6 +3099,47 @@ out: > return co.ret; > } > > +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones) > +{ > + BlockDriver *drv = bs->drv; > + CoroutineIOCompletion co = { > + .coroutine = qemu_coroutine_self(), > + }; > + IO_CODE(); > + > + bdrv_inc_in_flight(bs); > + if (!drv || !drv->bdrv_co_zone_report) { Now that zoned device support is determined at runtime instead of at compile-time, checking for drv->bdrv_co_zone_report isn't enough. The BlockDriverState might have bs->bl.zoned == BLK_Z_NONE. Please add || bs->bl.zoned == BLK_Z_NONE to this if statement to prevent calls when the device is not zoned. The same applies to bdrv_co_zone_mgmt(). > + co.ret = -ENOTSUP; > + goto out; > + } > + co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones); > +out: > + bdrv_dec_in_flight(bs); > + return co.ret; > +} > + > +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > + int64_t offset, int64_t len) > +{ > + BlockDriver *drv = bs->drv; > + CoroutineIOCompletion co = { > + .coroutine = qemu_coroutine_self(), > + }; > + IO_CODE(); > + > + bdrv_inc_in_flight(bs); > + if (!drv || !drv->bdrv_co_zone_mgmt) { > + co.ret = -ENOTSUP; > + goto out; > + } > + co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len); > +out: > + bdrv_dec_in_flight(bs); > + return co.ret; > +} > + > void *qemu_blockalign(BlockDriverState *bs, size_t size) > { > IO_CODE(); > diff --git a/include/block/block-io.h b/include/block/block-io.h > index 3398351596..10ff212036 100644 > --- a/include/block/block-io.h > +++ b/include/block/block-io.h > @@ -98,6 +98,13 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs); > > int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, > int64_t bytes); > +/* Report zone information of zone block device. */ > +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones); > +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > + int64_t offset, int64_t len); > + > bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); > int bdrv_block_status(BlockDriverState *bs, int64_t offset, > int64_t bytes, int64_t *pnum, int64_t *map, > diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h > index 57f0612f5e..565228d8dd 100644 > --- a/include/block/block_int-common.h > +++ b/include/block/block_int-common.h > @@ -703,6 +703,12 @@ struct BlockDriver { > int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_load_vmstate)( > BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); > > + int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs, > + int64_t offset, unsigned int *nr_zones, > + BlockZoneDescriptor *zones); > + int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op, > + int64_t offset, int64_t len); > + > /* removable device specific */ > bool (*bdrv_is_inserted)(BlockDriverState *bs); > void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag); > @@ -839,6 +845,21 @@ typedef struct BlockLimits { > > /* device zone model */ > BlockZoneModel zoned; > + > + /* zone size expressed in bytes */ > + uint32_t zone_size; > + > + /* total number of zones */ > + uint32_t nr_zones; > + > + /* maximum sectors of a zone append write operation */ > + int64_t max_append_sectors; > + > + /* maximum number of open zones */ > + int64_t max_open_zones; > + > + /* maximum number of active zones */ > + int64_t max_active_zones; > } BlockLimits; > > typedef struct BdrvOpBlocker BdrvOpBlocker; > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h > index f8cda9df91..eda6a7a253 100644 > --- a/include/block/raw-aio.h > +++ b/include/block/raw-aio.h > @@ -28,6 +28,8 @@ > #define QEMU_AIO_WRITE_ZEROES 0x0020 > #define QEMU_AIO_COPY_RANGE 0x0040 > #define QEMU_AIO_TRUNCATE 0x0080 > +#define QEMU_AIO_ZONE_REPORT 0x0100 > +#define QEMU_AIO_ZONE_MGMT 0x0200 > #define QEMU_AIO_TYPE_MASK \ > (QEMU_AIO_READ | \ > QEMU_AIO_WRITE | \ > @@ -36,7 +38,9 @@ > QEMU_AIO_DISCARD | \ > QEMU_AIO_WRITE_ZEROES | \ > QEMU_AIO_COPY_RANGE | \ > - QEMU_AIO_TRUNCATE) > + QEMU_AIO_TRUNCATE | \ > + QEMU_AIO_ZONE_REPORT | \ > + QEMU_AIO_ZONE_MGMT) > > /* AIO flags */ > #define QEMU_AIO_MISALIGNED 0x1000 > diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h > index 031a27ba10..dc8a4368f0 100644 > --- a/include/sysemu/block-backend-io.h > +++ b/include/sysemu/block-backend-io.h > @@ -46,6 +46,13 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset, > BlockCompletionFunc *cb, void *opaque); > BlockAIOCB *blk_aio_flush(BlockBackend *blk, > BlockCompletionFunc *cb, void *opaque); > +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones, > + BlockCompletionFunc *cb, void *opaque); > +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > + int64_t offset, int64_t len, > + BlockCompletionFunc *cb, void *opaque); > BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes, > BlockCompletionFunc *cb, void *opaque); > void blk_aio_cancel_async(BlockAIOCB *acb); > @@ -166,6 +173,17 @@ int co_wrapper_mixed blk_pwrite_zeroes(BlockBackend *blk, int64_t offset, > int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, > int64_t bytes, BdrvRequestFlags flags); > > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones); > +int co_wrapper_mixed blk_zone_report(BlockBackend *blk, int64_t offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones); > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > + int64_t offset, int64_t len); > +int co_wrapper_mixed blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > + int64_t offset, int64_t len); > + > int co_wrapper_mixed blk_pdiscard(BlockBackend *blk, int64_t offset, > int64_t bytes); > int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, > diff --git a/meson.build b/meson.build > index 6d3b665629..a267f74536 100644 > --- a/meson.build > +++ b/meson.build > @@ -1962,6 +1962,7 @@ config_host_data.set('CONFIG_REPLICATION', get_option('replication').allowed()) > # has_header > config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h')) > config_host_data.set('CONFIG_LINUX_MAGIC_H', cc.has_header('linux/magic.h')) > +config_host_data.set('CONFIG_BLKZONED', cc.has_header('linux/blkzoned.h')) > config_host_data.set('CONFIG_VALGRIND_H', cc.has_header('valgrind/valgrind.h')) > config_host_data.set('HAVE_BTRFS_H', cc.has_header('linux/btrfs.h')) > config_host_data.set('HAVE_DRM_H', cc.has_header('libdrm/drm.h')) > @@ -2056,6 +2057,9 @@ config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID', > config_host_data.set('HAVE_STRUCT_STAT_ST_ATIM', > cc.has_member('struct stat', 'st_atim', > prefix: '#include <sys/stat.h>')) > +config_host_data.set('HAVE_BLK_ZONE_REP_CAPACITY', > + cc.has_member('struct blk_zone', 'capacity', > + prefix: '#include <linux/blkzoned.h>')) > > # has_type > config_host_data.set('CONFIG_IOVEC', > diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c > index 952dc940f1..3a3bad77c3 100644 > --- a/qemu-io-cmds.c > +++ b/qemu-io-cmds.c > @@ -1712,6 +1712,150 @@ static const cmdinfo_t flush_cmd = { > .oneline = "flush all in-core file state to disk", > }; > > +static inline int64_t tosector(int64_t bytes) > +{ > + return bytes >> BDRV_SECTOR_BITS; > +} > + > +static int zone_report_f(BlockBackend *blk, int argc, char **argv) > +{ > + int ret; > + int64_t offset; > + unsigned int nr_zones; > + > + ++optind; > + offset = cvtnum(argv[optind]); > + ++optind; > + nr_zones = cvtnum(argv[optind]); > + > + g_autofree BlockZoneDescriptor *zones = NULL; > + zones = g_new(BlockZoneDescriptor, nr_zones); > + ret = blk_zone_report(blk, offset, &nr_zones, zones); > + if (ret < 0) { > + printf("zone report failed: %s\n", strerror(-ret)); > + } else { > + for (int i = 0; i < nr_zones; ++i) { > + printf("start: 0x%" PRIx64 ", len 0x%" PRIx64 ", " > + "cap"" 0x%" PRIx64 ", wptr 0x%" PRIx64 ", " > + "zcond:%u, [type: %u]\n", > + tosector(zones[i].start), tosector(zones[i].length), > + tosector(zones[i].cap), tosector(zones[i].wp), > + zones[i].state, zones[i].type); > + } > + } > + return ret; > +} > + > +static const cmdinfo_t zone_report_cmd = { > + .name = "zone_report", > + .altname = "zrp", > + .cfunc = zone_report_f, > + .argmin = 2, > + .argmax = 2, > + .args = "offset number", > + .oneline = "report zone information", > +}; > + > +static int zone_open_f(BlockBackend *blk, int argc, char **argv) > +{ > + int ret; > + int64_t offset, len; > + ++optind; > + offset = cvtnum(argv[optind]); > + ++optind; > + len = cvtnum(argv[optind]); > + ret = blk_zone_mgmt(blk, BLK_ZO_OPEN, offset, len); > + if (ret < 0) { > + printf("zone open failed: %s\n", strerror(-ret)); > + } > + return ret; > +} > + > +static const cmdinfo_t zone_open_cmd = { > + .name = "zone_open", > + .altname = "zo", > + .cfunc = zone_open_f, > + .argmin = 2, > + .argmax = 2, > + .args = "offset len", > + .oneline = "explicit open a range of zones in zone block device", > +}; > + > +static int zone_close_f(BlockBackend *blk, int argc, char **argv) > +{ > + int ret; > + int64_t offset, len; > + ++optind; > + offset = cvtnum(argv[optind]); > + ++optind; > + len = cvtnum(argv[optind]); > + ret = blk_zone_mgmt(blk, BLK_ZO_CLOSE, offset, len); > + if (ret < 0) { > + printf("zone close failed: %s\n", strerror(-ret)); > + } > + return ret; > +} > + > +static const cmdinfo_t zone_close_cmd = { > + .name = "zone_close", > + .altname = "zc", > + .cfunc = zone_close_f, > + .argmin = 2, > + .argmax = 2, > + .args = "offset len", > + .oneline = "close a range of zones in zone block device", > +}; > + > +static int zone_finish_f(BlockBackend *blk, int argc, char **argv) > +{ > + int ret; > + int64_t offset, len; > + ++optind; > + offset = cvtnum(argv[optind]); > + ++optind; > + len = cvtnum(argv[optind]); > + ret = blk_zone_mgmt(blk, BLK_ZO_FINISH, offset, len); > + if (ret < 0) { > + printf("zone finish failed: %s\n", strerror(-ret)); > + } > + return ret; > +} > + > +static const cmdinfo_t zone_finish_cmd = { > + .name = "zone_finish", > + .altname = "zf", > + .cfunc = zone_finish_f, > + .argmin = 2, > + .argmax = 2, > + .args = "offset len", > + .oneline = "finish a range of zones in zone block device", > +}; > + > +static int zone_reset_f(BlockBackend *blk, int argc, char **argv) > +{ > + int ret; > + int64_t offset, len; > + ++optind; > + offset = cvtnum(argv[optind]); > + ++optind; > + len = cvtnum(argv[optind]); > + ret = blk_zone_mgmt(blk, BLK_ZO_RESET, offset, len); > + if (ret < 0) { > + printf("zone reset failed: %s\n", strerror(-ret)); > + } > + return ret; > +} > + > +static const cmdinfo_t zone_reset_cmd = { > + .name = "zone_reset", > + .altname = "zrs", > + .cfunc = zone_reset_f, > + .argmin = 2, > + .argmax = 2, > + .args = "offset len", > + .oneline = "reset a zone write pointer in zone block device", > +}; > + > static int truncate_f(BlockBackend *blk, int argc, char **argv); > static const cmdinfo_t truncate_cmd = { > .name = "truncate", > @@ -2504,6 +2648,11 @@ static void __attribute((constructor)) init_qemuio_commands(void) > qemuio_add_command(&aio_write_cmd); > qemuio_add_command(&aio_flush_cmd); > qemuio_add_command(&flush_cmd); > + qemuio_add_command(&zone_report_cmd); > + qemuio_add_command(&zone_open_cmd); > + qemuio_add_command(&zone_close_cmd); > + qemuio_add_command(&zone_finish_cmd); > + qemuio_add_command(&zone_reset_cmd); > qemuio_add_command(&truncate_cmd); > qemuio_add_command(&length_cmd); > qemuio_add_command(&info_cmd); > -- > 2.38.1 > >
Stefan Hajnoczi <stefanha@gmail.com> 于2023年2月6日周一 20:04写道: > > )(_On Sun, 29 Jan 2023 at 05:30, Sam Li <faithilikerun@gmail.com> wrote: > > > > Add zoned device option to host_device BlockDriver. It will be presented only > > for zoned host block devices. By adding zone management operations to the > > host_block_device BlockDriver, users can use the new block layer APIs > > including Report Zone and four zone management operations > > (open, close, finish, reset, reset_all). > > > > Qemu-io uses the new APIs to perform zoned storage commands of the device: > > zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs), > > zone_finish(zf). > > > > For example, to test zone_report, use following command: > > $ ./build/qemu-io --image-opts -n driver=host_device, filename=/dev/nullb0 > > -c "zrp offset nr_zones" > > > > Signed-off-by: Sam Li <faithilikerun@gmail.com> > > Reviewed-by: Hannes Reinecke <hare@suse.de> > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> > > --- > > block/block-backend.c | 147 ++++++++++++++ > > block/file-posix.c | 323 ++++++++++++++++++++++++++++++ > > block/io.c | 41 ++++ > > include/block/block-io.h | 7 + > > include/block/block_int-common.h | 21 ++ > > include/block/raw-aio.h | 6 +- > > include/sysemu/block-backend-io.h | 18 ++ > > meson.build | 4 + > > qemu-io-cmds.c | 149 ++++++++++++++ > > 9 files changed, 715 insertions(+), 1 deletion(-) > > > > diff --git a/block/block-backend.c b/block/block-backend.c > > index ba7bf1d6bc..a4847b9131 100644 > > --- a/block/block-backend.c > > +++ b/block/block-backend.c > > @@ -1451,6 +1451,15 @@ typedef struct BlkRwCo { > > void *iobuf; > > int ret; > > BdrvRequestFlags flags; > > + union { > > + struct { > > + unsigned int *nr_zones; > > + BlockZoneDescriptor *zones; > > + } zone_report; > > + struct { > > + unsigned long op; > > + } zone_mgmt; > > + }; > > } BlkRwCo; > > > > int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags) > > @@ -1795,6 +1804,144 @@ int coroutine_fn blk_co_flush(BlockBackend *blk) > > return ret; > > } > > > > +static void coroutine_fn blk_aio_zone_report_entry(void *opaque) > > +{ > > + BlkAioEmAIOCB *acb = opaque; > > + BlkRwCo *rwco = &acb->rwco; > > + > > + rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset, > > + rwco->zone_report.nr_zones, > > + rwco->zone_report.zones); > > + blk_aio_complete(acb); > > +} > > + > > +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones, > > + BlockCompletionFunc *cb, void *opaque) > > +{ > > + BlkAioEmAIOCB *acb; > > + Coroutine *co; > > + IO_CODE(); > > + > > + blk_inc_in_flight(blk); > > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > > + acb->rwco = (BlkRwCo) { > > + .blk = blk, > > + .offset = offset, > > + .ret = NOT_DONE, > > + .zone_report = { > > + .zones = zones, > > + .nr_zones = nr_zones, > > + }, > > + }; > > + acb->has_returned = false; > > + > > + co = qemu_coroutine_create(blk_aio_zone_report_entry, acb); > > + bdrv_coroutine_enter(blk_bs(blk), co); > > + > > + acb->has_returned = true; > > + if (acb->rwco.ret != NOT_DONE) { > > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > > + blk_aio_complete_bh, acb); > > + } > > + > > + return &acb->common; > > +} > > + > > +static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque) > > +{ > > + BlkAioEmAIOCB *acb = opaque; > > + BlkRwCo *rwco = &acb->rwco; > > + > > + rwco->ret = blk_co_zone_mgmt(rwco->blk, rwco->zone_mgmt.op, > > + rwco->offset, acb->bytes); > > + blk_aio_complete(acb); > > +} > > + > > +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > + int64_t offset, int64_t len, > > + BlockCompletionFunc *cb, void *opaque) { > > + BlkAioEmAIOCB *acb; > > + Coroutine *co; > > + IO_CODE(); > > + > > + blk_inc_in_flight(blk); > > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > > + acb->rwco = (BlkRwCo) { > > + .blk = blk, > > + .offset = offset, > > + .ret = NOT_DONE, > > + .zone_mgmt = { > > + .op = op, > > + }, > > + }; > > + acb->bytes = len; > > + acb->has_returned = false; > > + > > + co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb); > > + bdrv_coroutine_enter(blk_bs(blk), co); > > + > > + acb->has_returned = true; > > + if (acb->rwco.ret != NOT_DONE) { > > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > > + blk_aio_complete_bh, acb); > > + } > > + > > + return &acb->common; > > +} > > + > > +/* > > + * Send a zone_report command. > > + * offset is a byte offset from the start of the device. No alignment > > + * required for offset. > > + * nr_zones represents IN maximum and OUT actual. > > + */ > > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones) > > +{ > > + int ret; > > + IO_CODE(); > > + > > + blk_inc_in_flight(blk); /* increase before waiting */ > > + blk_wait_while_drained(blk); > > + if (!blk_is_available(blk)) { > > + blk_dec_in_flight(blk); > > + return -ENOMEDIUM; > > + } > > + ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones); > > + blk_dec_in_flight(blk); > > + return ret; > > +} > > + > > +/* > > + * Send a zone_management command. > > + * op is the zone operation; > > + * offset is the byte offset from the start of the zoned device; > > + * len is the maximum number of bytes the command should operate on. It > > + * should be aligned with the device zone size. > > + */ > > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > + int64_t offset, int64_t len) > > +{ > > + int ret; > > + IO_CODE(); > > + > > + blk_inc_in_flight(blk); > > + blk_wait_while_drained(blk); > > + > > + ret = blk_check_byte_request(blk, offset, len); > > + if (ret < 0) { > > + blk_dec_in_flight(blk); > > + return ret; > > + } > > + > > + ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len); > > + blk_dec_in_flight(blk); > > + return ret; > > +} > > + > > void blk_drain(BlockBackend *blk) > > { > > BlockDriverState *bs = blk_bs(blk); > > diff --git a/block/file-posix.c b/block/file-posix.c > > index 43c59c6d56..b6d88db208 100644 > > --- a/block/file-posix.c > > +++ b/block/file-posix.c > > @@ -68,6 +68,9 @@ > > #include <sys/param.h> > > #include <sys/syscall.h> > > #include <sys/vfs.h> > > +#if defined(CONFIG_BLKZONED) > > +#include <linux/blkzoned.h> > > +#endif > > #include <linux/cdrom.h> > > #include <linux/fd.h> > > #include <linux/fs.h> > > @@ -216,6 +219,13 @@ typedef struct RawPosixAIOData { > > PreallocMode prealloc; > > Error **errp; > > } truncate; > > + struct { > > + unsigned int *nr_zones; > > + BlockZoneDescriptor *zones; > > + } zone_report; > > + struct { > > + unsigned long op; > > + } zone_mgmt; > > }; > > } RawPosixAIOData; > > > > @@ -1351,6 +1361,50 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) > > zoned = BLK_Z_NONE; > > } > > bs->bl.zoned = zoned; > > + if (zoned != BLK_Z_NONE) { > > + /* > > + * The zoned device must at least have zone size and nr_zones fields. > > + */ > > + ret = get_sysfs_long_val(&st, "chunk_sectors"); > > + if (ret < 0) { > > + error_setg_errno(errp, -ret, "Unable to read chunk_sectors " > > + "sysfs attribute"); > > + goto out; > > + } else if (!ret) { > > + error_setg(errp, "Read 0 from chunk_sectors sysfs attribute"); > > + goto out; > > + } > > + bs->bl.zone_size = ret << BDRV_SECTOR_BITS; > > + > > + ret = get_sysfs_long_val(&st, "nr_zones"); > > + if (ret < 0) { > > + error_setg_errno(errp, -ret, "Unable to read nr_zones " > > + "sysfs attribute"); > > + goto out; > > + } else if (!ret) { > > + error_setg(errp, "Read 0 from nr_zones sysfs attribute"); > > + goto out; > > + } > > + bs->bl.nr_zones = ret; > > + > > + ret = get_sysfs_long_val(&st, "zone_append_max_bytes"); > > + if (ret > 0) { > > + bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS; > > + } > > + > > + ret = get_sysfs_long_val(&st, "max_open_zones"); > > + if (ret >= 0) { > > + bs->bl.max_open_zones = ret; > > + } > > + > > + ret = get_sysfs_long_val(&st, "max_active_zones"); > > + if (ret >= 0) { > > + bs->bl.max_active_zones = ret; > > + } > > + return; > > + } > > +out: > > + bs->bl.zoned = BLK_Z_NONE; > > } > > > > static int check_for_dasd(int fd) > > @@ -1364,6 +1418,23 @@ static int check_for_dasd(int fd) > > #endif > > } > > > > +#if defined(CONFIG_BLKZONED) > > +/** > > + * Zoned storage needs to be virtualized with the correct physical block size > > + * and logical block size. > > + */ > > +static int hdev_probe_zoned_blocksizes(BlockDriverState *bs, BlockSizes *bsz) > > The #ifdef approach in this patch won't work because the same > BlockDriver now handles both zoned and non-zoned devices at runtime. > This function needs to be unified with hdev_probe_blocksizes(): > > if (check_for_dasd(s->fd) < 0 || bs->bl.zoned == BLK_Z_NONE) { > return -ENOTSUP; > } > > ...probe block sizes... > > > +{ > > + BDRVRawState *s = bs->opaque; > > + int ret; > > + > > + ret = probe_logical_blocksize(s->fd, &bsz->log); > > + if (ret < 0) { > > + return ret; > > + } > > + return probe_physical_blocksize(s->fd, &bsz->phys); > > +} > > +#else > > /** > > * Try to get @bs's logical and physical block size. > > * On success, store them in @bsz and return zero. > > @@ -1384,6 +1455,7 @@ static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) > > } > > return probe_physical_blocksize(s->fd, &bsz->phys); > > } > > +#endif > > > > /** > > * Try to get @bs's geometry: cyls, heads, sectors. > > @@ -1844,6 +1916,146 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd, > > } > > #endif > > > > +/* > > + * parse_zone - Fill a zone descriptor > > + */ > > +#if defined(CONFIG_BLKZONED) > > +static inline int parse_zone(struct BlockZoneDescriptor *zone, > > + const struct blk_zone *blkz) { > > + zone->start = blkz->start << BDRV_SECTOR_BITS; > > + zone->length = blkz->len << BDRV_SECTOR_BITS; > > + zone->wp = blkz->wp << BDRV_SECTOR_BITS; > > + > > +#ifdef HAVE_BLK_ZONE_REP_CAPACITY > > + zone->cap = blkz->capacity << BDRV_SECTOR_BITS; > > +#else > > + zone->cap = blkz->len << BDRV_SECTOR_BITS; > > +#endif > > + > > + switch (blkz->type) { > > + case BLK_ZONE_TYPE_SEQWRITE_REQ: > > + zone->type = BLK_ZT_SWR; > > + break; > > + case BLK_ZONE_TYPE_SEQWRITE_PREF: > > + zone->type = BLK_ZT_SWP; > > + break; > > + case BLK_ZONE_TYPE_CONVENTIONAL: > > + zone->type = BLK_ZT_CONV; > > + break; > > + default: > > + error_report("Unsupported zone type: 0x%x", blkz->type); > > + return -ENOTSUP; > > + } > > + > > + switch (blkz->cond) { > > + case BLK_ZONE_COND_NOT_WP: > > + zone->state = BLK_ZS_NOT_WP; > > + break; > > + case BLK_ZONE_COND_EMPTY: > > + zone->state = BLK_ZS_EMPTY; > > + break; > > + case BLK_ZONE_COND_IMP_OPEN: > > + zone->state = BLK_ZS_IOPEN; > > + break; > > + case BLK_ZONE_COND_EXP_OPEN: > > + zone->state = BLK_ZS_EOPEN; > > + break; > > + case BLK_ZONE_COND_CLOSED: > > + zone->state = BLK_ZS_CLOSED; > > + break; > > + case BLK_ZONE_COND_READONLY: > > + zone->state = BLK_ZS_RDONLY; > > + break; > > + case BLK_ZONE_COND_FULL: > > + zone->state = BLK_ZS_FULL; > > + break; > > + case BLK_ZONE_COND_OFFLINE: > > + zone->state = BLK_ZS_OFFLINE; > > + break; > > + default: > > + error_report("Unsupported zone state: 0x%x", blkz->cond); > > + return -ENOTSUP; > > + } > > + return 0; > > +} > > +#endif > > + > > +#if defined(CONFIG_BLKZONED) > > +static int handle_aiocb_zone_report(void *opaque) > > +{ > > + RawPosixAIOData *aiocb = opaque; > > + int fd = aiocb->aio_fildes; > > + unsigned int *nr_zones = aiocb->zone_report.nr_zones; > > + BlockZoneDescriptor *zones = aiocb->zone_report.zones; > > + /* zoned block devices use 512-byte sectors */ > > + uint64_t sector = aiocb->aio_offset / 512; > > + > > + struct blk_zone *blkz; > > + size_t rep_size; > > + unsigned int nrz; > > + int ret, n = 0, i = 0; > > + > > + nrz = *nr_zones; > > + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); > > + g_autofree struct blk_zone_report *rep = NULL; > > + rep = g_malloc(rep_size); > > + > > + blkz = (struct blk_zone *)(rep + 1); > > + while (n < nrz) { > > + memset(rep, 0, rep_size); > > + rep->sector = sector; > > + rep->nr_zones = nrz - n; > > + > > + do { > > + ret = ioctl(fd, BLKREPORTZONE, rep); > > + } while (ret != 0 && errno == EINTR); > > + if (ret != 0) { > > + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d", > > + fd, sector, errno); > > + return -errno; > > + } > > + > > + if (!rep->nr_zones) { > > + break; > > + } > > + > > + for (i = 0; i < rep->nr_zones; i++, n++) { > > + ret = parse_zone(&zones[n], &blkz[i]); > > + if (ret != 0) { > > + return ret; > > + } > > + > > + /* The next report should start after the last zone reported */ > > + sector = blkz[i].start + blkz[i].len; > > + } > > + } > > + > > + *nr_zones = n; > > + return 0; > > +} > > +#endif > > + > > +#if defined(CONFIG_BLKZONED) > > +static int handle_aiocb_zone_mgmt(void *opaque) > > +{ > > + RawPosixAIOData *aiocb = opaque; > > + int fd = aiocb->aio_fildes; > > + uint64_t sector = aiocb->aio_offset / 512; > > + int64_t nr_sectors = aiocb->aio_nbytes / 512; > > + struct blk_zone_range range; > > + int ret; > > + > > + /* Execute the operation */ > > + range.sector = sector; > > + range.nr_sectors = nr_sectors; > > + do { > > + ret = ioctl(fd, aiocb->zone_mgmt.op, &range); > > + } while (ret != 0 && errno == EINTR); > > + > > + return ret; > > +} > > +#endif > > + > > static int handle_aiocb_copy_range(void *opaque) > > { > > RawPosixAIOData *aiocb = opaque; > > @@ -3035,6 +3247,107 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret) > > } > > } > > > > +/* > > + * zone report - Get a zone block device's information in the form > > + * of an array of zone descriptors. > > + * zones is an array of zone descriptors to hold zone information on reply; > > + * offset can be any byte within the entire size of the device; > > + * nr_zones is the maxium number of sectors the command should operate on. > > + */ > > +#if defined(CONFIG_BLKZONED) > > +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones) { > > + BDRVRawState *s = bs->opaque; > > + RawPosixAIOData acb; > > + > > + acb = (RawPosixAIOData) { > > + .bs = bs, > > + .aio_fildes = s->fd, > > + .aio_type = QEMU_AIO_ZONE_REPORT, > > + .aio_offset = offset, > > + .zone_report = { > > + .nr_zones = nr_zones, > > + .zones = zones, > > + }, > > + }; > > + > > + return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb); > > +} > > +#endif > > + > > +/* > > + * zone management operations - Execute an operation on a zone > > + */ > > +#if defined(CONFIG_BLKZONED) > > +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > > + int64_t offset, int64_t len) { > > + BDRVRawState *s = bs->opaque; > > + RawPosixAIOData acb; > > + int64_t zone_size, zone_size_mask; > > + const char *op_name; > > + unsigned long zo; > > + int ret; > > + int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS; > > + > > + zone_size = bs->bl.zone_size; > > + zone_size_mask = zone_size - 1; > > + if (offset & zone_size_mask) { > > + error_report("sector offset %" PRId64 " is not aligned to zone size " > > + "%" PRId64 "", offset / 512, zone_size / 512); > > + return -EINVAL; > > + } > > + > > + if (((offset + len) < capacity && len & zone_size_mask) || > > + offset + len > capacity) { > > + error_report("number of sectors %" PRId64 " is not aligned to zone size" > > + " %" PRId64 "", len / 512, zone_size / 512); > > + return -EINVAL; > > + } > > + > > + switch (op) { > > + case BLK_ZO_OPEN: > > + op_name = "BLKOPENZONE"; > > + zo = BLKOPENZONE; > > + break; > > + case BLK_ZO_CLOSE: > > + op_name = "BLKCLOSEZONE"; > > + zo = BLKCLOSEZONE; > > + break; > > + case BLK_ZO_FINISH: > > + op_name = "BLKFINISHZONE"; > > + zo = BLKFINISHZONE; > > + break; > > + case BLK_ZO_RESET: > > + op_name = "BLKRESETZONE"; > > + zo = BLKRESETZONE; > > + break; > > + default: > > + error_report("Unsupported zone op: 0x%x", op); > > + return -ENOTSUP; > > + } > > + > > + acb = (RawPosixAIOData) { > > + .bs = bs, > > + .aio_fildes = s->fd, > > + .aio_type = QEMU_AIO_ZONE_MGMT, > > + .aio_offset = offset, > > + .aio_nbytes = len, > > + .zone_mgmt = { > > + .op = zo, > > + }, > > + }; > > + > > + ret = raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb); > > + if (ret != 0) { > > + ret = -errno; > > + error_report("ioctl %s failed %d", op_name, ret); > > + } > > + > > + return ret; > > +} > > +#endif > > + > > static coroutine_fn int > > raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, > > bool blkdev) > > @@ -3756,13 +4069,23 @@ static BlockDriver bdrv_host_device = { > > .bdrv_check_perm = raw_check_perm, > > .bdrv_set_perm = raw_set_perm, > > .bdrv_abort_perm_update = raw_abort_perm_update, > > +#ifndef CONFIG_BLKZONED > > .bdrv_probe_blocksizes = hdev_probe_blocksizes, > > +#endif > > .bdrv_probe_geometry = hdev_probe_geometry, > > > > /* generic scsi device */ > > #ifdef __linux__ > > .bdrv_co_ioctl = hdev_co_ioctl, > > #endif > > + > > + /* zoned device */ > > +#if defined(CONFIG_BLKZONED) > > + /* zone management operations */ > > + .bdrv_probe_blocksizes = hdev_probe_zoned_blocksizes, > > + .bdrv_co_zone_report = raw_co_zone_report, > > + .bdrv_co_zone_mgmt = raw_co_zone_mgmt, > > +#endif > > }; > > > > #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) > > diff --git a/block/io.c b/block/io.c > > index a09a19f7a7..1586e42ab9 100644 > > --- a/block/io.c > > +++ b/block/io.c > > @@ -3099,6 +3099,47 @@ out: > > return co.ret; > > } > > > > +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones) > > +{ > > + BlockDriver *drv = bs->drv; > > + CoroutineIOCompletion co = { > > + .coroutine = qemu_coroutine_self(), > > + }; > > + IO_CODE(); > > + > > + bdrv_inc_in_flight(bs); > > + if (!drv || !drv->bdrv_co_zone_report) { > > Now that zoned device support is determined at runtime instead of at > compile-time, checking for drv->bdrv_co_zone_report isn't enough. The > BlockDriverState might have bs->bl.zoned == BLK_Z_NONE. > > Please add || bs->bl.zoned == BLK_Z_NONE to this if statement to > prevent calls when the device is not zoned. > > The same applies to bdrv_co_zone_mgmt(). I see. Thanks! > > > + co.ret = -ENOTSUP; > > + goto out; > > + } > > + co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones); > > +out: > > + bdrv_dec_in_flight(bs); > > + return co.ret; > > +} > > + > > +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > > + int64_t offset, int64_t len) > > +{ > > + BlockDriver *drv = bs->drv; > > + CoroutineIOCompletion co = { > > + .coroutine = qemu_coroutine_self(), > > + }; > > + IO_CODE(); > > + > > + bdrv_inc_in_flight(bs); > > + if (!drv || !drv->bdrv_co_zone_mgmt) { > > + co.ret = -ENOTSUP; > > + goto out; > > + } > > + co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len); > > +out: > > + bdrv_dec_in_flight(bs); > > + return co.ret; > > +} > > + > > void *qemu_blockalign(BlockDriverState *bs, size_t size) > > { > > IO_CODE(); > > diff --git a/include/block/block-io.h b/include/block/block-io.h > > index 3398351596..10ff212036 100644 > > --- a/include/block/block-io.h > > +++ b/include/block/block-io.h > > @@ -98,6 +98,13 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs); > > > > int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, > > int64_t bytes); > > +/* Report zone information of zone block device. */ > > +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones); > > +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > > + int64_t offset, int64_t len); > > + > > bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); > > int bdrv_block_status(BlockDriverState *bs, int64_t offset, > > int64_t bytes, int64_t *pnum, int64_t *map, > > diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h > > index 57f0612f5e..565228d8dd 100644 > > --- a/include/block/block_int-common.h > > +++ b/include/block/block_int-common.h > > @@ -703,6 +703,12 @@ struct BlockDriver { > > int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_load_vmstate)( > > BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); > > > > + int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs, > > + int64_t offset, unsigned int *nr_zones, > > + BlockZoneDescriptor *zones); > > + int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op, > > + int64_t offset, int64_t len); > > + > > /* removable device specific */ > > bool (*bdrv_is_inserted)(BlockDriverState *bs); > > void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag); > > @@ -839,6 +845,21 @@ typedef struct BlockLimits { > > > > /* device zone model */ > > BlockZoneModel zoned; > > + > > + /* zone size expressed in bytes */ > > + uint32_t zone_size; > > + > > + /* total number of zones */ > > + uint32_t nr_zones; > > + > > + /* maximum sectors of a zone append write operation */ > > + int64_t max_append_sectors; > > + > > + /* maximum number of open zones */ > > + int64_t max_open_zones; > > + > > + /* maximum number of active zones */ > > + int64_t max_active_zones; > > } BlockLimits; > > > > typedef struct BdrvOpBlocker BdrvOpBlocker; > > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h > > index f8cda9df91..eda6a7a253 100644 > > --- a/include/block/raw-aio.h > > +++ b/include/block/raw-aio.h > > @@ -28,6 +28,8 @@ > > #define QEMU_AIO_WRITE_ZEROES 0x0020 > > #define QEMU_AIO_COPY_RANGE 0x0040 > > #define QEMU_AIO_TRUNCATE 0x0080 > > +#define QEMU_AIO_ZONE_REPORT 0x0100 > > +#define QEMU_AIO_ZONE_MGMT 0x0200 > > #define QEMU_AIO_TYPE_MASK \ > > (QEMU_AIO_READ | \ > > QEMU_AIO_WRITE | \ > > @@ -36,7 +38,9 @@ > > QEMU_AIO_DISCARD | \ > > QEMU_AIO_WRITE_ZEROES | \ > > QEMU_AIO_COPY_RANGE | \ > > - QEMU_AIO_TRUNCATE) > > + QEMU_AIO_TRUNCATE | \ > > + QEMU_AIO_ZONE_REPORT | \ > > + QEMU_AIO_ZONE_MGMT) > > > > /* AIO flags */ > > #define QEMU_AIO_MISALIGNED 0x1000 > > diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h > > index 031a27ba10..dc8a4368f0 100644 > > --- a/include/sysemu/block-backend-io.h > > +++ b/include/sysemu/block-backend-io.h > > @@ -46,6 +46,13 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset, > > BlockCompletionFunc *cb, void *opaque); > > BlockAIOCB *blk_aio_flush(BlockBackend *blk, > > BlockCompletionFunc *cb, void *opaque); > > +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones, > > + BlockCompletionFunc *cb, void *opaque); > > +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > + int64_t offset, int64_t len, > > + BlockCompletionFunc *cb, void *opaque); > > BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes, > > BlockCompletionFunc *cb, void *opaque); > > void blk_aio_cancel_async(BlockAIOCB *acb); > > @@ -166,6 +173,17 @@ int co_wrapper_mixed blk_pwrite_zeroes(BlockBackend *blk, int64_t offset, > > int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, > > int64_t bytes, BdrvRequestFlags flags); > > > > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones); > > +int co_wrapper_mixed blk_zone_report(BlockBackend *blk, int64_t offset, > > + unsigned int *nr_zones, > > + BlockZoneDescriptor *zones); > > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > + int64_t offset, int64_t len); > > +int co_wrapper_mixed blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > + int64_t offset, int64_t len); > > + > > int co_wrapper_mixed blk_pdiscard(BlockBackend *blk, int64_t offset, > > int64_t bytes); > > int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, > > diff --git a/meson.build b/meson.build > > index 6d3b665629..a267f74536 100644 > > --- a/meson.build > > +++ b/meson.build > > @@ -1962,6 +1962,7 @@ config_host_data.set('CONFIG_REPLICATION', get_option('replication').allowed()) > > # has_header > > config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h')) > > config_host_data.set('CONFIG_LINUX_MAGIC_H', cc.has_header('linux/magic.h')) > > +config_host_data.set('CONFIG_BLKZONED', cc.has_header('linux/blkzoned.h')) > > config_host_data.set('CONFIG_VALGRIND_H', cc.has_header('valgrind/valgrind.h')) > > config_host_data.set('HAVE_BTRFS_H', cc.has_header('linux/btrfs.h')) > > config_host_data.set('HAVE_DRM_H', cc.has_header('libdrm/drm.h')) > > @@ -2056,6 +2057,9 @@ config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID', > > config_host_data.set('HAVE_STRUCT_STAT_ST_ATIM', > > cc.has_member('struct stat', 'st_atim', > > prefix: '#include <sys/stat.h>')) > > +config_host_data.set('HAVE_BLK_ZONE_REP_CAPACITY', > > + cc.has_member('struct blk_zone', 'capacity', > > + prefix: '#include <linux/blkzoned.h>')) > > > > # has_type > > config_host_data.set('CONFIG_IOVEC', > > diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c > > index 952dc940f1..3a3bad77c3 100644 > > --- a/qemu-io-cmds.c > > +++ b/qemu-io-cmds.c > > @@ -1712,6 +1712,150 @@ static const cmdinfo_t flush_cmd = { > > .oneline = "flush all in-core file state to disk", > > }; > > > > +static inline int64_t tosector(int64_t bytes) > > +{ > > + return bytes >> BDRV_SECTOR_BITS; > > +} > > + > > +static int zone_report_f(BlockBackend *blk, int argc, char **argv) > > +{ > > + int ret; > > + int64_t offset; > > + unsigned int nr_zones; > > + > > + ++optind; > > + offset = cvtnum(argv[optind]); > > + ++optind; > > + nr_zones = cvtnum(argv[optind]); > > + > > + g_autofree BlockZoneDescriptor *zones = NULL; > > + zones = g_new(BlockZoneDescriptor, nr_zones); > > + ret = blk_zone_report(blk, offset, &nr_zones, zones); > > + if (ret < 0) { > > + printf("zone report failed: %s\n", strerror(-ret)); > > + } else { > > + for (int i = 0; i < nr_zones; ++i) { > > + printf("start: 0x%" PRIx64 ", len 0x%" PRIx64 ", " > > + "cap"" 0x%" PRIx64 ", wptr 0x%" PRIx64 ", " > > + "zcond:%u, [type: %u]\n", > > + tosector(zones[i].start), tosector(zones[i].length), > > + tosector(zones[i].cap), tosector(zones[i].wp), > > + zones[i].state, zones[i].type); > > + } > > + } > > + return ret; > > +} > > + > > +static const cmdinfo_t zone_report_cmd = { > > + .name = "zone_report", > > + .altname = "zrp", > > + .cfunc = zone_report_f, > > + .argmin = 2, > > + .argmax = 2, > > + .args = "offset number", > > + .oneline = "report zone information", > > +}; > > + > > +static int zone_open_f(BlockBackend *blk, int argc, char **argv) > > +{ > > + int ret; > > + int64_t offset, len; > > + ++optind; > > + offset = cvtnum(argv[optind]); > > + ++optind; > > + len = cvtnum(argv[optind]); > > + ret = blk_zone_mgmt(blk, BLK_ZO_OPEN, offset, len); > > + if (ret < 0) { > > + printf("zone open failed: %s\n", strerror(-ret)); > > + } > > + return ret; > > +} > > + > > +static const cmdinfo_t zone_open_cmd = { > > + .name = "zone_open", > > + .altname = "zo", > > + .cfunc = zone_open_f, > > + .argmin = 2, > > + .argmax = 2, > > + .args = "offset len", > > + .oneline = "explicit open a range of zones in zone block device", > > +}; > > + > > +static int zone_close_f(BlockBackend *blk, int argc, char **argv) > > +{ > > + int ret; > > + int64_t offset, len; > > + ++optind; > > + offset = cvtnum(argv[optind]); > > + ++optind; > > + len = cvtnum(argv[optind]); > > + ret = blk_zone_mgmt(blk, BLK_ZO_CLOSE, offset, len); > > + if (ret < 0) { > > + printf("zone close failed: %s\n", strerror(-ret)); > > + } > > + return ret; > > +} > > + > > +static const cmdinfo_t zone_close_cmd = { > > + .name = "zone_close", > > + .altname = "zc", > > + .cfunc = zone_close_f, > > + .argmin = 2, > > + .argmax = 2, > > + .args = "offset len", > > + .oneline = "close a range of zones in zone block device", > > +}; > > + > > +static int zone_finish_f(BlockBackend *blk, int argc, char **argv) > > +{ > > + int ret; > > + int64_t offset, len; > > + ++optind; > > + offset = cvtnum(argv[optind]); > > + ++optind; > > + len = cvtnum(argv[optind]); > > + ret = blk_zone_mgmt(blk, BLK_ZO_FINISH, offset, len); > > + if (ret < 0) { > > + printf("zone finish failed: %s\n", strerror(-ret)); > > + } > > + return ret; > > +} > > + > > +static const cmdinfo_t zone_finish_cmd = { > > + .name = "zone_finish", > > + .altname = "zf", > > + .cfunc = zone_finish_f, > > + .argmin = 2, > > + .argmax = 2, > > + .args = "offset len", > > + .oneline = "finish a range of zones in zone block device", > > +}; > > + > > +static int zone_reset_f(BlockBackend *blk, int argc, char **argv) > > +{ > > + int ret; > > + int64_t offset, len; > > + ++optind; > > + offset = cvtnum(argv[optind]); > > + ++optind; > > + len = cvtnum(argv[optind]); > > + ret = blk_zone_mgmt(blk, BLK_ZO_RESET, offset, len); > > + if (ret < 0) { > > + printf("zone reset failed: %s\n", strerror(-ret)); > > + } > > + return ret; > > +} > > + > > +static const cmdinfo_t zone_reset_cmd = { > > + .name = "zone_reset", > > + .altname = "zrs", > > + .cfunc = zone_reset_f, > > + .argmin = 2, > > + .argmax = 2, > > + .args = "offset len", > > + .oneline = "reset a zone write pointer in zone block device", > > +}; > > + > > static int truncate_f(BlockBackend *blk, int argc, char **argv); > > static const cmdinfo_t truncate_cmd = { > > .name = "truncate", > > @@ -2504,6 +2648,11 @@ static void __attribute((constructor)) init_qemuio_commands(void) > > qemuio_add_command(&aio_write_cmd); > > qemuio_add_command(&aio_flush_cmd); > > qemuio_add_command(&flush_cmd); > > + qemuio_add_command(&zone_report_cmd); > > + qemuio_add_command(&zone_open_cmd); > > + qemuio_add_command(&zone_close_cmd); > > + qemuio_add_command(&zone_finish_cmd); > > + qemuio_add_command(&zone_reset_cmd); > > qemuio_add_command(&truncate_cmd); > > qemuio_add_command(&length_cmd); > > qemuio_add_command(&info_cmd); > > -- > > 2.38.1 > > > >
Am 29.01.2023 um 11:28 hat Sam Li geschrieben: > Add zoned device option to host_device BlockDriver. It will be presented only > for zoned host block devices. By adding zone management operations to the > host_block_device BlockDriver, users can use the new block layer APIs > including Report Zone and four zone management operations > (open, close, finish, reset, reset_all). > > Qemu-io uses the new APIs to perform zoned storage commands of the device: > zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs), > zone_finish(zf). > > For example, to test zone_report, use following command: > $ ./build/qemu-io --image-opts -n driver=host_device, filename=/dev/nullb0 > -c "zrp offset nr_zones" > > Signed-off-by: Sam Li <faithilikerun@gmail.com> > Reviewed-by: Hannes Reinecke <hare@suse.de> > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> > --- > block/block-backend.c | 147 ++++++++++++++ > block/file-posix.c | 323 ++++++++++++++++++++++++++++++ > block/io.c | 41 ++++ > include/block/block-io.h | 7 + > include/block/block_int-common.h | 21 ++ > include/block/raw-aio.h | 6 +- > include/sysemu/block-backend-io.h | 18 ++ > meson.build | 4 + > qemu-io-cmds.c | 149 ++++++++++++++ > 9 files changed, 715 insertions(+), 1 deletion(-) > > diff --git a/block/block-backend.c b/block/block-backend.c > index ba7bf1d6bc..a4847b9131 100644 > --- a/block/block-backend.c > +++ b/block/block-backend.c > @@ -1451,6 +1451,15 @@ typedef struct BlkRwCo { > void *iobuf; > int ret; > BdrvRequestFlags flags; > + union { > + struct { > + unsigned int *nr_zones; > + BlockZoneDescriptor *zones; > + } zone_report; > + struct { > + unsigned long op; > + } zone_mgmt; > + }; > } BlkRwCo; Should we use a different struct for blk_aio_zone_*() so that we don't need to touch the one for the normal I/O path? My concern is that increasing the size of the struct (currently 32 bytes) might negatively impact the performance even of non-zoned devices. Maybe it turns out that it wasn't really necessary in the end (have we done any benchmarks?), but I don't think it can hurt anyway. With this changed, you can add to the series: Acked-by: Kevin Wolf <kwolf@redhat.com> Kevin
On Mon, Feb 27, 2023 at 07:20:14PM +0100, Kevin Wolf wrote: > Am 29.01.2023 um 11:28 hat Sam Li geschrieben: > > Add zoned device option to host_device BlockDriver. It will be presented only > > for zoned host block devices. By adding zone management operations to the > > host_block_device BlockDriver, users can use the new block layer APIs > > including Report Zone and four zone management operations > > (open, close, finish, reset, reset_all). > > > > Qemu-io uses the new APIs to perform zoned storage commands of the device: > > zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs), > > zone_finish(zf). > > > > For example, to test zone_report, use following command: > > $ ./build/qemu-io --image-opts -n driver=host_device, filename=/dev/nullb0 > > -c "zrp offset nr_zones" > > > > Signed-off-by: Sam Li <faithilikerun@gmail.com> > > Reviewed-by: Hannes Reinecke <hare@suse.de> > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> > > --- > > block/block-backend.c | 147 ++++++++++++++ > > block/file-posix.c | 323 ++++++++++++++++++++++++++++++ > > block/io.c | 41 ++++ > > include/block/block-io.h | 7 + > > include/block/block_int-common.h | 21 ++ > > include/block/raw-aio.h | 6 +- > > include/sysemu/block-backend-io.h | 18 ++ > > meson.build | 4 + > > qemu-io-cmds.c | 149 ++++++++++++++ > > 9 files changed, 715 insertions(+), 1 deletion(-) > > > > diff --git a/block/block-backend.c b/block/block-backend.c > > index ba7bf1d6bc..a4847b9131 100644 > > --- a/block/block-backend.c > > +++ b/block/block-backend.c > > @@ -1451,6 +1451,15 @@ typedef struct BlkRwCo { > > void *iobuf; > > int ret; > > BdrvRequestFlags flags; > > + union { > > + struct { > > + unsigned int *nr_zones; > > + BlockZoneDescriptor *zones; > > + } zone_report; > > + struct { > > + unsigned long op; > > + } zone_mgmt; > > + }; > > } BlkRwCo; > > Should we use a different struct for blk_aio_zone_*() so that we don't > need to touch the one for the normal I/O path? My concern is that > increasing the size of the struct (currently 32 bytes) might negatively > impact the performance even of non-zoned devices. Maybe it turns out > that it wasn't really necessary in the end (have we done any > benchmarks?), but I don't think it can hurt anyway. > > With this changed, you can add to the series: > Acked-by: Kevin Wolf <kwolf@redhat.com> There are unused fields in BlkRwCo and BlkAioEmAIOCB, so changing the size of the struct isn't necessary. ioctl/flush/pdiscard already use BlkAioEmAIOCB/BlkRwCo for non-read/write operations, including using the iobuf field for different types, so it wouldn't be weird: typedef struct BlkRwCo { BlockBackend *blk; int64_t offset; void *iobuf; ^^^^^ used for preadv/pwritev qiov, ioctl buf, and NULL for other request types. zone_report could put the BlockZoneDescriptor pointer here. zone_mgmt could put op here. int ret; BdrvRequestFlags flags; } BlkRwCo; typedef struct BlkAioEmAIOCB { BlockAIOCB common; BlkRwCo rwco; int64_t bytes; ^^^^^ zone_report could put the nr_zones pointer here bool has_returned; } BlkAioEmAIOCB; Does that sound okay? Stefan
Am 27.02.2023 um 20:14 hat Stefan Hajnoczi geschrieben: > On Mon, Feb 27, 2023 at 07:20:14PM +0100, Kevin Wolf wrote: > > Am 29.01.2023 um 11:28 hat Sam Li geschrieben: > > > Add zoned device option to host_device BlockDriver. It will be presented only > > > for zoned host block devices. By adding zone management operations to the > > > host_block_device BlockDriver, users can use the new block layer APIs > > > including Report Zone and four zone management operations > > > (open, close, finish, reset, reset_all). > > > > > > Qemu-io uses the new APIs to perform zoned storage commands of the device: > > > zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs), > > > zone_finish(zf). > > > > > > For example, to test zone_report, use following command: > > > $ ./build/qemu-io --image-opts -n driver=host_device, filename=/dev/nullb0 > > > -c "zrp offset nr_zones" > > > > > > Signed-off-by: Sam Li <faithilikerun@gmail.com> > > > Reviewed-by: Hannes Reinecke <hare@suse.de> > > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> > > > --- > > > block/block-backend.c | 147 ++++++++++++++ > > > block/file-posix.c | 323 ++++++++++++++++++++++++++++++ > > > block/io.c | 41 ++++ > > > include/block/block-io.h | 7 + > > > include/block/block_int-common.h | 21 ++ > > > include/block/raw-aio.h | 6 +- > > > include/sysemu/block-backend-io.h | 18 ++ > > > meson.build | 4 + > > > qemu-io-cmds.c | 149 ++++++++++++++ > > > 9 files changed, 715 insertions(+), 1 deletion(-) > > > > > > diff --git a/block/block-backend.c b/block/block-backend.c > > > index ba7bf1d6bc..a4847b9131 100644 > > > --- a/block/block-backend.c > > > +++ b/block/block-backend.c > > > @@ -1451,6 +1451,15 @@ typedef struct BlkRwCo { > > > void *iobuf; > > > int ret; > > > BdrvRequestFlags flags; > > > + union { > > > + struct { > > > + unsigned int *nr_zones; > > > + BlockZoneDescriptor *zones; > > > + } zone_report; > > > + struct { > > > + unsigned long op; > > > + } zone_mgmt; > > > + }; > > > } BlkRwCo; > > > > Should we use a different struct for blk_aio_zone_*() so that we don't > > need to touch the one for the normal I/O path? My concern is that > > increasing the size of the struct (currently 32 bytes) might negatively > > impact the performance even of non-zoned devices. Maybe it turns out > > that it wasn't really necessary in the end (have we done any > > benchmarks?), but I don't think it can hurt anyway. > > > > With this changed, you can add to the series: > > Acked-by: Kevin Wolf <kwolf@redhat.com> > > There are unused fields in BlkRwCo and BlkAioEmAIOCB, so changing the > size of the struct isn't necessary. ioctl/flush/pdiscard already use > BlkAioEmAIOCB/BlkRwCo for non-read/write operations, including using the > iobuf field for different types, so it wouldn't be weird: > > typedef struct BlkRwCo { > BlockBackend *blk; > int64_t offset; > void *iobuf; > ^^^^^ used for preadv/pwritev qiov, ioctl buf, and NULL for > other request types. zone_report could put the > BlockZoneDescriptor pointer here. zone_mgmt could put > op here. > int ret; > BdrvRequestFlags flags; > } BlkRwCo; > > typedef struct BlkAioEmAIOCB { > BlockAIOCB common; > BlkRwCo rwco; > int64_t bytes; > ^^^^^ zone_report could put the nr_zones pointer here > bool has_returned; > } BlkAioEmAIOCB; > > Does that sound okay? Might not be great for readability, but good enough for me. Kevin
Kevin Wolf <kwolf@redhat.com> 于2023年2月28日周二 19:54写道: > > Am 27.02.2023 um 20:14 hat Stefan Hajnoczi geschrieben: > > On Mon, Feb 27, 2023 at 07:20:14PM +0100, Kevin Wolf wrote: > > > Am 29.01.2023 um 11:28 hat Sam Li geschrieben: > > > > Add zoned device option to host_device BlockDriver. It will be presented only > > > > for zoned host block devices. By adding zone management operations to the > > > > host_block_device BlockDriver, users can use the new block layer APIs > > > > including Report Zone and four zone management operations > > > > (open, close, finish, reset, reset_all). > > > > > > > > Qemu-io uses the new APIs to perform zoned storage commands of the device: > > > > zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs), > > > > zone_finish(zf). > > > > > > > > For example, to test zone_report, use following command: > > > > $ ./build/qemu-io --image-opts -n driver=host_device, filename=/dev/nullb0 > > > > -c "zrp offset nr_zones" > > > > > > > > Signed-off-by: Sam Li <faithilikerun@gmail.com> > > > > Reviewed-by: Hannes Reinecke <hare@suse.de> > > > > Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> > > > > --- > > > > block/block-backend.c | 147 ++++++++++++++ > > > > block/file-posix.c | 323 ++++++++++++++++++++++++++++++ > > > > block/io.c | 41 ++++ > > > > include/block/block-io.h | 7 + > > > > include/block/block_int-common.h | 21 ++ > > > > include/block/raw-aio.h | 6 +- > > > > include/sysemu/block-backend-io.h | 18 ++ > > > > meson.build | 4 + > > > > qemu-io-cmds.c | 149 ++++++++++++++ > > > > 9 files changed, 715 insertions(+), 1 deletion(-) > > > > > > > > diff --git a/block/block-backend.c b/block/block-backend.c > > > > index ba7bf1d6bc..a4847b9131 100644 > > > > --- a/block/block-backend.c > > > > +++ b/block/block-backend.c > > > > @@ -1451,6 +1451,15 @@ typedef struct BlkRwCo { > > > > void *iobuf; > > > > int ret; > > > > BdrvRequestFlags flags; > > > > + union { > > > > + struct { > > > > + unsigned int *nr_zones; > > > > + BlockZoneDescriptor *zones; > > > > + } zone_report; > > > > + struct { > > > > + unsigned long op; > > > > + } zone_mgmt; > > > > + }; > > > > } BlkRwCo; > > > > > > Should we use a different struct for blk_aio_zone_*() so that we don't > > > need to touch the one for the normal I/O path? My concern is that > > > increasing the size of the struct (currently 32 bytes) might negatively > > > impact the performance even of non-zoned devices. Maybe it turns out > > > that it wasn't really necessary in the end (have we done any > > > benchmarks?), but I don't think it can hurt anyway. > > > > > > With this changed, you can add to the series: > > > Acked-by: Kevin Wolf <kwolf@redhat.com> > > > > There are unused fields in BlkRwCo and BlkAioEmAIOCB, so changing the > > size of the struct isn't necessary. ioctl/flush/pdiscard already use > > BlkAioEmAIOCB/BlkRwCo for non-read/write operations, including using the > > iobuf field for different types, so it wouldn't be weird: > > > > typedef struct BlkRwCo { > > BlockBackend *blk; > > int64_t offset; > > void *iobuf; > > ^^^^^ used for preadv/pwritev qiov, ioctl buf, and NULL for > > other request types. zone_report could put the > > BlockZoneDescriptor pointer here. zone_mgmt could put > > op here. > > int ret; > > BdrvRequestFlags flags; > > } BlkRwCo; > > > > typedef struct BlkAioEmAIOCB { > > BlockAIOCB common; > > BlkRwCo rwco; > > int64_t bytes; > > ^^^^^ zone_report could put the nr_zones pointer here > > bool has_returned; > > } BlkAioEmAIOCB; > > > > Does that sound okay? > > Might not be great for readability, but good enough for me. > > Kevin I see. Will change it accordingly. Thanks! Sam
diff --git a/block/block-backend.c b/block/block-backend.c index ba7bf1d6bc..a4847b9131 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1451,6 +1451,15 @@ typedef struct BlkRwCo { void *iobuf; int ret; BdrvRequestFlags flags; + union { + struct { + unsigned int *nr_zones; + BlockZoneDescriptor *zones; + } zone_report; + struct { + unsigned long op; + } zone_mgmt; + }; } BlkRwCo; int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags) @@ -1795,6 +1804,144 @@ int coroutine_fn blk_co_flush(BlockBackend *blk) return ret; } +static void coroutine_fn blk_aio_zone_report_entry(void *opaque) +{ + BlkAioEmAIOCB *acb = opaque; + BlkRwCo *rwco = &acb->rwco; + + rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset, + rwco->zone_report.nr_zones, + rwco->zone_report.zones); + blk_aio_complete(acb); +} + +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones, + BlockCompletionFunc *cb, void *opaque) +{ + BlkAioEmAIOCB *acb; + Coroutine *co; + IO_CODE(); + + blk_inc_in_flight(blk); + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); + acb->rwco = (BlkRwCo) { + .blk = blk, + .offset = offset, + .ret = NOT_DONE, + .zone_report = { + .zones = zones, + .nr_zones = nr_zones, + }, + }; + acb->has_returned = false; + + co = qemu_coroutine_create(blk_aio_zone_report_entry, acb); + bdrv_coroutine_enter(blk_bs(blk), co); + + acb->has_returned = true; + if (acb->rwco.ret != NOT_DONE) { + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), + blk_aio_complete_bh, acb); + } + + return &acb->common; +} + +static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque) +{ + BlkAioEmAIOCB *acb = opaque; + BlkRwCo *rwco = &acb->rwco; + + rwco->ret = blk_co_zone_mgmt(rwco->blk, rwco->zone_mgmt.op, + rwco->offset, acb->bytes); + blk_aio_complete(acb); +} + +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, + int64_t offset, int64_t len, + BlockCompletionFunc *cb, void *opaque) { + BlkAioEmAIOCB *acb; + Coroutine *co; + IO_CODE(); + + blk_inc_in_flight(blk); + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); + acb->rwco = (BlkRwCo) { + .blk = blk, + .offset = offset, + .ret = NOT_DONE, + .zone_mgmt = { + .op = op, + }, + }; + acb->bytes = len; + acb->has_returned = false; + + co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb); + bdrv_coroutine_enter(blk_bs(blk), co); + + acb->has_returned = true; + if (acb->rwco.ret != NOT_DONE) { + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), + blk_aio_complete_bh, acb); + } + + return &acb->common; +} + +/* + * Send a zone_report command. + * offset is a byte offset from the start of the device. No alignment + * required for offset. + * nr_zones represents IN maximum and OUT actual. + */ +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones) +{ + int ret; + IO_CODE(); + + blk_inc_in_flight(blk); /* increase before waiting */ + blk_wait_while_drained(blk); + if (!blk_is_available(blk)) { + blk_dec_in_flight(blk); + return -ENOMEDIUM; + } + ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones); + blk_dec_in_flight(blk); + return ret; +} + +/* + * Send a zone_management command. + * op is the zone operation; + * offset is the byte offset from the start of the zoned device; + * len is the maximum number of bytes the command should operate on. It + * should be aligned with the device zone size. + */ +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, + int64_t offset, int64_t len) +{ + int ret; + IO_CODE(); + + blk_inc_in_flight(blk); + blk_wait_while_drained(blk); + + ret = blk_check_byte_request(blk, offset, len); + if (ret < 0) { + blk_dec_in_flight(blk); + return ret; + } + + ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len); + blk_dec_in_flight(blk); + return ret; +} + void blk_drain(BlockBackend *blk) { BlockDriverState *bs = blk_bs(blk); diff --git a/block/file-posix.c b/block/file-posix.c index 43c59c6d56..b6d88db208 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -68,6 +68,9 @@ #include <sys/param.h> #include <sys/syscall.h> #include <sys/vfs.h> +#if defined(CONFIG_BLKZONED) +#include <linux/blkzoned.h> +#endif #include <linux/cdrom.h> #include <linux/fd.h> #include <linux/fs.h> @@ -216,6 +219,13 @@ typedef struct RawPosixAIOData { PreallocMode prealloc; Error **errp; } truncate; + struct { + unsigned int *nr_zones; + BlockZoneDescriptor *zones; + } zone_report; + struct { + unsigned long op; + } zone_mgmt; }; } RawPosixAIOData; @@ -1351,6 +1361,50 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) zoned = BLK_Z_NONE; } bs->bl.zoned = zoned; + if (zoned != BLK_Z_NONE) { + /* + * The zoned device must at least have zone size and nr_zones fields. + */ + ret = get_sysfs_long_val(&st, "chunk_sectors"); + if (ret < 0) { + error_setg_errno(errp, -ret, "Unable to read chunk_sectors " + "sysfs attribute"); + goto out; + } else if (!ret) { + error_setg(errp, "Read 0 from chunk_sectors sysfs attribute"); + goto out; + } + bs->bl.zone_size = ret << BDRV_SECTOR_BITS; + + ret = get_sysfs_long_val(&st, "nr_zones"); + if (ret < 0) { + error_setg_errno(errp, -ret, "Unable to read nr_zones " + "sysfs attribute"); + goto out; + } else if (!ret) { + error_setg(errp, "Read 0 from nr_zones sysfs attribute"); + goto out; + } + bs->bl.nr_zones = ret; + + ret = get_sysfs_long_val(&st, "zone_append_max_bytes"); + if (ret > 0) { + bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS; + } + + ret = get_sysfs_long_val(&st, "max_open_zones"); + if (ret >= 0) { + bs->bl.max_open_zones = ret; + } + + ret = get_sysfs_long_val(&st, "max_active_zones"); + if (ret >= 0) { + bs->bl.max_active_zones = ret; + } + return; + } +out: + bs->bl.zoned = BLK_Z_NONE; } static int check_for_dasd(int fd) @@ -1364,6 +1418,23 @@ static int check_for_dasd(int fd) #endif } +#if defined(CONFIG_BLKZONED) +/** + * Zoned storage needs to be virtualized with the correct physical block size + * and logical block size. + */ +static int hdev_probe_zoned_blocksizes(BlockDriverState *bs, BlockSizes *bsz) +{ + BDRVRawState *s = bs->opaque; + int ret; + + ret = probe_logical_blocksize(s->fd, &bsz->log); + if (ret < 0) { + return ret; + } + return probe_physical_blocksize(s->fd, &bsz->phys); +} +#else /** * Try to get @bs's logical and physical block size. * On success, store them in @bsz and return zero. @@ -1384,6 +1455,7 @@ static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) } return probe_physical_blocksize(s->fd, &bsz->phys); } +#endif /** * Try to get @bs's geometry: cyls, heads, sectors. @@ -1844,6 +1916,146 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd, } #endif +/* + * parse_zone - Fill a zone descriptor + */ +#if defined(CONFIG_BLKZONED) +static inline int parse_zone(struct BlockZoneDescriptor *zone, + const struct blk_zone *blkz) { + zone->start = blkz->start << BDRV_SECTOR_BITS; + zone->length = blkz->len << BDRV_SECTOR_BITS; + zone->wp = blkz->wp << BDRV_SECTOR_BITS; + +#ifdef HAVE_BLK_ZONE_REP_CAPACITY + zone->cap = blkz->capacity << BDRV_SECTOR_BITS; +#else + zone->cap = blkz->len << BDRV_SECTOR_BITS; +#endif + + switch (blkz->type) { + case BLK_ZONE_TYPE_SEQWRITE_REQ: + zone->type = BLK_ZT_SWR; + break; + case BLK_ZONE_TYPE_SEQWRITE_PREF: + zone->type = BLK_ZT_SWP; + break; + case BLK_ZONE_TYPE_CONVENTIONAL: + zone->type = BLK_ZT_CONV; + break; + default: + error_report("Unsupported zone type: 0x%x", blkz->type); + return -ENOTSUP; + } + + switch (blkz->cond) { + case BLK_ZONE_COND_NOT_WP: + zone->state = BLK_ZS_NOT_WP; + break; + case BLK_ZONE_COND_EMPTY: + zone->state = BLK_ZS_EMPTY; + break; + case BLK_ZONE_COND_IMP_OPEN: + zone->state = BLK_ZS_IOPEN; + break; + case BLK_ZONE_COND_EXP_OPEN: + zone->state = BLK_ZS_EOPEN; + break; + case BLK_ZONE_COND_CLOSED: + zone->state = BLK_ZS_CLOSED; + break; + case BLK_ZONE_COND_READONLY: + zone->state = BLK_ZS_RDONLY; + break; + case BLK_ZONE_COND_FULL: + zone->state = BLK_ZS_FULL; + break; + case BLK_ZONE_COND_OFFLINE: + zone->state = BLK_ZS_OFFLINE; + break; + default: + error_report("Unsupported zone state: 0x%x", blkz->cond); + return -ENOTSUP; + } + return 0; +} +#endif + +#if defined(CONFIG_BLKZONED) +static int handle_aiocb_zone_report(void *opaque) +{ + RawPosixAIOData *aiocb = opaque; + int fd = aiocb->aio_fildes; + unsigned int *nr_zones = aiocb->zone_report.nr_zones; + BlockZoneDescriptor *zones = aiocb->zone_report.zones; + /* zoned block devices use 512-byte sectors */ + uint64_t sector = aiocb->aio_offset / 512; + + struct blk_zone *blkz; + size_t rep_size; + unsigned int nrz; + int ret, n = 0, i = 0; + + nrz = *nr_zones; + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); + g_autofree struct blk_zone_report *rep = NULL; + rep = g_malloc(rep_size); + + blkz = (struct blk_zone *)(rep + 1); + while (n < nrz) { + memset(rep, 0, rep_size); + rep->sector = sector; + rep->nr_zones = nrz - n; + + do { + ret = ioctl(fd, BLKREPORTZONE, rep); + } while (ret != 0 && errno == EINTR); + if (ret != 0) { + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d", + fd, sector, errno); + return -errno; + } + + if (!rep->nr_zones) { + break; + } + + for (i = 0; i < rep->nr_zones; i++, n++) { + ret = parse_zone(&zones[n], &blkz[i]); + if (ret != 0) { + return ret; + } + + /* The next report should start after the last zone reported */ + sector = blkz[i].start + blkz[i].len; + } + } + + *nr_zones = n; + return 0; +} +#endif + +#if defined(CONFIG_BLKZONED) +static int handle_aiocb_zone_mgmt(void *opaque) +{ + RawPosixAIOData *aiocb = opaque; + int fd = aiocb->aio_fildes; + uint64_t sector = aiocb->aio_offset / 512; + int64_t nr_sectors = aiocb->aio_nbytes / 512; + struct blk_zone_range range; + int ret; + + /* Execute the operation */ + range.sector = sector; + range.nr_sectors = nr_sectors; + do { + ret = ioctl(fd, aiocb->zone_mgmt.op, &range); + } while (ret != 0 && errno == EINTR); + + return ret; +} +#endif + static int handle_aiocb_copy_range(void *opaque) { RawPosixAIOData *aiocb = opaque; @@ -3035,6 +3247,107 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret) } } +/* + * zone report - Get a zone block device's information in the form + * of an array of zone descriptors. + * zones is an array of zone descriptors to hold zone information on reply; + * offset can be any byte within the entire size of the device; + * nr_zones is the maxium number of sectors the command should operate on. + */ +#if defined(CONFIG_BLKZONED) +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones) { + BDRVRawState *s = bs->opaque; + RawPosixAIOData acb; + + acb = (RawPosixAIOData) { + .bs = bs, + .aio_fildes = s->fd, + .aio_type = QEMU_AIO_ZONE_REPORT, + .aio_offset = offset, + .zone_report = { + .nr_zones = nr_zones, + .zones = zones, + }, + }; + + return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb); +} +#endif + +/* + * zone management operations - Execute an operation on a zone + */ +#if defined(CONFIG_BLKZONED) +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, + int64_t offset, int64_t len) { + BDRVRawState *s = bs->opaque; + RawPosixAIOData acb; + int64_t zone_size, zone_size_mask; + const char *op_name; + unsigned long zo; + int ret; + int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS; + + zone_size = bs->bl.zone_size; + zone_size_mask = zone_size - 1; + if (offset & zone_size_mask) { + error_report("sector offset %" PRId64 " is not aligned to zone size " + "%" PRId64 "", offset / 512, zone_size / 512); + return -EINVAL; + } + + if (((offset + len) < capacity && len & zone_size_mask) || + offset + len > capacity) { + error_report("number of sectors %" PRId64 " is not aligned to zone size" + " %" PRId64 "", len / 512, zone_size / 512); + return -EINVAL; + } + + switch (op) { + case BLK_ZO_OPEN: + op_name = "BLKOPENZONE"; + zo = BLKOPENZONE; + break; + case BLK_ZO_CLOSE: + op_name = "BLKCLOSEZONE"; + zo = BLKCLOSEZONE; + break; + case BLK_ZO_FINISH: + op_name = "BLKFINISHZONE"; + zo = BLKFINISHZONE; + break; + case BLK_ZO_RESET: + op_name = "BLKRESETZONE"; + zo = BLKRESETZONE; + break; + default: + error_report("Unsupported zone op: 0x%x", op); + return -ENOTSUP; + } + + acb = (RawPosixAIOData) { + .bs = bs, + .aio_fildes = s->fd, + .aio_type = QEMU_AIO_ZONE_MGMT, + .aio_offset = offset, + .aio_nbytes = len, + .zone_mgmt = { + .op = zo, + }, + }; + + ret = raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb); + if (ret != 0) { + ret = -errno; + error_report("ioctl %s failed %d", op_name, ret); + } + + return ret; +} +#endif + static coroutine_fn int raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, bool blkdev) @@ -3756,13 +4069,23 @@ static BlockDriver bdrv_host_device = { .bdrv_check_perm = raw_check_perm, .bdrv_set_perm = raw_set_perm, .bdrv_abort_perm_update = raw_abort_perm_update, +#ifndef CONFIG_BLKZONED .bdrv_probe_blocksizes = hdev_probe_blocksizes, +#endif .bdrv_probe_geometry = hdev_probe_geometry, /* generic scsi device */ #ifdef __linux__ .bdrv_co_ioctl = hdev_co_ioctl, #endif + + /* zoned device */ +#if defined(CONFIG_BLKZONED) + /* zone management operations */ + .bdrv_probe_blocksizes = hdev_probe_zoned_blocksizes, + .bdrv_co_zone_report = raw_co_zone_report, + .bdrv_co_zone_mgmt = raw_co_zone_mgmt, +#endif }; #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) diff --git a/block/io.c b/block/io.c index a09a19f7a7..1586e42ab9 100644 --- a/block/io.c +++ b/block/io.c @@ -3099,6 +3099,47 @@ out: return co.ret; } +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones) +{ + BlockDriver *drv = bs->drv; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + IO_CODE(); + + bdrv_inc_in_flight(bs); + if (!drv || !drv->bdrv_co_zone_report) { + co.ret = -ENOTSUP; + goto out; + } + co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones); +out: + bdrv_dec_in_flight(bs); + return co.ret; +} + +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, + int64_t offset, int64_t len) +{ + BlockDriver *drv = bs->drv; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + IO_CODE(); + + bdrv_inc_in_flight(bs); + if (!drv || !drv->bdrv_co_zone_mgmt) { + co.ret = -ENOTSUP; + goto out; + } + co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len); +out: + bdrv_dec_in_flight(bs); + return co.ret; +} + void *qemu_blockalign(BlockDriverState *bs, size_t size) { IO_CODE(); diff --git a/include/block/block-io.h b/include/block/block-io.h index 3398351596..10ff212036 100644 --- a/include/block/block-io.h +++ b/include/block/block-io.h @@ -98,6 +98,13 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs); int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes); +/* Report zone information of zone block device. */ +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones); +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, + int64_t offset, int64_t len); + bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, int64_t *pnum, int64_t *map, diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h index 57f0612f5e..565228d8dd 100644 --- a/include/block/block_int-common.h +++ b/include/block/block_int-common.h @@ -703,6 +703,12 @@ struct BlockDriver { int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_load_vmstate)( BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); + int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs, + int64_t offset, unsigned int *nr_zones, + BlockZoneDescriptor *zones); + int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op, + int64_t offset, int64_t len); + /* removable device specific */ bool (*bdrv_is_inserted)(BlockDriverState *bs); void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag); @@ -839,6 +845,21 @@ typedef struct BlockLimits { /* device zone model */ BlockZoneModel zoned; + + /* zone size expressed in bytes */ + uint32_t zone_size; + + /* total number of zones */ + uint32_t nr_zones; + + /* maximum sectors of a zone append write operation */ + int64_t max_append_sectors; + + /* maximum number of open zones */ + int64_t max_open_zones; + + /* maximum number of active zones */ + int64_t max_active_zones; } BlockLimits; typedef struct BdrvOpBlocker BdrvOpBlocker; diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h index f8cda9df91..eda6a7a253 100644 --- a/include/block/raw-aio.h +++ b/include/block/raw-aio.h @@ -28,6 +28,8 @@ #define QEMU_AIO_WRITE_ZEROES 0x0020 #define QEMU_AIO_COPY_RANGE 0x0040 #define QEMU_AIO_TRUNCATE 0x0080 +#define QEMU_AIO_ZONE_REPORT 0x0100 +#define QEMU_AIO_ZONE_MGMT 0x0200 #define QEMU_AIO_TYPE_MASK \ (QEMU_AIO_READ | \ QEMU_AIO_WRITE | \ @@ -36,7 +38,9 @@ QEMU_AIO_DISCARD | \ QEMU_AIO_WRITE_ZEROES | \ QEMU_AIO_COPY_RANGE | \ - QEMU_AIO_TRUNCATE) + QEMU_AIO_TRUNCATE | \ + QEMU_AIO_ZONE_REPORT | \ + QEMU_AIO_ZONE_MGMT) /* AIO flags */ #define QEMU_AIO_MISALIGNED 0x1000 diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h index 031a27ba10..dc8a4368f0 100644 --- a/include/sysemu/block-backend-io.h +++ b/include/sysemu/block-backend-io.h @@ -46,6 +46,13 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset, BlockCompletionFunc *cb, void *opaque); BlockAIOCB *blk_aio_flush(BlockBackend *blk, BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones, + BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, + int64_t offset, int64_t len, + BlockCompletionFunc *cb, void *opaque); BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes, BlockCompletionFunc *cb, void *opaque); void blk_aio_cancel_async(BlockAIOCB *acb); @@ -166,6 +173,17 @@ int co_wrapper_mixed blk_pwrite_zeroes(BlockBackend *blk, int64_t offset, int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, int64_t bytes, BdrvRequestFlags flags); +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones); +int co_wrapper_mixed blk_zone_report(BlockBackend *blk, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones); +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, + int64_t offset, int64_t len); +int co_wrapper_mixed blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op, + int64_t offset, int64_t len); + int co_wrapper_mixed blk_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes); int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, diff --git a/meson.build b/meson.build index 6d3b665629..a267f74536 100644 --- a/meson.build +++ b/meson.build @@ -1962,6 +1962,7 @@ config_host_data.set('CONFIG_REPLICATION', get_option('replication').allowed()) # has_header config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h')) config_host_data.set('CONFIG_LINUX_MAGIC_H', cc.has_header('linux/magic.h')) +config_host_data.set('CONFIG_BLKZONED', cc.has_header('linux/blkzoned.h')) config_host_data.set('CONFIG_VALGRIND_H', cc.has_header('valgrind/valgrind.h')) config_host_data.set('HAVE_BTRFS_H', cc.has_header('linux/btrfs.h')) config_host_data.set('HAVE_DRM_H', cc.has_header('libdrm/drm.h')) @@ -2056,6 +2057,9 @@ config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID', config_host_data.set('HAVE_STRUCT_STAT_ST_ATIM', cc.has_member('struct stat', 'st_atim', prefix: '#include <sys/stat.h>')) +config_host_data.set('HAVE_BLK_ZONE_REP_CAPACITY', + cc.has_member('struct blk_zone', 'capacity', + prefix: '#include <linux/blkzoned.h>')) # has_type config_host_data.set('CONFIG_IOVEC', diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c index 952dc940f1..3a3bad77c3 100644 --- a/qemu-io-cmds.c +++ b/qemu-io-cmds.c @@ -1712,6 +1712,150 @@ static const cmdinfo_t flush_cmd = { .oneline = "flush all in-core file state to disk", }; +static inline int64_t tosector(int64_t bytes) +{ + return bytes >> BDRV_SECTOR_BITS; +} + +static int zone_report_f(BlockBackend *blk, int argc, char **argv) +{ + int ret; + int64_t offset; + unsigned int nr_zones; + + ++optind; + offset = cvtnum(argv[optind]); + ++optind; + nr_zones = cvtnum(argv[optind]); + + g_autofree BlockZoneDescriptor *zones = NULL; + zones = g_new(BlockZoneDescriptor, nr_zones); + ret = blk_zone_report(blk, offset, &nr_zones, zones); + if (ret < 0) { + printf("zone report failed: %s\n", strerror(-ret)); + } else { + for (int i = 0; i < nr_zones; ++i) { + printf("start: 0x%" PRIx64 ", len 0x%" PRIx64 ", " + "cap"" 0x%" PRIx64 ", wptr 0x%" PRIx64 ", " + "zcond:%u, [type: %u]\n", + tosector(zones[i].start), tosector(zones[i].length), + tosector(zones[i].cap), tosector(zones[i].wp), + zones[i].state, zones[i].type); + } + } + return ret; +} + +static const cmdinfo_t zone_report_cmd = { + .name = "zone_report", + .altname = "zrp", + .cfunc = zone_report_f, + .argmin = 2, + .argmax = 2, + .args = "offset number", + .oneline = "report zone information", +}; + +static int zone_open_f(BlockBackend *blk, int argc, char **argv) +{ + int ret; + int64_t offset, len; + ++optind; + offset = cvtnum(argv[optind]); + ++optind; + len = cvtnum(argv[optind]); + ret = blk_zone_mgmt(blk, BLK_ZO_OPEN, offset, len); + if (ret < 0) { + printf("zone open failed: %s\n", strerror(-ret)); + } + return ret; +} + +static const cmdinfo_t zone_open_cmd = { + .name = "zone_open", + .altname = "zo", + .cfunc = zone_open_f, + .argmin = 2, + .argmax = 2, + .args = "offset len", + .oneline = "explicit open a range of zones in zone block device", +}; + +static int zone_close_f(BlockBackend *blk, int argc, char **argv) +{ + int ret; + int64_t offset, len; + ++optind; + offset = cvtnum(argv[optind]); + ++optind; + len = cvtnum(argv[optind]); + ret = blk_zone_mgmt(blk, BLK_ZO_CLOSE, offset, len); + if (ret < 0) { + printf("zone close failed: %s\n", strerror(-ret)); + } + return ret; +} + +static const cmdinfo_t zone_close_cmd = { + .name = "zone_close", + .altname = "zc", + .cfunc = zone_close_f, + .argmin = 2, + .argmax = 2, + .args = "offset len", + .oneline = "close a range of zones in zone block device", +}; + +static int zone_finish_f(BlockBackend *blk, int argc, char **argv) +{ + int ret; + int64_t offset, len; + ++optind; + offset = cvtnum(argv[optind]); + ++optind; + len = cvtnum(argv[optind]); + ret = blk_zone_mgmt(blk, BLK_ZO_FINISH, offset, len); + if (ret < 0) { + printf("zone finish failed: %s\n", strerror(-ret)); + } + return ret; +} + +static const cmdinfo_t zone_finish_cmd = { + .name = "zone_finish", + .altname = "zf", + .cfunc = zone_finish_f, + .argmin = 2, + .argmax = 2, + .args = "offset len", + .oneline = "finish a range of zones in zone block device", +}; + +static int zone_reset_f(BlockBackend *blk, int argc, char **argv) +{ + int ret; + int64_t offset, len; + ++optind; + offset = cvtnum(argv[optind]); + ++optind; + len = cvtnum(argv[optind]); + ret = blk_zone_mgmt(blk, BLK_ZO_RESET, offset, len); + if (ret < 0) { + printf("zone reset failed: %s\n", strerror(-ret)); + } + return ret; +} + +static const cmdinfo_t zone_reset_cmd = { + .name = "zone_reset", + .altname = "zrs", + .cfunc = zone_reset_f, + .argmin = 2, + .argmax = 2, + .args = "offset len", + .oneline = "reset a zone write pointer in zone block device", +}; + static int truncate_f(BlockBackend *blk, int argc, char **argv); static const cmdinfo_t truncate_cmd = { .name = "truncate", @@ -2504,6 +2648,11 @@ static void __attribute((constructor)) init_qemuio_commands(void) qemuio_add_command(&aio_write_cmd); qemuio_add_command(&aio_flush_cmd); qemuio_add_command(&flush_cmd); + qemuio_add_command(&zone_report_cmd); + qemuio_add_command(&zone_open_cmd); + qemuio_add_command(&zone_close_cmd); + qemuio_add_command(&zone_finish_cmd); + qemuio_add_command(&zone_reset_cmd); qemuio_add_command(&truncate_cmd); qemuio_add_command(&length_cmd); qemuio_add_command(&info_cmd);