Message ID | 20220627001917.9417-2-faithilikerun@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | *** Add support for zoned device *** | expand |
On 6/27/22 02:19, Sam Li wrote: > By adding zone management operations in BlockDriver, storage > controller emulation can use the new block layer APIs including > zone_report and zone_mgmt(open, close, finish, reset). > --- > block/block-backend.c | 56 ++++++++ > block/coroutines.h | 5 + > block/file-posix.c | 238 +++++++++++++++++++++++++++++++ > include/block/block-common.h | 43 +++++- > include/block/block_int-common.h | 20 +++ > 5 files changed, 361 insertions(+), 1 deletion(-) > > diff --git a/block/block-backend.c b/block/block-backend.c > index e0e1aff4b1..786f964d02 100644 > --- a/block/block-backend.c > +++ b/block/block-backend.c > @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk) > return ret; > } > > +/* > + * Return zone_report from BlockDriver. Offset can be any number within > + * the zone size. No alignment for offset and len. > + */ > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > + int64_t len, int64_t *nr_zones, > + BlockZoneDescriptor *zones) > +{ > + int ret; > + BlockDriverState *bs; > + IO_CODE(); > + > + blk_inc_in_flight(blk); /* increase before waiting */ > + blk_wait_while_drained(blk); > + bs = blk_bs(blk); > + > + ret = blk_check_byte_request(blk, offset, len); > + if (ret < 0) { > + return ret; > + } > + > + bdrv_inc_in_flight(bs); > + ret = bdrv_co_zone_report(blk->root->bs, offset, len, > + nr_zones, zones); > + bdrv_dec_in_flight(bs); > + blk_dec_in_flight(blk); > + return ret; > +} > + > +/* > + * Return zone_mgmt from BlockDriver. > + * Offset is the start of a zone and len is aligned to zones. > + */ > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op, > + int64_t offset, int64_t len) > +{ > + int ret; > + BlockDriverState *bs; > + IO_CODE(); > + > + blk_inc_in_flight(blk); > + blk_wait_while_drained(blk); > + bs = blk_bs(blk); > + > + ret = blk_check_byte_request(blk, offset, len); > + if (ret < 0) { > + return ret; > + } > + > + bdrv_inc_in_flight(bs); > + ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len); > + bdrv_dec_in_flight(bs); > + blk_dec_in_flight(blk); > + return ret; > +} > + > void blk_drain(BlockBackend *blk) > { > BlockDriverState *bs = blk_bs(blk); > diff --git a/block/coroutines.h b/block/coroutines.h > index 830ecaa733..a114d7bc30 100644 > --- a/block/coroutines.h > +++ b/block/coroutines.h > @@ -80,6 +80,11 @@ int coroutine_fn > blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes); > > int coroutine_fn blk_co_do_flush(BlockBackend *blk); > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > + int64_t len, int64_t *nr_zones, > + BlockZoneDescriptor *zones); > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op, > + int64_t offset, int64_t len); > > > /* > diff --git a/block/file-posix.c b/block/file-posix.c > index 48cd096624..1b8b0d351f 100644 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -67,6 +67,7 @@ > #include <sys/param.h> > #include <sys/syscall.h> > #include <sys/vfs.h> > +#include <linux/blkzoned.h> > #include <linux/cdrom.h> > #include <linux/fd.h> > #include <linux/fs.h> > @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData { > PreallocMode prealloc; > Error **errp; > } truncate; > + struct { > + int64_t *nr_zones; > + BlockZoneDescriptor *zones; > + } zone_report; > + zone_op op; > }; > } RawPosixAIOData; > > @@ -1801,6 +1807,135 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd, > } > #endif > > +/* > + * parse_zone - Fill a zone descriptor > + */ > +static inline void parse_zone(struct BlockZoneDescriptor *zone, > + struct blk_zone *blkz) { > + zone->start = blkz->start; > + zone->length = blkz->len; > + zone->cap = blkz->capacity; > + zone->wp = blkz->wp - blkz->start; > + zone->type = blkz->type; > + zone->cond = blkz->cond; > +} > + > +static int handle_aiocb_zone_report(void *opaque) { > + RawPosixAIOData *aiocb = opaque; > + int fd = aiocb->aio_fildes; > + int64_t *nr_zones = aiocb->zone_report.nr_zones; > + BlockZoneDescriptor *zones = aiocb->zone_report.zones; > + int64_t offset = aiocb->aio_offset; > + int64_t len = aiocb->aio_nbytes; > + > + struct blk_zone *blkz; > + int64_t rep_size, nrz; > + int ret, n = 0, i = 0; > + > + nrz = *nr_zones; > + if (len == -1) { > + return -errno; > + } > + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); > + g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz); > + offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */ > + printf("start to report zone with offset: 0x%lx\n", offset); > + > + blkz = (struct blk_zone *)(rep + 1); > + while (n < nrz) { > + memset(rep, 0, rep_size); > + rep->sector = offset; > + rep->nr_zones = nrz; > + > + ret = ioctl(fd, BLKREPORTZONE, rep); > + if (ret != 0) { > + ret = -errno; > + error_report("%d: ioctl BLKREPORTZONE at %ld failed %d", > + fd, offset, errno); > + return ret; > + } > + > + if (!rep->nr_zones) { > + break; > + } > + > + for (i = 0; i < rep->nr_zones; i++, n++) { > + parse_zone(&zones[n], &blkz[i]); > + /* The next report should start after the last zone reported */ > + offset = blkz[i].start + blkz[i].len; > + } Where do you increase 'n' such that the loop can make forward progress? Wouldn't it be better to use a for() loop here? > + } > + > + *nr_zones = n; > + return 0; > +} > + > +static int handle_aiocb_zone_mgmt(void *opaque) { > + RawPosixAIOData *aiocb = opaque; > + int fd = aiocb->aio_fildes; > + int64_t offset = aiocb->aio_offset; > + int64_t len = aiocb->aio_nbytes; > + zone_op op = aiocb->op; > + > + struct blk_zone_range range; > + const char *ioctl_name; > + unsigned long ioctl_op; > + int64_t zone_size; > + int64_t zone_size_mask; > + int ret; > + Shouldn't we add a check here if 'fd' points to a zoned device? ioctl errors are not _that_ helpful here, as you might get a variety of errors and it's not quite obvious which of those errors indicate an unsupported feature. > + ret = ioctl(fd, BLKGETZONESZ, &zone_size); > + if (ret) { > + return -1; > + } > + > + zone_size_mask = zone_size - 1; > + if (offset & zone_size_mask) { > + error_report("offset is not the start of a zone"); > + return -1; > + } > + > + if (len & zone_size_mask) { > + error_report("len is not aligned to zones"); > + return -1; > + } > + > + switch (op) { > + case zone_open: > + ioctl_name = "BLKOPENZONE"; > + ioctl_op = BLKOPENZONE; > + break; > + case zone_close: > + ioctl_name = "BLKCLOSEZONE"; > + ioctl_op = BLKCLOSEZONE; > + break; > + case zone_finish: > + ioctl_name = "BLKFINISHZONE"; > + ioctl_op = BLKFINISHZONE; > + break; > + case zone_reset: > + ioctl_name = "BLKRESETZONE"; > + ioctl_op = BLKRESETZONE; > + break; > + default: > + error_report("Invalid zone operation 0x%x", op); > + errno = -EINVAL; > + return -1; > + } > + > + /* Execute the operation */ > + range.sector = offset; > + range.nr_sectors = len; > + ret = ioctl(fd, ioctl_op, &range); > + if (ret != 0) { > + error_report("ioctl %s failed %d", > + ioctl_name, errno); > + return -1; > + } > + > + return 0; > +} > + > static int handle_aiocb_copy_range(void *opaque) > { > RawPosixAIOData *aiocb = opaque; > @@ -2973,6 +3108,58 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret) > } > } > > +/* > + * zone report - Get a zone block device's information in the form > + * of an array of zone descriptors. > + * > + * @param bs: passing zone block device file descriptor > + * @param zones: an array of zone descriptors to hold zone > + * information on reply > + * @param offset: offset can be any byte within the zone size. > + * @param len: (not sure yet. > + * @return 0 on success, -1 on failure > + */ > +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset, > + int64_t len, int64_t *nr_zones, > + BlockZoneDescriptor *zones) { > + BDRVRawState *s = bs->opaque; > + RawPosixAIOData acb; > + > + acb = (RawPosixAIOData) { > + .bs = bs, > + .aio_fildes = s->fd, > + .aio_type = QEMU_AIO_IOCTL, > + .aio_offset = offset, > + .aio_nbytes = len, > + .zone_report = { > + .nr_zones = nr_zones, > + .zones = zones, > + }, > + }; > + > + return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb); > +} > + > +/* > + * zone management operations - Execute an operation on a zone > + */ > +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, zone_op op, > + int64_t offset, int64_t len) { > + BDRVRawState *s = bs->opaque; > + RawPosixAIOData acb; > + > + acb = (RawPosixAIOData) { > + .bs = bs, > + .aio_fildes = s->fd, > + .aio_type = QEMU_AIO_IOCTL, > + .aio_offset = offset, > + .aio_nbytes = len, > + .op = op, > + }; > + > + return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb); > +} > + > static coroutine_fn int > raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, > bool blkdev) > @@ -3324,6 +3511,9 @@ BlockDriver bdrv_file = { > .bdrv_abort_perm_update = raw_abort_perm_update, > .create_opts = &raw_create_opts, > .mutable_opts = mutable_opts, > + > + .bdrv_co_zone_report = raw_co_zone_report, > + .bdrv_co_zone_mgmt = raw_co_zone_mgmt, > }; > > /***********************************************/ > @@ -3703,6 +3893,53 @@ static BlockDriver bdrv_host_device = { > #endif > }; > > +static BlockDriver bdrv_zoned_host_device = { > + .format_name = "zoned_host_device", > + .protocol_name = "zoned_host_device", > + .instance_size = sizeof(BDRVRawState), > + .bdrv_needs_filename = true, > + .bdrv_probe_device = hdev_probe_device, > + .bdrv_parse_filename = hdev_parse_filename, > + .bdrv_file_open = hdev_open, > + .bdrv_close = raw_close, > + .bdrv_reopen_prepare = raw_reopen_prepare, > + .bdrv_reopen_commit = raw_reopen_commit, > + .bdrv_reopen_abort = raw_reopen_abort, > + .bdrv_co_create_opts = bdrv_co_create_opts_simple, > + .create_opts = &bdrv_create_opts_simple, > + .mutable_opts = mutable_opts, > + .bdrv_co_invalidate_cache = raw_co_invalidate_cache, > + .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, > + > + .bdrv_co_preadv = raw_co_preadv, > + .bdrv_co_pwritev = raw_co_pwritev, > + .bdrv_co_flush_to_disk = raw_co_flush_to_disk, > + .bdrv_co_pdiscard = hdev_co_pdiscard, > + .bdrv_co_copy_range_from = raw_co_copy_range_from, > + .bdrv_co_copy_range_to = raw_co_copy_range_to, > + .bdrv_refresh_limits = raw_refresh_limits, > + .bdrv_io_plug = raw_aio_plug, > + .bdrv_io_unplug = raw_aio_unplug, > + .bdrv_attach_aio_context = raw_aio_attach_aio_context, > + > + .bdrv_co_truncate = raw_co_truncate, > + .bdrv_getlength = raw_getlength, > + .bdrv_get_info = raw_get_info, > + .bdrv_get_allocated_file_size > + = raw_get_allocated_file_size, > + .bdrv_get_specific_stats = hdev_get_specific_stats, > + .bdrv_check_perm = raw_check_perm, > + .bdrv_set_perm = raw_set_perm, > + .bdrv_abort_perm_update = raw_abort_perm_update, > + .bdrv_probe_blocksizes = hdev_probe_blocksizes, > + .bdrv_probe_geometry = hdev_probe_geometry, > + .bdrv_co_ioctl = hdev_co_ioctl, > + > + /* zone management operations */ > + .bdrv_co_zone_report = raw_co_zone_report, > + .bdrv_co_zone_mgmt = raw_co_zone_mgmt, > +}; > + > #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) > static void cdrom_parse_filename(const char *filename, QDict *options, > Error **errp) > @@ -3964,6 +4201,7 @@ static void bdrv_file_init(void) > #if defined(HAVE_HOST_BLOCK_DEVICE) > bdrv_register(&bdrv_host_device); > #ifdef __linux__ > + bdrv_register(&bdrv_zoned_host_device); > bdrv_register(&bdrv_host_cdrom); > #endif > #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) > diff --git a/include/block/block-common.h b/include/block/block-common.h > index fdb7306e78..78cddeeda5 100644 > --- a/include/block/block-common.h > +++ b/include/block/block-common.h > @@ -23,7 +23,6 @@ > */ > #ifndef BLOCK_COMMON_H > #define BLOCK_COMMON_H > - > #include "block/aio.h" > #include "block/aio-wait.h" > #include "qemu/iov.h" > @@ -49,6 +48,48 @@ typedef struct BlockDriver BlockDriver; > typedef struct BdrvChild BdrvChild; > typedef struct BdrvChildClass BdrvChildClass; > > +typedef enum zone_op { > + zone_open, > + zone_close, > + zone_finish, > + zone_reset, > +} zone_op; > + > +typedef enum zone_model { > + BLK_Z_HM, > + BLK_Z_HA, > +} zone_model; > + > +typedef enum BlkZoneCondition { > + BLK_ZS_NOT_WP = 0x0, > + BLK_ZS_EMPTY = 0x1, > + BLK_ZS_IOPEN = 0x2, > + BLK_ZS_EOPEN = 0x3, > + BLK_ZS_CLOSED = 0x4, > + BLK_ZS_RDONLY = 0xD, > + BLK_ZS_FULL = 0xE, > + BLK_ZS_OFFLINE = 0xF, > +} BlkZoneCondition; > + > +typedef enum BlkZoneType { > + BLK_ZT_CONV = 0x1, > + BLK_ZT_SWR = 0x2, > + BLK_ZT_SWP = 0x3, > +} BlkZoneType; > + > +/* > + * Zone descriptor data structure. > + * Provide information on a zone with all position and size values in bytes. > + */ > +typedef struct BlockZoneDescriptor { > + uint64_t start; > + uint64_t length; > + uint64_t cap; > + uint64_t wp; > + BlkZoneType type; > + BlkZoneCondition cond; > +} BlockZoneDescriptor; > + > typedef struct BlockDriverInfo { > /* in bytes, 0 if irrelevant */ > int cluster_size; > diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h > index 8947abab76..b9ea9db6dc 100644 > --- a/include/block/block_int-common.h > +++ b/include/block/block_int-common.h > @@ -94,6 +94,20 @@ typedef struct BdrvTrackedRequest { > struct BdrvTrackedRequest *waiting_for; > } BdrvTrackedRequest; > > +/** > + * Zone device information data structure. > + * Provide information on a device. > + */ > +typedef struct zbd_dev { > + uint32_t zone_size; > + zone_model model; > + uint32_t block_size; > + uint32_t write_granularity; > + uint32_t nr_zones; > + struct BlockZoneDescriptor *zones; /* array of zones */ > + uint32_t max_nr_open_zones; /* maximum number of explicitly open zones */ > + uint32_t max_nr_active_zones; > +} zbd_dev; > > struct BlockDriver { > /* > @@ -691,6 +705,12 @@ struct BlockDriver { > QEMUIOVector *qiov, > int64_t pos); > > + int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs, > + int64_t offset, int64_t len, int64_t *nr_zones, > + BlockZoneDescriptor *zones); > + int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, enum zone_op op, > + int64_t offset, int64_t len); > + > /* removable device specific */ > bool (*bdrv_is_inserted)(BlockDriverState *bs); > void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag); Other than that: Well done! Cheers, Hannes
Hi Hannes, Hannes Reinecke <hare@suse.de> 于2022年6月27日周一 15:21写道: > > On 6/27/22 02:19, Sam Li wrote: > > By adding zone management operations in BlockDriver, storage > > controller emulation can use the new block layer APIs including > > zone_report and zone_mgmt(open, close, finish, reset). > > --- > > block/block-backend.c | 56 ++++++++ > > block/coroutines.h | 5 + > > block/file-posix.c | 238 +++++++++++++++++++++++++++++++ > > include/block/block-common.h | 43 +++++- > > include/block/block_int-common.h | 20 +++ > > 5 files changed, 361 insertions(+), 1 deletion(-) > > > > diff --git a/block/block-backend.c b/block/block-backend.c > > index e0e1aff4b1..786f964d02 100644 > > --- a/block/block-backend.c > > +++ b/block/block-backend.c > > @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk) > > return ret; > > } > > > > +/* > > + * Return zone_report from BlockDriver. Offset can be any number within > > + * the zone size. No alignment for offset and len. > > + */ > > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > > + int64_t len, int64_t *nr_zones, > > + BlockZoneDescriptor *zones) > > +{ > > + int ret; > > + BlockDriverState *bs; > > + IO_CODE(); > > + > > + blk_inc_in_flight(blk); /* increase before waiting */ > > + blk_wait_while_drained(blk); > > + bs = blk_bs(blk); > > + > > + ret = blk_check_byte_request(blk, offset, len); > > + if (ret < 0) { > > + return ret; > > + } > > + > > + bdrv_inc_in_flight(bs); > > + ret = bdrv_co_zone_report(blk->root->bs, offset, len, > > + nr_zones, zones); > > + bdrv_dec_in_flight(bs); > > + blk_dec_in_flight(blk); > > + return ret; > > +} > > + > > +/* > > + * Return zone_mgmt from BlockDriver. > > + * Offset is the start of a zone and len is aligned to zones. > > + */ > > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op, > > + int64_t offset, int64_t len) > > +{ > > + int ret; > > + BlockDriverState *bs; > > + IO_CODE(); > > + > > + blk_inc_in_flight(blk); > > + blk_wait_while_drained(blk); > > + bs = blk_bs(blk); > > + > > + ret = blk_check_byte_request(blk, offset, len); > > + if (ret < 0) { > > + return ret; > > + } > > + > > + bdrv_inc_in_flight(bs); > > + ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len); > > + bdrv_dec_in_flight(bs); > > + blk_dec_in_flight(blk); > > + return ret; > > +} > > + > > void blk_drain(BlockBackend *blk) > > { > > BlockDriverState *bs = blk_bs(blk); > > diff --git a/block/coroutines.h b/block/coroutines.h > > index 830ecaa733..a114d7bc30 100644 > > --- a/block/coroutines.h > > +++ b/block/coroutines.h > > @@ -80,6 +80,11 @@ int coroutine_fn > > blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes); > > > > int coroutine_fn blk_co_do_flush(BlockBackend *blk); > > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > > + int64_t len, int64_t *nr_zones, > > + BlockZoneDescriptor *zones); > > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op, > > + int64_t offset, int64_t len); > > > > > > /* > > diff --git a/block/file-posix.c b/block/file-posix.c > > index 48cd096624..1b8b0d351f 100644 > > --- a/block/file-posix.c > > +++ b/block/file-posix.c > > @@ -67,6 +67,7 @@ > > #include <sys/param.h> > > #include <sys/syscall.h> > > #include <sys/vfs.h> > > +#include <linux/blkzoned.h> > > #include <linux/cdrom.h> > > #include <linux/fd.h> > > #include <linux/fs.h> > > @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData { > > PreallocMode prealloc; > > Error **errp; > > } truncate; > > + struct { > > + int64_t *nr_zones; > > + BlockZoneDescriptor *zones; > > + } zone_report; > > + zone_op op; > > }; > > } RawPosixAIOData; > > > > @@ -1801,6 +1807,135 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd, > > } > > #endif > > > > +/* > > + * parse_zone - Fill a zone descriptor > > + */ > > +static inline void parse_zone(struct BlockZoneDescriptor *zone, > > + struct blk_zone *blkz) { > > + zone->start = blkz->start; > > + zone->length = blkz->len; > > + zone->cap = blkz->capacity; > > + zone->wp = blkz->wp - blkz->start; > > + zone->type = blkz->type; > > + zone->cond = blkz->cond; > > +} > > + > > +static int handle_aiocb_zone_report(void *opaque) { > > + RawPosixAIOData *aiocb = opaque; > > + int fd = aiocb->aio_fildes; > > + int64_t *nr_zones = aiocb->zone_report.nr_zones; > > + BlockZoneDescriptor *zones = aiocb->zone_report.zones; > > + int64_t offset = aiocb->aio_offset; > > + int64_t len = aiocb->aio_nbytes; > > + > > + struct blk_zone *blkz; > > + int64_t rep_size, nrz; > > + int ret, n = 0, i = 0; > > + > > + nrz = *nr_zones; > > + if (len == -1) { > > + return -errno; > > + } > > + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); > > + g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz); > > + offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */ > > + printf("start to report zone with offset: 0x%lx\n", offset); > > + > > + blkz = (struct blk_zone *)(rep + 1); > > + while (n < nrz) { > > + memset(rep, 0, rep_size); > > + rep->sector = offset; > > + rep->nr_zones = nrz; > > + > > + ret = ioctl(fd, BLKREPORTZONE, rep); > > + if (ret != 0) { > > + ret = -errno; > > + error_report("%d: ioctl BLKREPORTZONE at %ld failed %d", > > + fd, offset, errno); > > + return ret; > > + } > > + > > + if (!rep->nr_zones) { > > + break; > > + } > > + > > + for (i = 0; i < rep->nr_zones; i++, n++) { > > + parse_zone(&zones[n], &blkz[i]); > > + /* The next report should start after the last zone reported */ > > + offset = blkz[i].start + blkz[i].len; > > + } > > Where do you increase 'n' such that the loop can make forward progress? > Wouldn't it be better to use a for() loop here? > 'n' increases in this for loop as 'i' increases. I think the for() loop can serve the same purpose with some modifications. > > + } > > + > > + *nr_zones = n; > > + return 0; > > +} > > + > > +static int handle_aiocb_zone_mgmt(void *opaque) { > > + RawPosixAIOData *aiocb = opaque; > > + int fd = aiocb->aio_fildes; > > + int64_t offset = aiocb->aio_offset; > > + int64_t len = aiocb->aio_nbytes; > > + zone_op op = aiocb->op; > > + > > + struct blk_zone_range range; > > + const char *ioctl_name; > > + unsigned long ioctl_op; > > + int64_t zone_size; > > + int64_t zone_size_mask; > > + int ret; > > + > > Shouldn't we add a check here if 'fd' points to a zoned device? > ioctl errors are not _that_ helpful here, as you might get a variety > of errors and it's not quite obvious which of those errors indicate > an unsupported feature. > Yes, I'll add it in the next patch. > > + ret = ioctl(fd, BLKGETZONESZ, &zone_size); > > + if (ret) { > > + return -1; > > + } > > + > > + zone_size_mask = zone_size - 1; > > + if (offset & zone_size_mask) { > > + error_report("offset is not the start of a zone"); > > + return -1; > > + } > > + > > + if (len & zone_size_mask) { > > + error_report("len is not aligned to zones"); > > + return -1; > > + } > > + > > + switch (op) { > > + case zone_open: > > + ioctl_name = "BLKOPENZONE"; > > + ioctl_op = BLKOPENZONE; > > + break; > > + case zone_close: > > + ioctl_name = "BLKCLOSEZONE"; > > + ioctl_op = BLKCLOSEZONE; > > + break; > > + case zone_finish: > > + ioctl_name = "BLKFINISHZONE"; > > + ioctl_op = BLKFINISHZONE; > > + break; > > + case zone_reset: > > + ioctl_name = "BLKRESETZONE"; > > + ioctl_op = BLKRESETZONE; > > + break; > > + default: > > + error_report("Invalid zone operation 0x%x", op); > > + errno = -EINVAL; > > + return -1; > > + } > > + > > + /* Execute the operation */ > > + range.sector = offset; > > + range.nr_sectors = len; > > + ret = ioctl(fd, ioctl_op, &range); > > + if (ret != 0) { > > + error_report("ioctl %s failed %d", > > + ioctl_name, errno); > > + return -1; > > + } > > + > > + return 0; > > +} > > + > > static int handle_aiocb_copy_range(void *opaque) > > { > > RawPosixAIOData *aiocb = opaque; > > @@ -2973,6 +3108,58 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret) > > } > > } > > > > +/* > > + * zone report - Get a zone block device's information in the form > > + * of an array of zone descriptors. > > + * > > + * @param bs: passing zone block device file descriptor > > + * @param zones: an array of zone descriptors to hold zone > > + * information on reply > > + * @param offset: offset can be any byte within the zone size. > > + * @param len: (not sure yet. > > + * @return 0 on success, -1 on failure > > + */ > > +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset, > > + int64_t len, int64_t *nr_zones, > > + BlockZoneDescriptor *zones) { > > + BDRVRawState *s = bs->opaque; > > + RawPosixAIOData acb; > > + > > + acb = (RawPosixAIOData) { > > + .bs = bs, > > + .aio_fildes = s->fd, > > + .aio_type = QEMU_AIO_IOCTL, > > + .aio_offset = offset, > > + .aio_nbytes = len, > > + .zone_report = { > > + .nr_zones = nr_zones, > > + .zones = zones, > > + }, > > + }; > > + > > + return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb); > > +} > > + > > +/* > > + * zone management operations - Execute an operation on a zone > > + */ > > +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, zone_op op, > > + int64_t offset, int64_t len) { > > + BDRVRawState *s = bs->opaque; > > + RawPosixAIOData acb; > > + > > + acb = (RawPosixAIOData) { > > + .bs = bs, > > + .aio_fildes = s->fd, > > + .aio_type = QEMU_AIO_IOCTL, > > + .aio_offset = offset, > > + .aio_nbytes = len, > > + .op = op, > > + }; > > + > > + return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb); > > +} > > + > > static coroutine_fn int > > raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, > > bool blkdev) > > @@ -3324,6 +3511,9 @@ BlockDriver bdrv_file = { > > .bdrv_abort_perm_update = raw_abort_perm_update, > > .create_opts = &raw_create_opts, > > .mutable_opts = mutable_opts, > > + > > + .bdrv_co_zone_report = raw_co_zone_report, > > + .bdrv_co_zone_mgmt = raw_co_zone_mgmt, > > }; > > > > /***********************************************/ > > @@ -3703,6 +3893,53 @@ static BlockDriver bdrv_host_device = { > > #endif > > }; > > > > +static BlockDriver bdrv_zoned_host_device = { > > + .format_name = "zoned_host_device", > > + .protocol_name = "zoned_host_device", > > + .instance_size = sizeof(BDRVRawState), > > + .bdrv_needs_filename = true, > > + .bdrv_probe_device = hdev_probe_device, > > + .bdrv_parse_filename = hdev_parse_filename, > > + .bdrv_file_open = hdev_open, > > + .bdrv_close = raw_close, > > + .bdrv_reopen_prepare = raw_reopen_prepare, > > + .bdrv_reopen_commit = raw_reopen_commit, > > + .bdrv_reopen_abort = raw_reopen_abort, > > + .bdrv_co_create_opts = bdrv_co_create_opts_simple, > > + .create_opts = &bdrv_create_opts_simple, > > + .mutable_opts = mutable_opts, > > + .bdrv_co_invalidate_cache = raw_co_invalidate_cache, > > + .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, > > + > > + .bdrv_co_preadv = raw_co_preadv, > > + .bdrv_co_pwritev = raw_co_pwritev, > > + .bdrv_co_flush_to_disk = raw_co_flush_to_disk, > > + .bdrv_co_pdiscard = hdev_co_pdiscard, > > + .bdrv_co_copy_range_from = raw_co_copy_range_from, > > + .bdrv_co_copy_range_to = raw_co_copy_range_to, > > + .bdrv_refresh_limits = raw_refresh_limits, > > + .bdrv_io_plug = raw_aio_plug, > > + .bdrv_io_unplug = raw_aio_unplug, > > + .bdrv_attach_aio_context = raw_aio_attach_aio_context, > > + > > + .bdrv_co_truncate = raw_co_truncate, > > + .bdrv_getlength = raw_getlength, > > + .bdrv_get_info = raw_get_info, > > + .bdrv_get_allocated_file_size > > + = raw_get_allocated_file_size, > > + .bdrv_get_specific_stats = hdev_get_specific_stats, > > + .bdrv_check_perm = raw_check_perm, > > + .bdrv_set_perm = raw_set_perm, > > + .bdrv_abort_perm_update = raw_abort_perm_update, > > + .bdrv_probe_blocksizes = hdev_probe_blocksizes, > > + .bdrv_probe_geometry = hdev_probe_geometry, > > + .bdrv_co_ioctl = hdev_co_ioctl, > > + > > + /* zone management operations */ > > + .bdrv_co_zone_report = raw_co_zone_report, > > + .bdrv_co_zone_mgmt = raw_co_zone_mgmt, > > +}; > > + > > #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) > > static void cdrom_parse_filename(const char *filename, QDict *options, > > Error **errp) > > @@ -3964,6 +4201,7 @@ static void bdrv_file_init(void) > > #if defined(HAVE_HOST_BLOCK_DEVICE) > > bdrv_register(&bdrv_host_device); > > #ifdef __linux__ > > + bdrv_register(&bdrv_zoned_host_device); > > bdrv_register(&bdrv_host_cdrom); > > #endif > > #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) > > diff --git a/include/block/block-common.h b/include/block/block-common.h > > index fdb7306e78..78cddeeda5 100644 > > --- a/include/block/block-common.h > > +++ b/include/block/block-common.h > > @@ -23,7 +23,6 @@ > > */ > > #ifndef BLOCK_COMMON_H > > #define BLOCK_COMMON_H > > - > > #include "block/aio.h" > > #include "block/aio-wait.h" > > #include "qemu/iov.h" > > @@ -49,6 +48,48 @@ typedef struct BlockDriver BlockDriver; > > typedef struct BdrvChild BdrvChild; > > typedef struct BdrvChildClass BdrvChildClass; > > > > +typedef enum zone_op { > > + zone_open, > > + zone_close, > > + zone_finish, > > + zone_reset, > > +} zone_op; > > + > > +typedef enum zone_model { > > + BLK_Z_HM, > > + BLK_Z_HA, > > +} zone_model; > > + > > +typedef enum BlkZoneCondition { > > + BLK_ZS_NOT_WP = 0x0, > > + BLK_ZS_EMPTY = 0x1, > > + BLK_ZS_IOPEN = 0x2, > > + BLK_ZS_EOPEN = 0x3, > > + BLK_ZS_CLOSED = 0x4, > > + BLK_ZS_RDONLY = 0xD, > > + BLK_ZS_FULL = 0xE, > > + BLK_ZS_OFFLINE = 0xF, > > +} BlkZoneCondition; > > + > > +typedef enum BlkZoneType { > > + BLK_ZT_CONV = 0x1, > > + BLK_ZT_SWR = 0x2, > > + BLK_ZT_SWP = 0x3, > > +} BlkZoneType; > > + > > +/* > > + * Zone descriptor data structure. > > + * Provide information on a zone with all position and size values in bytes. > > + */ > > +typedef struct BlockZoneDescriptor { > > + uint64_t start; > > + uint64_t length; > > + uint64_t cap; > > + uint64_t wp; > > + BlkZoneType type; > > + BlkZoneCondition cond; > > +} BlockZoneDescriptor; > > + > > typedef struct BlockDriverInfo { > > /* in bytes, 0 if irrelevant */ > > int cluster_size; > > diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h > > index 8947abab76..b9ea9db6dc 100644 > > --- a/include/block/block_int-common.h > > +++ b/include/block/block_int-common.h > > @@ -94,6 +94,20 @@ typedef struct BdrvTrackedRequest { > > struct BdrvTrackedRequest *waiting_for; > > } BdrvTrackedRequest; > > > > +/** > > + * Zone device information data structure. > > + * Provide information on a device. > > + */ > > +typedef struct zbd_dev { > > + uint32_t zone_size; > > + zone_model model; > > + uint32_t block_size; > > + uint32_t write_granularity; > > + uint32_t nr_zones; > > + struct BlockZoneDescriptor *zones; /* array of zones */ > > + uint32_t max_nr_open_zones; /* maximum number of explicitly open zones */ > > + uint32_t max_nr_active_zones; > > +} zbd_dev; > > > > struct BlockDriver { > > /* > > @@ -691,6 +705,12 @@ struct BlockDriver { > > QEMUIOVector *qiov, > > int64_t pos); > > > > + int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs, > > + int64_t offset, int64_t len, int64_t *nr_zones, > > + BlockZoneDescriptor *zones); > > + int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, enum zone_op op, > > + int64_t offset, int64_t len); > > + > > /* removable device specific */ > > bool (*bdrv_is_inserted)(BlockDriverState *bs); > > void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag); > > Other than that: Well done! > Thanks for reviewing! Have a good day! Sam > Cheers, > > Hannes > -- > Dr. Hannes Reinecke Kernel Storage Architect > hare@suse.de +49 911 74053 688 > SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg > HRB 36809 (AG Nürnberg), GF: Felix Imendörffer
On Mon, Jun 27, 2022 at 08:19:13AM +0800, Sam Li wrote: > diff --git a/block/block-backend.c b/block/block-backend.c > index e0e1aff4b1..786f964d02 100644 > --- a/block/block-backend.c > +++ b/block/block-backend.c > @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk) > return ret; > } > > +/* > + * Return zone_report from BlockDriver. Offset can be any number within > + * the zone size. No alignment for offset and len. What is the purpose of len? Is it the maximum number of zones to return in nr_zones[]? How does the caller know how many zones were returned? > + */ > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > + int64_t len, int64_t *nr_zones, > + BlockZoneDescriptor *zones) > +{ > + int ret; > + BlockDriverState *bs; > + IO_CODE(); > + > + blk_inc_in_flight(blk); /* increase before waiting */ > + blk_wait_while_drained(blk); > + bs = blk_bs(blk); > + > + ret = blk_check_byte_request(blk, offset, len); > + if (ret < 0) { > + return ret; > + } > + > + bdrv_inc_in_flight(bs); The bdrv_inc/dec_in_flight() call should be inside bdrv_co_zone_report(). See bdrv_co_ioctl() for an example. > + ret = bdrv_co_zone_report(blk->root->bs, offset, len, > + nr_zones, zones); > + bdrv_dec_in_flight(bs); > + blk_dec_in_flight(blk); > + return ret; > +} > + > +/* > + * Return zone_mgmt from BlockDriver. Maybe this should be: Send a zone management command. > @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData { > PreallocMode prealloc; > Error **errp; > } truncate; > + struct { > + int64_t *nr_zones; > + BlockZoneDescriptor *zones; > + } zone_report; > + zone_op op; It's cleaner to put op inside a struct zone_mgmt so its purpose is self-explanatory: struct { zone_op op; } zone_mgmt; > +static int handle_aiocb_zone_report(void *opaque) { > + RawPosixAIOData *aiocb = opaque; > + int fd = aiocb->aio_fildes; > + int64_t *nr_zones = aiocb->zone_report.nr_zones; > + BlockZoneDescriptor *zones = aiocb->zone_report.zones; > + int64_t offset = aiocb->aio_offset; > + int64_t len = aiocb->aio_nbytes; > + > + struct blk_zone *blkz; > + int64_t rep_size, nrz; > + int ret, n = 0, i = 0; > + > + nrz = *nr_zones; > + if (len == -1) { > + return -errno; Where is errno set? Should this be an errno constant instead like -EINVAL? > + } > + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); > + g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz); g_new() looks incorrect. There should be 1 struct blk_zone_report followed by nrz struct blk_zone structs. Please use g_malloc(rep_size) instead. > + offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */ > + printf("start to report zone with offset: 0x%lx\n", offset); > + > + blkz = (struct blk_zone *)(rep + 1); > + while (n < nrz) { > + memset(rep, 0, rep_size); > + rep->sector = offset; > + rep->nr_zones = nrz; What prevents zones[] overflowing? nrz isn't being reduced in the loop, so maybe the rep->nr_zones return value will eventually exceed the number of elements still available in zones[n..]? > +static int handle_aiocb_zone_mgmt(void *opaque) { > + RawPosixAIOData *aiocb = opaque; > + int fd = aiocb->aio_fildes; > + int64_t offset = aiocb->aio_offset; > + int64_t len = aiocb->aio_nbytes; > + zone_op op = aiocb->op; > + > + struct blk_zone_range range; > + const char *ioctl_name; > + unsigned long ioctl_op; > + int64_t zone_size; > + int64_t zone_size_mask; > + int ret; > + > + ret = ioctl(fd, BLKGETZONESZ, &zone_size); Can this value be stored in bs (maybe in BlockLimits) to avoid calling ioctl(BLKGETZONESZ) each time? > + if (ret) { > + return -1; The return value should be a negative errno. -1 is -EPERM but it's probably not that error code you wanted. This should be: return -errno; > + } > + > + zone_size_mask = zone_size - 1; > + if (offset & zone_size_mask) { > + error_report("offset is not the start of a zone"); > + return -1; return -EINVAL; > + } > + > + if (len & zone_size_mask) { > + error_report("len is not aligned to zones"); > + return -1; return -EINVAL; > +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset, > + int64_t len, int64_t *nr_zones, > + BlockZoneDescriptor *zones) { > + BDRVRawState *s = bs->opaque; > + RawPosixAIOData acb; > + > + acb = (RawPosixAIOData) { > + .bs = bs, > + .aio_fildes = s->fd, > + .aio_type = QEMU_AIO_IOCTL, > + .aio_offset = offset, > + .aio_nbytes = len, > + .zone_report = { > + .nr_zones = nr_zones, > + .zones = zones, > + }, Indentation is off here. Please use 4-space indentation.
Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月28日周二 14:52写道: > > On Mon, Jun 27, 2022 at 08:19:13AM +0800, Sam Li wrote: > > diff --git a/block/block-backend.c b/block/block-backend.c > > index e0e1aff4b1..786f964d02 100644 > > --- a/block/block-backend.c > > +++ b/block/block-backend.c > > @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk) > > return ret; > > } > > > > +/* > > + * Return zone_report from BlockDriver. Offset can be any number within > > + * the zone size. No alignment for offset and len. > > What is the purpose of len? Is it the maximum number of zones to return > in nr_zones[]? len is actually not used in bdrv_co_zone_report. It is needed by blk_check_byte_request. > How does the caller know how many zones were returned? nr_zones represents IN maximum and OUT actual. The caller will know by nr_zones which is changed in bdrv_co_zone_report. I'll add it in the comments. > > > + */ > > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > > + int64_t len, int64_t *nr_zones, > > + BlockZoneDescriptor *zones) > > +{ > > + int ret; > > + BlockDriverState *bs; > > + IO_CODE(); > > + > > + blk_inc_in_flight(blk); /* increase before waiting */ > > + blk_wait_while_drained(blk); > > + bs = blk_bs(blk); > > + > > + ret = blk_check_byte_request(blk, offset, len); > > + if (ret < 0) { > > + return ret; > > + } > > + > > + bdrv_inc_in_flight(bs); > > The bdrv_inc/dec_in_flight() call should be inside > bdrv_co_zone_report(). See bdrv_co_ioctl() for an example. > > > + ret = bdrv_co_zone_report(blk->root->bs, offset, len, > > + nr_zones, zones); > > + bdrv_dec_in_flight(bs); > > + blk_dec_in_flight(blk); > > + return ret; > > +} > > + > > +/* > > + * Return zone_mgmt from BlockDriver. > > Maybe this should be: > > Send a zone management command. Yes, it's more accurate. > > > @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData { > > PreallocMode prealloc; > > Error **errp; > > } truncate; > > + struct { > > + int64_t *nr_zones; > > + BlockZoneDescriptor *zones; > > + } zone_report; > > + zone_op op; > > It's cleaner to put op inside a struct zone_mgmt so its purpose is > self-explanatory: > > struct { > zone_op op; > } zone_mgmt; > > > +static int handle_aiocb_zone_report(void *opaque) { > > + RawPosixAIOData *aiocb = opaque; > > + int fd = aiocb->aio_fildes; > > + int64_t *nr_zones = aiocb->zone_report.nr_zones; > > + BlockZoneDescriptor *zones = aiocb->zone_report.zones; > > + int64_t offset = aiocb->aio_offset; > > + int64_t len = aiocb->aio_nbytes; > > + > > + struct blk_zone *blkz; > > + int64_t rep_size, nrz; > > + int ret, n = 0, i = 0; > > + > > + nrz = *nr_zones; > > + if (len == -1) { > > + return -errno; > > Where is errno set? Should this be an errno constant instead like > -EINVAL? That's right! Noted. > > > + } > > + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); > > + g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz); > > g_new() looks incorrect. There should be 1 struct blk_zone_report > followed by nrz struct blk_zone structs. Please use g_malloc(rep_size) > instead. Yes! However, it still has a memory leak error when using g_autofree && g_malloc. > > > + offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */ > > + printf("start to report zone with offset: 0x%lx\n", offset); > > + > > + blkz = (struct blk_zone *)(rep + 1); > > + while (n < nrz) { > > + memset(rep, 0, rep_size); > > + rep->sector = offset; > > + rep->nr_zones = nrz; > > What prevents zones[] overflowing? nrz isn't being reduced in the loop, > so maybe the rep->nr_zones return value will eventually exceed the > number of elements still available in zones[n..]? I suppose the number of zones[] is restricted in the subsequent for loop where zones[] copy one zone at a time as n increases. Even if rep->zones exceeds the available room in zones[], the extra zone will not be copied. > > > +static int handle_aiocb_zone_mgmt(void *opaque) { > > + RawPosixAIOData *aiocb = opaque; > > + int fd = aiocb->aio_fildes; > > + int64_t offset = aiocb->aio_offset; > > + int64_t len = aiocb->aio_nbytes; > > + zone_op op = aiocb->op; > > + > > + struct blk_zone_range range; > > + const char *ioctl_name; > > + unsigned long ioctl_op; > > + int64_t zone_size; > > + int64_t zone_size_mask; > > + int ret; > > + > > + ret = ioctl(fd, BLKGETZONESZ, &zone_size); > > Can this value be stored in bs (maybe in BlockLimits) to avoid calling > ioctl(BLKGETZONESZ) each time? Yes, zone_size is in the zbd_dev field. I'll update BlockLimits after I think through the configurations. In addition, it's a temporary approach. It is substituted by get_sysfs_long_val now. > > > + if (ret) { > > + return -1; > > The return value should be a negative errno. -1 is -EPERM but it's > probably not that error code you wanted. This should be: > > return -errno; > > > + } > > + > > + zone_size_mask = zone_size - 1; > > + if (offset & zone_size_mask) { > > + error_report("offset is not the start of a zone"); > > + return -1; > > return -EINVAL; > > > + } > > + > > + if (len & zone_size_mask) { > > + error_report("len is not aligned to zones"); > > + return -1; > > return -EINVAL; > > > +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset, > > + int64_t len, int64_t *nr_zones, > > + BlockZoneDescriptor *zones) { > > + BDRVRawState *s = bs->opaque; > > + RawPosixAIOData acb; > > + > > + acb = (RawPosixAIOData) { > > + .bs = bs, > > + .aio_fildes = s->fd, > > + .aio_type = QEMU_AIO_IOCTL, > > + .aio_offset = offset, > > + .aio_nbytes = len, > > + .zone_report = { > > + .nr_zones = nr_zones, > > + .zones = zones, > > + }, > > Indentation is off here. Please use 4-space indentation. Noted! Thanks for reviewing! Sam
On 6/28/22 17:05, Sam Li wrote: > Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月28日周二 14:52写道: >> >> On Mon, Jun 27, 2022 at 08:19:13AM +0800, Sam Li wrote: >>> diff --git a/block/block-backend.c b/block/block-backend.c >>> index e0e1aff4b1..786f964d02 100644 >>> --- a/block/block-backend.c >>> +++ b/block/block-backend.c >>> @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk) >>> return ret; >>> } >>> >>> +/* >>> + * Return zone_report from BlockDriver. Offset can be any number within >>> + * the zone size. No alignment for offset and len. >> >> What is the purpose of len? Is it the maximum number of zones to return >> in nr_zones[]? > > len is actually not used in bdrv_co_zone_report. It is needed by > blk_check_byte_request. There is no IO buffer being passed. Only an array of zone descriptors and an in-out number of zones. len is definitely not needed. Not sure what blk_check_byte_request() is intended to check though. > >> How does the caller know how many zones were returned? > > nr_zones represents IN maximum and OUT actual. The caller will know by > nr_zones which is changed in bdrv_co_zone_report. I'll add it in the > comments. > >> >>> + */ >>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, >>> + int64_t len, int64_t *nr_zones, >>> + BlockZoneDescriptor *zones) >>> +{ >>> + int ret; >>> + BlockDriverState *bs; >>> + IO_CODE(); >>> + >>> + blk_inc_in_flight(blk); /* increase before waiting */ >>> + blk_wait_while_drained(blk); >>> + bs = blk_bs(blk); >>> + >>> + ret = blk_check_byte_request(blk, offset, len); >>> + if (ret < 0) { >>> + return ret; >>> + } >>> + >>> + bdrv_inc_in_flight(bs); >> >> The bdrv_inc/dec_in_flight() call should be inside >> bdrv_co_zone_report(). See bdrv_co_ioctl() for an example. >> >>> + ret = bdrv_co_zone_report(blk->root->bs, offset, len, >>> + nr_zones, zones); >>> + bdrv_dec_in_flight(bs); >>> + blk_dec_in_flight(blk); >>> + return ret; >>> +} >>> + >>> +/* >>> + * Return zone_mgmt from BlockDriver. >> >> Maybe this should be: >> >> Send a zone management command. > > Yes, it's more accurate. > >> >>> @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData { >>> PreallocMode prealloc; >>> Error **errp; >>> } truncate; >>> + struct { >>> + int64_t *nr_zones; >>> + BlockZoneDescriptor *zones; >>> + } zone_report; >>> + zone_op op; >> >> It's cleaner to put op inside a struct zone_mgmt so its purpose is >> self-explanatory: >> >> struct { >> zone_op op; >> } zone_mgmt; >> >>> +static int handle_aiocb_zone_report(void *opaque) { >>> + RawPosixAIOData *aiocb = opaque; >>> + int fd = aiocb->aio_fildes; >>> + int64_t *nr_zones = aiocb->zone_report.nr_zones; >>> + BlockZoneDescriptor *zones = aiocb->zone_report.zones; >>> + int64_t offset = aiocb->aio_offset; >>> + int64_t len = aiocb->aio_nbytes; Since you have the zone array and number of zones (size of that array) I really do not see why you need len. >>> + >>> + struct blk_zone *blkz; >>> + int64_t rep_size, nrz; >>> + int ret, n = 0, i = 0; >>> + >>> + nrz = *nr_zones; >>> + if (len == -1) { >>> + return -errno; >> >> Where is errno set? Should this be an errno constant instead like >> -EINVAL? > > That's right! Noted. > >> >>> + } >>> + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); >>> + g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz); >> >> g_new() looks incorrect. There should be 1 struct blk_zone_report >> followed by nrz struct blk_zone structs. Please use g_malloc(rep_size) >> instead. > > Yes! However, it still has a memory leak error when using g_autofree > && g_malloc. That may be because you are changing the value of the rep pointer while parsing the report ? > >> >>> + offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */ >>> + printf("start to report zone with offset: 0x%lx\n", offset); >>> + >>> + blkz = (struct blk_zone *)(rep + 1); >>> + while (n < nrz) { >>> + memset(rep, 0, rep_size); >>> + rep->sector = offset; >>> + rep->nr_zones = nrz; >> >> What prevents zones[] overflowing? nrz isn't being reduced in the loop, >> so maybe the rep->nr_zones return value will eventually exceed the >> number of elements still available in zones[n..]? > > I suppose the number of zones[] is restricted in the subsequent for > loop where zones[] copy one zone at a time as n increases. Even if > rep->zones exceeds the available room in zones[], the extra zone will > not be copied. > >> >>> +static int handle_aiocb_zone_mgmt(void *opaque) { >>> + RawPosixAIOData *aiocb = opaque; >>> + int fd = aiocb->aio_fildes; >>> + int64_t offset = aiocb->aio_offset; >>> + int64_t len = aiocb->aio_nbytes; >>> + zone_op op = aiocb->op; >>> + >>> + struct blk_zone_range range; >>> + const char *ioctl_name; >>> + unsigned long ioctl_op; >>> + int64_t zone_size; >>> + int64_t zone_size_mask; >>> + int ret; >>> + >>> + ret = ioctl(fd, BLKGETZONESZ, &zone_size); >> >> Can this value be stored in bs (maybe in BlockLimits) to avoid calling >> ioctl(BLKGETZONESZ) each time? > > Yes, zone_size is in the zbd_dev field. I'll update BlockLimits after > I think through the configurations. In addition, it's a temporary > approach. It is substituted by get_sysfs_long_val now. > >> >>> + if (ret) { >>> + return -1; >> >> The return value should be a negative errno. -1 is -EPERM but it's >> probably not that error code you wanted. This should be: >> >> return -errno; >> >>> + } >>> + >>> + zone_size_mask = zone_size - 1; >>> + if (offset & zone_size_mask) { >>> + error_report("offset is not the start of a zone"); >>> + return -1; >> >> return -EINVAL; >> >>> + } >>> + >>> + if (len & zone_size_mask) { >>> + error_report("len is not aligned to zones"); >>> + return -1; >> >> return -EINVAL; >> >>> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset, >>> + int64_t len, int64_t *nr_zones, >>> + BlockZoneDescriptor *zones) { >>> + BDRVRawState *s = bs->opaque; >>> + RawPosixAIOData acb; >>> + >>> + acb = (RawPosixAIOData) { >>> + .bs = bs, >>> + .aio_fildes = s->fd, >>> + .aio_type = QEMU_AIO_IOCTL, >>> + .aio_offset = offset, >>> + .aio_nbytes = len, >>> + .zone_report = { >>> + .nr_zones = nr_zones, >>> + .zones = zones, >>> + }, >> >> Indentation is off here. Please use 4-space indentation. > Noted! > > Thanks for reviewing! > > Sam
Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年6月28日周二 17:05写道: > > On 6/28/22 17:05, Sam Li wrote: > > Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月28日周二 14:52写道: > >> > >> On Mon, Jun 27, 2022 at 08:19:13AM +0800, Sam Li wrote: > >>> diff --git a/block/block-backend.c b/block/block-backend.c > >>> index e0e1aff4b1..786f964d02 100644 > >>> --- a/block/block-backend.c > >>> +++ b/block/block-backend.c > >>> @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk) > >>> return ret; > >>> } > >>> > >>> +/* > >>> + * Return zone_report from BlockDriver. Offset can be any number within > >>> + * the zone size. No alignment for offset and len. > >> > >> What is the purpose of len? Is it the maximum number of zones to return > >> in nr_zones[]? > > > > len is actually not used in bdrv_co_zone_report. It is needed by > > blk_check_byte_request. > > There is no IO buffer being passed. Only an array of zone descriptors and > an in-out number of zones. len is definitely not needed. Not sure what > blk_check_byte_request() is intended to check though. Can I just remove len argument and blk_check_byte_request() if it's not used? > > > > >> How does the caller know how many zones were returned? > > > > nr_zones represents IN maximum and OUT actual. The caller will know by > > nr_zones which is changed in bdrv_co_zone_report. I'll add it in the > > comments. > > > >> > >>> + */ > >>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > >>> + int64_t len, int64_t *nr_zones, > >>> + BlockZoneDescriptor *zones) > >>> +{ > >>> + int ret; > >>> + BlockDriverState *bs; > >>> + IO_CODE(); > >>> + > >>> + blk_inc_in_flight(blk); /* increase before waiting */ > >>> + blk_wait_while_drained(blk); > >>> + bs = blk_bs(blk); > >>> + > >>> + ret = blk_check_byte_request(blk, offset, len); > >>> + if (ret < 0) { > >>> + return ret; > >>> + } > >>> + > >>> + bdrv_inc_in_flight(bs); > >> > >> The bdrv_inc/dec_in_flight() call should be inside > >> bdrv_co_zone_report(). See bdrv_co_ioctl() for an example. > >> > >>> + ret = bdrv_co_zone_report(blk->root->bs, offset, len, > >>> + nr_zones, zones); > >>> + bdrv_dec_in_flight(bs); > >>> + blk_dec_in_flight(blk); > >>> + return ret; > >>> +} > >>> + > >>> +/* > >>> + * Return zone_mgmt from BlockDriver. > >> > >> Maybe this should be: > >> > >> Send a zone management command. > > > > Yes, it's more accurate. > > > >> > >>> @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData { > >>> PreallocMode prealloc; > >>> Error **errp; > >>> } truncate; > >>> + struct { > >>> + int64_t *nr_zones; > >>> + BlockZoneDescriptor *zones; > >>> + } zone_report; > >>> + zone_op op; > >> > >> It's cleaner to put op inside a struct zone_mgmt so its purpose is > >> self-explanatory: > >> > >> struct { > >> zone_op op; > >> } zone_mgmt; > >> > >>> +static int handle_aiocb_zone_report(void *opaque) { > >>> + RawPosixAIOData *aiocb = opaque; > >>> + int fd = aiocb->aio_fildes; > >>> + int64_t *nr_zones = aiocb->zone_report.nr_zones; > >>> + BlockZoneDescriptor *zones = aiocb->zone_report.zones; > >>> + int64_t offset = aiocb->aio_offset; > >>> + int64_t len = aiocb->aio_nbytes; > > Since you have the zone array and number of zones (size of that array) I > really do not see why you need len. > > >>> + > >>> + struct blk_zone *blkz; > >>> + int64_t rep_size, nrz; > >>> + int ret, n = 0, i = 0; > >>> + > >>> + nrz = *nr_zones; > >>> + if (len == -1) { > >>> + return -errno; > >> > >> Where is errno set? Should this be an errno constant instead like > >> -EINVAL? > > > > That's right! Noted. > > > >> > >>> + } > >>> + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); > >>> + g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz); > >> > >> g_new() looks incorrect. There should be 1 struct blk_zone_report > >> followed by nrz struct blk_zone structs. Please use g_malloc(rep_size) > >> instead. > > > > Yes! However, it still has a memory leak error when using g_autofree > > && g_malloc. > > That may be because you are changing the value of the rep pointer while > parsing the report ? I am not sure it is the case. Can you show me some way to find the problem? Thanks for reviewing! > > > > >> > >>> + offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */ > >>> + printf("start to report zone with offset: 0x%lx\n", offset); > >>> + > >>> + blkz = (struct blk_zone *)(rep + 1); > >>> + while (n < nrz) { > >>> + memset(rep, 0, rep_size); > >>> + rep->sector = offset; > >>> + rep->nr_zones = nrz; > >> > >> What prevents zones[] overflowing? nrz isn't being reduced in the loop, > >> so maybe the rep->nr_zones return value will eventually exceed the > >> number of elements still available in zones[n..]? > > > > I suppose the number of zones[] is restricted in the subsequent for > > loop where zones[] copy one zone at a time as n increases. Even if > > rep->zones exceeds the available room in zones[], the extra zone will > > not be copied. > > > >> > >>> +static int handle_aiocb_zone_mgmt(void *opaque) { > >>> + RawPosixAIOData *aiocb = opaque; > >>> + int fd = aiocb->aio_fildes; > >>> + int64_t offset = aiocb->aio_offset; > >>> + int64_t len = aiocb->aio_nbytes; > >>> + zone_op op = aiocb->op; > >>> + > >>> + struct blk_zone_range range; > >>> + const char *ioctl_name; > >>> + unsigned long ioctl_op; > >>> + int64_t zone_size; > >>> + int64_t zone_size_mask; > >>> + int ret; > >>> + > >>> + ret = ioctl(fd, BLKGETZONESZ, &zone_size); > >> > >> Can this value be stored in bs (maybe in BlockLimits) to avoid calling > >> ioctl(BLKGETZONESZ) each time? > > > > Yes, zone_size is in the zbd_dev field. I'll update BlockLimits after > > I think through the configurations. In addition, it's a temporary > > approach. It is substituted by get_sysfs_long_val now. > > > >> > >>> + if (ret) { > >>> + return -1; > >> > >> The return value should be a negative errno. -1 is -EPERM but it's > >> probably not that error code you wanted. This should be: > >> > >> return -errno; > >> > >>> + } > >>> + > >>> + zone_size_mask = zone_size - 1; > >>> + if (offset & zone_size_mask) { > >>> + error_report("offset is not the start of a zone"); > >>> + return -1; > >> > >> return -EINVAL; > >> > >>> + } > >>> + > >>> + if (len & zone_size_mask) { > >>> + error_report("len is not aligned to zones"); > >>> + return -1; > >> > >> return -EINVAL; > >> > >>> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset, > >>> + int64_t len, int64_t *nr_zones, > >>> + BlockZoneDescriptor *zones) { > >>> + BDRVRawState *s = bs->opaque; > >>> + RawPosixAIOData acb; > >>> + > >>> + acb = (RawPosixAIOData) { > >>> + .bs = bs, > >>> + .aio_fildes = s->fd, > >>> + .aio_type = QEMU_AIO_IOCTL, > >>> + .aio_offset = offset, > >>> + .aio_nbytes = len, > >>> + .zone_report = { > >>> + .nr_zones = nr_zones, > >>> + .zones = zones, > >>> + }, > >> > >> Indentation is off here. Please use 4-space indentation. > > Noted! > > > > Thanks for reviewing! > > > > Sam > > > -- > Damien Le Moal > Western Digital Research
On 6/28/22 19:23, Sam Li wrote: > Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年6月28日周二 17:05写道: >> >> On 6/28/22 17:05, Sam Li wrote: >>> Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月28日周二 14:52写道: >>>> >>>> On Mon, Jun 27, 2022 at 08:19:13AM +0800, Sam Li wrote: >>>>> diff --git a/block/block-backend.c b/block/block-backend.c >>>>> index e0e1aff4b1..786f964d02 100644 >>>>> --- a/block/block-backend.c >>>>> +++ b/block/block-backend.c >>>>> @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk) >>>>> return ret; >>>>> } >>>>> >>>>> +/* >>>>> + * Return zone_report from BlockDriver. Offset can be any number within >>>>> + * the zone size. No alignment for offset and len. >>>> >>>> What is the purpose of len? Is it the maximum number of zones to return >>>> in nr_zones[]? >>> >>> len is actually not used in bdrv_co_zone_report. It is needed by >>> blk_check_byte_request. >> >> There is no IO buffer being passed. Only an array of zone descriptors and >> an in-out number of zones. len is definitely not needed. Not sure what >> blk_check_byte_request() is intended to check though. > > Can I just remove len argument and blk_check_byte_request() if it's not used? If it is unused, yes, you must remove it. > >> >>> >>>> How does the caller know how many zones were returned? >>> >>> nr_zones represents IN maximum and OUT actual. The caller will know by >>> nr_zones which is changed in bdrv_co_zone_report. I'll add it in the >>> comments. >>> >>>> >>>>> + */ >>>>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, >>>>> + int64_t len, int64_t *nr_zones, >>>>> + BlockZoneDescriptor *zones) >>>>> +{ >>>>> + int ret; >>>>> + BlockDriverState *bs; >>>>> + IO_CODE(); >>>>> + >>>>> + blk_inc_in_flight(blk); /* increase before waiting */ >>>>> + blk_wait_while_drained(blk); >>>>> + bs = blk_bs(blk); >>>>> + >>>>> + ret = blk_check_byte_request(blk, offset, len); >>>>> + if (ret < 0) { >>>>> + return ret; >>>>> + } >>>>> + >>>>> + bdrv_inc_in_flight(bs); >>>> >>>> The bdrv_inc/dec_in_flight() call should be inside >>>> bdrv_co_zone_report(). See bdrv_co_ioctl() for an example. >>>> >>>>> + ret = bdrv_co_zone_report(blk->root->bs, offset, len, >>>>> + nr_zones, zones); >>>>> + bdrv_dec_in_flight(bs); >>>>> + blk_dec_in_flight(blk); >>>>> + return ret; >>>>> +} >>>>> + >>>>> +/* >>>>> + * Return zone_mgmt from BlockDriver. >>>> >>>> Maybe this should be: >>>> >>>> Send a zone management command. >>> >>> Yes, it's more accurate. >>> >>>> >>>>> @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData { >>>>> PreallocMode prealloc; >>>>> Error **errp; >>>>> } truncate; >>>>> + struct { >>>>> + int64_t *nr_zones; >>>>> + BlockZoneDescriptor *zones; >>>>> + } zone_report; >>>>> + zone_op op; >>>> >>>> It's cleaner to put op inside a struct zone_mgmt so its purpose is >>>> self-explanatory: >>>> >>>> struct { >>>> zone_op op; >>>> } zone_mgmt; >>>> >>>>> +static int handle_aiocb_zone_report(void *opaque) { >>>>> + RawPosixAIOData *aiocb = opaque; >>>>> + int fd = aiocb->aio_fildes; >>>>> + int64_t *nr_zones = aiocb->zone_report.nr_zones; >>>>> + BlockZoneDescriptor *zones = aiocb->zone_report.zones; >>>>> + int64_t offset = aiocb->aio_offset; >>>>> + int64_t len = aiocb->aio_nbytes; >> >> Since you have the zone array and number of zones (size of that array) I >> really do not see why you need len. >> >>>>> + >>>>> + struct blk_zone *blkz; >>>>> + int64_t rep_size, nrz; >>>>> + int ret, n = 0, i = 0; >>>>> + >>>>> + nrz = *nr_zones; >>>>> + if (len == -1) { >>>>> + return -errno; >>>> >>>> Where is errno set? Should this be an errno constant instead like >>>> -EINVAL? >>> >>> That's right! Noted. >>> >>>> >>>>> + } >>>>> + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); >>>>> + g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz); >>>> >>>> g_new() looks incorrect. There should be 1 struct blk_zone_report >>>> followed by nrz struct blk_zone structs. Please use g_malloc(rep_size) >>>> instead. >>> >>> Yes! However, it still has a memory leak error when using g_autofree >>> && g_malloc. >> >> That may be because you are changing the value of the rep pointer while >> parsing the report ? > > I am not sure it is the case. Can you show me some way to find the problem? Not sure. I never used this g_malloc()/g_autofree() before so not sure how it works. It may be that g_autofree() work only with g_new() ? Could you try separating the declaration and allocation ? e.g. Declare at the beginning of the function: g_autofree struct blk_zone_report *rep = NULL; And then when needed do: rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); rep = g_malloc(rep_size); > > Thanks for reviewing! > > >> >>> >>>> >>>>> + offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */ >>>>> + printf("start to report zone with offset: 0x%lx\n", offset); >>>>> + >>>>> + blkz = (struct blk_zone *)(rep + 1); >>>>> + while (n < nrz) { >>>>> + memset(rep, 0, rep_size); >>>>> + rep->sector = offset; >>>>> + rep->nr_zones = nrz; >>>> >>>> What prevents zones[] overflowing? nrz isn't being reduced in the loop, >>>> so maybe the rep->nr_zones return value will eventually exceed the >>>> number of elements still available in zones[n..]? >>> >>> I suppose the number of zones[] is restricted in the subsequent for >>> loop where zones[] copy one zone at a time as n increases. Even if >>> rep->zones exceeds the available room in zones[], the extra zone will >>> not be copied. >>> >>>> >>>>> +static int handle_aiocb_zone_mgmt(void *opaque) { >>>>> + RawPosixAIOData *aiocb = opaque; >>>>> + int fd = aiocb->aio_fildes; >>>>> + int64_t offset = aiocb->aio_offset; >>>>> + int64_t len = aiocb->aio_nbytes; >>>>> + zone_op op = aiocb->op; >>>>> + >>>>> + struct blk_zone_range range; >>>>> + const char *ioctl_name; >>>>> + unsigned long ioctl_op; >>>>> + int64_t zone_size; >>>>> + int64_t zone_size_mask; >>>>> + int ret; >>>>> + >>>>> + ret = ioctl(fd, BLKGETZONESZ, &zone_size); >>>> >>>> Can this value be stored in bs (maybe in BlockLimits) to avoid calling >>>> ioctl(BLKGETZONESZ) each time? >>> >>> Yes, zone_size is in the zbd_dev field. I'll update BlockLimits after >>> I think through the configurations. In addition, it's a temporary >>> approach. It is substituted by get_sysfs_long_val now. >>> >>>> >>>>> + if (ret) { >>>>> + return -1; >>>> >>>> The return value should be a negative errno. -1 is -EPERM but it's >>>> probably not that error code you wanted. This should be: >>>> >>>> return -errno; >>>> >>>>> + } >>>>> + >>>>> + zone_size_mask = zone_size - 1; >>>>> + if (offset & zone_size_mask) { >>>>> + error_report("offset is not the start of a zone"); >>>>> + return -1; >>>> >>>> return -EINVAL; >>>> >>>>> + } >>>>> + >>>>> + if (len & zone_size_mask) { >>>>> + error_report("len is not aligned to zones"); >>>>> + return -1; >>>> >>>> return -EINVAL; >>>> >>>>> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset, >>>>> + int64_t len, int64_t *nr_zones, >>>>> + BlockZoneDescriptor *zones) { >>>>> + BDRVRawState *s = bs->opaque; >>>>> + RawPosixAIOData acb; >>>>> + >>>>> + acb = (RawPosixAIOData) { >>>>> + .bs = bs, >>>>> + .aio_fildes = s->fd, >>>>> + .aio_type = QEMU_AIO_IOCTL, >>>>> + .aio_offset = offset, >>>>> + .aio_nbytes = len, >>>>> + .zone_report = { >>>>> + .nr_zones = nr_zones, >>>>> + .zones = zones, >>>>> + }, >>>> >>>> Indentation is off here. Please use 4-space indentation. >>> Noted! >>> >>> Thanks for reviewing! >>> >>> Sam >> >> >> -- >> Damien Le Moal >> Western Digital Research
Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年6月29日周三 09:43写道: > > On 6/28/22 19:23, Sam Li wrote: > > Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年6月28日周二 17:05写道: > >> > >> On 6/28/22 17:05, Sam Li wrote: > >>> Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月28日周二 14:52写道: > >>>> > >>>> On Mon, Jun 27, 2022 at 08:19:13AM +0800, Sam Li wrote: > >>>>> diff --git a/block/block-backend.c b/block/block-backend.c > >>>>> index e0e1aff4b1..786f964d02 100644 > >>>>> --- a/block/block-backend.c > >>>>> +++ b/block/block-backend.c > >>>>> @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk) > >>>>> return ret; > >>>>> } > >>>>> > >>>>> +/* > >>>>> + * Return zone_report from BlockDriver. Offset can be any number within > >>>>> + * the zone size. No alignment for offset and len. > >>>> > >>>> What is the purpose of len? Is it the maximum number of zones to return > >>>> in nr_zones[]? > >>> > >>> len is actually not used in bdrv_co_zone_report. It is needed by > >>> blk_check_byte_request. > >> > >> There is no IO buffer being passed. Only an array of zone descriptors and > >> an in-out number of zones. len is definitely not needed. Not sure what > >> blk_check_byte_request() is intended to check though. > > > > Can I just remove len argument and blk_check_byte_request() if it's not used? > > If it is unused, yes, you must remove it. > > > > >> > >>> > >>>> How does the caller know how many zones were returned? > >>> > >>> nr_zones represents IN maximum and OUT actual. The caller will know by > >>> nr_zones which is changed in bdrv_co_zone_report. I'll add it in the > >>> comments. > >>> > >>>> > >>>>> + */ > >>>>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > >>>>> + int64_t len, int64_t *nr_zones, > >>>>> + BlockZoneDescriptor *zones) > >>>>> +{ > >>>>> + int ret; > >>>>> + BlockDriverState *bs; > >>>>> + IO_CODE(); > >>>>> + > >>>>> + blk_inc_in_flight(blk); /* increase before waiting */ > >>>>> + blk_wait_while_drained(blk); > >>>>> + bs = blk_bs(blk); > >>>>> + > >>>>> + ret = blk_check_byte_request(blk, offset, len); > >>>>> + if (ret < 0) { > >>>>> + return ret; > >>>>> + } > >>>>> + > >>>>> + bdrv_inc_in_flight(bs); > >>>> > >>>> The bdrv_inc/dec_in_flight() call should be inside > >>>> bdrv_co_zone_report(). See bdrv_co_ioctl() for an example. > >>>> > >>>>> + ret = bdrv_co_zone_report(blk->root->bs, offset, len, > >>>>> + nr_zones, zones); > >>>>> + bdrv_dec_in_flight(bs); > >>>>> + blk_dec_in_flight(blk); > >>>>> + return ret; > >>>>> +} > >>>>> + > >>>>> +/* > >>>>> + * Return zone_mgmt from BlockDriver. > >>>> > >>>> Maybe this should be: > >>>> > >>>> Send a zone management command. > >>> > >>> Yes, it's more accurate. > >>> > >>>> > >>>>> @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData { > >>>>> PreallocMode prealloc; > >>>>> Error **errp; > >>>>> } truncate; > >>>>> + struct { > >>>>> + int64_t *nr_zones; > >>>>> + BlockZoneDescriptor *zones; > >>>>> + } zone_report; > >>>>> + zone_op op; > >>>> > >>>> It's cleaner to put op inside a struct zone_mgmt so its purpose is > >>>> self-explanatory: > >>>> > >>>> struct { > >>>> zone_op op; > >>>> } zone_mgmt; > >>>> > >>>>> +static int handle_aiocb_zone_report(void *opaque) { > >>>>> + RawPosixAIOData *aiocb = opaque; > >>>>> + int fd = aiocb->aio_fildes; > >>>>> + int64_t *nr_zones = aiocb->zone_report.nr_zones; > >>>>> + BlockZoneDescriptor *zones = aiocb->zone_report.zones; > >>>>> + int64_t offset = aiocb->aio_offset; > >>>>> + int64_t len = aiocb->aio_nbytes; > >> > >> Since you have the zone array and number of zones (size of that array) I > >> really do not see why you need len. > >> > >>>>> + > >>>>> + struct blk_zone *blkz; > >>>>> + int64_t rep_size, nrz; > >>>>> + int ret, n = 0, i = 0; > >>>>> + > >>>>> + nrz = *nr_zones; > >>>>> + if (len == -1) { > >>>>> + return -errno; > >>>> > >>>> Where is errno set? Should this be an errno constant instead like > >>>> -EINVAL? > >>> > >>> That's right! Noted. > >>> > >>>> > >>>>> + } > >>>>> + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); > >>>>> + g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz); > >>>> > >>>> g_new() looks incorrect. There should be 1 struct blk_zone_report > >>>> followed by nrz struct blk_zone structs. Please use g_malloc(rep_size) > >>>> instead. > >>> > >>> Yes! However, it still has a memory leak error when using g_autofree > >>> && g_malloc. > >> > >> That may be because you are changing the value of the rep pointer while > >> parsing the report ? > > > > I am not sure it is the case. Can you show me some way to find the problem? > > Not sure. I never used this g_malloc()/g_autofree() before so not sure how > it works. It may be that g_autofree() work only with g_new() ? > Could you try separating the declaration and allocation ? e.g. > > Declare at the beginning of the function: > g_autofree struct blk_zone_report *rep = NULL; > > And then when needed do: > > rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); > rep = g_malloc(rep_size); Actually, the memory leak occurs in that way. When using zone_mgmt, memory leak still occurs. Asan gives the error information not much so I haven't tracked down the problem yet. > > > > > Thanks for reviewing! > > > > > >> > >>> > >>>> > >>>>> + offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */ > >>>>> + printf("start to report zone with offset: 0x%lx\n", offset); > >>>>> + > >>>>> + blkz = (struct blk_zone *)(rep + 1); > >>>>> + while (n < nrz) { > >>>>> + memset(rep, 0, rep_size); > >>>>> + rep->sector = offset; > >>>>> + rep->nr_zones = nrz; > >>>> > >>>> What prevents zones[] overflowing? nrz isn't being reduced in the loop, > >>>> so maybe the rep->nr_zones return value will eventually exceed the > >>>> number of elements still available in zones[n..]? > >>> > >>> I suppose the number of zones[] is restricted in the subsequent for > >>> loop where zones[] copy one zone at a time as n increases. Even if > >>> rep->zones exceeds the available room in zones[], the extra zone will > >>> not be copied. > >>> > >>>> > >>>>> +static int handle_aiocb_zone_mgmt(void *opaque) { > >>>>> + RawPosixAIOData *aiocb = opaque; > >>>>> + int fd = aiocb->aio_fildes; > >>>>> + int64_t offset = aiocb->aio_offset; > >>>>> + int64_t len = aiocb->aio_nbytes; > >>>>> + zone_op op = aiocb->op; > >>>>> + > >>>>> + struct blk_zone_range range; > >>>>> + const char *ioctl_name; > >>>>> + unsigned long ioctl_op; > >>>>> + int64_t zone_size; > >>>>> + int64_t zone_size_mask; > >>>>> + int ret; > >>>>> + > >>>>> + ret = ioctl(fd, BLKGETZONESZ, &zone_size); > >>>> > >>>> Can this value be stored in bs (maybe in BlockLimits) to avoid calling > >>>> ioctl(BLKGETZONESZ) each time? > >>> > >>> Yes, zone_size is in the zbd_dev field. I'll update BlockLimits after > >>> I think through the configurations. In addition, it's a temporary > >>> approach. It is substituted by get_sysfs_long_val now. > >>> > >>>> > >>>>> + if (ret) { > >>>>> + return -1; > >>>> > >>>> The return value should be a negative errno. -1 is -EPERM but it's > >>>> probably not that error code you wanted. This should be: > >>>> > >>>> return -errno; > >>>> > >>>>> + } > >>>>> + > >>>>> + zone_size_mask = zone_size - 1; > >>>>> + if (offset & zone_size_mask) { > >>>>> + error_report("offset is not the start of a zone"); > >>>>> + return -1; > >>>> > >>>> return -EINVAL; > >>>> > >>>>> + } > >>>>> + > >>>>> + if (len & zone_size_mask) { > >>>>> + error_report("len is not aligned to zones"); > >>>>> + return -1; > >>>> > >>>> return -EINVAL; > >>>> > >>>>> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset, > >>>>> + int64_t len, int64_t *nr_zones, > >>>>> + BlockZoneDescriptor *zones) { > >>>>> + BDRVRawState *s = bs->opaque; > >>>>> + RawPosixAIOData acb; > >>>>> + > >>>>> + acb = (RawPosixAIOData) { > >>>>> + .bs = bs, > >>>>> + .aio_fildes = s->fd, > >>>>> + .aio_type = QEMU_AIO_IOCTL, > >>>>> + .aio_offset = offset, > >>>>> + .aio_nbytes = len, > >>>>> + .zone_report = { > >>>>> + .nr_zones = nr_zones, > >>>>> + .zones = zones, > >>>>> + }, > >>>> > >>>> Indentation is off here. Please use 4-space indentation. > >>> Noted! > >>> > >>> Thanks for reviewing! > >>> > >>> Sam > >> > >> > >> -- > >> Damien Le Moal > >> Western Digital Research > > > -- > Damien Le Moal > Western Digital Research
On 6/29/22 10:50, Sam Li wrote: >>>>>>> + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); >>>>>>> + g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz); >>>>>> >>>>>> g_new() looks incorrect. There should be 1 struct blk_zone_report >>>>>> followed by nrz struct blk_zone structs. Please use g_malloc(rep_size) >>>>>> instead. >>>>> >>>>> Yes! However, it still has a memory leak error when using g_autofree >>>>> && g_malloc. >>>> >>>> That may be because you are changing the value of the rep pointer while >>>> parsing the report ? >>> >>> I am not sure it is the case. Can you show me some way to find the problem? >> >> Not sure. I never used this g_malloc()/g_autofree() before so not sure how >> it works. It may be that g_autofree() work only with g_new() ? >> Could you try separating the declaration and allocation ? e.g. >> >> Declare at the beginning of the function: >> g_autofree struct blk_zone_report *rep = NULL; >> >> And then when needed do: >> >> rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); >> rep = g_malloc(rep_size); > > Actually, the memory leak occurs in that way. When using zone_mgmt, > memory leak still occurs. Asan gives the error information not much so > I haven't tracked down the problem yet. See this: https://blog.fishsoup.net/2015/11/05/attributecleanup-mixed-declarations-and-code-and-goto/ Maybe you can find some hints.
Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年6月29日周三 10:32写道: > > On 6/29/22 10:50, Sam Li wrote: > >>>>>>> + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); > >>>>>>> + g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz); > >>>>>> > >>>>>> g_new() looks incorrect. There should be 1 struct blk_zone_report > >>>>>> followed by nrz struct blk_zone structs. Please use g_malloc(rep_size) > >>>>>> instead. > >>>>> > >>>>> Yes! However, it still has a memory leak error when using g_autofree > >>>>> && g_malloc. > >>>> > >>>> That may be because you are changing the value of the rep pointer while > >>>> parsing the report ? > >>> > >>> I am not sure it is the case. Can you show me some way to find the problem? > >> > >> Not sure. I never used this g_malloc()/g_autofree() before so not sure how > >> it works. It may be that g_autofree() work only with g_new() ? > >> Could you try separating the declaration and allocation ? e.g. > >> > >> Declare at the beginning of the function: > >> g_autofree struct blk_zone_report *rep = NULL; > >> > >> And then when needed do: > >> > >> rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); > >> rep = g_malloc(rep_size); > > > > Actually, the memory leak occurs in that way. When using zone_mgmt, > > memory leak still occurs. Asan gives the error information not much so > > I haven't tracked down the problem yet. > > See this: > > https://blog.fishsoup.net/2015/11/05/attributecleanup-mixed-declarations-and-code-and-goto/ > > Maybe you can find some hints. Thanks! > > -- > Damien Le Moal > Western Digital Research
diff --git a/block/block-backend.c b/block/block-backend.c index e0e1aff4b1..786f964d02 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk) return ret; } +/* + * Return zone_report from BlockDriver. Offset can be any number within + * the zone size. No alignment for offset and len. + */ +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, + int64_t len, int64_t *nr_zones, + BlockZoneDescriptor *zones) +{ + int ret; + BlockDriverState *bs; + IO_CODE(); + + blk_inc_in_flight(blk); /* increase before waiting */ + blk_wait_while_drained(blk); + bs = blk_bs(blk); + + ret = blk_check_byte_request(blk, offset, len); + if (ret < 0) { + return ret; + } + + bdrv_inc_in_flight(bs); + ret = bdrv_co_zone_report(blk->root->bs, offset, len, + nr_zones, zones); + bdrv_dec_in_flight(bs); + blk_dec_in_flight(blk); + return ret; +} + +/* + * Return zone_mgmt from BlockDriver. + * Offset is the start of a zone and len is aligned to zones. + */ +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op, + int64_t offset, int64_t len) +{ + int ret; + BlockDriverState *bs; + IO_CODE(); + + blk_inc_in_flight(blk); + blk_wait_while_drained(blk); + bs = blk_bs(blk); + + ret = blk_check_byte_request(blk, offset, len); + if (ret < 0) { + return ret; + } + + bdrv_inc_in_flight(bs); + ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len); + bdrv_dec_in_flight(bs); + blk_dec_in_flight(blk); + return ret; +} + void blk_drain(BlockBackend *blk) { BlockDriverState *bs = blk_bs(blk); diff --git a/block/coroutines.h b/block/coroutines.h index 830ecaa733..a114d7bc30 100644 --- a/block/coroutines.h +++ b/block/coroutines.h @@ -80,6 +80,11 @@ int coroutine_fn blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes); int coroutine_fn blk_co_do_flush(BlockBackend *blk); +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, + int64_t len, int64_t *nr_zones, + BlockZoneDescriptor *zones); +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op, + int64_t offset, int64_t len); /* diff --git a/block/file-posix.c b/block/file-posix.c index 48cd096624..1b8b0d351f 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -67,6 +67,7 @@ #include <sys/param.h> #include <sys/syscall.h> #include <sys/vfs.h> +#include <linux/blkzoned.h> #include <linux/cdrom.h> #include <linux/fd.h> #include <linux/fs.h> @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData { PreallocMode prealloc; Error **errp; } truncate; + struct { + int64_t *nr_zones; + BlockZoneDescriptor *zones; + } zone_report; + zone_op op; }; } RawPosixAIOData; @@ -1801,6 +1807,135 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd, } #endif +/* + * parse_zone - Fill a zone descriptor + */ +static inline void parse_zone(struct BlockZoneDescriptor *zone, + struct blk_zone *blkz) { + zone->start = blkz->start; + zone->length = blkz->len; + zone->cap = blkz->capacity; + zone->wp = blkz->wp - blkz->start; + zone->type = blkz->type; + zone->cond = blkz->cond; +} + +static int handle_aiocb_zone_report(void *opaque) { + RawPosixAIOData *aiocb = opaque; + int fd = aiocb->aio_fildes; + int64_t *nr_zones = aiocb->zone_report.nr_zones; + BlockZoneDescriptor *zones = aiocb->zone_report.zones; + int64_t offset = aiocb->aio_offset; + int64_t len = aiocb->aio_nbytes; + + struct blk_zone *blkz; + int64_t rep_size, nrz; + int ret, n = 0, i = 0; + + nrz = *nr_zones; + if (len == -1) { + return -errno; + } + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); + g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz); + offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */ + printf("start to report zone with offset: 0x%lx\n", offset); + + blkz = (struct blk_zone *)(rep + 1); + while (n < nrz) { + memset(rep, 0, rep_size); + rep->sector = offset; + rep->nr_zones = nrz; + + ret = ioctl(fd, BLKREPORTZONE, rep); + if (ret != 0) { + ret = -errno; + error_report("%d: ioctl BLKREPORTZONE at %ld failed %d", + fd, offset, errno); + return ret; + } + + if (!rep->nr_zones) { + break; + } + + for (i = 0; i < rep->nr_zones; i++, n++) { + parse_zone(&zones[n], &blkz[i]); + /* The next report should start after the last zone reported */ + offset = blkz[i].start + blkz[i].len; + } + } + + *nr_zones = n; + return 0; +} + +static int handle_aiocb_zone_mgmt(void *opaque) { + RawPosixAIOData *aiocb = opaque; + int fd = aiocb->aio_fildes; + int64_t offset = aiocb->aio_offset; + int64_t len = aiocb->aio_nbytes; + zone_op op = aiocb->op; + + struct blk_zone_range range; + const char *ioctl_name; + unsigned long ioctl_op; + int64_t zone_size; + int64_t zone_size_mask; + int ret; + + ret = ioctl(fd, BLKGETZONESZ, &zone_size); + if (ret) { + return -1; + } + + zone_size_mask = zone_size - 1; + if (offset & zone_size_mask) { + error_report("offset is not the start of a zone"); + return -1; + } + + if (len & zone_size_mask) { + error_report("len is not aligned to zones"); + return -1; + } + + switch (op) { + case zone_open: + ioctl_name = "BLKOPENZONE"; + ioctl_op = BLKOPENZONE; + break; + case zone_close: + ioctl_name = "BLKCLOSEZONE"; + ioctl_op = BLKCLOSEZONE; + break; + case zone_finish: + ioctl_name = "BLKFINISHZONE"; + ioctl_op = BLKFINISHZONE; + break; + case zone_reset: + ioctl_name = "BLKRESETZONE"; + ioctl_op = BLKRESETZONE; + break; + default: + error_report("Invalid zone operation 0x%x", op); + errno = -EINVAL; + return -1; + } + + /* Execute the operation */ + range.sector = offset; + range.nr_sectors = len; + ret = ioctl(fd, ioctl_op, &range); + if (ret != 0) { + error_report("ioctl %s failed %d", + ioctl_name, errno); + return -1; + } + + return 0; +} + static int handle_aiocb_copy_range(void *opaque) { RawPosixAIOData *aiocb = opaque; @@ -2973,6 +3108,58 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret) } } +/* + * zone report - Get a zone block device's information in the form + * of an array of zone descriptors. + * + * @param bs: passing zone block device file descriptor + * @param zones: an array of zone descriptors to hold zone + * information on reply + * @param offset: offset can be any byte within the zone size. + * @param len: (not sure yet. + * @return 0 on success, -1 on failure + */ +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset, + int64_t len, int64_t *nr_zones, + BlockZoneDescriptor *zones) { + BDRVRawState *s = bs->opaque; + RawPosixAIOData acb; + + acb = (RawPosixAIOData) { + .bs = bs, + .aio_fildes = s->fd, + .aio_type = QEMU_AIO_IOCTL, + .aio_offset = offset, + .aio_nbytes = len, + .zone_report = { + .nr_zones = nr_zones, + .zones = zones, + }, + }; + + return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb); +} + +/* + * zone management operations - Execute an operation on a zone + */ +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, zone_op op, + int64_t offset, int64_t len) { + BDRVRawState *s = bs->opaque; + RawPosixAIOData acb; + + acb = (RawPosixAIOData) { + .bs = bs, + .aio_fildes = s->fd, + .aio_type = QEMU_AIO_IOCTL, + .aio_offset = offset, + .aio_nbytes = len, + .op = op, + }; + + return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb); +} + static coroutine_fn int raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, bool blkdev) @@ -3324,6 +3511,9 @@ BlockDriver bdrv_file = { .bdrv_abort_perm_update = raw_abort_perm_update, .create_opts = &raw_create_opts, .mutable_opts = mutable_opts, + + .bdrv_co_zone_report = raw_co_zone_report, + .bdrv_co_zone_mgmt = raw_co_zone_mgmt, }; /***********************************************/ @@ -3703,6 +3893,53 @@ static BlockDriver bdrv_host_device = { #endif }; +static BlockDriver bdrv_zoned_host_device = { + .format_name = "zoned_host_device", + .protocol_name = "zoned_host_device", + .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, + .bdrv_probe_device = hdev_probe_device, + .bdrv_parse_filename = hdev_parse_filename, + .bdrv_file_open = hdev_open, + .bdrv_close = raw_close, + .bdrv_reopen_prepare = raw_reopen_prepare, + .bdrv_reopen_commit = raw_reopen_commit, + .bdrv_reopen_abort = raw_reopen_abort, + .bdrv_co_create_opts = bdrv_co_create_opts_simple, + .create_opts = &bdrv_create_opts_simple, + .mutable_opts = mutable_opts, + .bdrv_co_invalidate_cache = raw_co_invalidate_cache, + .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, + + .bdrv_co_preadv = raw_co_preadv, + .bdrv_co_pwritev = raw_co_pwritev, + .bdrv_co_flush_to_disk = raw_co_flush_to_disk, + .bdrv_co_pdiscard = hdev_co_pdiscard, + .bdrv_co_copy_range_from = raw_co_copy_range_from, + .bdrv_co_copy_range_to = raw_co_copy_range_to, + .bdrv_refresh_limits = raw_refresh_limits, + .bdrv_io_plug = raw_aio_plug, + .bdrv_io_unplug = raw_aio_unplug, + .bdrv_attach_aio_context = raw_aio_attach_aio_context, + + .bdrv_co_truncate = raw_co_truncate, + .bdrv_getlength = raw_getlength, + .bdrv_get_info = raw_get_info, + .bdrv_get_allocated_file_size + = raw_get_allocated_file_size, + .bdrv_get_specific_stats = hdev_get_specific_stats, + .bdrv_check_perm = raw_check_perm, + .bdrv_set_perm = raw_set_perm, + .bdrv_abort_perm_update = raw_abort_perm_update, + .bdrv_probe_blocksizes = hdev_probe_blocksizes, + .bdrv_probe_geometry = hdev_probe_geometry, + .bdrv_co_ioctl = hdev_co_ioctl, + + /* zone management operations */ + .bdrv_co_zone_report = raw_co_zone_report, + .bdrv_co_zone_mgmt = raw_co_zone_mgmt, +}; + #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) static void cdrom_parse_filename(const char *filename, QDict *options, Error **errp) @@ -3964,6 +4201,7 @@ static void bdrv_file_init(void) #if defined(HAVE_HOST_BLOCK_DEVICE) bdrv_register(&bdrv_host_device); #ifdef __linux__ + bdrv_register(&bdrv_zoned_host_device); bdrv_register(&bdrv_host_cdrom); #endif #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) diff --git a/include/block/block-common.h b/include/block/block-common.h index fdb7306e78..78cddeeda5 100644 --- a/include/block/block-common.h +++ b/include/block/block-common.h @@ -23,7 +23,6 @@ */ #ifndef BLOCK_COMMON_H #define BLOCK_COMMON_H - #include "block/aio.h" #include "block/aio-wait.h" #include "qemu/iov.h" @@ -49,6 +48,48 @@ typedef struct BlockDriver BlockDriver; typedef struct BdrvChild BdrvChild; typedef struct BdrvChildClass BdrvChildClass; +typedef enum zone_op { + zone_open, + zone_close, + zone_finish, + zone_reset, +} zone_op; + +typedef enum zone_model { + BLK_Z_HM, + BLK_Z_HA, +} zone_model; + +typedef enum BlkZoneCondition { + BLK_ZS_NOT_WP = 0x0, + BLK_ZS_EMPTY = 0x1, + BLK_ZS_IOPEN = 0x2, + BLK_ZS_EOPEN = 0x3, + BLK_ZS_CLOSED = 0x4, + BLK_ZS_RDONLY = 0xD, + BLK_ZS_FULL = 0xE, + BLK_ZS_OFFLINE = 0xF, +} BlkZoneCondition; + +typedef enum BlkZoneType { + BLK_ZT_CONV = 0x1, + BLK_ZT_SWR = 0x2, + BLK_ZT_SWP = 0x3, +} BlkZoneType; + +/* + * Zone descriptor data structure. + * Provide information on a zone with all position and size values in bytes. + */ +typedef struct BlockZoneDescriptor { + uint64_t start; + uint64_t length; + uint64_t cap; + uint64_t wp; + BlkZoneType type; + BlkZoneCondition cond; +} BlockZoneDescriptor; + typedef struct BlockDriverInfo { /* in bytes, 0 if irrelevant */ int cluster_size; diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h index 8947abab76..b9ea9db6dc 100644 --- a/include/block/block_int-common.h +++ b/include/block/block_int-common.h @@ -94,6 +94,20 @@ typedef struct BdrvTrackedRequest { struct BdrvTrackedRequest *waiting_for; } BdrvTrackedRequest; +/** + * Zone device information data structure. + * Provide information on a device. + */ +typedef struct zbd_dev { + uint32_t zone_size; + zone_model model; + uint32_t block_size; + uint32_t write_granularity; + uint32_t nr_zones; + struct BlockZoneDescriptor *zones; /* array of zones */ + uint32_t max_nr_open_zones; /* maximum number of explicitly open zones */ + uint32_t max_nr_active_zones; +} zbd_dev; struct BlockDriver { /* @@ -691,6 +705,12 @@ struct BlockDriver { QEMUIOVector *qiov, int64_t pos); + int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs, + int64_t offset, int64_t len, int64_t *nr_zones, + BlockZoneDescriptor *zones); + int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, enum zone_op op, + int64_t offset, int64_t len); + /* removable device specific */ bool (*bdrv_is_inserted)(BlockDriverState *bs); void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);