Message ID | 20220929093134.73974-2-faithilikerun@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | add zone append write for zoned device | expand |
On 9/29/22 18:31, Sam Li wrote: > Since Linux doesn't have a user API to issue zone append operations to > zoned devices from user space, the file-posix driver is modified to add > zone append emulation using regular writes. To do this, the file-posix > driver tracks the wp location of all zones of the device. It uses an > array of uint64_t. The most significant bit of each wp location indicates > if the zone type is sequential write required. > > The zones wp can be changed due to the following operations issued: > - zone reset: change the wp to the start offset of that zone > - zone finish: change to the end location of that zone > - write to a zone > - zone append > > Signed-off-by: Sam Li <faithilikerun@gmail.com> > --- > block/file-posix.c | 138 ++++++++++++++++++++++++++++++- > include/block/block-common.h | 16 ++++ > include/block/block_int-common.h | 5 ++ > include/block/raw-aio.h | 4 +- > 4 files changed, 159 insertions(+), 4 deletions(-) > > diff --git a/block/file-posix.c b/block/file-posix.c > index 73656d87f2..33e81ac112 100755 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -206,6 +206,8 @@ typedef struct RawPosixAIOData { > struct { > struct iovec *iov; > int niov; > + int64_t *append_sector; > + BlockZoneWps *wps; > } io; > struct { > uint64_t cmd; > @@ -1332,6 +1334,59 @@ static int hdev_get_max_segments(int fd, struct stat *st) { > #endif > } > > +#if defined(CONFIG_BLKZONED) > +static int report_zone_wp(int64_t offset, int fd, BlockZoneWps *wps, > + unsigned int nrz) { Maybe rename this to get_zones_wp() ? > + struct blk_zone *blkz; > + int64_t rep_size; > + int64_t sector = offset >> BDRV_SECTOR_BITS; > + int ret, n = 0, i = 0; > + > + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); > + g_autofree struct blk_zone_report *rep = NULL; To be cleaner, move this declaration above with the others ? > + rep = g_malloc(rep_size); > + > + blkz = (struct blk_zone *)(rep + 1); > + while (n < nrz) { > + memset(rep, 0, rep_size); > + rep->sector = sector; > + rep->nr_zones = nrz - n; > + > + do { > + ret = ioctl(fd, BLKREPORTZONE, rep); > + } while (ret != 0 && errno == EINTR); > + if (ret != 0) { > + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d", > + fd, offset, errno); > + return -errno; > + } > + > + if (!rep->nr_zones) { > + break; > + } > + > + for (i = 0; i < rep->nr_zones; i++, n++) { > + wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS; > + sector = blkz[i].start + blkz[i].len; > + > + /* > + * In the wp tracking, it only cares if the zone type is sequential > + * writes required so that the wp can advance to the right location. Or sequential write preferred (host aware case) > + * Instead of the type of zone_type which is an 8-bit unsigned > + * integer, use the first most significant bits of the wp location > + * to indicate the zone type: 0 for SWR zones and 1 for the > + * others. > + */ > + if (!(blkz[i].type & BLK_ZONE_TYPE_SEQWRITE_REQ)) { This should be: if (blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL) { Note that the type field is not a bit-field. So you must compare values instead of doing bit operations. > + wps->wp[i] += (uint64_t)1 << 63; You can simplify this: wps->wp[i] |= 1ULL << 63; Overall, I would rewrite this like this: for (i = 0; i < rep->nr_zones; i++, n++) { /* * The wp tracking cares only about sequential write required * and sequential write preferred zones so that the wp can * advance to the right location. * Use the most significant bit of the wp location * to indicate the zone type: 0 for SWR zones and 1 for * conventional zones. */ if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) { wps->wp[i] = 1ULL << 63; else wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS; } sector = blkz[i - 1].start + blkz[i - 1].len; Which I think is a lot simpler. > + } > + } > + } > + > + return 0; > +} > +#endif > + > static void raw_refresh_limits(BlockDriverState *bs, Error **errp) > { > BDRVRawState *s = bs->opaque; > @@ -1415,6 +1470,20 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) > error_report("Invalid device capacity %" PRId64 " bytes ", bs->bl.capacity); > return; > } > + > + ret = get_sysfs_long_val(&st, "physical_block_size"); > + if (ret >= 0) { > + bs->bl.write_granularity = ret; > + } This change seems unrelated to the wp tracking. Should this be moved to a different patch ? > + > + bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret); > + qemu_mutex_init(&bs->bl.wps->lock); Move this initialization after the if block. Doing so, you do not need to call mutex destroy in case of error. > + if (report_zone_wp(0, s->fd, bs->bl.wps, ret) < 0 ) { > + error_report("report wps failed"); > + qemu_mutex_destroy(&bs->bl.wps->lock); > + g_free(bs->bl.wps); > + return; > + } > } > } > > @@ -1582,7 +1651,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) > ssize_t len; > > do { > - if (aiocb->aio_type & QEMU_AIO_WRITE) > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) > len = qemu_pwritev(aiocb->aio_fildes, > aiocb->io.iov, > aiocb->io.niov, > @@ -1612,7 +1681,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) > ssize_t len; > > while (offset < aiocb->aio_nbytes) { > - if (aiocb->aio_type & QEMU_AIO_WRITE) { > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > len = pwrite(aiocb->aio_fildes, > (const char *)buf + offset, > aiocb->aio_nbytes - offset, > @@ -1705,7 +1774,7 @@ static int handle_aiocb_rw(void *opaque) > } > > nbytes = handle_aiocb_rw_linear(aiocb, buf); > - if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { > + if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) { > char *p = buf; > size_t count = aiocb->aio_nbytes, copy; > int i; > @@ -1726,6 +1795,23 @@ static int handle_aiocb_rw(void *opaque) > > out: > if (nbytes == aiocb->aio_nbytes) { > +#if defined(CONFIG_BLKZONED) > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > + BlockZoneWps *wps = aiocb->io.wps; Why adding this pointer to the aiocb struct ? You can get the array from aiocb->bs->bl.wps, no ? > + int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; > + if (wps) { > + if (BDRV_ZT_IS_SWR(wps->wp[index])) { > + qemu_mutex_lock(&wps->lock); > + wps->wp[index] += aiocb->aio_nbytes; > + qemu_mutex_unlock(&wps->lock); > + } > + > + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { > + *aiocb->io.append_sector = wps->wp[index] >> BDRV_SECTOR_BITS; Bug: the append sector must be the first sector written, not the wp (sector following the last written sector). So this must be done *before* advancing the wp. You need to have wps->lock held here too since you are reading wps->wp[index]. So the mutex lock/unlock needs to be around the 2 hunks under "if (wps) {". Also, given that there cannot be any zone append issued to conventional zones (they will fail), you can simplify: if (wps) { qemu_mutex_lock(&wps->lock); if (BDRV_ZT_IS_SWR(wps->wp[index])) { if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { *aiocb->io.append_sector = wps->wp[index] >> BDRV_SECTOR_BITS; } wps->wp[index] += aiocb->aio_nbytes; } qemu_mutex_unlock(&wps->lock); } Now the last problem with this code is sequential write preferred zones. For these, the write may actually be overwriting sectors that have already been written, meaning that the wp may not necessarilly need to be advanced. You can handle that case together with SWR case simply like this: if (wps) { qemu_mutex_lock(&wps->lock); if (BDRV_ZT_IS_SWR(wps->wp[index])) { uint64_t wend_offset = aiocb->aio_offset + aiocb->aio_nbytes; if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { *aiocb->io.append_sector = wps->wp[index] >> BDRV_SECTOR_BITS; } /* Advance the wp if needed */ if (wend_offset > wps->wp[index]) wps->wp[index] = wend_offset; } qemu_mutex_unlock(&wps->lock); } > + } > + } > + } > +#endif > return 0; > } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) { > if (aiocb->aio_type & QEMU_AIO_WRITE) { > @@ -1737,6 +1823,19 @@ out: > } > } else { > assert(nbytes < 0); > +#if defined(CONFIG_BLKZONED) > + if (aiocb->aio_type == QEMU_AIO_ZONE_APPEND) { Why do this only for zone append ? Regular write error also need a refresh of the zone wp. > + qemu_mutex_lock(&aiocb->bs->bl.wps->lock); > + if (report_zone_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps, > + aiocb->bs->bl.nr_zones) < 0) { > + error_report("report zone wp failed"); > + qemu_mutex_destroy(&aiocb->bs->bl.wps->lock); > + g_free(aiocb->bs->bl.wps); > + return -EINVAL; > + } > + qemu_mutex_unlock(&aiocb->bs->bl.wps->lock); This really needs to be a helper function, e.g. update_zone_wp() or something like this. Aslo, why nuke the entire zone array if the refresh for this zone fails ? You could simply leave it as is. The next write may again fail and another attempt at refreshing it done. > + } > +#endif > return nbytes; > } > } > @@ -2027,12 +2126,16 @@ static int handle_aiocb_zone_report(void *opaque) { > static int handle_aiocb_zone_mgmt(void *opaque) { > #if defined(CONFIG_BLKZONED) > RawPosixAIOData *aiocb = opaque; > + BlockDriverState *bs = aiocb->bs; > int fd = aiocb->aio_fildes; > int64_t sector = aiocb->aio_offset / 512; > int64_t nr_sectors = aiocb->aio_nbytes / 512; > struct blk_zone_range range; > int ret; > > + BlockZoneWps *wps = bs->bl.wps; > + int index = aiocb->aio_offset / bs->bl.zone_size; > + > /* Execute the operation */ > range.sector = sector; > range.nr_sectors = nr_sectors; > @@ -2045,6 +2148,22 @@ static int handle_aiocb_zone_mgmt(void *opaque) { > errno); > return -errno; > } > + > + if (aiocb->zone_mgmt.all) { > + for (int i = 0; i < bs->bl.nr_zones; ++i) { > + qemu_mutex_lock(&wps->lock); > + wps->wp[i] = i * bs->bl.zone_size; > + qemu_mutex_unlock(&wps->lock); > + } > + } else if (aiocb->zone_mgmt.zone_op == BLKRESETZONE) { > + qemu_mutex_lock(&wps->lock); > + wps->wp[index] = aiocb->aio_offset; > + qemu_mutex_unlock(&wps->lock); > + } else if (aiocb->zone_mgmt.zone_op == BLKFINISHZONE) { > + qemu_mutex_lock(&wps->lock); > + wps->wp[index] = aiocb->aio_offset + bs->bl.zone_size; This may be the last zone of the device, which may be smaller. So you need to check that here. Same for the first case for reset all: you need to handle the smaller last zone if there is one. > + qemu_mutex_unlock(&wps->lock); > + } Instead of the lock/unlock for each case here, take the mutex lock before the if () and unlock it after it. Less lines :) Also, if the zone management command fails, you need to do a report zones and refresh the wps array. > return ret; > #else > return -ENOTSUP; > @@ -2355,6 +2474,8 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, > }, > }; > > + BlockZoneWps *wps = bs->bl.wps; > + acb.io.wps = wps; You do not need the pws variable. Simply do: acb.io.wps = bs->bl.wps; > assert(qiov->size == bytes); > return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb); > } > @@ -2465,6 +2586,12 @@ static void raw_close(BlockDriverState *bs) > BDRVRawState *s = bs->opaque; > > if (s->fd >= 0) { > +#if defined(CONFIG_BLKZONED) > + if (bs->bl.wps) { > + qemu_mutex_destroy(&bs->bl.wps->lock); > + g_free(bs->bl.wps); > + } > +#endif > qemu_close(s->fd); > s->fd = -1; > } > @@ -3299,6 +3426,11 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > zone_op_name = "BLKRESETZONE"; > zone_op = BLKRESETZONE; > break; > + case BLK_ZO_RESET_ALL: > + zone_op_name = "BLKRESETZONE"; > + zone_op = BLKRESETZONE; > + is_all = true; > + break; This change seems unrelated to the wp tracking. Different patch ? > default: > g_assert_not_reached(); > } > diff --git a/include/block/block-common.h b/include/block/block-common.h > index 8efb6b0c43..43bfc484eb 100644 > --- a/include/block/block-common.h > +++ b/include/block/block-common.h > @@ -58,6 +58,7 @@ typedef enum BlockZoneOp { > BLK_ZO_CLOSE, > BLK_ZO_FINISH, > BLK_ZO_RESET, > + BLK_ZO_RESET_ALL, same here. Adding reset all support should be a different patch. > } BlockZoneOp; > > typedef enum BlockZoneModel { > @@ -96,6 +97,14 @@ typedef struct BlockZoneDescriptor { > BlockZoneCondition cond; > } BlockZoneDescriptor; > > +/* > + * Track write pointers of a zone in bytes. > + */ > +typedef struct BlockZoneWps { > + QemuMutex lock; > + uint64_t wp[]; > +} BlockZoneWps; > + > typedef struct BlockDriverInfo { > /* in bytes, 0 if irrelevant */ > int cluster_size; > @@ -209,6 +218,13 @@ typedef enum { > #define BDRV_SECTOR_BITS 9 > #define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS) > > +/* > + * Get the first most significant bit of WP. If it is zero, then > + * the zone type is SWR. > + */ > +#define BDRV_ZT_IS_SWR(WP) ((WP & 0x8000000000000000) == 0) ? (true) : \ > + (false) Simplify: #define BDRV_ZT_IS_SWR(wp) (!((wp) & (1ULL << 63)) But since this must be used for both SWR and SWP zones, I would reverse this into: #define BDRV_ZONE_IS_CONV(wp) ((wp) & (1ULL << 63)) Which is a lot simpler. > + > #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \ > INT_MAX >> BDRV_SECTOR_BITS) > #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) > diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h > index 37dddc603c..59c2d1316d 100644 > --- a/include/block/block_int-common.h > +++ b/include/block/block_int-common.h > @@ -857,6 +857,11 @@ typedef struct BlockLimits { > > /* device capacity expressed in bytes */ > int64_t capacity; > + > + /* array of write pointers' location of each zone in the zoned device. */ > + BlockZoneWps *wps; > + > + int64_t write_granularity; > } BlockLimits; > > typedef struct BdrvOpBlocker BdrvOpBlocker; > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h > index 3d26929cdd..f13cc1887b 100644 > --- a/include/block/raw-aio.h > +++ b/include/block/raw-aio.h > @@ -31,6 +31,7 @@ > #define QEMU_AIO_TRUNCATE 0x0080 > #define QEMU_AIO_ZONE_REPORT 0x0100 > #define QEMU_AIO_ZONE_MGMT 0x0200 > +#define QEMU_AIO_ZONE_APPEND 0x0400 > #define QEMU_AIO_TYPE_MASK \ > (QEMU_AIO_READ | \ > QEMU_AIO_WRITE | \ > @@ -41,7 +42,8 @@ > QEMU_AIO_COPY_RANGE | \ > QEMU_AIO_TRUNCATE | \ > QEMU_AIO_ZONE_REPORT | \ > - QEMU_AIO_ZONE_MGMT) > + QEMU_AIO_ZONE_MGMT | \ > + QEMU_AIO_ZONE_APPEND) This should be introduced in patch 2. This patch should be only about zone wp tracking with regular writes and zone management ops. The second patch can implement zone append emulation thanks to this patch. So separate. > > /* AIO flags */ > #define QEMU_AIO_MISALIGNED 0x1000
On 10/5/22 10:44, Damien Le Moal wrote: > On 9/29/22 18:31, Sam Li wrote: >> Since Linux doesn't have a user API to issue zone append operations to >> zoned devices from user space, the file-posix driver is modified to add >> zone append emulation using regular writes. To do this, the file-posix >> driver tracks the wp location of all zones of the device. It uses an >> array of uint64_t. The most significant bit of each wp location indicates >> if the zone type is sequential write required. >> >> The zones wp can be changed due to the following operations issued: >> - zone reset: change the wp to the start offset of that zone >> - zone finish: change to the end location of that zone >> - write to a zone >> - zone append >> >> Signed-off-by: Sam Li <faithilikerun@gmail.com> >> --- >> block/file-posix.c | 138 ++++++++++++++++++++++++++++++- >> include/block/block-common.h | 16 ++++ >> include/block/block_int-common.h | 5 ++ >> include/block/raw-aio.h | 4 +- >> 4 files changed, 159 insertions(+), 4 deletions(-) >> >> diff --git a/block/file-posix.c b/block/file-posix.c >> index 73656d87f2..33e81ac112 100755 >> --- a/block/file-posix.c >> +++ b/block/file-posix.c >> @@ -206,6 +206,8 @@ typedef struct RawPosixAIOData { >> struct { >> struct iovec *iov; >> int niov; >> + int64_t *append_sector; >> + BlockZoneWps *wps; >> } io; >> struct { >> uint64_t cmd; >> @@ -1332,6 +1334,59 @@ static int hdev_get_max_segments(int fd, struct stat *st) { >> #endif >> } >> >> +#if defined(CONFIG_BLKZONED) >> +static int report_zone_wp(int64_t offset, int fd, BlockZoneWps *wps, >> + unsigned int nrz) { > > Maybe rename this to get_zones_wp() ? > >> + struct blk_zone *blkz; >> + int64_t rep_size; >> + int64_t sector = offset >> BDRV_SECTOR_BITS; >> + int ret, n = 0, i = 0; >> + >> + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); >> + g_autofree struct blk_zone_report *rep = NULL; > > To be cleaner, move this declaration above with the others ? > >> + rep = g_malloc(rep_size); >> + >> + blkz = (struct blk_zone *)(rep + 1); >> + while (n < nrz) { >> + memset(rep, 0, rep_size); >> + rep->sector = sector; >> + rep->nr_zones = nrz - n; >> + >> + do { >> + ret = ioctl(fd, BLKREPORTZONE, rep); >> + } while (ret != 0 && errno == EINTR); >> + if (ret != 0) { >> + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d", >> + fd, offset, errno); >> + return -errno; >> + } >> + >> + if (!rep->nr_zones) { >> + break; >> + } >> + >> + for (i = 0; i < rep->nr_zones; i++, n++) { >> + wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS; >> + sector = blkz[i].start + blkz[i].len; >> + >> + /* >> + * In the wp tracking, it only cares if the zone type is sequential >> + * writes required so that the wp can advance to the right location. > > Or sequential write preferred (host aware case) > >> + * Instead of the type of zone_type which is an 8-bit unsigned >> + * integer, use the first most significant bits of the wp location >> + * to indicate the zone type: 0 for SWR zones and 1 for the >> + * others. >> + */ >> + if (!(blkz[i].type & BLK_ZONE_TYPE_SEQWRITE_REQ)) { > > This should be: > > if (blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL) { > > Note that the type field is not a bit-field. So you must compare values > instead of doing bit operations. > >> + wps->wp[i] += (uint64_t)1 << 63; > > You can simplify this: > > wps->wp[i] |= 1ULL << 63; > > Overall, I would rewrite this like this: > > for (i = 0; i < rep->nr_zones; i++, n++) { > /* > * The wp tracking cares only about sequential write required > * and sequential write preferred zones so that the wp can > * advance to the right location. > * Use the most significant bit of the wp location > * to indicate the zone type: 0 for SWR zones and 1 for > * conventional zones. > */ > if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) { > wps->wp[i] = 1ULL << 63; > else > wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS; > } > sector = blkz[i - 1].start + blkz[i - 1].len; > > Which I think is a lot simpler. > >> + } >> + } >> + } >> + >> + return 0; >> +} >> +#endif >> + >> static void raw_refresh_limits(BlockDriverState *bs, Error **errp) >> { >> BDRVRawState *s = bs->opaque; >> @@ -1415,6 +1470,20 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) >> error_report("Invalid device capacity %" PRId64 " bytes ", bs->bl.capacity); >> return; >> } >> + >> + ret = get_sysfs_long_val(&st, "physical_block_size"); >> + if (ret >= 0) { >> + bs->bl.write_granularity = ret; >> + } > > This change seems unrelated to the wp tracking. Should this be moved to a > different patch ? > >> + >> + bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret); >> + qemu_mutex_init(&bs->bl.wps->lock); > > Move this initialization after the if block. Doing so, you do not need to > call mutex destroy in case of error. > >> + if (report_zone_wp(0, s->fd, bs->bl.wps, ret) < 0 ) { >> + error_report("report wps failed"); >> + qemu_mutex_destroy(&bs->bl.wps->lock); >> + g_free(bs->bl.wps); >> + return; >> + } >> } >> } >> >> @@ -1582,7 +1651,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) >> ssize_t len; >> >> do { >> - if (aiocb->aio_type & QEMU_AIO_WRITE) >> + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) >> len = qemu_pwritev(aiocb->aio_fildes, >> aiocb->io.iov, >> aiocb->io.niov, >> @@ -1612,7 +1681,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) >> ssize_t len; >> >> while (offset < aiocb->aio_nbytes) { >> - if (aiocb->aio_type & QEMU_AIO_WRITE) { >> + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { >> len = pwrite(aiocb->aio_fildes, >> (const char *)buf + offset, >> aiocb->aio_nbytes - offset, >> @@ -1705,7 +1774,7 @@ static int handle_aiocb_rw(void *opaque) >> } >> >> nbytes = handle_aiocb_rw_linear(aiocb, buf); >> - if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { >> + if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) { >> char *p = buf; >> size_t count = aiocb->aio_nbytes, copy; >> int i; >> @@ -1726,6 +1795,23 @@ static int handle_aiocb_rw(void *opaque) >> >> out: >> if (nbytes == aiocb->aio_nbytes) { >> +#if defined(CONFIG_BLKZONED) >> + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { >> + BlockZoneWps *wps = aiocb->io.wps; > > Why adding this pointer to the aiocb struct ? You can get the array from > aiocb->bs->bl.wps, no ? > >> + int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; >> + if (wps) { >> + if (BDRV_ZT_IS_SWR(wps->wp[index])) { >> + qemu_mutex_lock(&wps->lock); >> + wps->wp[index] += aiocb->aio_nbytes; >> + qemu_mutex_unlock(&wps->lock); >> + } >> + >> + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { >> + *aiocb->io.append_sector = wps->wp[index] >> BDRV_SECTOR_BITS; > > Bug: the append sector must be the first sector written, not the wp > (sector following the last written sector). So this must be done *before* > advancing the wp. > > You need to have wps->lock held here too since you are reading > wps->wp[index]. So the mutex lock/unlock needs to be around the 2 hunks > under "if (wps) {". Also, given that there cannot be any zone append > issued to conventional zones (they will fail), you can simplify: > > if (wps) { > qemu_mutex_lock(&wps->lock); > if (BDRV_ZT_IS_SWR(wps->wp[index])) { > if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { > *aiocb->io.append_sector = > wps->wp[index] >> BDRV_SECTOR_BITS; > } > wps->wp[index] += aiocb->aio_nbytes; > } > qemu_mutex_unlock(&wps->lock); > } > > Now the last problem with this code is sequential write preferred zones. > For these, the write may actually be overwriting sectors that have already > been written, meaning that the wp may not necessarilly need to be > advanced. You can handle that case together with SWR case simply like this: > > if (wps) { > qemu_mutex_lock(&wps->lock); > if (BDRV_ZT_IS_SWR(wps->wp[index])) { > uint64_t wend_offset = > aiocb->aio_offset + aiocb->aio_nbytes; > > if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { > *aiocb->io.append_sector = > wps->wp[index] >> BDRV_SECTOR_BITS; > } > > /* Advance the wp if needed */ > if (wend_offset > wps->wp[index]) > wps->wp[index] = wend_offset; > } > qemu_mutex_unlock(&wps->lock); > } > >> + } >> + } >> + } >> +#endif >> return 0; >> } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) { >> if (aiocb->aio_type & QEMU_AIO_WRITE) { >> @@ -1737,6 +1823,19 @@ out: >> } >> } else { >> assert(nbytes < 0); >> +#if defined(CONFIG_BLKZONED) >> + if (aiocb->aio_type == QEMU_AIO_ZONE_APPEND) { > > Why do this only for zone append ? Regular write error also need a refresh > of the zone wp. > >> + qemu_mutex_lock(&aiocb->bs->bl.wps->lock); >> + if (report_zone_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps, >> + aiocb->bs->bl.nr_zones) < 0) { >> + error_report("report zone wp failed"); >> + qemu_mutex_destroy(&aiocb->bs->bl.wps->lock); >> + g_free(aiocb->bs->bl.wps); >> + return -EINVAL; >> + } >> + qemu_mutex_unlock(&aiocb->bs->bl.wps->lock); > > This really needs to be a helper function, e.g. update_zone_wp() or > something like this. Aslo, why nuke the entire zone array if the refresh > for this zone fails ? You could simply leave it as is. The next write may > again fail and another attempt at refreshing it done. Another note on this: you must update the wp ONLY for the zone that you attempted to write here, and only that zone. If you update the wp for all zones, you will introduce problems for the zones that are being written (write in flight) since for these, wps->wp[i] is larger than the device current wp (they become eual only after all writes are completed). > >> + } >> +#endif >> return nbytes; >> } >> } >> @@ -2027,12 +2126,16 @@ static int handle_aiocb_zone_report(void *opaque) { >> static int handle_aiocb_zone_mgmt(void *opaque) { >> #if defined(CONFIG_BLKZONED) >> RawPosixAIOData *aiocb = opaque; >> + BlockDriverState *bs = aiocb->bs; >> int fd = aiocb->aio_fildes; >> int64_t sector = aiocb->aio_offset / 512; >> int64_t nr_sectors = aiocb->aio_nbytes / 512; >> struct blk_zone_range range; >> int ret; >> >> + BlockZoneWps *wps = bs->bl.wps; >> + int index = aiocb->aio_offset / bs->bl.zone_size; >> + >> /* Execute the operation */ >> range.sector = sector; >> range.nr_sectors = nr_sectors; >> @@ -2045,6 +2148,22 @@ static int handle_aiocb_zone_mgmt(void *opaque) { >> errno); >> return -errno; >> } >> + >> + if (aiocb->zone_mgmt.all) { >> + for (int i = 0; i < bs->bl.nr_zones; ++i) { >> + qemu_mutex_lock(&wps->lock); >> + wps->wp[i] = i * bs->bl.zone_size; >> + qemu_mutex_unlock(&wps->lock); >> + } >> + } else if (aiocb->zone_mgmt.zone_op == BLKRESETZONE) { >> + qemu_mutex_lock(&wps->lock); >> + wps->wp[index] = aiocb->aio_offset; >> + qemu_mutex_unlock(&wps->lock); >> + } else if (aiocb->zone_mgmt.zone_op == BLKFINISHZONE) { >> + qemu_mutex_lock(&wps->lock); >> + wps->wp[index] = aiocb->aio_offset + bs->bl.zone_size; > > This may be the last zone of the device, which may be smaller. So you need > to check that here. Same for the first case for reset all: you need to > handle the smaller last zone if there is one. > >> + qemu_mutex_unlock(&wps->lock); >> + } > > Instead of the lock/unlock for each case here, take the mutex lock before > the if () and unlock it after it. Less lines :) > > Also, if the zone management command fails, you need to do a report zones > and refresh the wps array. > >> return ret; >> #else >> return -ENOTSUP; >> @@ -2355,6 +2474,8 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, >> }, >> }; >> >> + BlockZoneWps *wps = bs->bl.wps; >> + acb.io.wps = wps; > > You do not need the pws variable. Simply do: > > acb.io.wps = bs->bl.wps; > >> assert(qiov->size == bytes); >> return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb); >> } >> @@ -2465,6 +2586,12 @@ static void raw_close(BlockDriverState *bs) >> BDRVRawState *s = bs->opaque; >> >> if (s->fd >= 0) { >> +#if defined(CONFIG_BLKZONED) >> + if (bs->bl.wps) { >> + qemu_mutex_destroy(&bs->bl.wps->lock); >> + g_free(bs->bl.wps); >> + } >> +#endif >> qemu_close(s->fd); >> s->fd = -1; >> } >> @@ -3299,6 +3426,11 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, >> zone_op_name = "BLKRESETZONE"; >> zone_op = BLKRESETZONE; >> break; >> + case BLK_ZO_RESET_ALL: >> + zone_op_name = "BLKRESETZONE"; >> + zone_op = BLKRESETZONE; >> + is_all = true; >> + break; > > This change seems unrelated to the wp tracking. Different patch ? > >> default: >> g_assert_not_reached(); >> } >> diff --git a/include/block/block-common.h b/include/block/block-common.h >> index 8efb6b0c43..43bfc484eb 100644 >> --- a/include/block/block-common.h >> +++ b/include/block/block-common.h >> @@ -58,6 +58,7 @@ typedef enum BlockZoneOp { >> BLK_ZO_CLOSE, >> BLK_ZO_FINISH, >> BLK_ZO_RESET, >> + BLK_ZO_RESET_ALL, > > same here. Adding reset all support should be a different patch. >> } BlockZoneOp; >> >> typedef enum BlockZoneModel { >> @@ -96,6 +97,14 @@ typedef struct BlockZoneDescriptor { >> BlockZoneCondition cond; >> } BlockZoneDescriptor; >> >> +/* >> + * Track write pointers of a zone in bytes. >> + */ >> +typedef struct BlockZoneWps { >> + QemuMutex lock; >> + uint64_t wp[]; >> +} BlockZoneWps; >> + >> typedef struct BlockDriverInfo { >> /* in bytes, 0 if irrelevant */ >> int cluster_size; >> @@ -209,6 +218,13 @@ typedef enum { >> #define BDRV_SECTOR_BITS 9 >> #define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS) >> >> +/* >> + * Get the first most significant bit of WP. If it is zero, then >> + * the zone type is SWR. >> + */ >> +#define BDRV_ZT_IS_SWR(WP) ((WP & 0x8000000000000000) == 0) ? (true) : \ >> + (false) > > Simplify: > > #define BDRV_ZT_IS_SWR(wp) (!((wp) & (1ULL << 63)) > > But since this must be used for both SWR and SWP zones, I would reverse > this into: > > #define BDRV_ZONE_IS_CONV(wp) ((wp) & (1ULL << 63)) > > Which is a lot simpler. > >> + >> #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \ >> INT_MAX >> BDRV_SECTOR_BITS) >> #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) >> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h >> index 37dddc603c..59c2d1316d 100644 >> --- a/include/block/block_int-common.h >> +++ b/include/block/block_int-common.h >> @@ -857,6 +857,11 @@ typedef struct BlockLimits { >> >> /* device capacity expressed in bytes */ >> int64_t capacity; >> + >> + /* array of write pointers' location of each zone in the zoned device. */ >> + BlockZoneWps *wps; >> + >> + int64_t write_granularity; >> } BlockLimits; >> >> typedef struct BdrvOpBlocker BdrvOpBlocker; >> diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h >> index 3d26929cdd..f13cc1887b 100644 >> --- a/include/block/raw-aio.h >> +++ b/include/block/raw-aio.h >> @@ -31,6 +31,7 @@ >> #define QEMU_AIO_TRUNCATE 0x0080 >> #define QEMU_AIO_ZONE_REPORT 0x0100 >> #define QEMU_AIO_ZONE_MGMT 0x0200 >> +#define QEMU_AIO_ZONE_APPEND 0x0400 >> #define QEMU_AIO_TYPE_MASK \ >> (QEMU_AIO_READ | \ >> QEMU_AIO_WRITE | \ >> @@ -41,7 +42,8 @@ >> QEMU_AIO_COPY_RANGE | \ >> QEMU_AIO_TRUNCATE | \ >> QEMU_AIO_ZONE_REPORT | \ >> - QEMU_AIO_ZONE_MGMT) >> + QEMU_AIO_ZONE_MGMT | \ >> + QEMU_AIO_ZONE_APPEND) > > This should be introduced in patch 2. This patch should be only about zone > wp tracking with regular writes and zone management ops. The second patch > can implement zone append emulation thanks to this patch. So separate. > >> >> /* AIO flags */ >> #define QEMU_AIO_MISALIGNED 0x1000 >
On 10/5/22 10:44, Damien Le Moal wrote: > On 9/29/22 18:31, Sam Li wrote: >> Since Linux doesn't have a user API to issue zone append operations to >> zoned devices from user space, the file-posix driver is modified to add >> zone append emulation using regular writes. To do this, the file-posix >> driver tracks the wp location of all zones of the device. It uses an >> array of uint64_t. The most significant bit of each wp location indicates >> if the zone type is sequential write required. >> >> The zones wp can be changed due to the following operations issued: >> - zone reset: change the wp to the start offset of that zone >> - zone finish: change to the end location of that zone >> - write to a zone >> - zone append >> >> Signed-off-by: Sam Li <faithilikerun@gmail.com> Replying to myself to add some comments that I forgot. >> --- >> block/file-posix.c | 138 ++++++++++++++++++++++++++++++- >> include/block/block-common.h | 16 ++++ >> include/block/block_int-common.h | 5 ++ >> include/block/raw-aio.h | 4 +- >> 4 files changed, 159 insertions(+), 4 deletions(-) >> >> diff --git a/block/file-posix.c b/block/file-posix.c >> index 73656d87f2..33e81ac112 100755 >> --- a/block/file-posix.c >> +++ b/block/file-posix.c >> @@ -206,6 +206,8 @@ typedef struct RawPosixAIOData { >> struct { >> struct iovec *iov; >> int niov; >> + int64_t *append_sector; >> + BlockZoneWps *wps; >> } io; >> struct { >> uint64_t cmd; >> @@ -1332,6 +1334,59 @@ static int hdev_get_max_segments(int fd, struct stat *st) { >> #endif >> } >> >> +#if defined(CONFIG_BLKZONED) >> +static int report_zone_wp(int64_t offset, int fd, BlockZoneWps *wps, >> + unsigned int nrz) { > > Maybe rename this to get_zones_wp() ? > >> + struct blk_zone *blkz; >> + int64_t rep_size; >> + int64_t sector = offset >> BDRV_SECTOR_BITS; >> + int ret, n = 0, i = 0; >> + >> + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); >> + g_autofree struct blk_zone_report *rep = NULL; > > To be cleaner, move this declaration above with the others ? > >> + rep = g_malloc(rep_size); >> + >> + blkz = (struct blk_zone *)(rep + 1); >> + while (n < nrz) { >> + memset(rep, 0, rep_size); >> + rep->sector = sector; >> + rep->nr_zones = nrz - n; >> + >> + do { >> + ret = ioctl(fd, BLKREPORTZONE, rep); >> + } while (ret != 0 && errno == EINTR); >> + if (ret != 0) { >> + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d", >> + fd, offset, errno); >> + return -errno; >> + } >> + >> + if (!rep->nr_zones) { >> + break; >> + } >> + >> + for (i = 0; i < rep->nr_zones; i++, n++) { >> + wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS; >> + sector = blkz[i].start + blkz[i].len; >> + >> + /* >> + * In the wp tracking, it only cares if the zone type is sequential >> + * writes required so that the wp can advance to the right location. > > Or sequential write preferred (host aware case) > >> + * Instead of the type of zone_type which is an 8-bit unsigned >> + * integer, use the first most significant bits of the wp location >> + * to indicate the zone type: 0 for SWR zones and 1 for the >> + * others. >> + */ >> + if (!(blkz[i].type & BLK_ZONE_TYPE_SEQWRITE_REQ)) { > > This should be: > > if (blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL) { > > Note that the type field is not a bit-field. So you must compare values > instead of doing bit operations. > >> + wps->wp[i] += (uint64_t)1 << 63; > > You can simplify this: > > wps->wp[i] |= 1ULL << 63; > > Overall, I would rewrite this like this: > > for (i = 0; i < rep->nr_zones; i++, n++) { > /* > * The wp tracking cares only about sequential write required > * and sequential write preferred zones so that the wp can > * advance to the right location. > * Use the most significant bit of the wp location > * to indicate the zone type: 0 for SWR zones and 1 for > * conventional zones. > */ > if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) { > wps->wp[i] = 1ULL << 63; > else > wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS; > } > sector = blkz[i - 1].start + blkz[i - 1].len; > > Which I think is a lot simpler. > >> + } >> + } >> + } >> + >> + return 0; >> +} >> +#endif >> + >> static void raw_refresh_limits(BlockDriverState *bs, Error **errp) >> { >> BDRVRawState *s = bs->opaque; >> @@ -1415,6 +1470,20 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) >> error_report("Invalid device capacity %" PRId64 " bytes ", bs->bl.capacity); >> return; >> } >> + >> + ret = get_sysfs_long_val(&st, "physical_block_size"); >> + if (ret >= 0) { >> + bs->bl.write_granularity = ret; >> + } > > This change seems unrelated to the wp tracking. Should this be moved to a > different patch ? > >> + >> + bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret); >> + qemu_mutex_init(&bs->bl.wps->lock); > > Move this initialization after the if block. Doing so, you do not need to > call mutex destroy in case of error. > >> + if (report_zone_wp(0, s->fd, bs->bl.wps, ret) < 0 ) { >> + error_report("report wps failed"); >> + qemu_mutex_destroy(&bs->bl.wps->lock); >> + g_free(bs->bl.wps); >> + return; >> + } >> } >> } >> >> @@ -1582,7 +1651,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) >> ssize_t len; >> >> do { >> - if (aiocb->aio_type & QEMU_AIO_WRITE) >> + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) >> len = qemu_pwritev(aiocb->aio_fildes, >> aiocb->io.iov, >> aiocb->io.niov, >> @@ -1612,7 +1681,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) >> ssize_t len; >> >> while (offset < aiocb->aio_nbytes) { >> - if (aiocb->aio_type & QEMU_AIO_WRITE) { >> + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { >> len = pwrite(aiocb->aio_fildes, >> (const char *)buf + offset, >> aiocb->aio_nbytes - offset, >> @@ -1705,7 +1774,7 @@ static int handle_aiocb_rw(void *opaque) >> } >> >> nbytes = handle_aiocb_rw_linear(aiocb, buf); >> - if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { >> + if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) { >> char *p = buf; >> size_t count = aiocb->aio_nbytes, copy; >> int i; >> @@ -1726,6 +1795,23 @@ static int handle_aiocb_rw(void *opaque) >> >> out: >> if (nbytes == aiocb->aio_nbytes) { >> +#if defined(CONFIG_BLKZONED) >> + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { >> + BlockZoneWps *wps = aiocb->io.wps; > > Why adding this pointer to the aiocb struct ? You can get the array from > aiocb->bs->bl.wps, no ? > >> + int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; >> + if (wps) { >> + if (BDRV_ZT_IS_SWR(wps->wp[index])) { >> + qemu_mutex_lock(&wps->lock); >> + wps->wp[index] += aiocb->aio_nbytes; >> + qemu_mutex_unlock(&wps->lock); >> + } >> + >> + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { >> + *aiocb->io.append_sector = wps->wp[index] >> BDRV_SECTOR_BITS; > > Bug: the append sector must be the first sector written, not the wp > (sector following the last written sector). So this must be done *before* > advancing the wp. > > You need to have wps->lock held here too since you are reading > wps->wp[index]. So the mutex lock/unlock needs to be around the 2 hunks > under "if (wps) {". Also, given that there cannot be any zone append > issued to conventional zones (they will fail), you can simplify: > > if (wps) { > qemu_mutex_lock(&wps->lock); > if (BDRV_ZT_IS_SWR(wps->wp[index])) { > if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { > *aiocb->io.append_sector = > wps->wp[index] >> BDRV_SECTOR_BITS; > } > wps->wp[index] += aiocb->aio_nbytes; > } > qemu_mutex_unlock(&wps->lock); > } > > Now the last problem with this code is sequential write preferred zones. > For these, the write may actually be overwriting sectors that have already > been written, meaning that the wp may not necessarilly need to be > advanced. You can handle that case together with SWR case simply like this: > > if (wps) { > qemu_mutex_lock(&wps->lock); > if (BDRV_ZT_IS_SWR(wps->wp[index])) { > uint64_t wend_offset = > aiocb->aio_offset + aiocb->aio_nbytes; > > if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { > *aiocb->io.append_sector = > wps->wp[index] >> BDRV_SECTOR_BITS; > } > > /* Advance the wp if needed */ > if (wend_offset > wps->wp[index]) > wps->wp[index] = wend_offset; > } > qemu_mutex_unlock(&wps->lock); > } > Note that you should not increment the wp if the zone is already full. But for such case, since write and zone append commands will fail, you can fail them immediately. >> + } >> + } >> + } >> +#endif >> return 0; >> } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) { >> if (aiocb->aio_type & QEMU_AIO_WRITE) { >> @@ -1737,6 +1823,19 @@ out: >> } >> } else { >> assert(nbytes < 0); >> +#if defined(CONFIG_BLKZONED) >> + if (aiocb->aio_type == QEMU_AIO_ZONE_APPEND) { > > Why do this only for zone append ? Regular write error also need a refresh > of the zone wp. > >> + qemu_mutex_lock(&aiocb->bs->bl.wps->lock); >> + if (report_zone_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps, >> + aiocb->bs->bl.nr_zones) < 0) { >> + error_report("report zone wp failed"); >> + qemu_mutex_destroy(&aiocb->bs->bl.wps->lock); >> + g_free(aiocb->bs->bl.wps); >> + return -EINVAL; >> + } >> + qemu_mutex_unlock(&aiocb->bs->bl.wps->lock); > > This really needs to be a helper function, e.g. update_zone_wp() or > something like this. Aslo, why nuke the entire zone array if the refresh > for this zone fails ? You could simply leave it as is. The next write may > again fail and another attempt at refreshing it done. > >> + } >> +#endif >> return nbytes; >> } >> } >> @@ -2027,12 +2126,16 @@ static int handle_aiocb_zone_report(void *opaque) { >> static int handle_aiocb_zone_mgmt(void *opaque) { >> #if defined(CONFIG_BLKZONED) >> RawPosixAIOData *aiocb = opaque; >> + BlockDriverState *bs = aiocb->bs; >> int fd = aiocb->aio_fildes; >> int64_t sector = aiocb->aio_offset / 512; >> int64_t nr_sectors = aiocb->aio_nbytes / 512; >> struct blk_zone_range range; >> int ret; >> >> + BlockZoneWps *wps = bs->bl.wps; >> + int index = aiocb->aio_offset / bs->bl.zone_size; >> + >> /* Execute the operation */ >> range.sector = sector; >> range.nr_sectors = nr_sectors; >> @@ -2045,6 +2148,22 @@ static int handle_aiocb_zone_mgmt(void *opaque) { >> errno); >> return -errno; >> } >> + >> + if (aiocb->zone_mgmt.all) { This case should be integrated into the if (aiocb->zone_mgmt.zone_op == BLKRESETZONE) >> + for (int i = 0; i < bs->bl.nr_zones; ++i) { >> + qemu_mutex_lock(&wps->lock); >> + wps->wp[i] = i * bs->bl.zone_size; You need to test the zone type bit and only change SWR or SWP zones. >> + qemu_mutex_unlock(&wps->lock); >> + } >> + } else if (aiocb->zone_mgmt.zone_op == BLKRESETZONE) { >> + qemu_mutex_lock(&wps->lock); >> + wps->wp[index] = aiocb->aio_offset; >> + qemu_mutex_unlock(&wps->lock); >> + } else if (aiocb->zone_mgmt.zone_op == BLKFINISHZONE) { >> + qemu_mutex_lock(&wps->lock); >> + wps->wp[index] = aiocb->aio_offset + bs->bl.zone_size; > > This may be the last zone of the device, which may be smaller. So you need > to check that here. Same for the first case for reset all: you need to > handle the smaller last zone if there is one. You should fail any zone management commnand targetting a conventional zone. You are not checking that. Also, you can avoid issuing commands by testing if a zone is already full (no need to issue finish command) or empty (no need to do a reset). > >> + qemu_mutex_unlock(&wps->lock); >> + } > > Instead of the lock/unlock for each case here, take the mutex lock before > the if () and unlock it after it. Less lines :) > > Also, if the zone management command fails, you need to do a report zones > and refresh the wps array. > >> return ret; >> #else >> return -ENOTSUP; >> @@ -2355,6 +2474,8 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, >> }, >> }; >> >> + BlockZoneWps *wps = bs->bl.wps; >> + acb.io.wps = wps; > > You do not need the pws variable. Simply do: > > acb.io.wps = bs->bl.wps; > >> assert(qiov->size == bytes); >> return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb); >> } >> @@ -2465,6 +2586,12 @@ static void raw_close(BlockDriverState *bs) >> BDRVRawState *s = bs->opaque; >> >> if (s->fd >= 0) { >> +#if defined(CONFIG_BLKZONED) >> + if (bs->bl.wps) { >> + qemu_mutex_destroy(&bs->bl.wps->lock); >> + g_free(bs->bl.wps); >> + } >> +#endif >> qemu_close(s->fd); >> s->fd = -1; >> } >> @@ -3299,6 +3426,11 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, >> zone_op_name = "BLKRESETZONE"; >> zone_op = BLKRESETZONE; >> break; >> + case BLK_ZO_RESET_ALL: >> + zone_op_name = "BLKRESETZONE"; >> + zone_op = BLKRESETZONE; >> + is_all = true; >> + break; > > This change seems unrelated to the wp tracking. Different patch ? > >> default: >> g_assert_not_reached(); >> } >> diff --git a/include/block/block-common.h b/include/block/block-common.h >> index 8efb6b0c43..43bfc484eb 100644 >> --- a/include/block/block-common.h >> +++ b/include/block/block-common.h >> @@ -58,6 +58,7 @@ typedef enum BlockZoneOp { >> BLK_ZO_CLOSE, >> BLK_ZO_FINISH, >> BLK_ZO_RESET, >> + BLK_ZO_RESET_ALL, > > same here. Adding reset all support should be a different patch. >> } BlockZoneOp; >> >> typedef enum BlockZoneModel { >> @@ -96,6 +97,14 @@ typedef struct BlockZoneDescriptor { >> BlockZoneCondition cond; >> } BlockZoneDescriptor; >> >> +/* >> + * Track write pointers of a zone in bytes. >> + */ >> +typedef struct BlockZoneWps { >> + QemuMutex lock; >> + uint64_t wp[]; >> +} BlockZoneWps; >> + >> typedef struct BlockDriverInfo { >> /* in bytes, 0 if irrelevant */ >> int cluster_size; >> @@ -209,6 +218,13 @@ typedef enum { >> #define BDRV_SECTOR_BITS 9 >> #define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS) >> >> +/* >> + * Get the first most significant bit of WP. If it is zero, then >> + * the zone type is SWR. >> + */ >> +#define BDRV_ZT_IS_SWR(WP) ((WP & 0x8000000000000000) == 0) ? (true) : \ >> + (false) > > Simplify: > > #define BDRV_ZT_IS_SWR(wp) (!((wp) & (1ULL << 63)) > > But since this must be used for both SWR and SWP zones, I would reverse > this into: > > #define BDRV_ZONE_IS_CONV(wp) ((wp) & (1ULL << 63)) > > Which is a lot simpler. > >> + >> #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \ >> INT_MAX >> BDRV_SECTOR_BITS) >> #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) >> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h >> index 37dddc603c..59c2d1316d 100644 >> --- a/include/block/block_int-common.h >> +++ b/include/block/block_int-common.h >> @@ -857,6 +857,11 @@ typedef struct BlockLimits { >> >> /* device capacity expressed in bytes */ >> int64_t capacity; >> + >> + /* array of write pointers' location of each zone in the zoned device. */ >> + BlockZoneWps *wps; >> + >> + int64_t write_granularity; >> } BlockLimits; >> >> typedef struct BdrvOpBlocker BdrvOpBlocker; >> diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h >> index 3d26929cdd..f13cc1887b 100644 >> --- a/include/block/raw-aio.h >> +++ b/include/block/raw-aio.h >> @@ -31,6 +31,7 @@ >> #define QEMU_AIO_TRUNCATE 0x0080 >> #define QEMU_AIO_ZONE_REPORT 0x0100 >> #define QEMU_AIO_ZONE_MGMT 0x0200 >> +#define QEMU_AIO_ZONE_APPEND 0x0400 >> #define QEMU_AIO_TYPE_MASK \ >> (QEMU_AIO_READ | \ >> QEMU_AIO_WRITE | \ >> @@ -41,7 +42,8 @@ >> QEMU_AIO_COPY_RANGE | \ >> QEMU_AIO_TRUNCATE | \ >> QEMU_AIO_ZONE_REPORT | \ >> - QEMU_AIO_ZONE_MGMT) >> + QEMU_AIO_ZONE_MGMT | \ >> + QEMU_AIO_ZONE_APPEND) > > This should be introduced in patch 2. This patch should be only about zone > wp tracking with regular writes and zone management ops. The second patch > can implement zone append emulation thanks to this patch. So separate. > >> >> /* AIO flags */ >> #define QEMU_AIO_MISALIGNED 0x1000 >
Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年10月5日周三 15:34写道: > > On 10/5/22 10:44, Damien Le Moal wrote: > > On 9/29/22 18:31, Sam Li wrote: > >> Since Linux doesn't have a user API to issue zone append operations to > >> zoned devices from user space, the file-posix driver is modified to add > >> zone append emulation using regular writes. To do this, the file-posix > >> driver tracks the wp location of all zones of the device. It uses an > >> array of uint64_t. The most significant bit of each wp location indicates > >> if the zone type is sequential write required. > >> > >> The zones wp can be changed due to the following operations issued: > >> - zone reset: change the wp to the start offset of that zone > >> - zone finish: change to the end location of that zone > >> - write to a zone > >> - zone append > >> > >> Signed-off-by: Sam Li <faithilikerun@gmail.com> > > Replying to myself to add some comments that I forgot. > > >> --- > >> block/file-posix.c | 138 ++++++++++++++++++++++++++++++- > >> include/block/block-common.h | 16 ++++ > >> include/block/block_int-common.h | 5 ++ > >> include/block/raw-aio.h | 4 +- > >> 4 files changed, 159 insertions(+), 4 deletions(-) > >> > >> diff --git a/block/file-posix.c b/block/file-posix.c > >> index 73656d87f2..33e81ac112 100755 > >> --- a/block/file-posix.c > >> +++ b/block/file-posix.c > >> @@ -206,6 +206,8 @@ typedef struct RawPosixAIOData { > >> struct { > >> struct iovec *iov; > >> int niov; > >> + int64_t *append_sector; > >> + BlockZoneWps *wps; > >> } io; > >> struct { > >> uint64_t cmd; > >> @@ -1332,6 +1334,59 @@ static int hdev_get_max_segments(int fd, struct stat *st) { > >> #endif > >> } > >> > >> +#if defined(CONFIG_BLKZONED) > >> +static int report_zone_wp(int64_t offset, int fd, BlockZoneWps *wps, > >> + unsigned int nrz) { > > > > Maybe rename this to get_zones_wp() ? > > > >> + struct blk_zone *blkz; > >> + int64_t rep_size; > >> + int64_t sector = offset >> BDRV_SECTOR_BITS; > >> + int ret, n = 0, i = 0; > >> + > >> + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); > >> + g_autofree struct blk_zone_report *rep = NULL; > > > > To be cleaner, move this declaration above with the others ? > > > >> + rep = g_malloc(rep_size); > >> + > >> + blkz = (struct blk_zone *)(rep + 1); > >> + while (n < nrz) { > >> + memset(rep, 0, rep_size); > >> + rep->sector = sector; > >> + rep->nr_zones = nrz - n; > >> + > >> + do { > >> + ret = ioctl(fd, BLKREPORTZONE, rep); > >> + } while (ret != 0 && errno == EINTR); > >> + if (ret != 0) { > >> + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d", > >> + fd, offset, errno); > >> + return -errno; > >> + } > >> + > >> + if (!rep->nr_zones) { > >> + break; > >> + } > >> + > >> + for (i = 0; i < rep->nr_zones; i++, n++) { > >> + wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS; > >> + sector = blkz[i].start + blkz[i].len; > >> + > >> + /* > >> + * In the wp tracking, it only cares if the zone type is sequential > >> + * writes required so that the wp can advance to the right location. > > > > Or sequential write preferred (host aware case) > > > >> + * Instead of the type of zone_type which is an 8-bit unsigned > >> + * integer, use the first most significant bits of the wp location > >> + * to indicate the zone type: 0 for SWR zones and 1 for the > >> + * others. > >> + */ > >> + if (!(blkz[i].type & BLK_ZONE_TYPE_SEQWRITE_REQ)) { > > > > This should be: > > > > if (blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL) { > > > > Note that the type field is not a bit-field. So you must compare values > > instead of doing bit operations. > > > >> + wps->wp[i] += (uint64_t)1 << 63; > > > > You can simplify this: > > > > wps->wp[i] |= 1ULL << 63; > > > > Overall, I would rewrite this like this: > > > > for (i = 0; i < rep->nr_zones; i++, n++) { > > /* > > * The wp tracking cares only about sequential write required > > * and sequential write preferred zones so that the wp can > > * advance to the right location. > > * Use the most significant bit of the wp location > > * to indicate the zone type: 0 for SWR zones and 1 for > > * conventional zones. > > */ > > if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) { > > wps->wp[i] = 1ULL << 63; > > else > > wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS; > > } > > sector = blkz[i - 1].start + blkz[i - 1].len; > > > > Which I think is a lot simpler. > > > >> + } > >> + } > >> + } > >> + > >> + return 0; > >> +} > >> +#endif > >> + > >> static void raw_refresh_limits(BlockDriverState *bs, Error **errp) > >> { > >> BDRVRawState *s = bs->opaque; > >> @@ -1415,6 +1470,20 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) > >> error_report("Invalid device capacity %" PRId64 " bytes ", bs->bl.capacity); > >> return; > >> } > >> + > >> + ret = get_sysfs_long_val(&st, "physical_block_size"); > >> + if (ret >= 0) { > >> + bs->bl.write_granularity = ret; > >> + } > > > > This change seems unrelated to the wp tracking. Should this be moved to a > > different patch ? > > > >> + > >> + bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret); > >> + qemu_mutex_init(&bs->bl.wps->lock); > > > > Move this initialization after the if block. Doing so, you do not need to > > call mutex destroy in case of error. > > > >> + if (report_zone_wp(0, s->fd, bs->bl.wps, ret) < 0 ) { > >> + error_report("report wps failed"); > >> + qemu_mutex_destroy(&bs->bl.wps->lock); > >> + g_free(bs->bl.wps); > >> + return; > >> + } > >> } > >> } > >> > >> @@ -1582,7 +1651,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) > >> ssize_t len; > >> > >> do { > >> - if (aiocb->aio_type & QEMU_AIO_WRITE) > >> + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) > >> len = qemu_pwritev(aiocb->aio_fildes, > >> aiocb->io.iov, > >> aiocb->io.niov, > >> @@ -1612,7 +1681,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) > >> ssize_t len; > >> > >> while (offset < aiocb->aio_nbytes) { > >> - if (aiocb->aio_type & QEMU_AIO_WRITE) { > >> + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > >> len = pwrite(aiocb->aio_fildes, > >> (const char *)buf + offset, > >> aiocb->aio_nbytes - offset, > >> @@ -1705,7 +1774,7 @@ static int handle_aiocb_rw(void *opaque) > >> } > >> > >> nbytes = handle_aiocb_rw_linear(aiocb, buf); > >> - if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { > >> + if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) { > >> char *p = buf; > >> size_t count = aiocb->aio_nbytes, copy; > >> int i; > >> @@ -1726,6 +1795,23 @@ static int handle_aiocb_rw(void *opaque) > >> > >> out: > >> if (nbytes == aiocb->aio_nbytes) { > >> +#if defined(CONFIG_BLKZONED) > >> + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > >> + BlockZoneWps *wps = aiocb->io.wps; > > > > Why adding this pointer to the aiocb struct ? You can get the array from > > aiocb->bs->bl.wps, no ? > > > >> + int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; > >> + if (wps) { > >> + if (BDRV_ZT_IS_SWR(wps->wp[index])) { > >> + qemu_mutex_lock(&wps->lock); > >> + wps->wp[index] += aiocb->aio_nbytes; > >> + qemu_mutex_unlock(&wps->lock); > >> + } > >> + > >> + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { > >> + *aiocb->io.append_sector = wps->wp[index] >> BDRV_SECTOR_BITS; > > > > Bug: the append sector must be the first sector written, not the wp > > (sector following the last written sector). So this must be done *before* > > advancing the wp. > > > > You need to have wps->lock held here too since you are reading > > wps->wp[index]. So the mutex lock/unlock needs to be around the 2 hunks > > under "if (wps) {". Also, given that there cannot be any zone append > > issued to conventional zones (they will fail), you can simplify: > > > > if (wps) { > > qemu_mutex_lock(&wps->lock); > > if (BDRV_ZT_IS_SWR(wps->wp[index])) { > > if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { > > *aiocb->io.append_sector = > > wps->wp[index] >> BDRV_SECTOR_BITS; > > } > > wps->wp[index] += aiocb->aio_nbytes; > > } > > qemu_mutex_unlock(&wps->lock); > > } > > > > Now the last problem with this code is sequential write preferred zones. > > For these, the write may actually be overwriting sectors that have already > > been written, meaning that the wp may not necessarilly need to be > > advanced. You can handle that case together with SWR case simply like this: > > > > if (wps) { > > qemu_mutex_lock(&wps->lock); > > if (BDRV_ZT_IS_SWR(wps->wp[index])) { > > uint64_t wend_offset = > > aiocb->aio_offset + aiocb->aio_nbytes; > > > > if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { > > *aiocb->io.append_sector = > > wps->wp[index] >> BDRV_SECTOR_BITS; > > } > > > > /* Advance the wp if needed */ > > if (wend_offset > wps->wp[index]) > > wps->wp[index] = wend_offset; > > } > > qemu_mutex_unlock(&wps->lock); > > } > > > > Note that you should not increment the wp if the zone is already full. But > for such case, since write and zone append commands will fail, you can > fail them immediately. > > >> + } > >> + } > >> + } > >> +#endif > >> return 0; > >> } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) { > >> if (aiocb->aio_type & QEMU_AIO_WRITE) { > >> @@ -1737,6 +1823,19 @@ out: > >> } > >> } else { > >> assert(nbytes < 0); > >> +#if defined(CONFIG_BLKZONED) > >> + if (aiocb->aio_type == QEMU_AIO_ZONE_APPEND) { > > > > Why do this only for zone append ? Regular write error also need a refresh > > of the zone wp. > > > >> + qemu_mutex_lock(&aiocb->bs->bl.wps->lock); > >> + if (report_zone_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps, > >> + aiocb->bs->bl.nr_zones) < 0) { > >> + error_report("report zone wp failed"); > >> + qemu_mutex_destroy(&aiocb->bs->bl.wps->lock); > >> + g_free(aiocb->bs->bl.wps); > >> + return -EINVAL; > >> + } > >> + qemu_mutex_unlock(&aiocb->bs->bl.wps->lock); > > > > This really needs to be a helper function, e.g. update_zone_wp() or > > something like this. Aslo, why nuke the entire zone array if the refresh > > for this zone fails ? You could simply leave it as is. The next write may > > again fail and another attempt at refreshing it done. > > > >> + } > >> +#endif > >> return nbytes; > >> } > >> } > >> @@ -2027,12 +2126,16 @@ static int handle_aiocb_zone_report(void *opaque) { > >> static int handle_aiocb_zone_mgmt(void *opaque) { > >> #if defined(CONFIG_BLKZONED) > >> RawPosixAIOData *aiocb = opaque; > >> + BlockDriverState *bs = aiocb->bs; > >> int fd = aiocb->aio_fildes; > >> int64_t sector = aiocb->aio_offset / 512; > >> int64_t nr_sectors = aiocb->aio_nbytes / 512; > >> struct blk_zone_range range; > >> int ret; > >> > >> + BlockZoneWps *wps = bs->bl.wps; > >> + int index = aiocb->aio_offset / bs->bl.zone_size; > >> + > >> /* Execute the operation */ > >> range.sector = sector; > >> range.nr_sectors = nr_sectors; > >> @@ -2045,6 +2148,22 @@ static int handle_aiocb_zone_mgmt(void *opaque) { > >> errno); > >> return -errno; > >> } > >> + > >> + if (aiocb->zone_mgmt.all) { > > This case should be integrated into the > if (aiocb->zone_mgmt.zone_op == BLKRESETZONE) The purpose of this all flag is to pass down the RESET_ALL request to the file-posix so that it can reset t write pointers of all zones. > > >> + for (int i = 0; i < bs->bl.nr_zones; ++i) { > >> + qemu_mutex_lock(&wps->lock); > >> + wps->wp[i] = i * bs->bl.zone_size; > > You need to test the zone type bit and only change SWR or SWP zones. > > >> + qemu_mutex_unlock(&wps->lock); > >> + } > >> + } else if (aiocb->zone_mgmt.zone_op == BLKRESETZONE) { > >> + qemu_mutex_lock(&wps->lock); > >> + wps->wp[index] = aiocb->aio_offset; > >> + qemu_mutex_unlock(&wps->lock); > >> + } else if (aiocb->zone_mgmt.zone_op == BLKFINISHZONE) { > >> + qemu_mutex_lock(&wps->lock); > >> + wps->wp[index] = aiocb->aio_offset + bs->bl.zone_size; > > > > This may be the last zone of the device, which may be smaller. So you need > > to check that here. Same for the first case for reset all: you need to > > handle the smaller last zone if there is one. > > You should fail any zone management commnand targetting a conventional > zone. You are not checking that. > > Also, you can avoid issuing commands by testing if a zone is already full > (no need to issue finish command) or empty (no need to do a reset). Maybe the type check can be done before issuing the request and the device can return VIRTIO_BLK_S_ZONE_INVALID_CMD to finish it. The simplest way to do the state check I think is to compare the wp to the start/end sector which can tell us if the zone is empty or full. Therefore it should be done in the raw_co_zone_mgmt(). Thanks for reviewing! I can send the revision very soon. > > > > >> + qemu_mutex_unlock(&wps->lock); > >> + } > > > > Instead of the lock/unlock for each case here, take the mutex lock before > > the if () and unlock it after it. Less lines :) > > > > Also, if the zone management command fails, you need to do a report zones > > and refresh the wps array. > > > >> return ret; > >> #else > >> return -ENOTSUP; > >> @@ -2355,6 +2474,8 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, > >> }, > >> }; > >> > >> + BlockZoneWps *wps = bs->bl.wps; > >> + acb.io.wps = wps; > > > > You do not need the pws variable. Simply do: > > > > acb.io.wps = bs->bl.wps; > > > >> assert(qiov->size == bytes); > >> return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb); > >> } > >> @@ -2465,6 +2586,12 @@ static void raw_close(BlockDriverState *bs) > >> BDRVRawState *s = bs->opaque; > >> > >> if (s->fd >= 0) { > >> +#if defined(CONFIG_BLKZONED) > >> + if (bs->bl.wps) { > >> + qemu_mutex_destroy(&bs->bl.wps->lock); > >> + g_free(bs->bl.wps); > >> + } > >> +#endif > >> qemu_close(s->fd); > >> s->fd = -1; > >> } > >> @@ -3299,6 +3426,11 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > >> zone_op_name = "BLKRESETZONE"; > >> zone_op = BLKRESETZONE; > >> break; > >> + case BLK_ZO_RESET_ALL: > >> + zone_op_name = "BLKRESETZONE"; > >> + zone_op = BLKRESETZONE; > >> + is_all = true; > >> + break; > > > > This change seems unrelated to the wp tracking. Different patch ? > > > >> default: > >> g_assert_not_reached(); > >> } > >> diff --git a/include/block/block-common.h b/include/block/block-common.h > >> index 8efb6b0c43..43bfc484eb 100644 > >> --- a/include/block/block-common.h > >> +++ b/include/block/block-common.h > >> @@ -58,6 +58,7 @@ typedef enum BlockZoneOp { > >> BLK_ZO_CLOSE, > >> BLK_ZO_FINISH, > >> BLK_ZO_RESET, > >> + BLK_ZO_RESET_ALL, > > > > same here. Adding reset all support should be a different patch. > >> } BlockZoneOp; > >> > >> typedef enum BlockZoneModel { > >> @@ -96,6 +97,14 @@ typedef struct BlockZoneDescriptor { > >> BlockZoneCondition cond; > >> } BlockZoneDescriptor; > >> > >> +/* > >> + * Track write pointers of a zone in bytes. > >> + */ > >> +typedef struct BlockZoneWps { > >> + QemuMutex lock; > >> + uint64_t wp[]; > >> +} BlockZoneWps; > >> + > >> typedef struct BlockDriverInfo { > >> /* in bytes, 0 if irrelevant */ > >> int cluster_size; > >> @@ -209,6 +218,13 @@ typedef enum { > >> #define BDRV_SECTOR_BITS 9 > >> #define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS) > >> > >> +/* > >> + * Get the first most significant bit of WP. If it is zero, then > >> + * the zone type is SWR. > >> + */ > >> +#define BDRV_ZT_IS_SWR(WP) ((WP & 0x8000000000000000) == 0) ? (true) : \ > >> + (false) > > > > Simplify: > > > > #define BDRV_ZT_IS_SWR(wp) (!((wp) & (1ULL << 63)) > > > > But since this must be used for both SWR and SWP zones, I would reverse > > this into: > > > > #define BDRV_ZONE_IS_CONV(wp) ((wp) & (1ULL << 63)) > > > > Which is a lot simpler. > > > >> + > >> #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \ > >> INT_MAX >> BDRV_SECTOR_BITS) > >> #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) > >> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h > >> index 37dddc603c..59c2d1316d 100644 > >> --- a/include/block/block_int-common.h > >> +++ b/include/block/block_int-common.h > >> @@ -857,6 +857,11 @@ typedef struct BlockLimits { > >> > >> /* device capacity expressed in bytes */ > >> int64_t capacity; > >> + > >> + /* array of write pointers' location of each zone in the zoned device. */ > >> + BlockZoneWps *wps; > >> + > >> + int64_t write_granularity; > >> } BlockLimits; > >> > >> typedef struct BdrvOpBlocker BdrvOpBlocker; > >> diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h > >> index 3d26929cdd..f13cc1887b 100644 > >> --- a/include/block/raw-aio.h > >> +++ b/include/block/raw-aio.h > >> @@ -31,6 +31,7 @@ > >> #define QEMU_AIO_TRUNCATE 0x0080 > >> #define QEMU_AIO_ZONE_REPORT 0x0100 > >> #define QEMU_AIO_ZONE_MGMT 0x0200 > >> +#define QEMU_AIO_ZONE_APPEND 0x0400 > >> #define QEMU_AIO_TYPE_MASK \ > >> (QEMU_AIO_READ | \ > >> QEMU_AIO_WRITE | \ > >> @@ -41,7 +42,8 @@ > >> QEMU_AIO_COPY_RANGE | \ > >> QEMU_AIO_TRUNCATE | \ > >> QEMU_AIO_ZONE_REPORT | \ > >> - QEMU_AIO_ZONE_MGMT) > >> + QEMU_AIO_ZONE_MGMT | \ > >> + QEMU_AIO_ZONE_APPEND) > > > > This should be introduced in patch 2. This patch should be only about zone > > wp tracking with regular writes and zone management ops. The second patch > > can implement zone append emulation thanks to this patch. So separate. > > > >> > >> /* AIO flags */ > >> #define QEMU_AIO_MISALIGNED 0x1000 > > > > -- > Damien Le Moal > Western Digital Research >
On 10/5/22 17:30, Sam Li wrote: > Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年10月5日周三 15:34写道: >> >> On 10/5/22 10:44, Damien Le Moal wrote: >>> On 9/29/22 18:31, Sam Li wrote: >>>> Since Linux doesn't have a user API to issue zone append operations to >>>> zoned devices from user space, the file-posix driver is modified to add >>>> zone append emulation using regular writes. To do this, the file-posix >>>> driver tracks the wp location of all zones of the device. It uses an >>>> array of uint64_t. The most significant bit of each wp location indicates >>>> if the zone type is sequential write required. >>>> >>>> The zones wp can be changed due to the following operations issued: >>>> - zone reset: change the wp to the start offset of that zone >>>> - zone finish: change to the end location of that zone >>>> - write to a zone >>>> - zone append >>>> >>>> Signed-off-by: Sam Li <faithilikerun@gmail.com> >> >> Replying to myself to add some comments that I forgot. >> >>>> --- >>>> block/file-posix.c | 138 ++++++++++++++++++++++++++++++- >>>> include/block/block-common.h | 16 ++++ >>>> include/block/block_int-common.h | 5 ++ >>>> include/block/raw-aio.h | 4 +- >>>> 4 files changed, 159 insertions(+), 4 deletions(-) >>>> >>>> diff --git a/block/file-posix.c b/block/file-posix.c >>>> index 73656d87f2..33e81ac112 100755 >>>> --- a/block/file-posix.c >>>> +++ b/block/file-posix.c >>>> @@ -206,6 +206,8 @@ typedef struct RawPosixAIOData { >>>> struct { >>>> struct iovec *iov; >>>> int niov; >>>> + int64_t *append_sector; >>>> + BlockZoneWps *wps; >>>> } io; >>>> struct { >>>> uint64_t cmd; >>>> @@ -1332,6 +1334,59 @@ static int hdev_get_max_segments(int fd, struct stat *st) { >>>> #endif >>>> } >>>> >>>> +#if defined(CONFIG_BLKZONED) >>>> +static int report_zone_wp(int64_t offset, int fd, BlockZoneWps *wps, >>>> + unsigned int nrz) { >>> >>> Maybe rename this to get_zones_wp() ? >>> >>>> + struct blk_zone *blkz; >>>> + int64_t rep_size; >>>> + int64_t sector = offset >> BDRV_SECTOR_BITS; >>>> + int ret, n = 0, i = 0; >>>> + >>>> + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); >>>> + g_autofree struct blk_zone_report *rep = NULL; >>> >>> To be cleaner, move this declaration above with the others ? >>> >>>> + rep = g_malloc(rep_size); >>>> + >>>> + blkz = (struct blk_zone *)(rep + 1); >>>> + while (n < nrz) { >>>> + memset(rep, 0, rep_size); >>>> + rep->sector = sector; >>>> + rep->nr_zones = nrz - n; >>>> + >>>> + do { >>>> + ret = ioctl(fd, BLKREPORTZONE, rep); >>>> + } while (ret != 0 && errno == EINTR); >>>> + if (ret != 0) { >>>> + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d", >>>> + fd, offset, errno); >>>> + return -errno; >>>> + } >>>> + >>>> + if (!rep->nr_zones) { >>>> + break; >>>> + } >>>> + >>>> + for (i = 0; i < rep->nr_zones; i++, n++) { >>>> + wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS; >>>> + sector = blkz[i].start + blkz[i].len; >>>> + >>>> + /* >>>> + * In the wp tracking, it only cares if the zone type is sequential >>>> + * writes required so that the wp can advance to the right location. >>> >>> Or sequential write preferred (host aware case) >>> >>>> + * Instead of the type of zone_type which is an 8-bit unsigned >>>> + * integer, use the first most significant bits of the wp location >>>> + * to indicate the zone type: 0 for SWR zones and 1 for the >>>> + * others. >>>> + */ >>>> + if (!(blkz[i].type & BLK_ZONE_TYPE_SEQWRITE_REQ)) { >>> >>> This should be: >>> >>> if (blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL) { >>> >>> Note that the type field is not a bit-field. So you must compare values >>> instead of doing bit operations. >>> >>>> + wps->wp[i] += (uint64_t)1 << 63; >>> >>> You can simplify this: >>> >>> wps->wp[i] |= 1ULL << 63; >>> >>> Overall, I would rewrite this like this: >>> >>> for (i = 0; i < rep->nr_zones; i++, n++) { >>> /* >>> * The wp tracking cares only about sequential write required >>> * and sequential write preferred zones so that the wp can >>> * advance to the right location. >>> * Use the most significant bit of the wp location >>> * to indicate the zone type: 0 for SWR zones and 1 for >>> * conventional zones. >>> */ >>> if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) { >>> wps->wp[i] = 1ULL << 63; >>> else >>> wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS; >>> } >>> sector = blkz[i - 1].start + blkz[i - 1].len; >>> >>> Which I think is a lot simpler. >>> >>>> + } >>>> + } >>>> + } >>>> + >>>> + return 0; >>>> +} >>>> +#endif >>>> + >>>> static void raw_refresh_limits(BlockDriverState *bs, Error **errp) >>>> { >>>> BDRVRawState *s = bs->opaque; >>>> @@ -1415,6 +1470,20 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) >>>> error_report("Invalid device capacity %" PRId64 " bytes ", bs->bl.capacity); >>>> return; >>>> } >>>> + >>>> + ret = get_sysfs_long_val(&st, "physical_block_size"); >>>> + if (ret >= 0) { >>>> + bs->bl.write_granularity = ret; >>>> + } >>> >>> This change seems unrelated to the wp tracking. Should this be moved to a >>> different patch ? >>> >>>> + >>>> + bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret); >>>> + qemu_mutex_init(&bs->bl.wps->lock); >>> >>> Move this initialization after the if block. Doing so, you do not need to >>> call mutex destroy in case of error. >>> >>>> + if (report_zone_wp(0, s->fd, bs->bl.wps, ret) < 0 ) { >>>> + error_report("report wps failed"); >>>> + qemu_mutex_destroy(&bs->bl.wps->lock); >>>> + g_free(bs->bl.wps); >>>> + return; >>>> + } >>>> } >>>> } >>>> >>>> @@ -1582,7 +1651,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) >>>> ssize_t len; >>>> >>>> do { >>>> - if (aiocb->aio_type & QEMU_AIO_WRITE) >>>> + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) >>>> len = qemu_pwritev(aiocb->aio_fildes, >>>> aiocb->io.iov, >>>> aiocb->io.niov, >>>> @@ -1612,7 +1681,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) >>>> ssize_t len; >>>> >>>> while (offset < aiocb->aio_nbytes) { >>>> - if (aiocb->aio_type & QEMU_AIO_WRITE) { >>>> + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { >>>> len = pwrite(aiocb->aio_fildes, >>>> (const char *)buf + offset, >>>> aiocb->aio_nbytes - offset, >>>> @@ -1705,7 +1774,7 @@ static int handle_aiocb_rw(void *opaque) >>>> } >>>> >>>> nbytes = handle_aiocb_rw_linear(aiocb, buf); >>>> - if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { >>>> + if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) { >>>> char *p = buf; >>>> size_t count = aiocb->aio_nbytes, copy; >>>> int i; >>>> @@ -1726,6 +1795,23 @@ static int handle_aiocb_rw(void *opaque) >>>> >>>> out: >>>> if (nbytes == aiocb->aio_nbytes) { >>>> +#if defined(CONFIG_BLKZONED) >>>> + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { >>>> + BlockZoneWps *wps = aiocb->io.wps; >>> >>> Why adding this pointer to the aiocb struct ? You can get the array from >>> aiocb->bs->bl.wps, no ? >>> >>>> + int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; >>>> + if (wps) { >>>> + if (BDRV_ZT_IS_SWR(wps->wp[index])) { >>>> + qemu_mutex_lock(&wps->lock); >>>> + wps->wp[index] += aiocb->aio_nbytes; >>>> + qemu_mutex_unlock(&wps->lock); >>>> + } >>>> + >>>> + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { >>>> + *aiocb->io.append_sector = wps->wp[index] >> BDRV_SECTOR_BITS; >>> >>> Bug: the append sector must be the first sector written, not the wp >>> (sector following the last written sector). So this must be done *before* >>> advancing the wp. >>> >>> You need to have wps->lock held here too since you are reading >>> wps->wp[index]. So the mutex lock/unlock needs to be around the 2 hunks >>> under "if (wps) {". Also, given that there cannot be any zone append >>> issued to conventional zones (they will fail), you can simplify: >>> >>> if (wps) { >>> qemu_mutex_lock(&wps->lock); >>> if (BDRV_ZT_IS_SWR(wps->wp[index])) { >>> if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { >>> *aiocb->io.append_sector = >>> wps->wp[index] >> BDRV_SECTOR_BITS; >>> } >>> wps->wp[index] += aiocb->aio_nbytes; >>> } >>> qemu_mutex_unlock(&wps->lock); >>> } >>> >>> Now the last problem with this code is sequential write preferred zones. >>> For these, the write may actually be overwriting sectors that have already >>> been written, meaning that the wp may not necessarilly need to be >>> advanced. You can handle that case together with SWR case simply like this: >>> >>> if (wps) { >>> qemu_mutex_lock(&wps->lock); >>> if (BDRV_ZT_IS_SWR(wps->wp[index])) { >>> uint64_t wend_offset = >>> aiocb->aio_offset + aiocb->aio_nbytes; >>> >>> if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { >>> *aiocb->io.append_sector = >>> wps->wp[index] >> BDRV_SECTOR_BITS; >>> } >>> >>> /* Advance the wp if needed */ >>> if (wend_offset > wps->wp[index]) >>> wps->wp[index] = wend_offset; >>> } >>> qemu_mutex_unlock(&wps->lock); >>> } >>> >> >> Note that you should not increment the wp if the zone is already full. But >> for such case, since write and zone append commands will fail, you can >> fail them immediately. >> >>>> + } >>>> + } >>>> + } >>>> +#endif >>>> return 0; >>>> } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) { >>>> if (aiocb->aio_type & QEMU_AIO_WRITE) { >>>> @@ -1737,6 +1823,19 @@ out: >>>> } >>>> } else { >>>> assert(nbytes < 0); >>>> +#if defined(CONFIG_BLKZONED) >>>> + if (aiocb->aio_type == QEMU_AIO_ZONE_APPEND) { >>> >>> Why do this only for zone append ? Regular write error also need a refresh >>> of the zone wp. >>> >>>> + qemu_mutex_lock(&aiocb->bs->bl.wps->lock); >>>> + if (report_zone_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps, >>>> + aiocb->bs->bl.nr_zones) < 0) { >>>> + error_report("report zone wp failed"); >>>> + qemu_mutex_destroy(&aiocb->bs->bl.wps->lock); >>>> + g_free(aiocb->bs->bl.wps); >>>> + return -EINVAL; >>>> + } >>>> + qemu_mutex_unlock(&aiocb->bs->bl.wps->lock); >>> >>> This really needs to be a helper function, e.g. update_zone_wp() or >>> something like this. Aslo, why nuke the entire zone array if the refresh >>> for this zone fails ? You could simply leave it as is. The next write may >>> again fail and another attempt at refreshing it done. >>> >>>> + } >>>> +#endif >>>> return nbytes; >>>> } >>>> } >>>> @@ -2027,12 +2126,16 @@ static int handle_aiocb_zone_report(void *opaque) { >>>> static int handle_aiocb_zone_mgmt(void *opaque) { >>>> #if defined(CONFIG_BLKZONED) >>>> RawPosixAIOData *aiocb = opaque; >>>> + BlockDriverState *bs = aiocb->bs; >>>> int fd = aiocb->aio_fildes; >>>> int64_t sector = aiocb->aio_offset / 512; >>>> int64_t nr_sectors = aiocb->aio_nbytes / 512; >>>> struct blk_zone_range range; >>>> int ret; >>>> >>>> + BlockZoneWps *wps = bs->bl.wps; >>>> + int index = aiocb->aio_offset / bs->bl.zone_size; >>>> + >>>> /* Execute the operation */ >>>> range.sector = sector; >>>> range.nr_sectors = nr_sectors; >>>> @@ -2045,6 +2148,22 @@ static int handle_aiocb_zone_mgmt(void *opaque) { >>>> errno); >>>> return -errno; >>>> } >>>> + >>>> + if (aiocb->zone_mgmt.all) { >> >> This case should be integrated into the >> if (aiocb->zone_mgmt.zone_op == BLKRESETZONE) > > The purpose of this all flag is to pass down the RESET_ALL request to > the file-posix so that it can reset t write pointers of all zones. I understand. My point was that you check this flag without checking that the op is BLKRESETZONE. You need to check that to ensure that the caller is issuing a valid request. And since you need to check that the operation is reset and you already have that check below, merging the on-zone-reset and all-zones-reset cases under the same "if (aiocb->zone_mgmt.zone_op == BLKRESETZONE)" condition will make things clear. > >> >>>> + for (int i = 0; i < bs->bl.nr_zones; ++i) { >>>> + qemu_mutex_lock(&wps->lock); >>>> + wps->wp[i] = i * bs->bl.zone_size; >> >> You need to test the zone type bit and only change SWR or SWP zones. >> >>>> + qemu_mutex_unlock(&wps->lock); >>>> + } >>>> + } else if (aiocb->zone_mgmt.zone_op == BLKRESETZONE) { >>>> + qemu_mutex_lock(&wps->lock); >>>> + wps->wp[index] = aiocb->aio_offset; >>>> + qemu_mutex_unlock(&wps->lock); >>>> + } else if (aiocb->zone_mgmt.zone_op == BLKFINISHZONE) { >>>> + qemu_mutex_lock(&wps->lock); >>>> + wps->wp[index] = aiocb->aio_offset + bs->bl.zone_size; >>> >>> This may be the last zone of the device, which may be smaller. So you need >>> to check that here. Same for the first case for reset all: you need to >>> handle the smaller last zone if there is one. >> >> You should fail any zone management commnand targetting a conventional >> zone. You are not checking that. >> >> Also, you can avoid issuing commands by testing if a zone is already full >> (no need to issue finish command) or empty (no need to do a reset). > > Maybe the type check can be done before issuing the request and the > device can return VIRTIO_BLK_S_ZONE_INVALID_CMD to finish it. > > The simplest way to do the state check I think is to compare the wp to > the start/end sector which can tell us if the zone is empty or full. > Therefore it should be done in the raw_co_zone_mgmt(). Nope. You cannot do that there since you can only read the wp array holding the lock. And you must not release the lock until you issue the command to the drive. So you should check the zone condition here. > > Thanks for reviewing! I can send the revision very soon. > >> >>> >>>> + qemu_mutex_unlock(&wps->lock); >>>> + } >>> >>> Instead of the lock/unlock for each case here, take the mutex lock before >>> the if () and unlock it after it. Less lines :) >>> >>> Also, if the zone management command fails, you need to do a report zones >>> and refresh the wps array. >>> >>>> return ret; >>>> #else >>>> return -ENOTSUP; >>>> @@ -2355,6 +2474,8 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, >>>> }, >>>> }; >>>> >>>> + BlockZoneWps *wps = bs->bl.wps; >>>> + acb.io.wps = wps; >>> >>> You do not need the pws variable. Simply do: >>> >>> acb.io.wps = bs->bl.wps; >>> >>>> assert(qiov->size == bytes); >>>> return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb); >>>> } >>>> @@ -2465,6 +2586,12 @@ static void raw_close(BlockDriverState *bs) >>>> BDRVRawState *s = bs->opaque; >>>> >>>> if (s->fd >= 0) { >>>> +#if defined(CONFIG_BLKZONED) >>>> + if (bs->bl.wps) { >>>> + qemu_mutex_destroy(&bs->bl.wps->lock); >>>> + g_free(bs->bl.wps); >>>> + } >>>> +#endif >>>> qemu_close(s->fd); >>>> s->fd = -1; >>>> } >>>> @@ -3299,6 +3426,11 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, >>>> zone_op_name = "BLKRESETZONE"; >>>> zone_op = BLKRESETZONE; >>>> break; >>>> + case BLK_ZO_RESET_ALL: >>>> + zone_op_name = "BLKRESETZONE"; >>>> + zone_op = BLKRESETZONE; >>>> + is_all = true; >>>> + break; >>> >>> This change seems unrelated to the wp tracking. Different patch ? >>> >>>> default: >>>> g_assert_not_reached(); >>>> } >>>> diff --git a/include/block/block-common.h b/include/block/block-common.h >>>> index 8efb6b0c43..43bfc484eb 100644 >>>> --- a/include/block/block-common.h >>>> +++ b/include/block/block-common.h >>>> @@ -58,6 +58,7 @@ typedef enum BlockZoneOp { >>>> BLK_ZO_CLOSE, >>>> BLK_ZO_FINISH, >>>> BLK_ZO_RESET, >>>> + BLK_ZO_RESET_ALL, >>> >>> same here. Adding reset all support should be a different patch. >>>> } BlockZoneOp; >>>> >>>> typedef enum BlockZoneModel { >>>> @@ -96,6 +97,14 @@ typedef struct BlockZoneDescriptor { >>>> BlockZoneCondition cond; >>>> } BlockZoneDescriptor; >>>> >>>> +/* >>>> + * Track write pointers of a zone in bytes. >>>> + */ >>>> +typedef struct BlockZoneWps { >>>> + QemuMutex lock; >>>> + uint64_t wp[]; >>>> +} BlockZoneWps; >>>> + >>>> typedef struct BlockDriverInfo { >>>> /* in bytes, 0 if irrelevant */ >>>> int cluster_size; >>>> @@ -209,6 +218,13 @@ typedef enum { >>>> #define BDRV_SECTOR_BITS 9 >>>> #define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS) >>>> >>>> +/* >>>> + * Get the first most significant bit of WP. If it is zero, then >>>> + * the zone type is SWR. >>>> + */ >>>> +#define BDRV_ZT_IS_SWR(WP) ((WP & 0x8000000000000000) == 0) ? (true) : \ >>>> + (false) >>> >>> Simplify: >>> >>> #define BDRV_ZT_IS_SWR(wp) (!((wp) & (1ULL << 63)) >>> >>> But since this must be used for both SWR and SWP zones, I would reverse >>> this into: >>> >>> #define BDRV_ZONE_IS_CONV(wp) ((wp) & (1ULL << 63)) >>> >>> Which is a lot simpler. >>> >>>> + >>>> #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \ >>>> INT_MAX >> BDRV_SECTOR_BITS) >>>> #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) >>>> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h >>>> index 37dddc603c..59c2d1316d 100644 >>>> --- a/include/block/block_int-common.h >>>> +++ b/include/block/block_int-common.h >>>> @@ -857,6 +857,11 @@ typedef struct BlockLimits { >>>> >>>> /* device capacity expressed in bytes */ >>>> int64_t capacity; >>>> + >>>> + /* array of write pointers' location of each zone in the zoned device. */ >>>> + BlockZoneWps *wps; >>>> + >>>> + int64_t write_granularity; >>>> } BlockLimits; >>>> >>>> typedef struct BdrvOpBlocker BdrvOpBlocker; >>>> diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h >>>> index 3d26929cdd..f13cc1887b 100644 >>>> --- a/include/block/raw-aio.h >>>> +++ b/include/block/raw-aio.h >>>> @@ -31,6 +31,7 @@ >>>> #define QEMU_AIO_TRUNCATE 0x0080 >>>> #define QEMU_AIO_ZONE_REPORT 0x0100 >>>> #define QEMU_AIO_ZONE_MGMT 0x0200 >>>> +#define QEMU_AIO_ZONE_APPEND 0x0400 >>>> #define QEMU_AIO_TYPE_MASK \ >>>> (QEMU_AIO_READ | \ >>>> QEMU_AIO_WRITE | \ >>>> @@ -41,7 +42,8 @@ >>>> QEMU_AIO_COPY_RANGE | \ >>>> QEMU_AIO_TRUNCATE | \ >>>> QEMU_AIO_ZONE_REPORT | \ >>>> - QEMU_AIO_ZONE_MGMT) >>>> + QEMU_AIO_ZONE_MGMT | \ >>>> + QEMU_AIO_ZONE_APPEND) >>> >>> This should be introduced in patch 2. This patch should be only about zone >>> wp tracking with regular writes and zone management ops. The second patch >>> can implement zone append emulation thanks to this patch. So separate. >>> >>>> >>>> /* AIO flags */ >>>> #define QEMU_AIO_MISALIGNED 0x1000 >>> >> >> -- >> Damien Le Moal >> Western Digital Research >>
diff --git a/block/file-posix.c b/block/file-posix.c index 73656d87f2..33e81ac112 100755 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -206,6 +206,8 @@ typedef struct RawPosixAIOData { struct { struct iovec *iov; int niov; + int64_t *append_sector; + BlockZoneWps *wps; } io; struct { uint64_t cmd; @@ -1332,6 +1334,59 @@ static int hdev_get_max_segments(int fd, struct stat *st) { #endif } +#if defined(CONFIG_BLKZONED) +static int report_zone_wp(int64_t offset, int fd, BlockZoneWps *wps, + unsigned int nrz) { + struct blk_zone *blkz; + int64_t rep_size; + int64_t sector = offset >> BDRV_SECTOR_BITS; + int ret, n = 0, i = 0; + + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); + g_autofree struct blk_zone_report *rep = NULL; + rep = g_malloc(rep_size); + + blkz = (struct blk_zone *)(rep + 1); + while (n < nrz) { + memset(rep, 0, rep_size); + rep->sector = sector; + rep->nr_zones = nrz - n; + + do { + ret = ioctl(fd, BLKREPORTZONE, rep); + } while (ret != 0 && errno == EINTR); + if (ret != 0) { + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d", + fd, offset, errno); + return -errno; + } + + if (!rep->nr_zones) { + break; + } + + for (i = 0; i < rep->nr_zones; i++, n++) { + wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS; + sector = blkz[i].start + blkz[i].len; + + /* + * In the wp tracking, it only cares if the zone type is sequential + * writes required so that the wp can advance to the right location. + * Instead of the type of zone_type which is an 8-bit unsigned + * integer, use the first most significant bits of the wp location + * to indicate the zone type: 0 for SWR zones and 1 for the + * others. + */ + if (!(blkz[i].type & BLK_ZONE_TYPE_SEQWRITE_REQ)) { + wps->wp[i] += (uint64_t)1 << 63; + } + } + } + + return 0; +} +#endif + static void raw_refresh_limits(BlockDriverState *bs, Error **errp) { BDRVRawState *s = bs->opaque; @@ -1415,6 +1470,20 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) error_report("Invalid device capacity %" PRId64 " bytes ", bs->bl.capacity); return; } + + ret = get_sysfs_long_val(&st, "physical_block_size"); + if (ret >= 0) { + bs->bl.write_granularity = ret; + } + + bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret); + qemu_mutex_init(&bs->bl.wps->lock); + if (report_zone_wp(0, s->fd, bs->bl.wps, ret) < 0 ) { + error_report("report wps failed"); + qemu_mutex_destroy(&bs->bl.wps->lock); + g_free(bs->bl.wps); + return; + } } } @@ -1582,7 +1651,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) ssize_t len; do { - if (aiocb->aio_type & QEMU_AIO_WRITE) + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) len = qemu_pwritev(aiocb->aio_fildes, aiocb->io.iov, aiocb->io.niov, @@ -1612,7 +1681,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) ssize_t len; while (offset < aiocb->aio_nbytes) { - if (aiocb->aio_type & QEMU_AIO_WRITE) { + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { len = pwrite(aiocb->aio_fildes, (const char *)buf + offset, aiocb->aio_nbytes - offset, @@ -1705,7 +1774,7 @@ static int handle_aiocb_rw(void *opaque) } nbytes = handle_aiocb_rw_linear(aiocb, buf); - if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { + if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) { char *p = buf; size_t count = aiocb->aio_nbytes, copy; int i; @@ -1726,6 +1795,23 @@ static int handle_aiocb_rw(void *opaque) out: if (nbytes == aiocb->aio_nbytes) { +#if defined(CONFIG_BLKZONED) + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { + BlockZoneWps *wps = aiocb->io.wps; + int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; + if (wps) { + if (BDRV_ZT_IS_SWR(wps->wp[index])) { + qemu_mutex_lock(&wps->lock); + wps->wp[index] += aiocb->aio_nbytes; + qemu_mutex_unlock(&wps->lock); + } + + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { + *aiocb->io.append_sector = wps->wp[index] >> BDRV_SECTOR_BITS; + } + } + } +#endif return 0; } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) { if (aiocb->aio_type & QEMU_AIO_WRITE) { @@ -1737,6 +1823,19 @@ out: } } else { assert(nbytes < 0); +#if defined(CONFIG_BLKZONED) + if (aiocb->aio_type == QEMU_AIO_ZONE_APPEND) { + qemu_mutex_lock(&aiocb->bs->bl.wps->lock); + if (report_zone_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps, + aiocb->bs->bl.nr_zones) < 0) { + error_report("report zone wp failed"); + qemu_mutex_destroy(&aiocb->bs->bl.wps->lock); + g_free(aiocb->bs->bl.wps); + return -EINVAL; + } + qemu_mutex_unlock(&aiocb->bs->bl.wps->lock); + } +#endif return nbytes; } } @@ -2027,12 +2126,16 @@ static int handle_aiocb_zone_report(void *opaque) { static int handle_aiocb_zone_mgmt(void *opaque) { #if defined(CONFIG_BLKZONED) RawPosixAIOData *aiocb = opaque; + BlockDriverState *bs = aiocb->bs; int fd = aiocb->aio_fildes; int64_t sector = aiocb->aio_offset / 512; int64_t nr_sectors = aiocb->aio_nbytes / 512; struct blk_zone_range range; int ret; + BlockZoneWps *wps = bs->bl.wps; + int index = aiocb->aio_offset / bs->bl.zone_size; + /* Execute the operation */ range.sector = sector; range.nr_sectors = nr_sectors; @@ -2045,6 +2148,22 @@ static int handle_aiocb_zone_mgmt(void *opaque) { errno); return -errno; } + + if (aiocb->zone_mgmt.all) { + for (int i = 0; i < bs->bl.nr_zones; ++i) { + qemu_mutex_lock(&wps->lock); + wps->wp[i] = i * bs->bl.zone_size; + qemu_mutex_unlock(&wps->lock); + } + } else if (aiocb->zone_mgmt.zone_op == BLKRESETZONE) { + qemu_mutex_lock(&wps->lock); + wps->wp[index] = aiocb->aio_offset; + qemu_mutex_unlock(&wps->lock); + } else if (aiocb->zone_mgmt.zone_op == BLKFINISHZONE) { + qemu_mutex_lock(&wps->lock); + wps->wp[index] = aiocb->aio_offset + bs->bl.zone_size; + qemu_mutex_unlock(&wps->lock); + } return ret; #else return -ENOTSUP; @@ -2355,6 +2474,8 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, }, }; + BlockZoneWps *wps = bs->bl.wps; + acb.io.wps = wps; assert(qiov->size == bytes); return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb); } @@ -2465,6 +2586,12 @@ static void raw_close(BlockDriverState *bs) BDRVRawState *s = bs->opaque; if (s->fd >= 0) { +#if defined(CONFIG_BLKZONED) + if (bs->bl.wps) { + qemu_mutex_destroy(&bs->bl.wps->lock); + g_free(bs->bl.wps); + } +#endif qemu_close(s->fd); s->fd = -1; } @@ -3299,6 +3426,11 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, zone_op_name = "BLKRESETZONE"; zone_op = BLKRESETZONE; break; + case BLK_ZO_RESET_ALL: + zone_op_name = "BLKRESETZONE"; + zone_op = BLKRESETZONE; + is_all = true; + break; default: g_assert_not_reached(); } diff --git a/include/block/block-common.h b/include/block/block-common.h index 8efb6b0c43..43bfc484eb 100644 --- a/include/block/block-common.h +++ b/include/block/block-common.h @@ -58,6 +58,7 @@ typedef enum BlockZoneOp { BLK_ZO_CLOSE, BLK_ZO_FINISH, BLK_ZO_RESET, + BLK_ZO_RESET_ALL, } BlockZoneOp; typedef enum BlockZoneModel { @@ -96,6 +97,14 @@ typedef struct BlockZoneDescriptor { BlockZoneCondition cond; } BlockZoneDescriptor; +/* + * Track write pointers of a zone in bytes. + */ +typedef struct BlockZoneWps { + QemuMutex lock; + uint64_t wp[]; +} BlockZoneWps; + typedef struct BlockDriverInfo { /* in bytes, 0 if irrelevant */ int cluster_size; @@ -209,6 +218,13 @@ typedef enum { #define BDRV_SECTOR_BITS 9 #define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS) +/* + * Get the first most significant bit of WP. If it is zero, then + * the zone type is SWR. + */ +#define BDRV_ZT_IS_SWR(WP) ((WP & 0x8000000000000000) == 0) ? (true) : \ + (false) + #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \ INT_MAX >> BDRV_SECTOR_BITS) #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h index 37dddc603c..59c2d1316d 100644 --- a/include/block/block_int-common.h +++ b/include/block/block_int-common.h @@ -857,6 +857,11 @@ typedef struct BlockLimits { /* device capacity expressed in bytes */ int64_t capacity; + + /* array of write pointers' location of each zone in the zoned device. */ + BlockZoneWps *wps; + + int64_t write_granularity; } BlockLimits; typedef struct BdrvOpBlocker BdrvOpBlocker; diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h index 3d26929cdd..f13cc1887b 100644 --- a/include/block/raw-aio.h +++ b/include/block/raw-aio.h @@ -31,6 +31,7 @@ #define QEMU_AIO_TRUNCATE 0x0080 #define QEMU_AIO_ZONE_REPORT 0x0100 #define QEMU_AIO_ZONE_MGMT 0x0200 +#define QEMU_AIO_ZONE_APPEND 0x0400 #define QEMU_AIO_TYPE_MASK \ (QEMU_AIO_READ | \ QEMU_AIO_WRITE | \ @@ -41,7 +42,8 @@ QEMU_AIO_COPY_RANGE | \ QEMU_AIO_TRUNCATE | \ QEMU_AIO_ZONE_REPORT | \ - QEMU_AIO_ZONE_MGMT) + QEMU_AIO_ZONE_MGMT | \ + QEMU_AIO_ZONE_APPEND) /* AIO flags */ #define QEMU_AIO_MISALIGNED 0x1000
Since Linux doesn't have a user API to issue zone append operations to zoned devices from user space, the file-posix driver is modified to add zone append emulation using regular writes. To do this, the file-posix driver tracks the wp location of all zones of the device. It uses an array of uint64_t. The most significant bit of each wp location indicates if the zone type is sequential write required. The zones wp can be changed due to the following operations issued: - zone reset: change the wp to the start offset of that zone - zone finish: change to the end location of that zone - write to a zone - zone append Signed-off-by: Sam Li <faithilikerun@gmail.com> --- block/file-posix.c | 138 ++++++++++++++++++++++++++++++- include/block/block-common.h | 16 ++++ include/block/block_int-common.h | 5 ++ include/block/raw-aio.h | 4 +- 4 files changed, 159 insertions(+), 4 deletions(-)