Message ID | 20221016145626.171838-3-faithilikerun@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add zone append write for zoned device | expand |
On Sun, 2022-10-16 at 22:56 +0800, Sam Li wrote: > A zone append command is a write operation that specifies the first > logical block of a zone as the write position. When writing to a zoned > block device using zone append, the byte offset of writes is pointing > to the write pointer of that zone. Upon completion the device will > respond with the position the data has been written in the zone. > > Signed-off-by: Sam Li <faithilikerun@gmail.com> > --- > block/block-backend.c | 65 ++++++++++++++++++++++ > block/file-posix.c | 89 +++++++++++++++++++++++++++++-- > block/io.c | 21 ++++++++ > block/raw-format.c | 8 +++ > include/block/block-io.h | 3 ++ > include/block/block_int-common.h | 5 ++ > include/block/raw-aio.h | 4 +- > include/sysemu/block-backend-io.h | 9 ++++ > 8 files changed, 198 insertions(+), 6 deletions(-) > > diff --git a/block/block-backend.c b/block/block-backend.c > index 1c618e9c68..06931ddd24 100644 > --- a/block/block-backend.c > +++ b/block/block-backend.c > @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo { > struct { > unsigned long op; > } zone_mgmt; > + struct { > + int64_t *append_sector; > + } zone_append; > }; > } BlkRwCo; > > @@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, > BlockZoneOp op, > return &acb->common; > } > > +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) > +{ > + BlkAioEmAIOCB *acb = opaque; > + BlkRwCo *rwco = &acb->rwco; > + > + rwco->ret = blk_co_zone_append(rwco->blk, rwco->zone_append.append_sector, > + rwco->iobuf, rwco->flags); > + blk_aio_complete(acb); > +} > + > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, > + QEMUIOVector *qiov, BdrvRequestFlags flags, > + BlockCompletionFunc *cb, void *opaque) { > + BlkAioEmAIOCB *acb; > + Coroutine *co; > + IO_CODE(); > + > + blk_inc_in_flight(blk); > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > + acb->rwco = (BlkRwCo) { > + .blk = blk, > + .ret = NOT_DONE, > + .flags = flags, > + .iobuf = qiov, > + .zone_append = { > + .append_sector = offset, > + }, > + }; > + acb->has_returned = false; > + > + co = qemu_coroutine_create(blk_aio_zone_append_entry, acb); > + bdrv_coroutine_enter(blk_bs(blk), co); > + acb->has_returned = true; > + if (acb->rwco.ret != NOT_DONE) { > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > + blk_aio_complete_bh, acb); > + } > + > + return &acb->common; > +} > + > /* > * Send a zone_report command. > * offset is a byte offset from the start of the device. No alignment > @@ -1923,6 +1967,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, > BlockZoneOp op, > return ret; > } > > +/* > + * Send a zone_append command. > + */ > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, > + QEMUIOVector *qiov, BdrvRequestFlags flags) > +{ > + int ret; > + IO_CODE(); > + > + blk_inc_in_flight(blk); > + blk_wait_while_drained(blk); > + if (!blk_is_available(blk)) { > + blk_dec_in_flight(blk); > + return -ENOMEDIUM; > + } > + > + ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags); > + blk_dec_in_flight(blk); > + return ret; > +} > + > void blk_drain(BlockBackend *blk) > { > BlockDriverState *bs = blk_bs(blk); > diff --git a/block/file-posix.c b/block/file-posix.c > index 5ff5500301..3d0cc33d02 100644 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -205,6 +205,7 @@ typedef struct RawPosixAIOData { > struct { > struct iovec *iov; > int niov; > + int64_t *offset; > } io; > struct { > uint64_t cmd; > @@ -1475,6 +1476,11 @@ static void raw_refresh_limits(BlockDriverState *bs, > Error **errp) > bs->bl.max_active_zones = ret; > } > > + ret = get_sysfs_long_val(&st, "physical_block_size"); > + if (ret >= 0) { > + bs->bl.write_granularity = ret; > + } > + > bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret); > if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) { > error_report("report wps failed"); > @@ -1647,9 +1653,18 @@ qemu_pwritev(int fd, const struct iovec *iov, int > nr_iov, off_t offset) > static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) > { > ssize_t len; > + BlockZoneWps *wps = aiocb->bs->bl.wps; > + int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; Can this code ever be called for a non-zoned device with 0 zone size? If yes, you need to avoid division by zero here... > + > + if (wps) { > + qemu_mutex_lock(&wps->lock); > + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { > + aiocb->aio_offset = wps->wp[index]; > + } > + } > > do { > - if (aiocb->aio_type & QEMU_AIO_WRITE) > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) > len = qemu_pwritev(aiocb->aio_fildes, > aiocb->io.iov, > aiocb->io.niov, > @@ -1660,6 +1675,9 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData > *aiocb) > aiocb->io.niov, > aiocb->aio_offset); > } while (len == -1 && errno == EINTR); > + if (wps) { > + qemu_mutex_unlock(&wps->lock); > + } > > if (len == -1) { > return -errno; > @@ -1677,9 +1695,17 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData > *aiocb, char *buf) > { > ssize_t offset = 0; > ssize_t len; > + BlockZoneWps *wps = aiocb->bs->bl.wps; > + int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; same here > > + if (wps) { > + qemu_mutex_lock(&wps->lock); > + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { > + aiocb->aio_offset = wps->wp[index]; > + } > + } > while (offset < aiocb->aio_nbytes) { > - if (aiocb->aio_type & QEMU_AIO_WRITE) { > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > len = pwrite(aiocb->aio_fildes, > (const char *)buf + offset, > aiocb->aio_nbytes - offset, > @@ -1709,6 +1735,9 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData > *aiocb, char *buf) > } > offset += len; > } > + if (wps) { > + qemu_mutex_unlock(&wps->lock); > + } > > return offset; > } > @@ -1772,7 +1801,7 @@ static int handle_aiocb_rw(void *opaque) > } > > nbytes = handle_aiocb_rw_linear(aiocb, buf); > - if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { > + if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) { > char *p = buf; > size_t count = aiocb->aio_nbytes, copy; > int i; > @@ -1794,7 +1823,7 @@ static int handle_aiocb_rw(void *opaque) > out: > if (nbytes == aiocb->aio_nbytes) { > #if defined(CONFIG_BLKZONED) > - if (aiocb->aio_type & QEMU_AIO_WRITE) { > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > BlockZoneWps *wps = aiocb->bs->bl.wps; > int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; and here > if (wps) { > @@ -1802,6 +1831,9 @@ out: > if (!BDRV_ZT_IS_CONV(wps->wp[index])) { > uint64_t wend_offset = > aiocb->aio_offset + aiocb->aio_nbytes; > + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { > + *aiocb->io.offset = wps->wp[index]; > + } > > /* Advance the wp if needed */ > if (wend_offset > wps->wp[index]) { > @@ -1824,7 +1856,7 @@ out: > } else { > assert(nbytes < 0); > #if defined(CONFIG_BLKZONED) > - if (aiocb->aio_type & QEMU_AIO_WRITE) { > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > update_zones_wp(aiocb->aio_fildes, aiocb->bs->bl.wps, 0, 1); > } > #endif > @@ -3478,6 +3510,52 @@ static int coroutine_fn > raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > } > #endif > > +#if defined(CONFIG_BLKZONED) > +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, > + int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags) { > + BDRVRawState *s = bs->opaque; > + int64_t zone_size_mask = bs->bl.zone_size - 1; > + int64_t iov_len = 0; > + int64_t len = 0; > + RawPosixAIOData acb; > + > + if (*offset & zone_size_mask) { > + error_report("sector offset %" PRId64 " is not aligned to zone size " > + "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512); > + return -EINVAL; > + } > + > + int64_t wg = bs->bl.write_granularity; > + int64_t wg_mask = wg - 1; > + for (int i = 0; i < qiov->niov; i++) { > + iov_len = qiov->iov[i].iov_len; > + if (iov_len & wg_mask) { > + error_report("len of IOVector[%d] %" PRId64 " is not aligned to " > + "block size %" PRId64 "", i, iov_len, wg); > + return -EINVAL; > + } > + len += iov_len; > + } > + > + acb = (RawPosixAIOData) { > + .bs = bs, > + .aio_fildes = s->fd, > + .aio_type = QEMU_AIO_ZONE_APPEND, > + .aio_offset = *offset, > + .aio_nbytes = len, > + .io = { > + .iov = qiov->iov, > + .niov = qiov->niov, > + .offset = offset, > + }, > + }; > + > + return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb); > +} > +#endif > + > static coroutine_fn int > raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, > bool blkdev) > @@ -4253,6 +4331,7 @@ static BlockDriver bdrv_zoned_host_device = { > /* zone management operations */ > .bdrv_co_zone_report = raw_co_zone_report, > .bdrv_co_zone_mgmt = raw_co_zone_mgmt, > + .bdrv_co_zone_append = raw_co_zone_append, > }; > #endif > > diff --git a/block/io.c b/block/io.c > index 88f707ea4d..03e1109056 100644 > --- a/block/io.c > +++ b/block/io.c > @@ -3230,6 +3230,27 @@ out: > return co.ret; > } > > +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags) > +{ > + BlockDriver *drv = bs->drv; > + CoroutineIOCompletion co = { > + .coroutine = qemu_coroutine_self(), > + }; > + IO_CODE(); > + > + bdrv_inc_in_flight(bs); > + if (!drv || !drv->bdrv_co_zone_append) { > + co.ret = -ENOTSUP; > + goto out; > + } > + co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags); > +out: > + bdrv_dec_in_flight(bs); > + return co.ret; > +} > + > void *qemu_blockalign(BlockDriverState *bs, size_t size) > { > IO_CODE(); > diff --git a/block/raw-format.c b/block/raw-format.c > index 18dc52a150..33bff8516e 100644 > --- a/block/raw-format.c > +++ b/block/raw-format.c > @@ -325,6 +325,13 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState > *bs, BlockZoneOp op, > return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len); > } > > +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, > + int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags) { > + return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags); > +} > + > static int64_t raw_getlength(BlockDriverState *bs) > { > int64_t len; > @@ -629,6 +636,7 @@ BlockDriver bdrv_raw = { > .bdrv_co_pdiscard = &raw_co_pdiscard, > .bdrv_co_zone_report = &raw_co_zone_report, > .bdrv_co_zone_mgmt = &raw_co_zone_mgmt, > + .bdrv_co_zone_append = &raw_co_zone_append, > .bdrv_co_block_status = &raw_co_block_status, > .bdrv_co_copy_range_from = &raw_co_copy_range_from, > .bdrv_co_copy_range_to = &raw_co_copy_range_to, > diff --git a/include/block/block-io.h b/include/block/block-io.h > index f0cdf67d33..6a54453578 100644 > --- a/include/block/block-io.h > +++ b/include/block/block-io.h > @@ -94,6 +94,9 @@ int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, > int64_t offset, > BlockZoneDescriptor *zones); > int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > int64_t offset, int64_t len); > +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags); > > int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes); > bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); > diff --git a/include/block/block_int-common.h b/include/block/block_int- > common.h > index e3136146aa..a7e7db5646 100644 > --- a/include/block/block_int-common.h > +++ b/include/block/block_int-common.h > @@ -701,6 +701,9 @@ struct BlockDriver { > BlockZoneDescriptor *zones); > int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp > op, > int64_t offset, int64_t len); > + int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs, > + int64_t *offset, QEMUIOVector *qiov, > + BdrvRequestFlags flags); > > /* removable device specific */ > bool (*bdrv_is_inserted)(BlockDriverState *bs); > @@ -860,6 +863,8 @@ typedef struct BlockLimits { > > /* array of write pointers' location of each zone in the zoned device. */ > BlockZoneWps *wps; > + > + int64_t write_granularity; > } BlockLimits; > > typedef struct BdrvOpBlocker BdrvOpBlocker; > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h > index 877b2240b3..53033a5ca7 100644 > --- a/include/block/raw-aio.h > +++ b/include/block/raw-aio.h > @@ -31,6 +31,7 @@ > #define QEMU_AIO_TRUNCATE 0x0080 > #define QEMU_AIO_ZONE_REPORT 0x0100 > #define QEMU_AIO_ZONE_MGMT 0x0200 > +#define QEMU_AIO_ZONE_APPEND 0x0400 > #define QEMU_AIO_TYPE_MASK \ > (QEMU_AIO_READ | \ > QEMU_AIO_WRITE | \ > @@ -41,7 +42,8 @@ > QEMU_AIO_COPY_RANGE | \ > QEMU_AIO_TRUNCATE | \ > QEMU_AIO_ZONE_REPORT | \ > - QEMU_AIO_ZONE_MGMT) > + QEMU_AIO_ZONE_MGMT | \ > + QEMU_AIO_ZONE_APPEND) > > /* AIO flags */ > #define QEMU_AIO_MISALIGNED 0x1000 > diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend- > io.h > index 1b5fc7db6b..ff9f777f52 100644 > --- a/include/sysemu/block-backend-io.h > +++ b/include/sysemu/block-backend-io.h > @@ -52,6 +52,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t > offset, > BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > int64_t offset, int64_t len, > BlockCompletionFunc *cb, void *opaque); > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, > + QEMUIOVector *qiov, BdrvRequestFlags flags, > + BlockCompletionFunc *cb, void *opaque); > BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes, > BlockCompletionFunc *cb, void *opaque); > void blk_aio_cancel_async(BlockAIOCB *acb); > @@ -173,6 +176,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, > BlockZoneOp op, > int64_t offset, int64_t len); > int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > int64_t offset, int64_t len); > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags); > +int generated_co_wrapper blk_zone_append(BlockBackend *blk, int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags); > > int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset, > int64_t bytes);
On 10/16/22 23:56, Sam Li wrote: > A zone append command is a write operation that specifies the first > logical block of a zone as the write position. When writing to a zoned > block device using zone append, the byte offset of writes is pointing > to the write pointer of that zone. Upon completion the device will > respond with the position the data has been written in the zone. > > Signed-off-by: Sam Li <faithilikerun@gmail.com> > --- > block/block-backend.c | 65 ++++++++++++++++++++++ > block/file-posix.c | 89 +++++++++++++++++++++++++++++-- > block/io.c | 21 ++++++++ > block/raw-format.c | 8 +++ > include/block/block-io.h | 3 ++ > include/block/block_int-common.h | 5 ++ > include/block/raw-aio.h | 4 +- > include/sysemu/block-backend-io.h | 9 ++++ > 8 files changed, 198 insertions(+), 6 deletions(-) > > diff --git a/block/block-backend.c b/block/block-backend.c > index 1c618e9c68..06931ddd24 100644 > --- a/block/block-backend.c > +++ b/block/block-backend.c > @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo { > struct { > unsigned long op; > } zone_mgmt; > + struct { > + int64_t *append_sector; As mentioned previosuly, call this sector. "append" is already in the zone_append struct member name.... > + } zone_append; > }; > } BlkRwCo; > > @@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > return &acb->common; > } > > +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) > +{ > + BlkAioEmAIOCB *acb = opaque; > + BlkRwCo *rwco = &acb->rwco; > + > + rwco->ret = blk_co_zone_append(rwco->blk, rwco->zone_append.append_sector, ...so you avoid awkward repetitions of "append" like here. You'll have: rwco->zone_append.sector, which is shorter and more natural. > + rwco->iobuf, rwco->flags); > + blk_aio_complete(acb); > +} > + > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, > + QEMUIOVector *qiov, BdrvRequestFlags flags, > + BlockCompletionFunc *cb, void *opaque) { > + BlkAioEmAIOCB *acb; > + Coroutine *co; > + IO_CODE(); > + > + blk_inc_in_flight(blk); > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > + acb->rwco = (BlkRwCo) { > + .blk = blk, > + .ret = NOT_DONE, > + .flags = flags, > + .iobuf = qiov, > + .zone_append = { > + .append_sector = offset, > + }, > + }; > + acb->has_returned = false; > + > + co = qemu_coroutine_create(blk_aio_zone_append_entry, acb); > + bdrv_coroutine_enter(blk_bs(blk), co); > + acb->has_returned = true; > + if (acb->rwco.ret != NOT_DONE) { > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > + blk_aio_complete_bh, acb); > + } > + > + return &acb->common; > +} > + > /* > * Send a zone_report command. > * offset is a byte offset from the start of the device. No alignment > @@ -1923,6 +1967,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > return ret; > } > > +/* > + * Send a zone_append command. > + */ > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, > + QEMUIOVector *qiov, BdrvRequestFlags flags) > +{ > + int ret; > + IO_CODE(); > + > + blk_inc_in_flight(blk); > + blk_wait_while_drained(blk); > + if (!blk_is_available(blk)) { > + blk_dec_in_flight(blk); > + return -ENOMEDIUM; > + } > + > + ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags); > + blk_dec_in_flight(blk); > + return ret; > +} > + > void blk_drain(BlockBackend *blk) > { > BlockDriverState *bs = blk_bs(blk); > diff --git a/block/file-posix.c b/block/file-posix.c > index 5ff5500301..3d0cc33d02 100644 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -205,6 +205,7 @@ typedef struct RawPosixAIOData { > struct { > struct iovec *iov; > int niov; > + int64_t *offset; > } io; > struct { > uint64_t cmd; > @@ -1475,6 +1476,11 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) > bs->bl.max_active_zones = ret; > } > > + ret = get_sysfs_long_val(&st, "physical_block_size"); > + if (ret >= 0) { > + bs->bl.write_granularity = ret; > + } > + > bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret); > if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) { > error_report("report wps failed"); > @@ -1647,9 +1653,18 @@ qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) > static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) > { > ssize_t len; > + BlockZoneWps *wps = aiocb->bs->bl.wps; > + int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; > + > + if (wps) { > + qemu_mutex_lock(&wps->lock); > + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { > + aiocb->aio_offset = wps->wp[index]; > + } > + } > > do { > - if (aiocb->aio_type & QEMU_AIO_WRITE) > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) > len = qemu_pwritev(aiocb->aio_fildes, > aiocb->io.iov, > aiocb->io.niov, > @@ -1660,6 +1675,9 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) > aiocb->io.niov, > aiocb->aio_offset); > } while (len == -1 && errno == EINTR); > + if (wps) { > + qemu_mutex_unlock(&wps->lock); As commented in the previous patch, you cannot release the lock until you update the wp array entry. So this means that you should be taking the wp lock at the beginning of handle_aiocb_rw() and release it only after the wp array is updated. That will also simplify the code and avoid spreading lock/unlock of that array to many places. > + } > > if (len == -1) { > return -errno; > @@ -1677,9 +1695,17 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) > { > ssize_t offset = 0; > ssize_t len; > + BlockZoneWps *wps = aiocb->bs->bl.wps; > + int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; > > + if (wps) { > + qemu_mutex_lock(&wps->lock); > + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { > + aiocb->aio_offset = wps->wp[index]; > + } > + } > while (offset < aiocb->aio_nbytes) { > - if (aiocb->aio_type & QEMU_AIO_WRITE) { > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > len = pwrite(aiocb->aio_fildes, > (const char *)buf + offset, > aiocb->aio_nbytes - offset, > @@ -1709,6 +1735,9 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) > } > offset += len; > } > + if (wps) { > + qemu_mutex_unlock(&wps->lock); Same comment as above. > + } > > return offset; > } > @@ -1772,7 +1801,7 @@ static int handle_aiocb_rw(void *opaque) > } > > nbytes = handle_aiocb_rw_linear(aiocb, buf); > - if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { > + if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) { > char *p = buf; > size_t count = aiocb->aio_nbytes, copy; > int i; > @@ -1794,7 +1823,7 @@ static int handle_aiocb_rw(void *opaque) > out: > if (nbytes == aiocb->aio_nbytes) { > #if defined(CONFIG_BLKZONED) > - if (aiocb->aio_type & QEMU_AIO_WRITE) { > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > BlockZoneWps *wps = aiocb->bs->bl.wps; > int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; > if (wps) { > @@ -1802,6 +1831,9 @@ out: > if (!BDRV_ZT_IS_CONV(wps->wp[index])) { > uint64_t wend_offset = > aiocb->aio_offset + aiocb->aio_nbytes; > + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { > + *aiocb->io.offset = wps->wp[index]; > + } The comment above will address the problem here, which is that you are accessing the array without it being locked. You cannot do that. > > /* Advance the wp if needed */ > if (wend_offset > wps->wp[index]) { > @@ -1824,7 +1856,7 @@ out: > } else { > assert(nbytes < 0); > #if defined(CONFIG_BLKZONED) > - if (aiocb->aio_type & QEMU_AIO_WRITE) { > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > update_zones_wp(aiocb->aio_fildes, aiocb->bs->bl.wps, 0, 1); > } > #endif > @@ -3478,6 +3510,52 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > } > #endif > > +#if defined(CONFIG_BLKZONED) > +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, > + int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags) { > + BDRVRawState *s = bs->opaque; > + int64_t zone_size_mask = bs->bl.zone_size - 1; > + int64_t iov_len = 0; > + int64_t len = 0; > + RawPosixAIOData acb; > + > + if (*offset & zone_size_mask) { > + error_report("sector offset %" PRId64 " is not aligned to zone size " > + "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512); > + return -EINVAL; > + } > + > + int64_t wg = bs->bl.write_granularity; > + int64_t wg_mask = wg - 1; > + for (int i = 0; i < qiov->niov; i++) { > + iov_len = qiov->iov[i].iov_len; > + if (iov_len & wg_mask) { > + error_report("len of IOVector[%d] %" PRId64 " is not aligned to " > + "block size %" PRId64 "", i, iov_len, wg); > + return -EINVAL; > + } > + len += iov_len; > + } > + > + acb = (RawPosixAIOData) { > + .bs = bs, > + .aio_fildes = s->fd, > + .aio_type = QEMU_AIO_ZONE_APPEND, > + .aio_offset = *offset, > + .aio_nbytes = len, > + .io = { > + .iov = qiov->iov, > + .niov = qiov->niov, > + .offset = offset, > + }, > + }; > + > + return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb); > +} > +#endif > + > static coroutine_fn int > raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, > bool blkdev) > @@ -4253,6 +4331,7 @@ static BlockDriver bdrv_zoned_host_device = { > /* zone management operations */ > .bdrv_co_zone_report = raw_co_zone_report, > .bdrv_co_zone_mgmt = raw_co_zone_mgmt, > + .bdrv_co_zone_append = raw_co_zone_append, > }; > #endif > > diff --git a/block/io.c b/block/io.c > index 88f707ea4d..03e1109056 100644 > --- a/block/io.c > +++ b/block/io.c > @@ -3230,6 +3230,27 @@ out: > return co.ret; > } > > +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags) > +{ > + BlockDriver *drv = bs->drv; > + CoroutineIOCompletion co = { > + .coroutine = qemu_coroutine_self(), > + }; > + IO_CODE(); > + > + bdrv_inc_in_flight(bs); > + if (!drv || !drv->bdrv_co_zone_append) { > + co.ret = -ENOTSUP; > + goto out; > + } > + co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags); > +out: > + bdrv_dec_in_flight(bs); > + return co.ret; > +} > + > void *qemu_blockalign(BlockDriverState *bs, size_t size) > { > IO_CODE(); > diff --git a/block/raw-format.c b/block/raw-format.c > index 18dc52a150..33bff8516e 100644 > --- a/block/raw-format.c > +++ b/block/raw-format.c > @@ -325,6 +325,13 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len); > } > > +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, > + int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags) { > + return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags); > +} > + > static int64_t raw_getlength(BlockDriverState *bs) > { > int64_t len; > @@ -629,6 +636,7 @@ BlockDriver bdrv_raw = { > .bdrv_co_pdiscard = &raw_co_pdiscard, > .bdrv_co_zone_report = &raw_co_zone_report, > .bdrv_co_zone_mgmt = &raw_co_zone_mgmt, > + .bdrv_co_zone_append = &raw_co_zone_append, > .bdrv_co_block_status = &raw_co_block_status, > .bdrv_co_copy_range_from = &raw_co_copy_range_from, > .bdrv_co_copy_range_to = &raw_co_copy_range_to, > diff --git a/include/block/block-io.h b/include/block/block-io.h > index f0cdf67d33..6a54453578 100644 > --- a/include/block/block-io.h > +++ b/include/block/block-io.h > @@ -94,6 +94,9 @@ int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, > BlockZoneDescriptor *zones); > int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > int64_t offset, int64_t len); > +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags); > > int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes); > bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); > diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h > index e3136146aa..a7e7db5646 100644 > --- a/include/block/block_int-common.h > +++ b/include/block/block_int-common.h > @@ -701,6 +701,9 @@ struct BlockDriver { > BlockZoneDescriptor *zones); > int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op, > int64_t offset, int64_t len); > + int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs, > + int64_t *offset, QEMUIOVector *qiov, > + BdrvRequestFlags flags); > > /* removable device specific */ > bool (*bdrv_is_inserted)(BlockDriverState *bs); > @@ -860,6 +863,8 @@ typedef struct BlockLimits { > > /* array of write pointers' location of each zone in the zoned device. */ > BlockZoneWps *wps; > + > + int64_t write_granularity; > } BlockLimits; > > typedef struct BdrvOpBlocker BdrvOpBlocker; > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h > index 877b2240b3..53033a5ca7 100644 > --- a/include/block/raw-aio.h > +++ b/include/block/raw-aio.h > @@ -31,6 +31,7 @@ > #define QEMU_AIO_TRUNCATE 0x0080 > #define QEMU_AIO_ZONE_REPORT 0x0100 > #define QEMU_AIO_ZONE_MGMT 0x0200 > +#define QEMU_AIO_ZONE_APPEND 0x0400 > #define QEMU_AIO_TYPE_MASK \ > (QEMU_AIO_READ | \ > QEMU_AIO_WRITE | \ > @@ -41,7 +42,8 @@ > QEMU_AIO_COPY_RANGE | \ > QEMU_AIO_TRUNCATE | \ > QEMU_AIO_ZONE_REPORT | \ > - QEMU_AIO_ZONE_MGMT) > + QEMU_AIO_ZONE_MGMT | \ > + QEMU_AIO_ZONE_APPEND) > > /* AIO flags */ > #define QEMU_AIO_MISALIGNED 0x1000 > diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h > index 1b5fc7db6b..ff9f777f52 100644 > --- a/include/sysemu/block-backend-io.h > +++ b/include/sysemu/block-backend-io.h > @@ -52,6 +52,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, > BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > int64_t offset, int64_t len, > BlockCompletionFunc *cb, void *opaque); > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, > + QEMUIOVector *qiov, BdrvRequestFlags flags, > + BlockCompletionFunc *cb, void *opaque); > BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes, > BlockCompletionFunc *cb, void *opaque); > void blk_aio_cancel_async(BlockAIOCB *acb); > @@ -173,6 +176,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > int64_t offset, int64_t len); > int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > int64_t offset, int64_t len); > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags); > +int generated_co_wrapper blk_zone_append(BlockBackend *blk, int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags); > > int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset, > int64_t bytes);
Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年10月17日周一 13:22写道: > > On 10/16/22 23:56, Sam Li wrote: > > A zone append command is a write operation that specifies the first > > logical block of a zone as the write position. When writing to a zoned > > block device using zone append, the byte offset of writes is pointing > > to the write pointer of that zone. Upon completion the device will > > respond with the position the data has been written in the zone. > > > > Signed-off-by: Sam Li <faithilikerun@gmail.com> > > --- > > block/block-backend.c | 65 ++++++++++++++++++++++ > > block/file-posix.c | 89 +++++++++++++++++++++++++++++-- > > block/io.c | 21 ++++++++ > > block/raw-format.c | 8 +++ > > include/block/block-io.h | 3 ++ > > include/block/block_int-common.h | 5 ++ > > include/block/raw-aio.h | 4 +- > > include/sysemu/block-backend-io.h | 9 ++++ > > 8 files changed, 198 insertions(+), 6 deletions(-) > > > > diff --git a/block/block-backend.c b/block/block-backend.c > > index 1c618e9c68..06931ddd24 100644 > > --- a/block/block-backend.c > > +++ b/block/block-backend.c > > @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo { > > struct { > > unsigned long op; > > } zone_mgmt; > > + struct { > > + int64_t *append_sector; > > As mentioned previosuly, call this sector. "append" is already in the > zone_append struct member name.... Right, missed it here. I'll use "offset" then since it's in block layer. > > > + } zone_append; > > }; > > } BlkRwCo; > > > > @@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > return &acb->common; > > } > > > > +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) > > +{ > > + BlkAioEmAIOCB *acb = opaque; > > + BlkRwCo *rwco = &acb->rwco; > > + > > + rwco->ret = blk_co_zone_append(rwco->blk, rwco->zone_append.append_sector, > > ...so you avoid awkward repetitions of "append" like here. You'll have: > rwco->zone_append.sector, which is shorter and more natural. > > > + rwco->iobuf, rwco->flags); > > + blk_aio_complete(acb); > > +} > > + > > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, > > + QEMUIOVector *qiov, BdrvRequestFlags flags, > > + BlockCompletionFunc *cb, void *opaque) { > > + BlkAioEmAIOCB *acb; > > + Coroutine *co; > > + IO_CODE(); > > + > > + blk_inc_in_flight(blk); > > + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > > + acb->rwco = (BlkRwCo) { > > + .blk = blk, > > + .ret = NOT_DONE, > > + .flags = flags, > > + .iobuf = qiov, > > + .zone_append = { > > + .append_sector = offset, > > + }, > > + }; > > + acb->has_returned = false; > > + > > + co = qemu_coroutine_create(blk_aio_zone_append_entry, acb); > > + bdrv_coroutine_enter(blk_bs(blk), co); > > + acb->has_returned = true; > > + if (acb->rwco.ret != NOT_DONE) { > > + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > > + blk_aio_complete_bh, acb); > > + } > > + > > + return &acb->common; > > +} > > + > > /* > > * Send a zone_report command. > > * offset is a byte offset from the start of the device. No alignment > > @@ -1923,6 +1967,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > return ret; > > } > > > > +/* > > + * Send a zone_append command. > > + */ > > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, > > + QEMUIOVector *qiov, BdrvRequestFlags flags) > > +{ > > + int ret; > > + IO_CODE(); > > + > > + blk_inc_in_flight(blk); > > + blk_wait_while_drained(blk); > > + if (!blk_is_available(blk)) { > > + blk_dec_in_flight(blk); > > + return -ENOMEDIUM; > > + } > > + > > + ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags); > > + blk_dec_in_flight(blk); > > + return ret; > > +} > > + > > void blk_drain(BlockBackend *blk) > > { > > BlockDriverState *bs = blk_bs(blk); > > diff --git a/block/file-posix.c b/block/file-posix.c > > index 5ff5500301..3d0cc33d02 100644 > > --- a/block/file-posix.c > > +++ b/block/file-posix.c > > @@ -205,6 +205,7 @@ typedef struct RawPosixAIOData { > > struct { > > struct iovec *iov; > > int niov; > > + int64_t *offset; > > } io; > > struct { > > uint64_t cmd; > > @@ -1475,6 +1476,11 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) > > bs->bl.max_active_zones = ret; > > } > > > > + ret = get_sysfs_long_val(&st, "physical_block_size"); > > + if (ret >= 0) { > > + bs->bl.write_granularity = ret; > > + } > > + > > bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret); > > if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) { > > error_report("report wps failed"); > > @@ -1647,9 +1653,18 @@ qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) > > static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) > > { > > ssize_t len; > > + BlockZoneWps *wps = aiocb->bs->bl.wps; > > + int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; > > + > > + if (wps) { > > + qemu_mutex_lock(&wps->lock); > > + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { > > + aiocb->aio_offset = wps->wp[index]; > > + } > > + } > > > > do { > > - if (aiocb->aio_type & QEMU_AIO_WRITE) > > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) > > len = qemu_pwritev(aiocb->aio_fildes, > > aiocb->io.iov, > > aiocb->io.niov, > > @@ -1660,6 +1675,9 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) > > aiocb->io.niov, > > aiocb->aio_offset); > > } while (len == -1 && errno == EINTR); > > + if (wps) { > > + qemu_mutex_unlock(&wps->lock); > > As commented in the previous patch, you cannot release the lock until you > update the wp array entry. So this means that you should be taking the wp > lock at the beginning of handle_aiocb_rw() and release it only after the > wp array is updated. That will also simplify the code and avoid spreading > lock/unlock of that array to many places. > > > + } > > > > if (len == -1) { > > return -errno; > > @@ -1677,9 +1695,17 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) > > { > > ssize_t offset = 0; > > ssize_t len; > > + BlockZoneWps *wps = aiocb->bs->bl.wps; > > + int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; > > > > + if (wps) { > > + qemu_mutex_lock(&wps->lock); > > + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { > > + aiocb->aio_offset = wps->wp[index]; > > + } > > + } > > while (offset < aiocb->aio_nbytes) { > > - if (aiocb->aio_type & QEMU_AIO_WRITE) { > > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > > len = pwrite(aiocb->aio_fildes, > > (const char *)buf + offset, > > aiocb->aio_nbytes - offset, > > @@ -1709,6 +1735,9 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) > > } > > offset += len; > > } > > + if (wps) { > > + qemu_mutex_unlock(&wps->lock); > > Same comment as above. > > > + } > > > > return offset; > > } > > @@ -1772,7 +1801,7 @@ static int handle_aiocb_rw(void *opaque) > > } > > > > nbytes = handle_aiocb_rw_linear(aiocb, buf); > > - if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { > > + if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) { > > char *p = buf; > > size_t count = aiocb->aio_nbytes, copy; > > int i; > > @@ -1794,7 +1823,7 @@ static int handle_aiocb_rw(void *opaque) > > out: > > if (nbytes == aiocb->aio_nbytes) { > > #if defined(CONFIG_BLKZONED) > > - if (aiocb->aio_type & QEMU_AIO_WRITE) { > > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > > BlockZoneWps *wps = aiocb->bs->bl.wps; > > int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; > > if (wps) { > > @@ -1802,6 +1831,9 @@ out: > > if (!BDRV_ZT_IS_CONV(wps->wp[index])) { > > uint64_t wend_offset = > > aiocb->aio_offset + aiocb->aio_nbytes; > > + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { > > + *aiocb->io.offset = wps->wp[index]; > > + } > > The comment above will address the problem here, which is that you are > accessing the array without it being locked. You cannot do that. > > > > > /* Advance the wp if needed */ > > if (wend_offset > wps->wp[index]) { > > @@ -1824,7 +1856,7 @@ out: > > } else { > > assert(nbytes < 0); > > #if defined(CONFIG_BLKZONED) > > - if (aiocb->aio_type & QEMU_AIO_WRITE) { > > + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { > > update_zones_wp(aiocb->aio_fildes, aiocb->bs->bl.wps, 0, 1); > > } > > #endif > > @@ -3478,6 +3510,52 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > > } > > #endif > > > > +#if defined(CONFIG_BLKZONED) > > +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, > > + int64_t *offset, > > + QEMUIOVector *qiov, > > + BdrvRequestFlags flags) { > > + BDRVRawState *s = bs->opaque; > > + int64_t zone_size_mask = bs->bl.zone_size - 1; > > + int64_t iov_len = 0; > > + int64_t len = 0; > > + RawPosixAIOData acb; > > + > > + if (*offset & zone_size_mask) { > > + error_report("sector offset %" PRId64 " is not aligned to zone size " > > + "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512); > > + return -EINVAL; > > + } > > + > > + int64_t wg = bs->bl.write_granularity; > > + int64_t wg_mask = wg - 1; > > + for (int i = 0; i < qiov->niov; i++) { > > + iov_len = qiov->iov[i].iov_len; > > + if (iov_len & wg_mask) { > > + error_report("len of IOVector[%d] %" PRId64 " is not aligned to " > > + "block size %" PRId64 "", i, iov_len, wg); > > + return -EINVAL; > > + } > > + len += iov_len; > > + } > > + > > + acb = (RawPosixAIOData) { > > + .bs = bs, > > + .aio_fildes = s->fd, > > + .aio_type = QEMU_AIO_ZONE_APPEND, > > + .aio_offset = *offset, > > + .aio_nbytes = len, > > + .io = { > > + .iov = qiov->iov, > > + .niov = qiov->niov, > > + .offset = offset, > > + }, > > + }; > > + > > + return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb); > > +} > > +#endif > > + > > static coroutine_fn int > > raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, > > bool blkdev) > > @@ -4253,6 +4331,7 @@ static BlockDriver bdrv_zoned_host_device = { > > /* zone management operations */ > > .bdrv_co_zone_report = raw_co_zone_report, > > .bdrv_co_zone_mgmt = raw_co_zone_mgmt, > > + .bdrv_co_zone_append = raw_co_zone_append, > > }; > > #endif > > > > diff --git a/block/io.c b/block/io.c > > index 88f707ea4d..03e1109056 100644 > > --- a/block/io.c > > +++ b/block/io.c > > @@ -3230,6 +3230,27 @@ out: > > return co.ret; > > } > > > > +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset, > > + QEMUIOVector *qiov, > > + BdrvRequestFlags flags) > > +{ > > + BlockDriver *drv = bs->drv; > > + CoroutineIOCompletion co = { > > + .coroutine = qemu_coroutine_self(), > > + }; > > + IO_CODE(); > > + > > + bdrv_inc_in_flight(bs); > > + if (!drv || !drv->bdrv_co_zone_append) { > > + co.ret = -ENOTSUP; > > + goto out; > > + } > > + co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags); > > +out: > > + bdrv_dec_in_flight(bs); > > + return co.ret; > > +} > > + > > void *qemu_blockalign(BlockDriverState *bs, size_t size) > > { > > IO_CODE(); > > diff --git a/block/raw-format.c b/block/raw-format.c > > index 18dc52a150..33bff8516e 100644 > > --- a/block/raw-format.c > > +++ b/block/raw-format.c > > @@ -325,6 +325,13 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > > return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len); > > } > > > > +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, > > + int64_t *offset, > > + QEMUIOVector *qiov, > > + BdrvRequestFlags flags) { > > + return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags); > > +} > > + > > static int64_t raw_getlength(BlockDriverState *bs) > > { > > int64_t len; > > @@ -629,6 +636,7 @@ BlockDriver bdrv_raw = { > > .bdrv_co_pdiscard = &raw_co_pdiscard, > > .bdrv_co_zone_report = &raw_co_zone_report, > > .bdrv_co_zone_mgmt = &raw_co_zone_mgmt, > > + .bdrv_co_zone_append = &raw_co_zone_append, > > .bdrv_co_block_status = &raw_co_block_status, > > .bdrv_co_copy_range_from = &raw_co_copy_range_from, > > .bdrv_co_copy_range_to = &raw_co_copy_range_to, > > diff --git a/include/block/block-io.h b/include/block/block-io.h > > index f0cdf67d33..6a54453578 100644 > > --- a/include/block/block-io.h > > +++ b/include/block/block-io.h > > @@ -94,6 +94,9 @@ int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, > > BlockZoneDescriptor *zones); > > int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > > int64_t offset, int64_t len); > > +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset, > > + QEMUIOVector *qiov, > > + BdrvRequestFlags flags); > > > > int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes); > > bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); > > diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h > > index e3136146aa..a7e7db5646 100644 > > --- a/include/block/block_int-common.h > > +++ b/include/block/block_int-common.h > > @@ -701,6 +701,9 @@ struct BlockDriver { > > BlockZoneDescriptor *zones); > > int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op, > > int64_t offset, int64_t len); > > + int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs, > > + int64_t *offset, QEMUIOVector *qiov, > > + BdrvRequestFlags flags); > > > > /* removable device specific */ > > bool (*bdrv_is_inserted)(BlockDriverState *bs); > > @@ -860,6 +863,8 @@ typedef struct BlockLimits { > > > > /* array of write pointers' location of each zone in the zoned device. */ > > BlockZoneWps *wps; > > + > > + int64_t write_granularity; > > } BlockLimits; > > > > typedef struct BdrvOpBlocker BdrvOpBlocker; > > diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h > > index 877b2240b3..53033a5ca7 100644 > > --- a/include/block/raw-aio.h > > +++ b/include/block/raw-aio.h > > @@ -31,6 +31,7 @@ > > #define QEMU_AIO_TRUNCATE 0x0080 > > #define QEMU_AIO_ZONE_REPORT 0x0100 > > #define QEMU_AIO_ZONE_MGMT 0x0200 > > +#define QEMU_AIO_ZONE_APPEND 0x0400 > > #define QEMU_AIO_TYPE_MASK \ > > (QEMU_AIO_READ | \ > > QEMU_AIO_WRITE | \ > > @@ -41,7 +42,8 @@ > > QEMU_AIO_COPY_RANGE | \ > > QEMU_AIO_TRUNCATE | \ > > QEMU_AIO_ZONE_REPORT | \ > > - QEMU_AIO_ZONE_MGMT) > > + QEMU_AIO_ZONE_MGMT | \ > > + QEMU_AIO_ZONE_APPEND) > > > > /* AIO flags */ > > #define QEMU_AIO_MISALIGNED 0x1000 > > diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h > > index 1b5fc7db6b..ff9f777f52 100644 > > --- a/include/sysemu/block-backend-io.h > > +++ b/include/sysemu/block-backend-io.h > > @@ -52,6 +52,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, > > BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > int64_t offset, int64_t len, > > BlockCompletionFunc *cb, void *opaque); > > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, > > + QEMUIOVector *qiov, BdrvRequestFlags flags, > > + BlockCompletionFunc *cb, void *opaque); > > BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes, > > BlockCompletionFunc *cb, void *opaque); > > void blk_aio_cancel_async(BlockAIOCB *acb); > > @@ -173,6 +176,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > int64_t offset, int64_t len); > > int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > > int64_t offset, int64_t len); > > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, > > + QEMUIOVector *qiov, > > + BdrvRequestFlags flags); > > +int generated_co_wrapper blk_zone_append(BlockBackend *blk, int64_t *offset, > > + QEMUIOVector *qiov, > > + BdrvRequestFlags flags); > > > > int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset, > > int64_t bytes); > > -- > Damien Le Moal > Western Digital Research >
diff --git a/block/block-backend.c b/block/block-backend.c index 1c618e9c68..06931ddd24 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo { struct { unsigned long op; } zone_mgmt; + struct { + int64_t *append_sector; + } zone_append; }; } BlkRwCo; @@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, return &acb->common; } +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) +{ + BlkAioEmAIOCB *acb = opaque; + BlkRwCo *rwco = &acb->rwco; + + rwco->ret = blk_co_zone_append(rwco->blk, rwco->zone_append.append_sector, + rwco->iobuf, rwco->flags); + blk_aio_complete(acb); +} + +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, + QEMUIOVector *qiov, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque) { + BlkAioEmAIOCB *acb; + Coroutine *co; + IO_CODE(); + + blk_inc_in_flight(blk); + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); + acb->rwco = (BlkRwCo) { + .blk = blk, + .ret = NOT_DONE, + .flags = flags, + .iobuf = qiov, + .zone_append = { + .append_sector = offset, + }, + }; + acb->has_returned = false; + + co = qemu_coroutine_create(blk_aio_zone_append_entry, acb); + bdrv_coroutine_enter(blk_bs(blk), co); + acb->has_returned = true; + if (acb->rwco.ret != NOT_DONE) { + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), + blk_aio_complete_bh, acb); + } + + return &acb->common; +} + /* * Send a zone_report command. * offset is a byte offset from the start of the device. No alignment @@ -1923,6 +1967,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, return ret; } +/* + * Send a zone_append command. + */ +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, + QEMUIOVector *qiov, BdrvRequestFlags flags) +{ + int ret; + IO_CODE(); + + blk_inc_in_flight(blk); + blk_wait_while_drained(blk); + if (!blk_is_available(blk)) { + blk_dec_in_flight(blk); + return -ENOMEDIUM; + } + + ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags); + blk_dec_in_flight(blk); + return ret; +} + void blk_drain(BlockBackend *blk) { BlockDriverState *bs = blk_bs(blk); diff --git a/block/file-posix.c b/block/file-posix.c index 5ff5500301..3d0cc33d02 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -205,6 +205,7 @@ typedef struct RawPosixAIOData { struct { struct iovec *iov; int niov; + int64_t *offset; } io; struct { uint64_t cmd; @@ -1475,6 +1476,11 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) bs->bl.max_active_zones = ret; } + ret = get_sysfs_long_val(&st, "physical_block_size"); + if (ret >= 0) { + bs->bl.write_granularity = ret; + } + bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret); if (get_zones_wp(s->fd, bs->bl.wps, 0, ret) < 0) { error_report("report wps failed"); @@ -1647,9 +1653,18 @@ qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) { ssize_t len; + BlockZoneWps *wps = aiocb->bs->bl.wps; + int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; + + if (wps) { + qemu_mutex_lock(&wps->lock); + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { + aiocb->aio_offset = wps->wp[index]; + } + } do { - if (aiocb->aio_type & QEMU_AIO_WRITE) + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) len = qemu_pwritev(aiocb->aio_fildes, aiocb->io.iov, aiocb->io.niov, @@ -1660,6 +1675,9 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) aiocb->io.niov, aiocb->aio_offset); } while (len == -1 && errno == EINTR); + if (wps) { + qemu_mutex_unlock(&wps->lock); + } if (len == -1) { return -errno; @@ -1677,9 +1695,17 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) { ssize_t offset = 0; ssize_t len; + BlockZoneWps *wps = aiocb->bs->bl.wps; + int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; + if (wps) { + qemu_mutex_lock(&wps->lock); + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { + aiocb->aio_offset = wps->wp[index]; + } + } while (offset < aiocb->aio_nbytes) { - if (aiocb->aio_type & QEMU_AIO_WRITE) { + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { len = pwrite(aiocb->aio_fildes, (const char *)buf + offset, aiocb->aio_nbytes - offset, @@ -1709,6 +1735,9 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) } offset += len; } + if (wps) { + qemu_mutex_unlock(&wps->lock); + } return offset; } @@ -1772,7 +1801,7 @@ static int handle_aiocb_rw(void *opaque) } nbytes = handle_aiocb_rw_linear(aiocb, buf); - if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { + if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) { char *p = buf; size_t count = aiocb->aio_nbytes, copy; int i; @@ -1794,7 +1823,7 @@ static int handle_aiocb_rw(void *opaque) out: if (nbytes == aiocb->aio_nbytes) { #if defined(CONFIG_BLKZONED) - if (aiocb->aio_type & QEMU_AIO_WRITE) { + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { BlockZoneWps *wps = aiocb->bs->bl.wps; int index = aiocb->aio_offset / aiocb->bs->bl.zone_size; if (wps) { @@ -1802,6 +1831,9 @@ out: if (!BDRV_ZT_IS_CONV(wps->wp[index])) { uint64_t wend_offset = aiocb->aio_offset + aiocb->aio_nbytes; + if (aiocb->aio_type & QEMU_AIO_ZONE_APPEND) { + *aiocb->io.offset = wps->wp[index]; + } /* Advance the wp if needed */ if (wend_offset > wps->wp[index]) { @@ -1824,7 +1856,7 @@ out: } else { assert(nbytes < 0); #if defined(CONFIG_BLKZONED) - if (aiocb->aio_type & QEMU_AIO_WRITE) { + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { update_zones_wp(aiocb->aio_fildes, aiocb->bs->bl.wps, 0, 1); } #endif @@ -3478,6 +3510,52 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, } #endif +#if defined(CONFIG_BLKZONED) +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, + int64_t *offset, + QEMUIOVector *qiov, + BdrvRequestFlags flags) { + BDRVRawState *s = bs->opaque; + int64_t zone_size_mask = bs->bl.zone_size - 1; + int64_t iov_len = 0; + int64_t len = 0; + RawPosixAIOData acb; + + if (*offset & zone_size_mask) { + error_report("sector offset %" PRId64 " is not aligned to zone size " + "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512); + return -EINVAL; + } + + int64_t wg = bs->bl.write_granularity; + int64_t wg_mask = wg - 1; + for (int i = 0; i < qiov->niov; i++) { + iov_len = qiov->iov[i].iov_len; + if (iov_len & wg_mask) { + error_report("len of IOVector[%d] %" PRId64 " is not aligned to " + "block size %" PRId64 "", i, iov_len, wg); + return -EINVAL; + } + len += iov_len; + } + + acb = (RawPosixAIOData) { + .bs = bs, + .aio_fildes = s->fd, + .aio_type = QEMU_AIO_ZONE_APPEND, + .aio_offset = *offset, + .aio_nbytes = len, + .io = { + .iov = qiov->iov, + .niov = qiov->niov, + .offset = offset, + }, + }; + + return raw_thread_pool_submit(bs, handle_aiocb_rw, &acb); +} +#endif + static coroutine_fn int raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, bool blkdev) @@ -4253,6 +4331,7 @@ static BlockDriver bdrv_zoned_host_device = { /* zone management operations */ .bdrv_co_zone_report = raw_co_zone_report, .bdrv_co_zone_mgmt = raw_co_zone_mgmt, + .bdrv_co_zone_append = raw_co_zone_append, }; #endif diff --git a/block/io.c b/block/io.c index 88f707ea4d..03e1109056 100644 --- a/block/io.c +++ b/block/io.c @@ -3230,6 +3230,27 @@ out: return co.ret; } +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset, + QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + BlockDriver *drv = bs->drv; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + IO_CODE(); + + bdrv_inc_in_flight(bs); + if (!drv || !drv->bdrv_co_zone_append) { + co.ret = -ENOTSUP; + goto out; + } + co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags); +out: + bdrv_dec_in_flight(bs); + return co.ret; +} + void *qemu_blockalign(BlockDriverState *bs, size_t size) { IO_CODE(); diff --git a/block/raw-format.c b/block/raw-format.c index 18dc52a150..33bff8516e 100644 --- a/block/raw-format.c +++ b/block/raw-format.c @@ -325,6 +325,13 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len); } +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, + int64_t *offset, + QEMUIOVector *qiov, + BdrvRequestFlags flags) { + return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags); +} + static int64_t raw_getlength(BlockDriverState *bs) { int64_t len; @@ -629,6 +636,7 @@ BlockDriver bdrv_raw = { .bdrv_co_pdiscard = &raw_co_pdiscard, .bdrv_co_zone_report = &raw_co_zone_report, .bdrv_co_zone_mgmt = &raw_co_zone_mgmt, + .bdrv_co_zone_append = &raw_co_zone_append, .bdrv_co_block_status = &raw_co_block_status, .bdrv_co_copy_range_from = &raw_co_copy_range_from, .bdrv_co_copy_range_to = &raw_co_copy_range_to, diff --git a/include/block/block-io.h b/include/block/block-io.h index f0cdf67d33..6a54453578 100644 --- a/include/block/block-io.h +++ b/include/block/block-io.h @@ -94,6 +94,9 @@ int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, BlockZoneDescriptor *zones); int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, int64_t offset, int64_t len); +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset, + QEMUIOVector *qiov, + BdrvRequestFlags flags); int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes); bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h index e3136146aa..a7e7db5646 100644 --- a/include/block/block_int-common.h +++ b/include/block/block_int-common.h @@ -701,6 +701,9 @@ struct BlockDriver { BlockZoneDescriptor *zones); int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op, int64_t offset, int64_t len); + int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs, + int64_t *offset, QEMUIOVector *qiov, + BdrvRequestFlags flags); /* removable device specific */ bool (*bdrv_is_inserted)(BlockDriverState *bs); @@ -860,6 +863,8 @@ typedef struct BlockLimits { /* array of write pointers' location of each zone in the zoned device. */ BlockZoneWps *wps; + + int64_t write_granularity; } BlockLimits; typedef struct BdrvOpBlocker BdrvOpBlocker; diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h index 877b2240b3..53033a5ca7 100644 --- a/include/block/raw-aio.h +++ b/include/block/raw-aio.h @@ -31,6 +31,7 @@ #define QEMU_AIO_TRUNCATE 0x0080 #define QEMU_AIO_ZONE_REPORT 0x0100 #define QEMU_AIO_ZONE_MGMT 0x0200 +#define QEMU_AIO_ZONE_APPEND 0x0400 #define QEMU_AIO_TYPE_MASK \ (QEMU_AIO_READ | \ QEMU_AIO_WRITE | \ @@ -41,7 +42,8 @@ QEMU_AIO_COPY_RANGE | \ QEMU_AIO_TRUNCATE | \ QEMU_AIO_ZONE_REPORT | \ - QEMU_AIO_ZONE_MGMT) + QEMU_AIO_ZONE_MGMT | \ + QEMU_AIO_ZONE_APPEND) /* AIO flags */ #define QEMU_AIO_MISALIGNED 0x1000 diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h index 1b5fc7db6b..ff9f777f52 100644 --- a/include/sysemu/block-backend-io.h +++ b/include/sysemu/block-backend-io.h @@ -52,6 +52,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, int64_t offset, int64_t len, BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, + QEMUIOVector *qiov, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque); BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes, BlockCompletionFunc *cb, void *opaque); void blk_aio_cancel_async(BlockAIOCB *acb); @@ -173,6 +176,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, int64_t offset, int64_t len); int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op, int64_t offset, int64_t len); +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, + QEMUIOVector *qiov, + BdrvRequestFlags flags); +int generated_co_wrapper blk_zone_append(BlockBackend *blk, int64_t *offset, + QEMUIOVector *qiov, + BdrvRequestFlags flags); int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
A zone append command is a write operation that specifies the first logical block of a zone as the write position. When writing to a zoned block device using zone append, the byte offset of writes is pointing to the write pointer of that zone. Upon completion the device will respond with the position the data has been written in the zone. Signed-off-by: Sam Li <faithilikerun@gmail.com> --- block/block-backend.c | 65 ++++++++++++++++++++++ block/file-posix.c | 89 +++++++++++++++++++++++++++++-- block/io.c | 21 ++++++++ block/raw-format.c | 8 +++ include/block/block-io.h | 3 ++ include/block/block_int-common.h | 5 ++ include/block/raw-aio.h | 4 +- include/sysemu/block-backend-io.h | 9 ++++ 8 files changed, 198 insertions(+), 6 deletions(-)