@@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
struct {
unsigned long op;
} zone_mgmt;
+ struct {
+ int64_t *offset;
+ } zone_append;
};
} BlkRwCo;
@@ -1871,6 +1874,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
return &acb->common;
}
+static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
+{
+ BlkAioEmAIOCB *acb = opaque;
+ BlkRwCo *rwco = &acb->rwco;
+
+ rwco->ret = blk_co_zone_append(rwco->blk, rwco->zone_append.offset,
+ rwco->iobuf, rwco->flags);
+ blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
+ QEMUIOVector *qiov, BdrvRequestFlags flags,
+ BlockCompletionFunc *cb, void *opaque) {
+ BlkAioEmAIOCB *acb;
+ Coroutine *co;
+ IO_CODE();
+
+ blk_inc_in_flight(blk);
+ acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
+ acb->rwco = (BlkRwCo) {
+ .blk = blk,
+ .ret = NOT_DONE,
+ .flags = flags,
+ .iobuf = qiov,
+ .zone_append = {
+ .offset = offset,
+ },
+ };
+ acb->has_returned = false;
+
+ co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
+ bdrv_coroutine_enter(blk_bs(blk), co);
+ acb->has_returned = true;
+ if (acb->rwco.ret != NOT_DONE) {
+ replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+ blk_aio_complete_bh, acb);
+ }
+
+ return &acb->common;
+}
+
/*
* Send a zone_report command.
* offset is a byte offset from the start of the device. No alignment
@@ -1922,6 +1966,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
return ret;
}
+/*
+ * Send a zone_append command.
+ */
+int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
+ QEMUIOVector *qiov, BdrvRequestFlags flags)
+{
+ int ret;
+ IO_CODE();
+
+ blk_inc_in_flight(blk);
+ blk_wait_while_drained(blk);
+ if (!blk_is_available(blk)) {
+ blk_dec_in_flight(blk);
+ return -ENOMEDIUM;
+ }
+
+ ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
+ blk_dec_in_flight(blk);
+ return ret;
+}
+
void blk_drain(BlockBackend *blk)
{
BlockDriverState *bs = blk_bs(blk);
@@ -159,6 +159,7 @@ typedef struct BDRVRawState {
bool has_write_zeroes:1;
bool use_linux_aio:1;
bool use_linux_io_uring:1;
+ int64_t *offset; /* offset of zone append operation */
int page_cache_inconsistent; /* errno from fdatasync failure */
bool has_fallocate;
bool needs_alignment;
@@ -1484,6 +1485,11 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
bs->bl.max_active_zones = ret;
}
+ ret = get_sysfs_long_val(&st, "physical_block_size");
+ if (ret >= 0) {
+ bs->bl.write_granularity = ret;
+ }
+
bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
ret = get_zones_wp(s->fd, bs->bl.wps, 0, bs->bl.nr_zones);
if (ret < 0) {
@@ -1664,7 +1670,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
ssize_t len;
do {
- if (aiocb->aio_type & QEMU_AIO_WRITE)
+ if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
len = qemu_pwritev(aiocb->aio_fildes,
aiocb->io.iov,
aiocb->io.niov,
@@ -1694,7 +1700,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
ssize_t len;
while (offset < aiocb->aio_nbytes) {
- if (aiocb->aio_type & QEMU_AIO_WRITE) {
+ if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
len = pwrite(aiocb->aio_fildes,
(const char *)buf + offset,
aiocb->aio_nbytes - offset,
@@ -1787,7 +1793,7 @@ static int handle_aiocb_rw(void *opaque)
}
nbytes = handle_aiocb_rw_linear(aiocb, buf);
- if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
+ if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
char *p = buf;
size_t count = aiocb->aio_nbytes, copy;
int i;
@@ -2426,6 +2432,10 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
#if defined(CONFIG_BLKZONED)
if (bs->bl.wps) {
qemu_co_mutex_lock(&bs->bl.wps->colock);
+ if (type & QEMU_AIO_ZONE_APPEND && bs->bl.zone_size) {
+ int index = offset / bs->bl.zone_size;
+ offset = bs->bl.wps->wp[index];
+ }
}
#endif
@@ -2473,9 +2483,13 @@ out:
#if defined(CONFIG_BLKZONED)
BlockZoneWps *wps = bs->bl.wps;
if (ret == 0) {
- if (type & QEMU_AIO_WRITE && wps && bs->bl.zone_size) {
+ if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
+ && wps && bs->bl.zone_size) {
int index = offset / bs->bl.zone_size;
if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
+ if (type & QEMU_AIO_ZONE_APPEND) {
+ *s->offset = wps->wp[index];
+ }
/* Advance the wp if needed */
if (offset + bytes > wps->wp[index]) {
wps->wp[index] = offset + bytes;
@@ -2483,7 +2497,7 @@ out:
}
}
} else {
- if (type & QEMU_AIO_WRITE) {
+ if (type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
update_zones_wp(s->fd, bs->bl.wps, 0, 1);
}
}
@@ -3495,6 +3509,41 @@ out:
}
#endif
+#if defined(CONFIG_BLKZONED)
+static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
+ int64_t *offset,
+ QEMUIOVector *qiov,
+ BdrvRequestFlags flags) {
+ assert(flags == 0);
+ int64_t zone_size_mask = bs->bl.zone_size - 1;
+ int64_t iov_len = 0;
+ int64_t len = 0;
+ BDRVRawState *s = bs->opaque;
+ s->offset = offset;
+
+
+ if (*offset & zone_size_mask) {
+ error_report("sector offset %" PRId64 " is not aligned to zone size "
+ "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
+ return -EINVAL;
+ }
+
+ int64_t wg = bs->bl.write_granularity;
+ int64_t wg_mask = wg - 1;
+ for (int i = 0; i < qiov->niov; i++) {
+ iov_len = qiov->iov[i].iov_len;
+ if (iov_len & wg_mask) {
+ error_report("len of IOVector[%d] %" PRId64 " is not aligned to "
+ "block size %" PRId64 "", i, iov_len, wg);
+ return -EINVAL;
+ }
+ len += iov_len;
+ }
+
+ return raw_co_prw(bs, *offset, len, qiov, QEMU_AIO_ZONE_APPEND);
+}
+#endif
+
static coroutine_fn int
raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
bool blkdev)
@@ -4270,6 +4319,7 @@ static BlockDriver bdrv_zoned_host_device = {
/* zone management operations */
.bdrv_co_zone_report = raw_co_zone_report,
.bdrv_co_zone_mgmt = raw_co_zone_mgmt,
+ .bdrv_co_zone_append = raw_co_zone_append,
};
#endif
@@ -3230,6 +3230,27 @@ out:
return co.ret;
}
+int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
+ QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
+{
+ BlockDriver *drv = bs->drv;
+ CoroutineIOCompletion co = {
+ .coroutine = qemu_coroutine_self(),
+ };
+ IO_CODE();
+
+ bdrv_inc_in_flight(bs);
+ if (!drv || !drv->bdrv_co_zone_append) {
+ co.ret = -ENOTSUP;
+ goto out;
+ }
+ co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
+out:
+ bdrv_dec_in_flight(bs);
+ return co.ret;
+}
+
void *qemu_blockalign(BlockDriverState *bs, size_t size)
{
IO_CODE();
@@ -345,6 +345,10 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
luringcb->qiov->niov, offset);
break;
+ case QEMU_AIO_ZONE_APPEND:
+ io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
+ luringcb->qiov->niov, offset);
+ break;
case QEMU_AIO_READ:
io_uring_prep_readv(sqes, fd, luringcb->qiov->iov,
luringcb->qiov->niov, offset);
@@ -389,6 +389,9 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
case QEMU_AIO_WRITE:
io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
break;
+ case QEMU_AIO_ZONE_APPEND:
+ io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
+ break;
case QEMU_AIO_READ:
io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
break;
@@ -325,6 +325,13 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
}
+static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
+ int64_t *offset,
+ QEMUIOVector *qiov,
+ BdrvRequestFlags flags) {
+ return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
+}
+
static int64_t raw_getlength(BlockDriverState *bs)
{
int64_t len;
@@ -629,6 +636,7 @@ BlockDriver bdrv_raw = {
.bdrv_co_pdiscard = &raw_co_pdiscard,
.bdrv_co_zone_report = &raw_co_zone_report,
.bdrv_co_zone_mgmt = &raw_co_zone_mgmt,
+ .bdrv_co_zone_append = &raw_co_zone_append,
.bdrv_co_block_status = &raw_co_block_status,
.bdrv_co_copy_range_from = &raw_co_copy_range_from,
.bdrv_co_copy_range_to = &raw_co_copy_range_to,
@@ -94,6 +94,9 @@ int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
BlockZoneDescriptor *zones);
int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
int64_t offset, int64_t len);
+int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
+ QEMUIOVector *qiov,
+ BdrvRequestFlags flags);
int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
@@ -701,6 +701,9 @@ struct BlockDriver {
BlockZoneDescriptor *zones);
int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op,
int64_t offset, int64_t len);
+ int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs,
+ int64_t *offset, QEMUIOVector *qiov,
+ BdrvRequestFlags flags);
/* removable device specific */
bool (*bdrv_is_inserted)(BlockDriverState *bs);
@@ -857,6 +860,8 @@ typedef struct BlockLimits {
/* array of write pointers' location of each zone in the zoned device. */
BlockZoneWps *wps;
+
+ int64_t write_granularity;
} BlockLimits;
typedef struct BdrvOpBlocker BdrvOpBlocker;
@@ -31,6 +31,7 @@
#define QEMU_AIO_TRUNCATE 0x0080
#define QEMU_AIO_ZONE_REPORT 0x0100
#define QEMU_AIO_ZONE_MGMT 0x0200
+#define QEMU_AIO_ZONE_APPEND 0x0400
#define QEMU_AIO_TYPE_MASK \
(QEMU_AIO_READ | \
QEMU_AIO_WRITE | \
@@ -41,7 +42,8 @@
QEMU_AIO_COPY_RANGE | \
QEMU_AIO_TRUNCATE | \
QEMU_AIO_ZONE_REPORT | \
- QEMU_AIO_ZONE_MGMT)
+ QEMU_AIO_ZONE_MGMT | \
+ QEMU_AIO_ZONE_APPEND)
/* AIO flags */
#define QEMU_AIO_MISALIGNED 0x1000
@@ -52,6 +52,9 @@ BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
int64_t offset, int64_t len,
BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
+ QEMUIOVector *qiov, BdrvRequestFlags flags,
+ BlockCompletionFunc *cb, void *opaque);
BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes,
BlockCompletionFunc *cb, void *opaque);
void blk_aio_cancel_async(BlockAIOCB *acb);
@@ -173,6 +176,12 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
int64_t offset, int64_t len);
int generated_co_wrapper blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
int64_t offset, int64_t len);
+int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
+ QEMUIOVector *qiov,
+ BdrvRequestFlags flags);
+int generated_co_wrapper blk_zone_append(BlockBackend *blk, int64_t *offset,
+ QEMUIOVector *qiov,
+ BdrvRequestFlags flags);
int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset,
int64_t bytes);
A zone append command is a write operation that specifies the first logical block of a zone as the write position. When writing to a zoned block device using zone append, the byte offset of writes is pointing to the write pointer of that zone. Upon completion the device will respond with the position the data has been written in the zone. Signed-off-by: Sam Li <faithilikerun@gmail.com> --- block/block-backend.c | 65 +++++++++++++++++++++++++++++++ block/file-posix.c | 60 +++++++++++++++++++++++++--- block/io.c | 21 ++++++++++ block/io_uring.c | 4 ++ block/linux-aio.c | 3 ++ block/raw-format.c | 8 ++++ include/block/block-io.h | 3 ++ include/block/block_int-common.h | 5 +++ include/block/raw-aio.h | 4 +- include/sysemu/block-backend-io.h | 9 +++++ 10 files changed, 176 insertions(+), 6 deletions(-)