diff mbox series

[v4] block/file-posix.c: Use pwritev2() with RWF_DSYNC for FUA

Message ID 20250404074233.133365-1-prantoran@gmail.com (mailing list archive)
State New
Headers show
Series [v4] block/file-posix.c: Use pwritev2() with RWF_DSYNC for FUA | expand

Commit Message

Pinku Deb Nath April 4, 2025, 7:42 a.m. UTC
Full Unit Access (FUA) is an optimization where a disk write with the
flag set will be persisted to disk immediately instead of potentially
remaining in the disk's write cache. This commit address the todo task
for using pwritev2() with RWF_DSYNC in the thread pool section of
raw_co_prw(), if pwritev2 with RWF_DSYNC is available in the host,
which is alway for Linux kernel >= 4.7. The intent for FUA is indicated
with the BDRV_REQ_FUA flag. The old code paths are preserved in case
BDRV_REQ_FUA is off or pwritev2() with RWF_DSYNC is not available.

During testing, I observed that the BDRV_REQ_FUA is always turned on
when blk->enable_write_cache is not set in block/block-backend.c, so
I commented this section off during testing:
https://gitlab.com/qemu-project/qemu/-/blob/master/block/block-backend.c?ref_type=heads#L1432-1434

Signed-off-by: Pinku Deb Nath <prantoran@gmail.com>

Update 1:

The testing with "-t writeback" works for turning on enable_write_cache.
I renamed the function to qemu_pwritev_fua() and fixed any typos.

I moved the handle_aiocb_flush() into the qemu_pwritev_fua() and
removed from the previously todo seciont. Initially I thought
of only passing aiocb, but then I was not sure whethe I could
derive buf from aiocb, so I added arguments for iovec and iovcnt
into qemu_pwritev_fua().

For handling buf in handle_aiocb_rw_linear(), I created iovec
and passed its reference. I assumed that there will be only one
buffer/iovec, so I passed 1 for iovcnt.

Signed-off-by: Pinku Deb Nath <prantoran@gmail.com>

Update 2:

For non-Linux hosts, I am returning -ENOSYS in the same way for
pwritev.

I changed the signature to add fd, iov, nr_iov, offset separately,
with the aiocb pointer at the end. I thought of reconstructing
aiocb inside the function but the handle_aiocb_flush() uses
(BDRVRawState)aiocb->bs->opaque which I was not sure whether it
was used to pass return values upstream.
So I decided on passing aiocb pointer as an input argument.

I was not sure about how git rebase worked with patches, I hope
it worked, finger-crossed 

Comments

Stefan Hajnoczi April 4, 2025, 3:05 p.m. UTC | #1
On Fri, Apr 04, 2025 at 12:42:33AM -0700, Pinku Deb Nath wrote:
> Full Unit Access (FUA) is an optimization where a disk write with the
> flag set will be persisted to disk immediately instead of potentially
> remaining in the disk's write cache. This commit address the todo task
> for using pwritev2() with RWF_DSYNC in the thread pool section of
> raw_co_prw(), if pwritev2 with RWF_DSYNC is available in the host,
> which is alway for Linux kernel >= 4.7. The intent for FUA is indicated
> with the BDRV_REQ_FUA flag. The old code paths are preserved in case
> BDRV_REQ_FUA is off or pwritev2() with RWF_DSYNC is not available.
> 
> During testing, I observed that the BDRV_REQ_FUA is always turned on
> when blk->enable_write_cache is not set in block/block-backend.c, so
> I commented this section off during testing:
> https://gitlab.com/qemu-project/qemu/-/blob/master/block/block-backend.c?ref_type=heads#L1432-1434
> 
> Signed-off-by: Pinku Deb Nath <prantoran@gmail.com>
> 
> Update 1:
> 
> The testing with "-t writeback" works for turning on enable_write_cache.
> I renamed the function to qemu_pwritev_fua() and fixed any typos.
> 
> I moved the handle_aiocb_flush() into the qemu_pwritev_fua() and
> removed from the previously todo seciont. Initially I thought
> of only passing aiocb, but then I was not sure whethe I could
> derive buf from aiocb, so I added arguments for iovec and iovcnt
> into qemu_pwritev_fua().
> 
> For handling buf in handle_aiocb_rw_linear(), I created iovec
> and passed its reference. I assumed that there will be only one
> buffer/iovec, so I passed 1 for iovcnt.
> 
> Signed-off-by: Pinku Deb Nath <prantoran@gmail.com>
> 
> Update 2:
> 
> For non-Linux hosts, I am returning -ENOSYS in the same way for
> pwritev.
> 
> I changed the signature to add fd, iov, nr_iov, offset separately,
> with the aiocb pointer at the end. I thought of reconstructing
> aiocb inside the function but the handle_aiocb_flush() uses
> (BDRVRawState)aiocb->bs->opaque which I was not sure whether it
> was used to pass return values upstream.
> So I decided on passing aiocb pointer as an input argument.
> 
> I was not sure about how git rebase worked with patches, I hope
> it worked, finger-crossed 
diff mbox series

Patch

diff --git a/block/file-posix.c b/block/file-posix.c
index 56d1972d15..16e32acedd 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -229,6 +229,7 @@  typedef struct RawPosixAIOData {
             unsigned long op;
         } zone_mgmt;
     };
+    BdrvRequestFlags flags;
 } RawPosixAIOData;
 
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
@@ -1674,6 +1675,20 @@  qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
     return pwritev(fd, iov, nr_iov, offset);
 }
 
+static ssize_t
+qemu_pwritev_fua(int fd, struct iovec *iov, int nr_iov, off_t offset, const RawPosixAIOData *aiocb)
+{
+#ifdef RWF_DSYNC
+    return pwritev2(fd, iov, nr_iov, offset, RWF_DSYNC);
+#else
+    ssize_t len = pwritev2(fd, iov, nr_iov, offset, 0);
+    if (len == 0) {
+        len = handle_aiocb_flush(aiocb);
+    }
+    return len;
+#endif
+}
+
 #else
 
 static bool preadv_present = false;
@@ -1690,6 +1705,11 @@  qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
     return -ENOSYS;
 }
 
+static ssize_t
+qemu_pwritev_fua(int fd, struct iovec *iov, int nr_iov, off_t offset, const RawPosixAIOData *aiocb)
+{
+    return -ENOSYS;
+}
 #endif
 
 static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
@@ -1698,10 +1718,16 @@  static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
 
     len = RETRY_ON_EINTR(
         (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
-            qemu_pwritev(aiocb->aio_fildes,
-                           aiocb->io.iov,
-                           aiocb->io.niov,
-                           aiocb->aio_offset) :
+            (aiocb->flags &  BDRV_REQ_FUA) ?
+                qemu_pwritev_fua(aiocb->aio_fildes,
+                                aiocb->io.iov,
+                                aiocb->io.niov,
+                                aiocb->aio_offset,
+                                aiocb) :
+                qemu_pwritev(aiocb->aio_fildes,
+                            aiocb->io.iov,
+                            aiocb->io.niov,
+                            aiocb->aio_offset) :
             qemu_preadv(aiocb->aio_fildes,
                           aiocb->io.iov,
                           aiocb->io.niov,
@@ -1727,10 +1753,22 @@  static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
 
     while (offset < aiocb->aio_nbytes) {
         if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
-            len = pwrite(aiocb->aio_fildes,
-                         (const char *)buf + offset,
-                         aiocb->aio_nbytes - offset,
-                         aiocb->aio_offset + offset);
+            if (aiocb->flags & BDRV_REQ_FUA) {
+                struct iovec iov = {
+                    .iov_base = buf,
+                    .iov_len = aiocb->aio_nbytes - offset,
+                };
+                len = qemu_pwritev_fua(aiocb->aio_fildes,
+                                    &iov,
+                                    1,
+                                    aiocb->aio_offset + offset,
+                                    aiocb);
+            } else {
+                len = pwrite(aiocb->aio_fildes,
+                            (const char *)buf + offset,
+                            aiocb->aio_nbytes - offset,
+                            aiocb->aio_offset + offset);
+            }
         } else {
             len = pread(aiocb->aio_fildes,
                         buf + offset,
@@ -2539,14 +2577,11 @@  static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
             .iov            = qiov->iov,
             .niov           = qiov->niov,
         },
+        .flags          = flags,
     };
 
     assert(qiov->size == bytes);
     ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
-    if (ret == 0 && (flags & BDRV_REQ_FUA)) {
-        /* TODO Use pwritev2() instead if it's available */
-        ret = raw_co_flush_to_disk(bs);
-    }
     goto out; /* Avoid the compiler err of unused label */
 
 out: