diff mbox

[v2,3/6] block: Fragment writes to max transfer length

Message ID 1468017364-25980-4-git-send-email-eblake@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Eric Blake July 8, 2016, 10:36 p.m. UTC
Drivers should be able to rely on the block layer honoring the
max transfer length, rather than needing to return -EINVAL
(iscsi) or manually fragment things (nbd).  We already fragment
write zeroes at the block layer; this patch adds the fragmentation
for normal writes, after requests have been aligned (fragmenting
before alignment would lead to multiple unaligned requests, rather
than just the head and tail).

When fragmenting a large request where FUA was requested, but
where we know that FUA is implemented by flushing all requests
rather than the given request, then we can still get by with
only one flush.  Note, however, that we need a followup patch
to the raw format driver to avoid a regression in the number of
flushes actually issued.

The return value was previously nebulous on success (sometimes
zero, sometimes the length written); since we never have a short
write, and since fragmenting may store yet another positive
value in 'ret', change the function to always return the
incoming 'bytes' value on success.

Signed-off-by: Eric Blake <eblake@redhat.com>

---
v2: Tweak success return semantics to match read
---
 block/io.c | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)
diff mbox

Patch

diff --git a/block/io.c b/block/io.c
index 5628267..46076a7 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1269,7 +1269,8 @@  fail:
 }

 /*
- * Forwards an already correctly aligned write request to the BlockDriver.
+ * Forwards an already correctly aligned write request to the BlockDriver,
+ * after possibly fragmenting it.
  */
 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
@@ -1281,6 +1282,8 @@  static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,

     int64_t start_sector = offset >> BDRV_SECTOR_BITS;
     int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
+    uint64_t bytes_remaining = bytes;
+    int max_transfer;

     assert(is_power_of_2(align));
     assert((offset & (align - 1)) == 0);
@@ -1288,6 +1291,8 @@  static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
     assert(!qiov || bytes == qiov->size);
     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
     assert(!(flags & ~BDRV_REQ_MASK));
+    max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
+                                   align);

     waited = wait_serialising_requests(req);
     assert(!waited || !req->serialising);
@@ -1310,9 +1315,34 @@  static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
     } else if (flags & BDRV_REQ_ZERO_WRITE) {
         bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
         ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
-    } else {
+    } else if (bytes <= max_transfer) {
         bdrv_debug_event(bs, BLKDBG_PWRITEV);
         ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
+    } else {
+        bdrv_debug_event(bs, BLKDBG_PWRITEV);
+        while (bytes_remaining) {
+            int num = MIN(bytes_remaining, max_transfer);
+            QEMUIOVector local_qiov;
+            int local_flags = flags;
+
+            assert(num);
+            if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
+                !(bs->supported_write_flags & BDRV_REQ_FUA)) {
+                /* If FUA is going to be emulated by flush, we only
+                 * need to flush on the last iteration */
+                local_flags &= ~BDRV_REQ_FUA;
+            }
+            qemu_iovec_init(&local_qiov, qiov->niov);
+            qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
+
+            ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
+                                      num, &local_qiov, local_flags);
+            qemu_iovec_destroy(&local_qiov);
+            if (ret < 0) {
+                break;
+            }
+            bytes_remaining -= num;
+        }
     }
     bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);

@@ -1324,6 +1354,7 @@  static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,

     if (ret >= 0) {
         bs->total_sectors = MAX(bs->total_sectors, end_sector);
+        ret = bytes;
     }

     return ret;