diff mbox

[v2,1/6] block: Fragment reads to max transfer length

Message ID 1468017364-25980-2-git-send-email-eblake@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Eric Blake July 8, 2016, 10:35 p.m. UTC
Drivers should be able to rely on the block layer honoring the
max transfer length, rather than needing to return -EINVAL
(iscsi) or manually fragment things (nbd).  This patch adds
the fragmentation in the block layer, after requests have been
aligned (fragmenting before alignment would lead to multiple
unaligned requests, rather than just the head and tail).

The return value was previously nebulous on success (sometimes
zero, sometimes the length read); since we never have a short
read, and since fragmenting may store yet another positive
value in 'ret', change the function to always return the
incoming 'bytes' value on success.

Signed-off-by: Eric Blake <eblake@redhat.com>

---
v2: Fix uninitialized use of 'ret' for an all-zero read beyond eof
---
 block/io.c | 55 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 34 insertions(+), 21 deletions(-)

Comments

Eric Blake July 15, 2016, 4:08 a.m. UTC | #1
On 07/08/2016 04:35 PM, Eric Blake wrote:
> Drivers should be able to rely on the block layer honoring the
> max transfer length, rather than needing to return -EINVAL
> (iscsi) or manually fragment things (nbd).  This patch adds
> the fragmentation in the block layer, after requests have been
> aligned (fragmenting before alignment would lead to multiple
> unaligned requests, rather than just the head and tail).
> 
> The return value was previously nebulous on success (sometimes
> zero, sometimes the length read); since we never have a short
> read, and since fragmenting may store yet another positive
> value in 'ret', change the function to always return the
> incoming 'bytes' value on success.
> 
> Signed-off-by: Eric Blake <eblake@redhat.com>
> 
> ---
> v2: Fix uninitialized use of 'ret' for an all-zero read beyond eof

Uggh. Something I did here and not in v1 is now causing 'make
check-qtest' failures. Please don't merge until I've posted v3.
Eric Blake July 15, 2016, 2:28 p.m. UTC | #2
On 07/14/2016 10:08 PM, Eric Blake wrote:
> On 07/08/2016 04:35 PM, Eric Blake wrote:
>> Drivers should be able to rely on the block layer honoring the
>> max transfer length, rather than needing to return -EINVAL
>> (iscsi) or manually fragment things (nbd).  This patch adds
>> the fragmentation in the block layer, after requests have been
>> aligned (fragmenting before alignment would lead to multiple
>> unaligned requests, rather than just the head and tail).
>>
>> The return value was previously nebulous on success (sometimes
>> zero, sometimes the length read); since we never have a short
>> read, and since fragmenting may store yet another positive
>> value in 'ret', change the function to always return the
>> incoming 'bytes' value on success.
>>
>> Signed-off-by: Eric Blake <eblake@redhat.com>
>>
>> ---
>> v2: Fix uninitialized use of 'ret' for an all-zero read beyond eof
> 
> Uggh. Something I did here and not in v1 is now causing 'make
> check-qtest' failures. Please don't merge until I've posted v3.

Looks like there is at least one caller that expects
bdrv_aligned_preadv() to return 0 (not positive) on success; I'm not
sure which one(s), as it turned into a lot of code to chase, but a
simple tweak to guarantee ret = 0 on success solves the failures in
'make check'.  v3 coming up soon.
diff mbox

Patch

diff --git a/block/io.c b/block/io.c
index 2887394..5628267 100644
--- a/block/io.c
+++ b/block/io.c
@@ -971,8 +971,8 @@  err:

 /*
  * Forwards an already correctly aligned request to the BlockDriver. This
- * handles copy on read and zeroing after EOF; any other features must be
- * implemented by the caller.
+ * handles copy on read, zeroing after EOF, and fragmentation of large
+ * reads; any other features must be implemented by the caller.
  */
 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
@@ -980,12 +980,16 @@  static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
 {
     int64_t total_bytes, max_bytes;
     int ret;
+    uint64_t bytes_remaining = bytes;
+    int max_transfer;

     assert(is_power_of_2(align));
     assert((offset & (align - 1)) == 0);
     assert((bytes & (align - 1)) == 0);
     assert(!qiov || bytes == qiov->size);
     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
+    max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
+                                   align);

     /* TODO: We would need a per-BDS .supported_read_flags and
      * potential fallback support, if we ever implement any read flags
@@ -1024,7 +1028,7 @@  static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
         }
     }

-    /* Forward the request to the BlockDriver */
+    /* Forward the request to the BlockDriver, possibly fragmenting it */
     total_bytes = bdrv_getlength(bs);
     if (total_bytes < 0) {
         ret = total_bytes;
@@ -1032,30 +1036,39 @@  static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
     }

     max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
-    if (bytes <= max_bytes) {
+    if (bytes <= max_bytes && bytes <= max_transfer) {
         ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
-    } else if (max_bytes > 0) {
-        QEMUIOVector local_qiov;
-
-        qemu_iovec_init(&local_qiov, qiov->niov);
-        qemu_iovec_concat(&local_qiov, qiov, 0, max_bytes);
-
-        ret = bdrv_driver_preadv(bs, offset, max_bytes, &local_qiov, 0);
-
-        qemu_iovec_destroy(&local_qiov);
-    } else {
-        ret = 0;
+        goto out;
     }

-    /* Reading beyond end of file is supposed to produce zeroes */
-    if (ret == 0 && total_bytes < offset + bytes) {
-        uint64_t zero_offset = MAX(0, total_bytes - offset);
-        uint64_t zero_bytes = offset + bytes - zero_offset;
-        qemu_iovec_memset(qiov, zero_offset, 0, zero_bytes);
+    while (bytes_remaining) {
+        int num;
+
+        if (max_bytes) {
+            QEMUIOVector local_qiov;
+
+            num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
+            assert(num);
+            qemu_iovec_init(&local_qiov, qiov->niov);
+            qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
+
+            ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
+                                     num, &local_qiov, 0);
+            max_bytes -= num;
+            qemu_iovec_destroy(&local_qiov);
+            if (ret < 0) {
+                break;
+            }
+        } else {
+            num = bytes_remaining;
+            ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
+                                    bytes_remaining);
+        }
+        bytes_remaining -= num;
     }

 out:
-    return ret;
+    return ret < 0 ? ret : bytes;
 }

 /*