diff mbox

[v3,44/44] nbd: Implement NBD_OPT_BLOCK_SIZE on client

Message ID 1461368452-10389-45-git-send-email-eblake@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Eric Blake April 22, 2016, 11:40 p.m. UTC
The upstream NBD Protocol has defined a new extension to allow
the server to advertise block sizes to the client, as well as
a way for the client to inform the server that it intends to
obey block sizes.

Pass any received sizes on to the block layer.

Use the minimum block size as the sector size we pass to the
kernel - which also has the nice effect of cooperating with
(non-qemu) servers that don't do read-modify-write when exposing
a block device with 4k sectors; it can also allow us to visit a
file larger than 2T on a 32-bit kernel.

Signed-off-by: Eric Blake <eblake@redhat.com>
---
 include/block/nbd.h |  3 +++
 block/nbd-client.c  |  3 +++
 block/nbd.c         | 17 +++++++++---
 nbd/client.c        | 74 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 87 insertions(+), 10 deletions(-)

Comments

Alex Bligh April 25, 2016, 12:19 p.m. UTC | #1
On 23 Apr 2016, at 00:40, Eric Blake <eblake@redhat.com> wrote:

> The upstream NBD Protocol has defined a new extension to allow
> the server to advertise block sizes to the client, as well as
> a way for the client to inform the server that it intends to
> obey block sizes.
> 
> Pass any received sizes on to the block layer.
> 
> Use the minimum block size as the sector size we pass to the
> kernel - which also has the nice effect of cooperating with
> (non-qemu) servers that don't do read-modify-write when exposing
> a block device with 4k sectors; it can also allow us to visit a
> file larger than 2T on a 32-bit kernel.
> 
> Signed-off-by: Eric Blake <eblake@redhat.com>
> ---
> include/block/nbd.h |  3 +++
> block/nbd-client.c  |  3 +++
> block/nbd.c         | 17 +++++++++---
> nbd/client.c        | 74 ++++++++++++++++++++++++++++++++++++++++++++++++-----
> 4 files changed, 87 insertions(+), 10 deletions(-)
> 
> diff --git a/include/block/nbd.h b/include/block/nbd.h
> index a5c68df..27a6854 100644
> --- a/include/block/nbd.h
> +++ b/include/block/nbd.h
> @@ -133,6 +133,9 @@ enum {
> struct NbdExportInfo {
>     uint64_t size;
>     uint16_t flags;
> +    uint32_t min_block;
> +    uint32_t opt_block;
> +    uint32_t max_block;
> };
> typedef struct NbdExportInfo NbdExportInfo;
> 
> diff --git a/block/nbd-client.c b/block/nbd-client.c
> index 2b6ac27..602a8ab 100644
> --- a/block/nbd-client.c
> +++ b/block/nbd-client.c
> @@ -443,6 +443,9 @@ int nbd_client_init(BlockDriverState *bs,
>         logout("Failed to negotiate with the NBD server\n");
>         return ret;
>     }
> +    if (client->info.min_block > bs->request_alignment) {
> +        bs->request_alignment = client->info.min_block;
> +    }
> 
>     qemu_co_mutex_init(&client->send_mutex);
>     qemu_co_mutex_init(&client->free_sema);
> diff --git a/block/nbd.c b/block/nbd.c
> index 5172039..bb7df55 100644
> --- a/block/nbd.c
> +++ b/block/nbd.c
> @@ -407,9 +407,20 @@ static int nbd_co_flush(BlockDriverState *bs)
> 
> static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
> {
> -    bs->bl.max_discard = UINT32_MAX >> BDRV_SECTOR_BITS;
> -    bs->bl.max_write_zeroes = UINT32_MAX >> BDRV_SECTOR_BITS;
> -    bs->bl.max_transfer_length = UINT32_MAX >> BDRV_SECTOR_BITS;
> +    NbdClientSession *s = nbd_get_client_session(bs);
> +    int max = UINT32_MAX >> BDRV_SECTOR_BITS;
> +
> +    if (s->info.max_block) {
> +        max = s->info.max_block >> BDRV_SECTOR_BITS;
> +    }
> +    bs->bl.max_discard = max;
> +    bs->bl.max_write_zeroes = max;
> +    bs->bl.max_transfer_length = max;
> +
> +    if (s->info.opt_block &&
> +        s->info.opt_block >> BDRV_SECTOR_BITS > bs->bl.opt_transfer_length) {
> +        bs->bl.opt_transfer_length = s->info.opt_block >> BDRV_SECTOR_BITS;
> +    }
> }
> 
> static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
> diff --git a/nbd/client.c b/nbd/client.c
> index dac4f29..24f6b0b 100644
> --- a/nbd/client.c
> +++ b/nbd/client.c
> @@ -232,6 +232,11 @@ static int nbd_handle_reply_err(QIOChannel *ioc, nbd_opt_reply *reply,
>                    reply->option);
>         break;
> 
> +    case NBD_REP_ERR_BLOCK_SIZE_REQD:
> +        error_setg(errp, "Server wants OPT_BLOCK_SIZE before option %" PRIx32,
> +                   reply->option);
> +        break;
> +
>     default:
>         error_setg(errp, "Unknown error code when asking for option %" PRIx32,
>                    reply->option);
> @@ -333,6 +338,21 @@ static int nbd_opt_go(QIOChannel *ioc, const char *wantname,
>      * flags still 0 is a witness of a broken server. */
>     info->flags = 0;
> 
> +    /* Some servers use NBD_OPT_GO to advertise non-default block
> +     * sizes, and require that we first use NBD_OPT_BLOCK_SIZE to
> +     * agree to that. */
> +    TRACE("Attempting NBD_OPT_BLOCK_SIZE");
> +    if (nbd_send_option_request(ioc, NBD_OPT_BLOCK_SIZE, 0, NULL, errp) < 0) {
> +        return -1;
> +    }
> +    if (nbd_receive_option_reply(ioc, NBD_OPT_BLOCK_SIZE, &reply, errp) < 0) {
> +        return -1;
> +    }
> +    error = nbd_handle_reply_err(ioc, &reply, errp);
> +    if (error < 0) {
> +        return error;
> +    }
> +
>     TRACE("Attempting NBD_OPT_GO for export '%s'", wantname);
>     if (nbd_send_option_request(ioc, NBD_OPT_GO, -1, wantname, errp) < 0) {
>         return -1;
> @@ -402,6 +422,45 @@ static int nbd_opt_go(QIOChannel *ioc, const char *wantname,
>                   info->size, info->flags);
>             break;
> 
> +        case NBD_INFO_BLOCK_SIZE:
> +            if (len != sizeof(info->min_block) * 3) {
> +                error_setg(errp, "remaining export info len %" PRIu32
> +                           " is unexpected size", len);
> +                return -1;
> +            }
> +            if (read_sync(ioc, &info->min_block, sizeof(info->min_block)) !=
> +                sizeof(info->min_block)) {
> +                error_setg(errp, "failed to read info minimum block size");
> +                return -1;
> +            }
> +            be32_to_cpus(&info->min_block);
> +            if (!is_power_of_2(info->min_block)) {
> +                error_setg(errp, "server minimum block size %" PRId32
> +                           "is not a power of two", info->min_block);
> +                return -1;
> +            }
> +            if (read_sync(ioc, &info->opt_block, sizeof(info->opt_block)) !=
> +                sizeof(info->opt_block)) {
> +                error_setg(errp, "failed to read info preferred block size");
> +                return -1;
> +            }
> +            be32_to_cpus(&info->opt_block);
> +            if (!is_power_of_2(info->opt_block) ||
> +                info->opt_block < info->min_block) {
> +                error_setg(errp, "server preferred block size %" PRId32
> +                           "is not valid", info->opt_block);
> +                return -1;
> +            }
> +            if (read_sync(ioc, &info->max_block, sizeof(info->max_block)) !=
> +                sizeof(info->max_block)) {
> +                error_setg(errp, "failed to read info maximum block size");
> +                return -1;
> +            }
> +            be32_to_cpus(&info->max_block);
> +            TRACE("Block sizes are 0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" PRIx32,
> +                  info->min_block, info->opt_block, info->max_block);
> +            break;
> +

You should probably check min_block <= opt_block <= max_block here

Also should there be a check that BDRV_SECTOR_SIZE >= min_block?


>         default:
>             TRACE("ignoring unknown export info %" PRIu16, type);
>             if (drop_sync(ioc, len) != len) {
> @@ -710,8 +769,9 @@ fail:
> #ifdef __linux__
> int nbd_init(int fd, QIOChannelSocket *sioc, NbdExportInfo *info)
> {
> -    unsigned long sectors = info->size / BDRV_SECTOR_SIZE;
> -    if (info->size / BDRV_SECTOR_SIZE != sectors) {
> +    unsigned long sector_size = MAX(BDRV_SECTOR_SIZE, info->min_block);
> +    unsigned long sectors = info->size / sector_size;
> +    if (info->size / sector_size != sectors) {
>         LOG("Export size %" PRId64 " too large for 32-bit kernel", info->size);
>         return -E2BIG;
>     }
> @@ -724,18 +784,18 @@ int nbd_init(int fd, QIOChannelSocket *sioc, NbdExportInfo *info)
>         return -serrno;
>     }
> 
> -    TRACE("Setting block size to %lu", (unsigned long)BDRV_SECTOR_SIZE);
> +    TRACE("Setting block size to %lu", sector_size);
> 
> -    if (ioctl(fd, NBD_SET_BLKSIZE, (unsigned long)BDRV_SECTOR_SIZE) < 0) {
> +    if (ioctl(fd, NBD_SET_BLKSIZE, sector_size) < 0) {
>         int serrno = errno;
>         LOG("Failed setting NBD block size");
>         return -serrno;
>     }
> 
>     TRACE("Setting size to %lu block(s)", sectors);
> -    if (size % BDRV_SECTOR_SIZE) {
> -        TRACE("Ignoring trailing %d bytes of export",
> -              (int) (size % BDRV_SECTOR_SIZE));
> +    if (info->size % sector_size) {
> +        TRACE("Ignoring trailing %" PRId64 " bytes of export",
> +              info->size % sector_size);
>     }
> 
>     if (ioctl(fd, NBD_SET_SIZE_BLOCKS, sectors) < 0) {
> -- 
> 2.5.5
> 
>
Eric Blake April 25, 2016, 7:16 p.m. UTC | #2
On 04/25/2016 06:19 AM, Alex Bligh wrote:
> 
> On 23 Apr 2016, at 00:40, Eric Blake <eblake@redhat.com> wrote:
> 
>> The upstream NBD Protocol has defined a new extension to allow
>> the server to advertise block sizes to the client, as well as
>> a way for the client to inform the server that it intends to
>> obey block sizes.
>>
>> Pass any received sizes on to the block layer.
>>
>> Use the minimum block size as the sector size we pass to the
>> kernel - which also has the nice effect of cooperating with
>> (non-qemu) servers that don't do read-modify-write when exposing
>> a block device with 4k sectors; it can also allow us to visit a
>> file larger than 2T on a 32-bit kernel.
>>

>> +            be32_to_cpus(&info->max_block);
>> +            TRACE("Block sizes are 0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" PRIx32,
>> +                  info->min_block, info->opt_block, info->max_block);
>> +            break;
>> +
> 
> You should probably check min_block <= opt_block <= max_block here

opt_block > max_block is possible if max_block is clamped to export size
(in the degenerate case where you have a small export that is too small
for the granularity of a hole or efficient I/O).  But yes, some sanity
checks that the server isn't horribly broken might be worthwhile.

> 
> Also should there be a check that BDRV_SECTOR_SIZE >= min_block?

No, because we take the server's min_block and feed it into
bs->request_align (which forces the block layer to comply with a minimum
alignment larger than 512, using code already tested on physical block
drives with 4k sectors), see the hunk in nbd-client.c.

>> int nbd_init(int fd, QIOChannelSocket *sioc, NbdExportInfo *info)
>> {
>> -    unsigned long sectors = info->size / BDRV_SECTOR_SIZE;
>> -    if (info->size / BDRV_SECTOR_SIZE != sectors) {
>> +    unsigned long sector_size = MAX(BDRV_SECTOR_SIZE, info->min_block);
>> +    unsigned long sectors = info->size / sector_size;
>> +    if (info->size / sector_size != sectors) {
>>         LOG("Export size %" PRId64 " too large for 32-bit kernel", info->size);
>>         return -E2BIG;
>>     }
>> @@ -724,18 +784,18 @@ int nbd_init(int fd, QIOChannelSocket *sioc, NbdExportInfo *info)
>>         return -serrno;
>>     }
>>
>> -    TRACE("Setting block size to %lu", (unsigned long)BDRV_SECTOR_SIZE);
>> +    TRACE("Setting block size to %lu", sector_size);
>>
>> -    if (ioctl(fd, NBD_SET_BLKSIZE, (unsigned long)BDRV_SECTOR_SIZE) < 0) {
>> +    if (ioctl(fd, NBD_SET_BLKSIZE, sector_size) < 0) {

We also feed the maximum of 512 or the advertised minimum block size
into the kernel when using ioctl() for the kernel to take over
transmission phase; although I'm not certain whether the kernel obeys
NBD_SET_BLKSIZE as a hint rather than a hard rule - but if that needs
patching, it needs patching in the kernel implementation, not in qemu.
diff mbox

Patch

diff --git a/include/block/nbd.h b/include/block/nbd.h
index a5c68df..27a6854 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -133,6 +133,9 @@  enum {
 struct NbdExportInfo {
     uint64_t size;
     uint16_t flags;
+    uint32_t min_block;
+    uint32_t opt_block;
+    uint32_t max_block;
 };
 typedef struct NbdExportInfo NbdExportInfo;

diff --git a/block/nbd-client.c b/block/nbd-client.c
index 2b6ac27..602a8ab 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -443,6 +443,9 @@  int nbd_client_init(BlockDriverState *bs,
         logout("Failed to negotiate with the NBD server\n");
         return ret;
     }
+    if (client->info.min_block > bs->request_alignment) {
+        bs->request_alignment = client->info.min_block;
+    }

     qemu_co_mutex_init(&client->send_mutex);
     qemu_co_mutex_init(&client->free_sema);
diff --git a/block/nbd.c b/block/nbd.c
index 5172039..bb7df55 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -407,9 +407,20 @@  static int nbd_co_flush(BlockDriverState *bs)

 static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
 {
-    bs->bl.max_discard = UINT32_MAX >> BDRV_SECTOR_BITS;
-    bs->bl.max_write_zeroes = UINT32_MAX >> BDRV_SECTOR_BITS;
-    bs->bl.max_transfer_length = UINT32_MAX >> BDRV_SECTOR_BITS;
+    NbdClientSession *s = nbd_get_client_session(bs);
+    int max = UINT32_MAX >> BDRV_SECTOR_BITS;
+
+    if (s->info.max_block) {
+        max = s->info.max_block >> BDRV_SECTOR_BITS;
+    }
+    bs->bl.max_discard = max;
+    bs->bl.max_write_zeroes = max;
+    bs->bl.max_transfer_length = max;
+
+    if (s->info.opt_block &&
+        s->info.opt_block >> BDRV_SECTOR_BITS > bs->bl.opt_transfer_length) {
+        bs->bl.opt_transfer_length = s->info.opt_block >> BDRV_SECTOR_BITS;
+    }
 }

 static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
diff --git a/nbd/client.c b/nbd/client.c
index dac4f29..24f6b0b 100644
--- a/nbd/client.c
+++ b/nbd/client.c
@@ -232,6 +232,11 @@  static int nbd_handle_reply_err(QIOChannel *ioc, nbd_opt_reply *reply,
                    reply->option);
         break;

+    case NBD_REP_ERR_BLOCK_SIZE_REQD:
+        error_setg(errp, "Server wants OPT_BLOCK_SIZE before option %" PRIx32,
+                   reply->option);
+        break;
+
     default:
         error_setg(errp, "Unknown error code when asking for option %" PRIx32,
                    reply->option);
@@ -333,6 +338,21 @@  static int nbd_opt_go(QIOChannel *ioc, const char *wantname,
      * flags still 0 is a witness of a broken server. */
     info->flags = 0;

+    /* Some servers use NBD_OPT_GO to advertise non-default block
+     * sizes, and require that we first use NBD_OPT_BLOCK_SIZE to
+     * agree to that. */
+    TRACE("Attempting NBD_OPT_BLOCK_SIZE");
+    if (nbd_send_option_request(ioc, NBD_OPT_BLOCK_SIZE, 0, NULL, errp) < 0) {
+        return -1;
+    }
+    if (nbd_receive_option_reply(ioc, NBD_OPT_BLOCK_SIZE, &reply, errp) < 0) {
+        return -1;
+    }
+    error = nbd_handle_reply_err(ioc, &reply, errp);
+    if (error < 0) {
+        return error;
+    }
+
     TRACE("Attempting NBD_OPT_GO for export '%s'", wantname);
     if (nbd_send_option_request(ioc, NBD_OPT_GO, -1, wantname, errp) < 0) {
         return -1;
@@ -402,6 +422,45 @@  static int nbd_opt_go(QIOChannel *ioc, const char *wantname,
                   info->size, info->flags);
             break;

+        case NBD_INFO_BLOCK_SIZE:
+            if (len != sizeof(info->min_block) * 3) {
+                error_setg(errp, "remaining export info len %" PRIu32
+                           " is unexpected size", len);
+                return -1;
+            }
+            if (read_sync(ioc, &info->min_block, sizeof(info->min_block)) !=
+                sizeof(info->min_block)) {
+                error_setg(errp, "failed to read info minimum block size");
+                return -1;
+            }
+            be32_to_cpus(&info->min_block);
+            if (!is_power_of_2(info->min_block)) {
+                error_setg(errp, "server minimum block size %" PRId32
+                           "is not a power of two", info->min_block);
+                return -1;
+            }
+            if (read_sync(ioc, &info->opt_block, sizeof(info->opt_block)) !=
+                sizeof(info->opt_block)) {
+                error_setg(errp, "failed to read info preferred block size");
+                return -1;
+            }
+            be32_to_cpus(&info->opt_block);
+            if (!is_power_of_2(info->opt_block) ||
+                info->opt_block < info->min_block) {
+                error_setg(errp, "server preferred block size %" PRId32
+                           "is not valid", info->opt_block);
+                return -1;
+            }
+            if (read_sync(ioc, &info->max_block, sizeof(info->max_block)) !=
+                sizeof(info->max_block)) {
+                error_setg(errp, "failed to read info maximum block size");
+                return -1;
+            }
+            be32_to_cpus(&info->max_block);
+            TRACE("Block sizes are 0x%" PRIx32 ", 0x%" PRIx32 ", 0x%" PRIx32,
+                  info->min_block, info->opt_block, info->max_block);
+            break;
+
         default:
             TRACE("ignoring unknown export info %" PRIu16, type);
             if (drop_sync(ioc, len) != len) {
@@ -710,8 +769,9 @@  fail:
 #ifdef __linux__
 int nbd_init(int fd, QIOChannelSocket *sioc, NbdExportInfo *info)
 {
-    unsigned long sectors = info->size / BDRV_SECTOR_SIZE;
-    if (info->size / BDRV_SECTOR_SIZE != sectors) {
+    unsigned long sector_size = MAX(BDRV_SECTOR_SIZE, info->min_block);
+    unsigned long sectors = info->size / sector_size;
+    if (info->size / sector_size != sectors) {
         LOG("Export size %" PRId64 " too large for 32-bit kernel", info->size);
         return -E2BIG;
     }
@@ -724,18 +784,18 @@  int nbd_init(int fd, QIOChannelSocket *sioc, NbdExportInfo *info)
         return -serrno;
     }

-    TRACE("Setting block size to %lu", (unsigned long)BDRV_SECTOR_SIZE);
+    TRACE("Setting block size to %lu", sector_size);

-    if (ioctl(fd, NBD_SET_BLKSIZE, (unsigned long)BDRV_SECTOR_SIZE) < 0) {
+    if (ioctl(fd, NBD_SET_BLKSIZE, sector_size) < 0) {
         int serrno = errno;
         LOG("Failed setting NBD block size");
         return -serrno;
     }

     TRACE("Setting size to %lu block(s)", sectors);
-    if (size % BDRV_SECTOR_SIZE) {
-        TRACE("Ignoring trailing %d bytes of export",
-              (int) (size % BDRV_SECTOR_SIZE));
+    if (info->size % sector_size) {
+        TRACE("Ignoring trailing %" PRId64 " bytes of export",
+              info->size % sector_size);
     }

     if (ioctl(fd, NBD_SET_SIZE_BLOCKS, sectors) < 0) {