diff mbox series

[v2,2/6] file-posix: try BLKSECTGET on block devices too, do not round to power of 2

Message ID 20210524163645.382940-3-pbonzini@redhat.com (mailing list archive)
State New, archived
Headers show
Series block: file-posix queue | expand

Commit Message

Paolo Bonzini May 24, 2021, 4:36 p.m. UTC
bs->sg is only true for character devices, but block devices can also
be used with scsi-block and scsi-generic.  Unfortunately BLKSECTGET
returns bytes in an int for /dev/sgN devices, and sectors in a short
for block devices, so account for that in the code.

The maximum transfer also need not be a power of 2 (for example I have
seen disks with 1280 KiB maximum transfer) so there's no need to pass
the result through pow2floor.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 block/file-posix.c | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

Comments

Kevin Wolf May 27, 2021, 3:51 p.m. UTC | #1
Am 24.05.2021 um 18:36 hat Paolo Bonzini geschrieben:
> bs->sg is only true for character devices, but block devices can also
> be used with scsi-block and scsi-generic.  Unfortunately BLKSECTGET
> returns bytes in an int for /dev/sgN devices, and sectors in a short
> for block devices, so account for that in the code.
> 
> The maximum transfer also need not be a power of 2 (for example I have
> seen disks with 1280 KiB maximum transfer) so there's no need to pass
> the result through pow2floor.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

Looks like this is more or less a revert of Maxim's commit 867eccfe. If
this is what we want, should this old commit be mentioned in one way or
another in the commit message?

Apparently the motivation for Maxim's patch was, if I'm reading the
description correctly, that it affected non-sg cases by imposing
unnecessary restrictions. I see that patch 1 changed the max_iov part so
that it won't affect non-sg cases any more, but max_transfer could still
be more restricted than necessary, no?

For convenience, the bug report fixed with that patch is here:
https://bugzilla.redhat.com/show_bug.cgi?id=1647104

Are we really trying to describe different things (limits for SG_IO and
for normal I/O) in one value with max_transfer, even though it could be
two different numbers for the same block device?

> diff --git a/block/file-posix.c b/block/file-posix.c
> index 59c889d5a7..e5ef006aee 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1149,22 +1149,27 @@ static void raw_reopen_abort(BDRVReopenState *state)
>      s->reopen_state = NULL;
>  }
>  
> -static int sg_get_max_transfer_length(int fd)
> +static int sg_get_max_transfer_length(int fd, struct stat *st)

This is now a misnomer. Should we revert to the pre-867eccfe name
hdev_get_max_transfer_length()?

>  {
>  #ifdef BLKSECTGET
> -    int max_bytes = 0;
> -
> -    if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
> -        return max_bytes;
> +    if (S_ISBLK(st->st_mode)) {
> +        unsigned short max_sectors = 0;
> +        if (ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
> +            return max_sectors * 512;
> +        }
>      } else {
> -        return -errno;
> +        int max_bytes = 0;
> +        if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
> +            return max_bytes;
> +        }
>      }
> +    return -errno;
>  #else
>      return -ENOSYS;
>  #endif
>  }
>  
> -static int sg_get_max_segments(int fd)
> +static int sg_get_max_segments(int fd, struct stat *st)

Same for this one.

>  {
>  #ifdef CONFIG_LINUX
>      char buf[32];
> @@ -1173,15 +1178,9 @@ static int sg_get_max_segments(int fd)
>      int ret;
>      int sysfd = -1;
>      long max_segments;
> -    struct stat st;
> -
> -    if (fstat(fd, &st)) {
> -        ret = -errno;
> -        goto out;
> -    }
>  
>      sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
> -                                major(st.st_rdev), minor(st.st_rdev));
> +                                major(st->st_rdev), minor(st->st_rdev));
>      sysfd = open(sysfspath, O_RDONLY);
>      if (sysfd == -1) {
>          ret = -errno;
> @@ -1218,15 +1217,20 @@ out:
>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
>      BDRVRawState *s = bs->opaque;
> +    struct stat st;
> +
> +    if (fstat(s->fd, &st)) {
> +        return;

Don't we want to set errp? Or do you intentionally ignore the error?

> +    }
>  
> -    if (bs->sg) {
> -        int ret = sg_get_max_transfer_length(s->fd);
> +    if (bs->sg || S_ISBLK(st.st_mode)) {
> +        int ret = sg_get_max_transfer_length(s->fd, &st);
>  
>          if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
> -            bs->bl.max_transfer = pow2floor(ret);
> +            bs->bl.max_transfer = ret;
>          }
>  
> -        ret = sg_get_max_segments(s->fd);
> +        ret = sg_get_max_segments(s->fd, &st);
>          if (ret > 0) {
>              bs->bl.max_iov = ret;
>          }

Kevin
Paolo Bonzini May 27, 2021, 8:14 p.m. UTC | #2
On 27/05/21 17:51, Kevin Wolf wrote:
> Am 24.05.2021 um 18:36 hat Paolo Bonzini geschrieben:
>> bs->sg is only true for character devices, but block devices can also
>> be used with scsi-block and scsi-generic.  Unfortunately BLKSECTGET
>> returns bytes in an int for /dev/sgN devices, and sectors in a short
>> for block devices, so account for that in the code.
>>
>> The maximum transfer also need not be a power of 2 (for example I have
>> seen disks with 1280 KiB maximum transfer) so there's no need to pass
>> the result through pow2floor.
>>
>> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> 
> Looks like this is more or less a revert of Maxim's commit 867eccfe. If
> this is what we want, should this old commit be mentioned in one way or
> another in the commit message?

It is (but it is not intentional).

> Apparently the motivation for Maxim's patch was, if I'm reading the
> description correctly, that it affected non-sg cases by imposing
> unnecessary restrictions. I see that patch 1 changed the max_iov part so
> that it won't affect non-sg cases any more, but max_transfer could still
> be more restricted than necessary, no?

Indeed the kernel puts no limit at all, but especially with O_DIRECT we 
probably benefit from avoiding the moral equivalent of "bufferbloat".

> For convenience, the bug report fixed with that patch is here:
> https://bugzilla.redhat.com/show_bug.cgi?id=1647104
> 
> Are we really trying to describe different things (limits for SG_IO and
> for normal I/O) in one value with max_transfer, even though it could be
> two different numbers for the same block device?

>> -static int sg_get_max_transfer_length(int fd)
>> +static int sg_get_max_transfer_length(int fd, struct stat *st)
> 
> This is now a misnomer. Should we revert to the pre-867eccfe name
> hdev_get_max_transfer_length()?

Yes.

>>   static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>>   {
>>       BDRVRawState *s = bs->opaque;
>> +    struct stat st;
>> +
>> +    if (fstat(s->fd, &st)) {
>> +        return;
> 
> Don't we want to set errp? Or do you intentionally ignore the error?

Yes, since we ignore errors from the ioctl I figured it's the same for 
fstat (just do not do the ioctls).

However, skipping raw_probe_alignment is wrong.

Thanks for the review!  Should I wait for you to go through the other 
patches?

Paolo
Kevin Wolf May 31, 2021, 1:59 p.m. UTC | #3
Am 27.05.2021 um 22:14 hat Paolo Bonzini geschrieben:
> On 27/05/21 17:51, Kevin Wolf wrote:
> > Am 24.05.2021 um 18:36 hat Paolo Bonzini geschrieben:
> > > bs->sg is only true for character devices, but block devices can also
> > > be used with scsi-block and scsi-generic.  Unfortunately BLKSECTGET
> > > returns bytes in an int for /dev/sgN devices, and sectors in a short
> > > for block devices, so account for that in the code.
> > > 
> > > The maximum transfer also need not be a power of 2 (for example I have
> > > seen disks with 1280 KiB maximum transfer) so there's no need to pass
> > > the result through pow2floor.
> > > 
> > > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> > 
> > Looks like this is more or less a revert of Maxim's commit 867eccfe. If
> > this is what we want, should this old commit be mentioned in one way or
> > another in the commit message?
> 
> It is (but it is not intentional).
> 
> > Apparently the motivation for Maxim's patch was, if I'm reading the
> > description correctly, that it affected non-sg cases by imposing
> > unnecessary restrictions. I see that patch 1 changed the max_iov part so
> > that it won't affect non-sg cases any more, but max_transfer could still
> > be more restricted than necessary, no?
> 
> Indeed the kernel puts no limit at all, but especially with O_DIRECT we
> probably benefit from avoiding the moral equivalent of "bufferbloat".

Yeah, that sounds plausible, but on the other hand the bug report Maxim
addressed was about performance issues related to buffer sizes being too
small. So even if we want to have some limit, max_transfer of the host
device is probably not the right one for the general case.

> > For convenience, the bug report fixed with that patch is here:
> > https://bugzilla.redhat.com/show_bug.cgi?id=1647104
> > 
> > Are we really trying to describe different things (limits for SG_IO and
> > for normal I/O) in one value with max_transfer, even though it could be
> > two different numbers for the same block device?
> 
> > > -static int sg_get_max_transfer_length(int fd)
> > > +static int sg_get_max_transfer_length(int fd, struct stat *st)
> > 
> > This is now a misnomer. Should we revert to the pre-867eccfe name
> > hdev_get_max_transfer_length()?
> 
> Yes.
> 
> > >   static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
> > >   {
> > >       BDRVRawState *s = bs->opaque;
> > > +    struct stat st;
> > > +
> > > +    if (fstat(s->fd, &st)) {
> > > +        return;
> > 
> > Don't we want to set errp? Or do you intentionally ignore the error?
> 
> Yes, since we ignore errors from the ioctl I figured it's the same for fstat
> (just do not do the ioctls).
> 
> However, skipping raw_probe_alignment is wrong.
> 
> Thanks for the review!  Should I wait for you to go through the other
> patches?

I went through the whole series, but had no comments for the other
patches, so the rest should be good.

Kevin
Paolo Bonzini May 31, 2021, 4:36 p.m. UTC | #4
On 31/05/21 15:59, Kevin Wolf wrote:
>>> Apparently the motivation for Maxim's patch was, if I'm reading the
>>> description correctly, that it affected non-sg cases by imposing
>>> unnecessary restrictions. I see that patch 1 changed the max_iov part so
>>> that it won't affect non-sg cases any more, but max_transfer could still
>>> be more restricted than necessary, no?
>>
>> Indeed the kernel puts no limit at all, but especially with O_DIRECT we
>> probably benefit from avoiding the moral equivalent of "bufferbloat".
> 
> Yeah, that sounds plausible, but on the other hand the bug report Maxim
> addressed was about performance issues related to buffer sizes being too
> small. So even if we want to have some limit, max_transfer of the host
> device is probably not the right one for the general case.

Yeah, for a simple dd with O_DIRECT there is no real max_transfer, and 
if you are willing to allocate a big enough buffer.  Quick test on my 
laptop, reading 12.5 GiB:

    163840       9.46777s
    327680       9.41480s
    520192       9.39520s (max_iov * 4K)
    614400       9.06289s
    655360	8.85762s
    1310720      8.75502s
    2621440	8.26522s
    5242880	7.88319s
    10485760	7.66751s
    20971520 	7.42627s

In practice using blktrace shows that virtual address space is 
fragmented enough that the cap for I/O operations is not max_transfer 
but max_iov * 4096 (as was before the series)...  and yet the benefit 
effectively *begins* there because it's where the cost of the system 
calls is amortized over multiple kernel<->disk communications.

Things are probably more complicated if more than one I/O is in flight, 
and with async I/O instead of read/write, but still a huge part of 
performance is seemingly the cost of system calls (not just the context 
switch, also pinning the I/O buffer and all other ancillary costs).

So the solution is probably to add a max_hw_transfer limit in addition 
to max_transfer, and have max_hw_iov instead of max_iov to match.

Paolo
diff mbox series

Patch

diff --git a/block/file-posix.c b/block/file-posix.c
index 59c889d5a7..e5ef006aee 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1149,22 +1149,27 @@  static void raw_reopen_abort(BDRVReopenState *state)
     s->reopen_state = NULL;
 }
 
-static int sg_get_max_transfer_length(int fd)
+static int sg_get_max_transfer_length(int fd, struct stat *st)
 {
 #ifdef BLKSECTGET
-    int max_bytes = 0;
-
-    if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
-        return max_bytes;
+    if (S_ISBLK(st->st_mode)) {
+        unsigned short max_sectors = 0;
+        if (ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
+            return max_sectors * 512;
+        }
     } else {
-        return -errno;
+        int max_bytes = 0;
+        if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
+            return max_bytes;
+        }
     }
+    return -errno;
 #else
     return -ENOSYS;
 #endif
 }
 
-static int sg_get_max_segments(int fd)
+static int sg_get_max_segments(int fd, struct stat *st)
 {
 #ifdef CONFIG_LINUX
     char buf[32];
@@ -1173,15 +1178,9 @@  static int sg_get_max_segments(int fd)
     int ret;
     int sysfd = -1;
     long max_segments;
-    struct stat st;
-
-    if (fstat(fd, &st)) {
-        ret = -errno;
-        goto out;
-    }
 
     sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
-                                major(st.st_rdev), minor(st.st_rdev));
+                                major(st->st_rdev), minor(st->st_rdev));
     sysfd = open(sysfspath, O_RDONLY);
     if (sysfd == -1) {
         ret = -errno;
@@ -1218,15 +1217,20 @@  out:
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
     BDRVRawState *s = bs->opaque;
+    struct stat st;
+
+    if (fstat(s->fd, &st)) {
+        return;
+    }
 
-    if (bs->sg) {
-        int ret = sg_get_max_transfer_length(s->fd);
+    if (bs->sg || S_ISBLK(st.st_mode)) {
+        int ret = sg_get_max_transfer_length(s->fd, &st);
 
         if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
-            bs->bl.max_transfer = pow2floor(ret);
+            bs->bl.max_transfer = ret;
         }
 
-        ret = sg_get_max_segments(s->fd);
+        ret = sg_get_max_segments(s->fd, &st);
         if (ret > 0) {
             bs->bl.max_iov = ret;
         }