diff mbox series

[1/2] file-posix: Correctly read max_segments of SG nodes

Message ID 20200811225122.17342-2-dmitry.fomichev@wdc.com (mailing list archive)
State New, archived
Headers show
Series [1/2] file-posix: Correctly read max_segments of SG nodes | expand

Commit Message

Dmitry Fomichev Aug. 11, 2020, 10:51 p.m. UTC
If scsi-generic driver is in use, an SG node can be specified in
the command line in place of a regular SCSI device. In this case,
sg_get_max_segments() fails to open max_segments entry in sysfs
because /dev/sgX is a character device. As the result, the maximum
transfer size for the device may be calculated incorrectly, causing
I/O errors if the maximum transfer size at the guest ends up to be
larger compared to the host.

Check system device type in sg_get_max_segments() and read the
max_segments value differently if it is a character device.

Reported-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Fixes: 9103f1ceb46614b150bcbc3c9a4fbc72b47fedcc
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
---
 block/file-posix.c | 55 +++++++++++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 23 deletions(-)

Comments

Max Reitz Sept. 17, 2020, 1:16 p.m. UTC | #1
On 12.08.20 00:51, Dmitry Fomichev wrote:
> If scsi-generic driver is in use, an SG node can be specified in
> the command line in place of a regular SCSI device. In this case,
> sg_get_max_segments() fails to open max_segments entry in sysfs
> because /dev/sgX is a character device. As the result, the maximum
> transfer size for the device may be calculated incorrectly, causing
> I/O errors if the maximum transfer size at the guest ends up to be
> larger compared to the host.
> 
> Check system device type in sg_get_max_segments() and read the
> max_segments value differently if it is a character device.
> 
> Reported-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> Fixes: 9103f1ceb46614b150bcbc3c9a4fbc72b47fedcc
> Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
> ---
>  block/file-posix.c | 55 +++++++++++++++++++++++++++-------------------
>  1 file changed, 32 insertions(+), 23 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 094e3b0212..f9e2424e8f 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1108,6 +1108,7 @@ static int sg_get_max_segments(int fd)
>      int ret;
>      int sysfd = -1;
>      long max_segments;
> +    unsigned int max_segs;
>      struct stat st;
>  
>      if (fstat(fd, &st)) {
> @@ -1115,30 +1116,38 @@ static int sg_get_max_segments(int fd)
>          goto out;
>      }
>  
> -    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
> -                                major(st.st_rdev), minor(st.st_rdev));
> -    sysfd = open(sysfspath, O_RDONLY);
> -    if (sysfd == -1) {
> -        ret = -errno;
> -        goto out;
> +    if (S_ISBLK(st.st_mode)) {
> +        sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
> +                                    major(st.st_rdev), minor(st.st_rdev));

Sounds reasonable, but this function is (naturally) only called if
bs->sg is true, which is set by hdev_is_sg(), which returns true only if
the device file is a character device.

So is this path ever taken, or can we just replace it all with the ioctl?

(Before 867eccfed84, this function was used for all host devices, which
might explain why the code even exists.)

Max
Maxim Levitsky Sept. 17, 2020, 1:22 p.m. UTC | #2
On Thu, 2020-09-17 at 15:16 +0200, Max Reitz wrote:
> On 12.08.20 00:51, Dmitry Fomichev wrote:
> > If scsi-generic driver is in use, an SG node can be specified in
> > the command line in place of a regular SCSI device. In this case,
> > sg_get_max_segments() fails to open max_segments entry in sysfs
> > because /dev/sgX is a character device. As the result, the maximum
> > transfer size for the device may be calculated incorrectly, causing
> > I/O errors if the maximum transfer size at the guest ends up to be
> > larger compared to the host.
> > 
> > Check system device type in sg_get_max_segments() and read the
> > max_segments value differently if it is a character device.
> > 
> > Reported-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> > Fixes: 9103f1ceb46614b150bcbc3c9a4fbc72b47fedcc
> > Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
> > ---
> >  block/file-posix.c | 55 +++++++++++++++++++++++++++-------------------
> >  1 file changed, 32 insertions(+), 23 deletions(-)
> > 
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 094e3b0212..f9e2424e8f 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -1108,6 +1108,7 @@ static int sg_get_max_segments(int fd)
> >      int ret;
> >      int sysfd = -1;
> >      long max_segments;
> > +    unsigned int max_segs;
> >      struct stat st;
> >  
> >      if (fstat(fd, &st)) {
> > @@ -1115,30 +1116,38 @@ static int sg_get_max_segments(int fd)
> >          goto out;
> >      }
> >  
> > -    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
> > -                                major(st.st_rdev), minor(st.st_rdev));
> > -    sysfd = open(sysfspath, O_RDONLY);
> > -    if (sysfd == -1) {
> > -        ret = -errno;
> > -        goto out;
> > +    if (S_ISBLK(st.st_mode)) {
> > +        sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
> > +                                    major(st.st_rdev), minor(st.st_rdev));
> 
> Sounds reasonable, but this function is (naturally) only called if
> bs->sg is true, which is set by hdev_is_sg(), which returns true only if
> the device file is a character device.
> 
> So is this path ever taken, or can we just replace it all with the ioctl?
> 
> (Before 867eccfed84, this function was used for all host devices, which
> might explain why the code even exists.)
> 
> Max

I have another proposal which I am currently evaluating.

How about we drop all the SG_IO limits code alltogher from the raw driver, and
instead just let the scsi drivers (scsi-block and scsi-generic) query
the device directly, since I don't think that the kernel (I will double check this)?


Best regards,
	Maxim Levitsky
Maxim Levitsky Sept. 17, 2020, 1:24 p.m. UTC | #3
On Thu, 2020-09-17 at 16:22 +0300, Maxim Levitsky wrote:
> On Thu, 2020-09-17 at 15:16 +0200, Max Reitz wrote:
> > On 12.08.20 00:51, Dmitry Fomichev wrote:
> > > If scsi-generic driver is in use, an SG node can be specified in
> > > the command line in place of a regular SCSI device. In this case,
> > > sg_get_max_segments() fails to open max_segments entry in sysfs
> > > because /dev/sgX is a character device. As the result, the maximum
> > > transfer size for the device may be calculated incorrectly, causing
> > > I/O errors if the maximum transfer size at the guest ends up to be
> > > larger compared to the host.
> > > 
> > > Check system device type in sg_get_max_segments() and read the
> > > max_segments value differently if it is a character device.
> > > 
> > > Reported-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> > > Fixes: 9103f1ceb46614b150bcbc3c9a4fbc72b47fedcc
> > > Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
> > > ---
> > >  block/file-posix.c | 55 +++++++++++++++++++++++++++-------------------
> > >  1 file changed, 32 insertions(+), 23 deletions(-)
> > > 
> > > diff --git a/block/file-posix.c b/block/file-posix.c
> > > index 094e3b0212..f9e2424e8f 100644
> > > --- a/block/file-posix.c
> > > +++ b/block/file-posix.c
> > > @@ -1108,6 +1108,7 @@ static int sg_get_max_segments(int fd)
> > >      int ret;
> > >      int sysfd = -1;
> > >      long max_segments;
> > > +    unsigned int max_segs;
> > >      struct stat st;
> > >  
> > >      if (fstat(fd, &st)) {
> > > @@ -1115,30 +1116,38 @@ static int sg_get_max_segments(int fd)
> > >          goto out;
> > >      }
> > >  
> > > -    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
> > > -                                major(st.st_rdev), minor(st.st_rdev));
> > > -    sysfd = open(sysfspath, O_RDONLY);
> > > -    if (sysfd == -1) {
> > > -        ret = -errno;
> > > -        goto out;
> > > +    if (S_ISBLK(st.st_mode)) {
> > > +        sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
> > > +                                    major(st.st_rdev), minor(st.st_rdev));
> > 
> > Sounds reasonable, but this function is (naturally) only called if
> > bs->sg is true, which is set by hdev_is_sg(), which returns true only if
> > the device file is a character device.
> > 
> > So is this path ever taken, or can we just replace it all with the ioctl?
> > 
> > (Before 867eccfed84, this function was used for all host devices, which
> > might explain why the code even exists.)
> > 
> > Max
> 
> I have another proposal which I am currently evaluating.
> 
> How about we drop all the SG_IO limits code alltogher from the raw driver, and
> instead just let the scsi drivers (scsi-block and scsi-generic) query
> the device directly, since I don't think that the kernel (I will double check this)?

I hit send too soon. I mean I don't think that the kernel imposes its own limits on SG_IO.

Best regards,
	Maxim Levitsky
> 
> 
> Best regards,
> 	Maxim Levitsky
> 
>
Dmitry Fomichev Sept. 17, 2020, 4:44 p.m. UTC | #4
> -----Original Message-----
> From: Maxim Levitsky <mlevitsk@redhat.com>
> Sent: Thursday, September 17, 2020 9:24 AM
> To: Max Reitz <mreitz@redhat.com>; Dmitry Fomichev
> <Dmitry.Fomichev@wdc.com>; Kevin Wolf <kwolf@redhat.com>; Paolo
> Bonzini <pbonzini@redhat.com>; Fam Zheng <fam@euphon.net>; Philippe
> Mathieu-Daudé <philmd@redhat.com>
> Cc: Alistair Francis <Alistair.Francis@wdc.com>; Damien Le Moal
> <Damien.LeMoal@wdc.com>; qemu-block@nongnu.org; qemu-
> devel@nongnu.org
> Subject: Re: [PATCH 1/2] file-posix: Correctly read max_segments of SG
> nodes
> 
> On Thu, 2020-09-17 at 16:22 +0300, Maxim Levitsky wrote:
> > On Thu, 2020-09-17 at 15:16 +0200, Max Reitz wrote:
> > > On 12.08.20 00:51, Dmitry Fomichev wrote:
> > > > If scsi-generic driver is in use, an SG node can be specified in
> > > > the command line in place of a regular SCSI device. In this case,
> > > > sg_get_max_segments() fails to open max_segments entry in sysfs
> > > > because /dev/sgX is a character device. As the result, the maximum
> > > > transfer size for the device may be calculated incorrectly, causing
> > > > I/O errors if the maximum transfer size at the guest ends up to be
> > > > larger compared to the host.
> > > >
> > > > Check system device type in sg_get_max_segments() and read the
> > > > max_segments value differently if it is a character device.
> > > >
> > > > Reported-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> > > > Fixes: 9103f1ceb46614b150bcbc3c9a4fbc72b47fedcc
> > > > Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
> > > > ---
> > > >  block/file-posix.c | 55 +++++++++++++++++++++++++++----------------
> ---
> > > >  1 file changed, 32 insertions(+), 23 deletions(-)
> > > >
> > > > diff --git a/block/file-posix.c b/block/file-posix.c
> > > > index 094e3b0212..f9e2424e8f 100644
> > > > --- a/block/file-posix.c
> > > > +++ b/block/file-posix.c
> > > > @@ -1108,6 +1108,7 @@ static int sg_get_max_segments(int fd)
> > > >      int ret;
> > > >      int sysfd = -1;
> > > >      long max_segments;
> > > > +    unsigned int max_segs;
> > > >      struct stat st;
> > > >
> > > >      if (fstat(fd, &st)) {
> > > > @@ -1115,30 +1116,38 @@ static int sg_get_max_segments(int fd)
> > > >          goto out;
> > > >      }
> > > >
> > > > -    sysfspath =
> g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
> > > > -                                major(st.st_rdev), minor(st.st_rdev));
> > > > -    sysfd = open(sysfspath, O_RDONLY);
> > > > -    if (sysfd == -1) {
> > > > -        ret = -errno;
> > > > -        goto out;
> > > > +    if (S_ISBLK(st.st_mode)) {
> > > > +        sysfspath =
> g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
> > > > +                                    major(st.st_rdev), minor(st.st_rdev));
> > >
> > > Sounds reasonable, but this function is (naturally) only called if
> > > bs->sg is true, which is set by hdev_is_sg(), which returns true only if
> > > the device file is a character device.
> > >
> > > So is this path ever taken, or can we just replace it all with the ioctl?
> > >
> > > (Before 867eccfed84, this function was used for all host devices, which
> > > might explain why the code even exists.)
> > >
> > > Max
> >
> > I have another proposal which I am currently evaluating.
> >
> > How about we drop all the SG_IO limits code alltogher from the raw driver,
> and
> > instead just let the scsi drivers (scsi-block and scsi-generic) query
> > the device directly, since I don't think that the kernel (I will double check
> this)?
> 
> I hit send too soon. I mean I don't think that the kernel imposes its own limits
> on SG_IO.
> 

When I was making the fix, I had a feeling that this code should be moved
someplace else where it could be called both for block devices and sg nodes,
but that would make the patch more intrusive.

Maxim, looks like you are on top of this problem and your approach sounds
sensible too me. Just FYI, it is also possible to avoid using SG_GET_SG_TABLESIZE
ioctl and rely entirely on sysfs, but the code gets a bit more complicated (see below)

Cheers,
Dmitry

--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1110,6 +1110,8 @@ static int sg_get_max_segments(int fd)
     char buf[32];
     const char *end;
     char *sysfspath = NULL;
+    struct dirent *blkdev = NULL;
+    DIR *dir = NULL;
     int ret;
     int sysfd = -1;
     long max_segments;
@@ -1120,8 +1122,30 @@ static int sg_get_max_segments(int fd)
         goto out;
     }
 
-    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
-                                major(st.st_rdev), minor(st.st_rdev));
+    if (S_ISBLK(st.st_mode)) {
+        sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
+                                    major(st.st_rdev), minor(st.st_rdev));
+    } else {
+        sysfspath = g_strdup_printf("/sys/dev/char/%u:%u/device/block",
+                                    major(st.st_rdev), minor(st.st_rdev));
+        dir = opendir(sysfspath);
+        if (!dir) {
+            ret = -errno;
+            goto out;
+        }
+        do {
+            blkdev = readdir(dir);
+            if (!blkdev) {
+                ret = -errno;
+                closedir(dir);
+                goto out;
+            }
+        } while (*blkdev->d_name == '.');
+        g_free(sysfspath);
+        sysfspath = g_strdup_printf("/sys/block/%s/queue/max_segments",
+                                    blkdev->d_name);
+        closedir(dir);
+    }
     sysfd = open(sysfspath, O_RDONLY);
     if (sysfd == -1) {
         ret = -errno;

> Best regards,
> 	Maxim Levitsky
> >
> >
> > Best regards,
> > 	Maxim Levitsky
> >
> >
>
Paolo Bonzini Sept. 19, 2020, 3:15 p.m. UTC | #5
On 17/09/20 15:16, Max Reitz wrote:
> So is this path ever taken, or can we just replace it all with the ioctl?
> 
> (Before 867eccfed84, this function was used for all host devices, which
> might explain why the code even exists.)

Because 867eccfed84 is wrong.  If you use /dev/sda* with SG_IO you do
need to take into account the hardware max segment size/max segment count.

Probably ->sg needs to be set by the front-end, not by the back-end.  An
even better way (but for which I'd leave the task to you and Kevin)
could be to have a new permission BLK_PERM_WRITE_BYPASS and to reduce
the limits to the hardware limits if anybody has requested that
permission.  I tried to implement that a couple years ago but I just
couldn't wrap my mind around the permission code.

Paolo
Paolo Bonzini Sept. 19, 2020, 3:18 p.m. UTC | #6
On 17/09/20 18:44, Dmitry Fomichev wrote:
> 
> Maxim, looks like you are on top of this problem and your approach sounds
> sensible too me. Just FYI, it is also possible to avoid using SG_GET_SG_TABLESIZE
> ioctl and rely entirely on sysfs, but the code gets a bit more complicated (see below)

I would prefer to have the code in block/ but I have no problem if the
hardware limits are placed in a new field of bs->bl.  Then scsi-block
and scsi-generic can consult it instead of bs->bl.max_transfer.

Paolo
diff mbox series

Patch

diff --git a/block/file-posix.c b/block/file-posix.c
index 094e3b0212..f9e2424e8f 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1108,6 +1108,7 @@  static int sg_get_max_segments(int fd)
     int ret;
     int sysfd = -1;
     long max_segments;
+    unsigned int max_segs;
     struct stat st;
 
     if (fstat(fd, &st)) {
@@ -1115,30 +1116,38 @@  static int sg_get_max_segments(int fd)
         goto out;
     }
 
-    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
-                                major(st.st_rdev), minor(st.st_rdev));
-    sysfd = open(sysfspath, O_RDONLY);
-    if (sysfd == -1) {
-        ret = -errno;
-        goto out;
+    if (S_ISBLK(st.st_mode)) {
+        sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
+                                    major(st.st_rdev), minor(st.st_rdev));
+        sysfd = open(sysfspath, O_RDONLY);
+        if (sysfd == -1) {
+            ret = -errno;
+            goto out;
+        }
+        do {
+            ret = read(sysfd, buf, sizeof(buf) - 1);
+        } while (ret == -1 && errno == EINTR);
+        if (ret < 0) {
+            ret = -errno;
+            goto out;
+        } else if (ret == 0) {
+            ret = -EIO;
+            goto out;
+        }
+        buf[ret] = 0;
+        /* The file is ended with '\n', pass 'end' to accept that. */
+        ret = qemu_strtol(buf, &end, 10, &max_segments);
+        if (ret == 0 && end && *end == '\n') {
+            ret = max_segments;
+        }
+    } else {
+	ret = ioctl(fd, SG_GET_SG_TABLESIZE, &max_segs);
+        if (ret != 0) {
+            ret = -errno;
+            goto out;
+        }
+        ret = max_segs;
     }
-    do {
-        ret = read(sysfd, buf, sizeof(buf) - 1);
-    } while (ret == -1 && errno == EINTR);
-    if (ret < 0) {
-        ret = -errno;
-        goto out;
-    } else if (ret == 0) {
-        ret = -EIO;
-        goto out;
-    }
-    buf[ret] = 0;
-    /* The file is ended with '\n', pass 'end' to accept that. */
-    ret = qemu_strtol(buf, &end, 10, &max_segments);
-    if (ret == 0 && end && *end == '\n') {
-        ret = max_segments;
-    }
-
 out:
     if (sysfd != -1) {
         close(sysfd);