diff mbox series

[v2,1/2] file-posix: probe discard alignment on Linux block devices

Message ID 20250410184103.23385-2-stefanha@redhat.com (mailing list archive)
State New
Headers show
Series block: discard alignment fixes | expand

Commit Message

Stefan Hajnoczi April 10, 2025, 6:41 p.m. UTC
Populate the pdiscard_alignment block limit so the block layer is able
align discard requests correctly.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/file-posix.c | 56 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

Comments

Hanna Czenczek April 11, 2025, 8:15 a.m. UTC | #1
On 10.04.25 20:41, Stefan Hajnoczi wrote:
> Populate the pdiscard_alignment block limit so the block layer is able
> align discard requests correctly.
>
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
>   block/file-posix.c | 56 +++++++++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 55 insertions(+), 1 deletion(-)

Ah, I didn’t know sysfs is actually fair game.  Should we not also get 
the maximum discard length then, too?

> diff --git a/block/file-posix.c b/block/file-posix.c
> index 56d1972d15..2a1e1f48c0 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1276,10 +1276,10 @@ static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
>   }
>   #endif /* defined(CONFIG_BLKZONED) */
>   
> +#ifdef CONFIG_LINUX
>   /*
>    * Get a sysfs attribute value as a long integer.
>    */
> -#ifdef CONFIG_LINUX
>   static long get_sysfs_long_val(struct stat *st, const char *attribute)
>   {
>       g_autofree char *str = NULL;
> @@ -1299,6 +1299,30 @@ static long get_sysfs_long_val(struct stat *st, const char *attribute)
>       }
>       return ret;
>   }
> +
> +/*
> + * Get a sysfs attribute value as a uint32_t.
> + */
> +static int get_sysfs_u32_val(struct stat *st, const char *attribute,
> +                             uint32_t *u32)
> +{
> +    g_autofree char *str = NULL;
> +    const char *end;
> +    unsigned int val;
> +    int ret;
> +
> +    ret = get_sysfs_str_val(st, attribute, &str);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    /* The file is ended with '\n', pass 'end' to accept that. */
> +    ret = qemu_strtoui(str, &end, 10, &val);
> +    if (ret == 0 && end && *end == '\0') {
> +        *u32 = val;
> +    }
> +    return ret;
> +}
>   #endif
>   
>   static int hdev_get_max_segments(int fd, struct stat *st)
> @@ -1318,6 +1342,23 @@ static int hdev_get_max_segments(int fd, struct stat *st)
>   #endif
>   }
>   
> +/*
> + * Fills in *dalign with the discard alignment and returns 0 on success,
> + * -errno otherwise.
> + */
> +static int hdev_get_pdiscard_alignment(struct stat *st, uint32_t *dalign)
> +{
> +#ifdef CONFIG_LINUX
> +    /*
> +     * Note that Linux "discard_granularity" is QEMU "discard_alignment". Linux
> +     * "discard_alignment" is something else.
> +     */
> +    return get_sysfs_u32_val(st, "discard_granularity", dalign);
> +#else
> +    return -ENOTSUP;
> +#endif
> +}
> +
>   #if defined(CONFIG_BLKZONED)
>   /*
>    * If the reset_all flag is true, then the wps of zone whose state is
> @@ -1527,6 +1568,19 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>           }
>       }
>   
> +    if (S_ISBLK(st.st_mode)) {
> +        uint32_t dalign = 0;
> +        int ret;
> +
> +        ret = hdev_get_pdiscard_alignment(&st, &dalign);
> +        if (ret == 0) {
> +            /* Must be a multiple of request_alignment */
> +            assert(dalign % bs->bl.request_alignment == 0);

Is it fair to crash qemu if the kernel reports a value that is not a 
multiple of request_alignment?  Wouldn’t it make more sense to take the 
maximum, and if that still isn’t a multiple, return an error here?

Hanna

> +
> +            bs->bl.pdiscard_alignment = dalign;
> +        }
> +    }
> +
>       raw_refresh_zoned_limits(bs, &st, errp);
>   }
>
Stefan Hajnoczi April 14, 2025, 3:34 p.m. UTC | #2
On Fri, Apr 11, 2025 at 10:15:13AM +0200, Hanna Czenczek wrote:
> On 10.04.25 20:41, Stefan Hajnoczi wrote:
> > Populate the pdiscard_alignment block limit so the block layer is able
> > align discard requests correctly.
> > 
> > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > ---
> >   block/file-posix.c | 56 +++++++++++++++++++++++++++++++++++++++++++++-
> >   1 file changed, 55 insertions(+), 1 deletion(-)
> 
> Ah, I didn’t know sysfs is actually fair game.  Should we not also get the
> maximum discard length then, too?

The maximum discard length behaves differently: the Linux block layer
splits requests according to the maximum discard length. If the guest
submits a discard request that is too large for the host, the host block
layer will split it and the request succeeds. That is why I didn't make
any changes to the maximum discard length in this series.

> 
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 56d1972d15..2a1e1f48c0 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -1276,10 +1276,10 @@ static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
> >   }
> >   #endif /* defined(CONFIG_BLKZONED) */
> > +#ifdef CONFIG_LINUX
> >   /*
> >    * Get a sysfs attribute value as a long integer.
> >    */
> > -#ifdef CONFIG_LINUX
> >   static long get_sysfs_long_val(struct stat *st, const char *attribute)
> >   {
> >       g_autofree char *str = NULL;
> > @@ -1299,6 +1299,30 @@ static long get_sysfs_long_val(struct stat *st, const char *attribute)
> >       }
> >       return ret;
> >   }
> > +
> > +/*
> > + * Get a sysfs attribute value as a uint32_t.
> > + */
> > +static int get_sysfs_u32_val(struct stat *st, const char *attribute,
> > +                             uint32_t *u32)
> > +{
> > +    g_autofree char *str = NULL;
> > +    const char *end;
> > +    unsigned int val;
> > +    int ret;
> > +
> > +    ret = get_sysfs_str_val(st, attribute, &str);
> > +    if (ret < 0) {
> > +        return ret;
> > +    }
> > +
> > +    /* The file is ended with '\n', pass 'end' to accept that. */
> > +    ret = qemu_strtoui(str, &end, 10, &val);
> > +    if (ret == 0 && end && *end == '\0') {
> > +        *u32 = val;
> > +    }
> > +    return ret;
> > +}
> >   #endif
> >   static int hdev_get_max_segments(int fd, struct stat *st)
> > @@ -1318,6 +1342,23 @@ static int hdev_get_max_segments(int fd, struct stat *st)
> >   #endif
> >   }
> > +/*
> > + * Fills in *dalign with the discard alignment and returns 0 on success,
> > + * -errno otherwise.
> > + */
> > +static int hdev_get_pdiscard_alignment(struct stat *st, uint32_t *dalign)
> > +{
> > +#ifdef CONFIG_LINUX
> > +    /*
> > +     * Note that Linux "discard_granularity" is QEMU "discard_alignment". Linux
> > +     * "discard_alignment" is something else.
> > +     */
> > +    return get_sysfs_u32_val(st, "discard_granularity", dalign);
> > +#else
> > +    return -ENOTSUP;
> > +#endif
> > +}
> > +
> >   #if defined(CONFIG_BLKZONED)
> >   /*
> >    * If the reset_all flag is true, then the wps of zone whose state is
> > @@ -1527,6 +1568,19 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
> >           }
> >       }
> > +    if (S_ISBLK(st.st_mode)) {
> > +        uint32_t dalign = 0;
> > +        int ret;
> > +
> > +        ret = hdev_get_pdiscard_alignment(&st, &dalign);
> > +        if (ret == 0) {
> > +            /* Must be a multiple of request_alignment */
> > +            assert(dalign % bs->bl.request_alignment == 0);
> 
> Is it fair to crash qemu if the kernel reports a value that is not a
> multiple of request_alignment?  Wouldn’t it make more sense to take the
> maximum, and if that still isn’t a multiple, return an error here?

I'll replace the assertion with an error.

The Linux block layer sysfs documentation says:

  [RO] Devices that support discard functionality may internally
  allocate space using units that are bigger than the logical
  block size.

I don't expect dalign to be smaller than request_alignment, but it
doesn't hurt the check if request_alignment would work.

> 
> Hanna
> 
> > +
> > +            bs->bl.pdiscard_alignment = dalign;
> > +        }
> > +    }
> > +
> >       raw_refresh_zoned_limits(bs, &st, errp);
> >   }
>
diff mbox series

Patch

diff --git a/block/file-posix.c b/block/file-posix.c
index 56d1972d15..2a1e1f48c0 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1276,10 +1276,10 @@  static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
 }
 #endif /* defined(CONFIG_BLKZONED) */
 
+#ifdef CONFIG_LINUX
 /*
  * Get a sysfs attribute value as a long integer.
  */
-#ifdef CONFIG_LINUX
 static long get_sysfs_long_val(struct stat *st, const char *attribute)
 {
     g_autofree char *str = NULL;
@@ -1299,6 +1299,30 @@  static long get_sysfs_long_val(struct stat *st, const char *attribute)
     }
     return ret;
 }
+
+/*
+ * Get a sysfs attribute value as a uint32_t.
+ */
+static int get_sysfs_u32_val(struct stat *st, const char *attribute,
+                             uint32_t *u32)
+{
+    g_autofree char *str = NULL;
+    const char *end;
+    unsigned int val;
+    int ret;
+
+    ret = get_sysfs_str_val(st, attribute, &str);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* The file is ended with '\n', pass 'end' to accept that. */
+    ret = qemu_strtoui(str, &end, 10, &val);
+    if (ret == 0 && end && *end == '\0') {
+        *u32 = val;
+    }
+    return ret;
+}
 #endif
 
 static int hdev_get_max_segments(int fd, struct stat *st)
@@ -1318,6 +1342,23 @@  static int hdev_get_max_segments(int fd, struct stat *st)
 #endif
 }
 
+/*
+ * Fills in *dalign with the discard alignment and returns 0 on success,
+ * -errno otherwise.
+ */
+static int hdev_get_pdiscard_alignment(struct stat *st, uint32_t *dalign)
+{
+#ifdef CONFIG_LINUX
+    /*
+     * Note that Linux "discard_granularity" is QEMU "discard_alignment". Linux
+     * "discard_alignment" is something else.
+     */
+    return get_sysfs_u32_val(st, "discard_granularity", dalign);
+#else
+    return -ENOTSUP;
+#endif
+}
+
 #if defined(CONFIG_BLKZONED)
 /*
  * If the reset_all flag is true, then the wps of zone whose state is
@@ -1527,6 +1568,19 @@  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
         }
     }
 
+    if (S_ISBLK(st.st_mode)) {
+        uint32_t dalign = 0;
+        int ret;
+
+        ret = hdev_get_pdiscard_alignment(&st, &dalign);
+        if (ret == 0) {
+            /* Must be a multiple of request_alignment */
+            assert(dalign % bs->bl.request_alignment == 0);
+
+            bs->bl.pdiscard_alignment = dalign;
+        }
+    }
+
     raw_refresh_zoned_limits(bs, &st, errp);
 }