diff mbox series

[v2] btrfs: pass checksum type via BTRFS_IOC_FS_INFO ioctl

Message ID 20200626073019.24003-1-johannes.thumshirn@wdc.com (mailing list archive)
State New, archived
Headers show
Series [v2] btrfs: pass checksum type via BTRFS_IOC_FS_INFO ioctl | expand

Commit Message

Johannes Thumshirn June 26, 2020, 7:30 a.m. UTC
With the recent addition of filesystem checksum types other than CRC32c,
it is not anymore hard-coded which checksum type a btrfs filesystem uses.

Up to now there is no good way to read the filesystem checksum, apart from
reading the filesystem UUID and then query sysfs for the checksum type.

Add a new csum_type field to the BTRFS_IOC_FS_INFO ioctl command which
usually is used to query filesystem features. Also add a flags member
indicating that the kernel responded with a set csum_type field.

To simplify further additions to the ioctl, also switch the padding to a
u8 array. Pahole was used to verify the result of this switch:

pahole -C btrfs_ioctl_fs_info_args fs/btrfs/btrfs.ko
struct btrfs_ioctl_fs_info_args {
        __u64                      max_id;               /*     0     8 */
        __u64                      num_devices;          /*     8     8 */
        __u8                       fsid[16];             /*    16    16 */
        __u32                      nodesize;             /*    32     4 */
        __u32                      sectorsize;           /*    36     4 */
        __u32                      clone_alignment;      /*    40     4 */
        __u32                      flags;                /*    44     4 */
        __u16                      csum_type;            /*    48     2 */
        __u8                       reserved[974];        /*    50   974 */

        /* size: 1024, cachelines: 16, members: 9 */
};

Fixes: 3951e7f050ac ("btrfs: add xxhash64 to checksumming algorithms")
Fixes: 3831bf0094ab ("btrfs: add sha256 to checksumming algorithm")
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
Changes to v1:
* add 'out' comment to be consistent (Hans)
* remove le16_to_cpu() (kbuild robot)
* switch padding to be all u8 (David)
---
 fs/btrfs/ioctl.c           |  2 ++
 include/uapi/linux/btrfs.h | 13 +++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

Comments

David Sterba June 26, 2020, 10:38 a.m. UTC | #1
On Fri, Jun 26, 2020 at 04:30:19PM +0900, Johannes Thumshirn wrote:
> With the recent addition of filesystem checksum types other than CRC32c,
> it is not anymore hard-coded which checksum type a btrfs filesystem uses.
> 
> Up to now there is no good way to read the filesystem checksum, apart from
> reading the filesystem UUID and then query sysfs for the checksum type.
> 
> Add a new csum_type field to the BTRFS_IOC_FS_INFO ioctl command which
> usually is used to query filesystem features. Also add a flags member
> indicating that the kernel responded with a set csum_type field.
> 
> To simplify further additions to the ioctl, also switch the padding to a
> u8 array. Pahole was used to verify the result of this switch:
> 
> pahole -C btrfs_ioctl_fs_info_args fs/btrfs/btrfs.ko
> struct btrfs_ioctl_fs_info_args {
>         __u64                      max_id;               /*     0     8 */
>         __u64                      num_devices;          /*     8     8 */
>         __u8                       fsid[16];             /*    16    16 */
>         __u32                      nodesize;             /*    32     4 */
>         __u32                      sectorsize;           /*    36     4 */
>         __u32                      clone_alignment;      /*    40     4 */
>         __u32                      flags;                /*    44     4 */
>         __u16                      csum_type;            /*    48     2 */
>         __u8                       reserved[974];        /*    50   974 */
> 
>         /* size: 1024, cachelines: 16, members: 9 */
> };
> 
> Fixes: 3951e7f050ac ("btrfs: add xxhash64 to checksumming algorithms")
> Fixes: 3831bf0094ab ("btrfs: add sha256 to checksumming algorithm")
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
> Changes to v1:
> * add 'out' comment to be consistent (Hans)
> * remove le16_to_cpu() (kbuild robot)
> * switch padding to be all u8 (David)
> ---
>  fs/btrfs/ioctl.c           |  2 ++
>  include/uapi/linux/btrfs.h | 13 +++++++++++--
>  2 files changed, 13 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index b3e4c632d80c..6c9d0c3a5e7e 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -3217,6 +3217,8 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
>  	fi_args->nodesize = fs_info->nodesize;
>  	fi_args->sectorsize = fs_info->sectorsize;
>  	fi_args->clone_alignment = fs_info->sectorsize;
> +	fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy);
> +	fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_TYPE;
>  
>  	if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
>  		ret = -EFAULT;
> diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
> index e6b6cb0f8bc6..ee15ece4f477 100644
> --- a/include/uapi/linux/btrfs.h
> +++ b/include/uapi/linux/btrfs.h
> @@ -250,10 +250,19 @@ struct btrfs_ioctl_fs_info_args {
>  	__u32 nodesize;				/* out */
>  	__u32 sectorsize;			/* out */
>  	__u32 clone_alignment;			/* out */
> -	__u32 reserved32;
> -	__u64 reserved[122];			/* pad to 1k */
> +	__u32 flags;				/* out */
> +	__u16 csum_type;			/* out */
> +	__u8 reserved[974];			/* pad to 1k */

I think 32 bits for out flags should be enough (or at least for a long
time). I'm not sure if we should make the flags also input. Right now I
think not and if we need to pass flags to request potentially expensive
data to retrieve, we'd add another member for that. I don't have a
concrete example and the whole FS_INFO ioctl is supposed to be
lightweight so as it is now looks good to me.

Also it's pretty small patch, it should be good to go to stable so we
get the fixed interface even in 5.7.
Hans van Kranenburg June 26, 2020, 12:06 p.m. UTC | #2
Hi!

On 6/26/20 12:38 PM, David Sterba wrote:
> On Fri, Jun 26, 2020 at 04:30:19PM +0900, Johannes Thumshirn wrote:
>> With the recent addition of filesystem checksum types other than CRC32c,
>> it is not anymore hard-coded which checksum type a btrfs filesystem uses.
>>
>> Up to now there is no good way to read the filesystem checksum, apart from
>> reading the filesystem UUID and then query sysfs for the checksum type.
>>
>> Add a new csum_type field to the BTRFS_IOC_FS_INFO ioctl command which
>> usually is used to query filesystem features. Also add a flags member
>> indicating that the kernel responded with a set csum_type field.
>>
>> To simplify further additions to the ioctl, also switch the padding to a
>> u8 array. Pahole was used to verify the result of this switch:
>>
>> pahole -C btrfs_ioctl_fs_info_args fs/btrfs/btrfs.ko
>> struct btrfs_ioctl_fs_info_args {
>>         __u64                      max_id;               /*     0     8 */
>>         __u64                      num_devices;          /*     8     8 */
>>         __u8                       fsid[16];             /*    16    16 */
>>         __u32                      nodesize;             /*    32     4 */
>>         __u32                      sectorsize;           /*    36     4 */
>>         __u32                      clone_alignment;      /*    40     4 */
>>         __u32                      flags;                /*    44     4 */
>>         __u16                      csum_type;            /*    48     2 */
>>         __u8                       reserved[974];        /*    50   974 */
>>
>>         /* size: 1024, cachelines: 16, members: 9 */
>> };
>>
>> Fixes: 3951e7f050ac ("btrfs: add xxhash64 to checksumming algorithms")
>> Fixes: 3831bf0094ab ("btrfs: add sha256 to checksumming algorithm")
>> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>> ---
>> Changes to v1:
>> * add 'out' comment to be consistent (Hans)
>> * remove le16_to_cpu() (kbuild robot)
>> * switch padding to be all u8 (David)
>> ---
>>  fs/btrfs/ioctl.c           |  2 ++
>>  include/uapi/linux/btrfs.h | 13 +++++++++++--
>>  2 files changed, 13 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
>> index b3e4c632d80c..6c9d0c3a5e7e 100644
>> --- a/fs/btrfs/ioctl.c
>> +++ b/fs/btrfs/ioctl.c
>> @@ -3217,6 +3217,8 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
>>  	fi_args->nodesize = fs_info->nodesize;
>>  	fi_args->sectorsize = fs_info->sectorsize;
>>  	fi_args->clone_alignment = fs_info->sectorsize;
>> +	fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy);
>> +	fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_TYPE;
>>  
>>  	if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
>>  		ret = -EFAULT;
>> diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
>> index e6b6cb0f8bc6..ee15ece4f477 100644
>> --- a/include/uapi/linux/btrfs.h
>> +++ b/include/uapi/linux/btrfs.h
>> @@ -250,10 +250,19 @@ struct btrfs_ioctl_fs_info_args {
>>  	__u32 nodesize;				/* out */
>>  	__u32 sectorsize;			/* out */
>>  	__u32 clone_alignment;			/* out */
>> -	__u32 reserved32;
>> -	__u64 reserved[122];			/* pad to 1k */
>> +	__u32 flags;				/* out */
>> +	__u16 csum_type;			/* out */
>> +	__u8 reserved[974];			/* pad to 1k */
> 
> I think 32 bits for out flags should be enough (or at least for a long
> time).

Otherwise just add a flags2 later somewhere further on, and continue in
there?. :) Yolo.

> I'm not sure if we should make the flags also input.

Remember that the only thing that the ioctl code now does is blindly
copying the output data over the provided user buf.

This is the famous "there's no check if user provided buf was properly
zeroed". Combine this with "what to do if a bit is set to ask for a
feature that's not implemented yet?" -> ENOENT? That would mean that any
user who's calling the ioctl with a buffer filled with garbage will
likely hit it.

So, there you go, BTRFS_IOC_FS_INFO_V2. I don't think it's worth it.

> Right now I
> think not and if we need to pass flags to request potentially expensive
> data to retrieve, we'd add another member for that. I don't have a
> concrete example and the whole FS_INFO ioctl is supposed to be
> lightweight so as it is now looks good to me.

As a user of this ioctl, my impression also always has been that there's
just a collection of simple interesting values returned, that can't be
easily read from other metadata, so mostly stuff from the super block.

E.g. a generation field would also be interesting, to see what the last
committed transaction is. One could e.g. create a monitoring graph that
shows how often they happen.

So, no big complicated stuff.

> Also it's pretty small patch, it should be good to go to stable so we
> get the fixed interface even in 5.7.

Yay.
Hans
David Sterba June 26, 2020, 12:49 p.m. UTC | #3
On Fri, Jun 26, 2020 at 02:06:01PM +0200, Hans van Kranenburg wrote:
> >> @@ -250,10 +250,19 @@ struct btrfs_ioctl_fs_info_args {
> >>  	__u32 nodesize;				/* out */
> >>  	__u32 sectorsize;			/* out */
> >>  	__u32 clone_alignment;			/* out */
> >> -	__u32 reserved32;
> >> -	__u64 reserved[122];			/* pad to 1k */
> >> +	__u32 flags;				/* out */
> >> +	__u16 csum_type;			/* out */
> >> +	__u8 reserved[974];			/* pad to 1k */
> > 
> > I think 32 bits for out flags should be enough (or at least for a long
> > time).
> 
> Otherwise just add a flags2 later somewhere further on, and continue in
> there?. :) Yolo.
> 
> > I'm not sure if we should make the flags also input.
> 
> Remember that the only thing that the ioctl code now does is blindly
> copying the output data over the provided user buf.
> 
> This is the famous "there's no check if user provided buf was properly
> zeroed".

Should there such check? Zeroing the ioctl buffers is expected as a
convention to make future extensions possible.

> Combine this with "what to do if a bit is set to ask for a
> feature that's not implemented yet?" -> ENOENT? That would mean that any
> user who's calling the ioctl with a buffer filled with garbage will
> likely hit it.

Yes, users filling garbage values are either fuzzing or not the using
the interface correctly, so any sane error is fine.

> So, there you go, BTRFS_IOC_FS_INFO_V2. I don't think it's worth it.

That I don't what to happen of course, but so far I'm not conviced that
it would be needed. If you have another counter example, other than
filling with garbage values, please post it.

> > Right now I
> > think not and if we need to pass flags to request potentially expensive
> > data to retrieve, we'd add another member for that. I don't have a
> > concrete example and the whole FS_INFO ioctl is supposed to be
> > lightweight so as it is now looks good to me.
> 
> As a user of this ioctl, my impression also always has been that there's
> just a collection of simple interesting values returned, that can't be
> easily read from other metadata, so mostly stuff from the super block.

Yes that is how I understand it too.

> E.g. a generation field would also be interesting, to see what the last
> committed transaction is. One could e.g. create a monitoring graph that
> shows how often they happen.

So the FS_INFO is clearly underused and can be extended with at least:

- generation
- metadata_uuid

and now that I'm reading the output of dump-super, we maybe should also
add the 'csum_size' so the user does not need to do the translation
type -> size.

The compat features or label have their own ioctls, the stats about
roots are IMHO not that useful and devices have own ioctl. Maybe
total_bytes and that's probably all.
David Sterba June 26, 2020, 12:54 p.m. UTC | #4
On Fri, Jun 26, 2020 at 04:30:19PM +0900, Johannes Thumshirn wrote:
> With the recent addition of filesystem checksum types other than CRC32c,
> it is not anymore hard-coded which checksum type a btrfs filesystem uses.
> 
> Up to now there is no good way to read the filesystem checksum, apart from
> reading the filesystem UUID and then query sysfs for the checksum type.
> 
> Add a new csum_type field to the BTRFS_IOC_FS_INFO ioctl command which
> usually is used to query filesystem features. Also add a flags member
> indicating that the kernel responded with a set csum_type field.
> 
> To simplify further additions to the ioctl, also switch the padding to a
> u8 array. Pahole was used to verify the result of this switch:
> 
> pahole -C btrfs_ioctl_fs_info_args fs/btrfs/btrfs.ko
> struct btrfs_ioctl_fs_info_args {
>         __u64                      max_id;               /*     0     8 */
>         __u64                      num_devices;          /*     8     8 */
>         __u8                       fsid[16];             /*    16    16 */
>         __u32                      nodesize;             /*    32     4 */
>         __u32                      sectorsize;           /*    36     4 */
>         __u32                      clone_alignment;      /*    40     4 */
>         __u32                      flags;                /*    44     4 */
>         __u16                      csum_type;            /*    48     2 */

This leaves 2 bytes (one u16) unaligned to the next 4 bytes, which
shouldn't be a problem, but I think having the csum_size would be a good
and also getting rid of the gap.

>         __u8                       reserved[974];        /*    50   974 */
> 
>         /* size: 1024, cachelines: 16, members: 9 */
> };
Hans van Kranenburg June 26, 2020, 1:03 p.m. UTC | #5
On 6/26/20 2:49 PM, David Sterba wrote:
> On Fri, Jun 26, 2020 at 02:06:01PM +0200, Hans van Kranenburg wrote:
>>>> @@ -250,10 +250,19 @@ struct btrfs_ioctl_fs_info_args {
>>>>  	__u32 nodesize;				/* out */
>>>>  	__u32 sectorsize;			/* out */
>>>>  	__u32 clone_alignment;			/* out */
>>>> -	__u32 reserved32;
>>>> -	__u64 reserved[122];			/* pad to 1k */
>>>> +	__u32 flags;				/* out */
>>>> +	__u16 csum_type;			/* out */
>>>> +	__u8 reserved[974];			/* pad to 1k */
>>>
>>> I think 32 bits for out flags should be enough (or at least for a long
>>> time).
>>
>> Otherwise just add a flags2 later somewhere further on, and continue in
>> there?. :) Yolo.
>>
>>> I'm not sure if we should make the flags also input.
>>
>> Remember that the only thing that the ioctl code now does is blindly
>> copying the output data over the provided user buf.
>>
>> This is the famous "there's no check if user provided buf was properly
>> zeroed".
> 
> Should there such check? Zeroing the ioctl buffers is expected as a
> convention to make future extensions possible.
> 
>> Combine this with "what to do if a bit is set to ask for a
>> feature that's not implemented yet?" -> ENOENT? That would mean that any
>> user who's calling the ioctl with a buffer filled with garbage will
>> likely hit it.
> 
> Yes, users filling garbage values are either fuzzing or not the using
> the interface correctly, so any sane error is fine.
> 
>> So, there you go, BTRFS_IOC_FS_INFO_V2. I don't think it's worth it.
> 
> That I don't what to happen of course, but so far I'm not conviced that
> it would be needed. If you have another counter example, other than
> filling with garbage values, please post it.

My reference is commit d24a67b2d997 in which LOGICAL_INO_V2 was created
for this reason. Just allocating a memory buffer without explicitly
zeroing it. Apparently that happens, and it would change the behavior
(getting errors, call getting rejected) without the user changing their
code. Does that fall in the category of 'not using the interface
correctly', or in the category 'you're not a pro coder, but we have
mercy with you'?

>>> Right now I
>>> think not and if we need to pass flags to request potentially expensive
>>> data to retrieve, we'd add another member for that. I don't have a
>>> concrete example and the whole FS_INFO ioctl is supposed to be
>>> lightweight so as it is now looks good to me.
>>
>> As a user of this ioctl, my impression also always has been that there's
>> just a collection of simple interesting values returned, that can't be
>> easily read from other metadata, so mostly stuff from the super block.
> 
> Yes that is how I understand it too.
> 
>> E.g. a generation field would also be interesting, to see what the last
>> committed transaction is. One could e.g. create a monitoring graph that
>> shows how often they happen.
> 
> So the FS_INFO is clearly underused and can be extended with at least:
> 
> - generation
> - metadata_uuid
> 
> and now that I'm reading the output of dump-super, we maybe should also
> add the 'csum_size' so the user does not need to do the translation
> type -> size.
> 
> The compat features or label have their own ioctls, the stats about
> roots are IMHO not that useful and devices have own ioctl. Maybe
> total_bytes and that's probably all.

Hans
Johannes Thumshirn June 26, 2020, 2:28 p.m. UTC | #6
On 26/06/2020 14:55, David Sterba wrote:
[...]
> This leaves 2 bytes (one u16) unaligned to the next 4 bytes, which
> shouldn't be a problem, but I think having the csum_size would be a good
> and also getting rid of the gap.
> 
>>         __u8                       reserved[974];        /*    50   974 */
>>
>>         /* size: 1024, cachelines: 16, members: 9 */
>> };

Good idea, will update and post a v3.
diff mbox series

Patch

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b3e4c632d80c..6c9d0c3a5e7e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3217,6 +3217,8 @@  static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
 	fi_args->nodesize = fs_info->nodesize;
 	fi_args->sectorsize = fs_info->sectorsize;
 	fi_args->clone_alignment = fs_info->sectorsize;
+	fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy);
+	fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_TYPE;
 
 	if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
 		ret = -EFAULT;
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index e6b6cb0f8bc6..ee15ece4f477 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -250,10 +250,19 @@  struct btrfs_ioctl_fs_info_args {
 	__u32 nodesize;				/* out */
 	__u32 sectorsize;			/* out */
 	__u32 clone_alignment;			/* out */
-	__u32 reserved32;
-	__u64 reserved[122];			/* pad to 1k */
+	__u32 flags;				/* out */
+	__u16 csum_type;			/* out */
+	__u8 reserved[974];			/* pad to 1k */
 };
 
+/*
+ * fs_info ioctl flags
+ *
+ * Used by:
+ * struct btrfs_ioctl_fs_info_args
+ */
+#define BTRFS_FS_INFO_FLAG_CSUM_TYPE			(1 << 0)
+
 /*
  * feature flags
  *