diff mbox series

[PoC,1/9] btrfs: introduce BTRFS_IOC_SCRUB_FS family of ioctls

Message ID e37ae2c85731ec307869e7c8f87c10d36d51846f.1662191784.git.wqu@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs: scrub: introduce a new family of ioctl, scrub_fs | expand

Commit Message

Qu Wenruo Sept. 3, 2022, 8:19 a.m. UTC
The new ioctls are to address the disadvantages of the existing
btrfs_scrub_dev():

a One thread per-device
  This can cause multiple block groups to be marked read-only for scrub,
  reducing available space temporarily.

  This also causes higher CPU/IO usage.
  For scrub, we should use the minimal amount of CPU and cause less
  IO when possible.

b Extra IO for RAID56
  For data stripes, we will cause at least 2x IO if we run "btrfs scrub
  start <mnt>".
  1x from scrubbing the device of data stripe.
  The other 1x from scrubbing the parity stripe.

  This duplicated IO should definitely be avoided

c Bad progress report for RAID56
  We can not report any repaired P/Q bytes at all.

The a and b will be addressed by the new one thread per-fs
btrfs_scrub_fs ioctl.

While c will be addressed by the new btrfs_scrub_fs_progress structure,
which has better comments and classification for all errors.

This patch is only a skeleton for the new family of ioctls, will return
-EOPNOTSUPP for now.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/ioctl.c           |   6 ++
 include/uapi/linux/btrfs.h | 173 +++++++++++++++++++++++++++++++++++++
 2 files changed, 179 insertions(+)

Comments

Wang Yugui Sept. 3, 2022, 9:25 a.m. UTC | #1
Hi,

> The new ioctls are to address the disadvantages of the existing
> btrfs_scrub_dev():
> 
> a One thread per-device
>   This can cause multiple block groups to be marked read-only for scrub,
>   reducing available space temporarily.
> 
>   This also causes higher CPU/IO usage.
>   For scrub, we should use the minimal amount of CPU and cause less
>   IO when possible.
> 
> b Extra IO for RAID56
>   For data stripes, we will cause at least 2x IO if we run "btrfs scrub
>   start <mnt>".
>   1x from scrubbing the device of data stripe.
>   The other 1x from scrubbing the parity stripe.
> 
>   This duplicated IO should definitely be avoided
> 
> c Bad progress report for RAID56
>   We can not report any repaired P/Q bytes at all.
> 
> The a and b will be addressed by the new one thread per-fs
> btrfs_scrub_fs ioctl.

CRC check of scrub is CPU sensitive, so we still need multiple threads,
such as one thread per-fs but with additional threads pool based on
chunks?

Best Regards
Wang Yugui (wangyugui@e16-tech.com)
2022/09/03



> While c will be addressed by the new btrfs_scrub_fs_progress structure,
> which has better comments and classification for all errors.
> 
> This patch is only a skeleton for the new family of ioctls, will return
> -EOPNOTSUPP for now.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  fs/btrfs/ioctl.c           |   6 ++
>  include/uapi/linux/btrfs.h | 173 +++++++++++++++++++++++++++++++++++++
>  2 files changed, 179 insertions(+)
> 
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index fe0cc816b4eb..3df3bcdf06eb 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -5508,6 +5508,12 @@ long btrfs_ioctl(struct file *file, unsigned int
>  		return btrfs_ioctl_scrub_cancel(fs_info);
>  	case BTRFS_IOC_SCRUB_PROGRESS:
>  		return btrfs_ioctl_scrub_progress(fs_info, argp);
> +	case BTRFS_IOC_SCRUB_FS:
> +		return -EOPNOTSUPP;
> +	case BTRFS_IOC_SCRUB_FS_CANCEL:
> +		return -EOPNOTSUPP;
> +	case BTRFS_IOC_SCRUB_FS_PROGRESS:
> +		return -EOPNOTSUPP;
>  	case BTRFS_IOC_BALANCE_V2:
>  		return btrfs_ioctl_balance(file, argp);
>  	case BTRFS_IOC_BALANCE_CTL:
> diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
> index 7ada84e4a3ed..795ed33843ce 100644
> --- a/include/uapi/linux/btrfs.h
> +++ b/include/uapi/linux/btrfs.h
> @@ -191,6 +191,174 @@ struct btrfs_ioctl_scrub_args {
>  	__u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
>  };
>  
> +struct btrfs_scrub_fs_progress {
> +	/*
> +	 * Fatal errors, including -ENOMEM, or csum/extent tree search errors.
> +	 *
> +	 * Normally after hitting such fatal errors, we error out, thus later
> +	 * accounting will no longer be reliable.
> +	 */
> +	__u16	nr_fatal_errors;
> +
> +	/*
> +	 * All super errors, from invalid members and IO error all go into
> +	 * nr_super_errors.
> +	 */
> +	__u16	nr_super_errors;
> +
> +	/* Super block accounting. */
> +	__u16	nr_super_scrubbed;
> +	__u16	nr_super_repaired;
> +
> +	/*
> +	 * Data accounting in bytes.
> +	 *
> +	 * We only care about how many bytes we scrubbed, thus no
> +	 * accounting for number of extents.
> +	 *
> +	 * This accounting includes the extra mirrors.
> +	 * E.g. for RAID1, one 16KiB extent will cause 32KiB in @data_scrubbed.
> +	 */
> +	__u64	data_scrubbed;
> +
> +	/* How many bytes can be recovered. */
> +	__u64	data_recoverable;
> +
> +	/*
> +	 * How many bytes are uncertain, this can only happen for NODATASUM
> +	 * cases.
> +	 * Including NODATASUM, and no extra mirror/parity to verify.
> +	 * Or has extra mirrors, but they mismatch with each other.
> +	 */
> +	__u64	data_nocsum_uncertain;
> +
> +	/*
> +	 * For data error bytes, these means determining errors, including:
> +	 *
> +	 * - IO failure, including missing dev.
> +	 * - Data csum mismatch
> +	 *   Csum tree search failure must go above case.
> +	 */
> +	__u64	data_io_fail;
> +	__u64	data_csum_mismatch;
> +
> +	/*
> +	 * All the unmentioned cases, including data matching its csum (of
> +	 * course, implies IO suceeded) and data has no csum but matches all
> +	 * other copies/parities, are the expected cases, no need to record.
> +	 */
> +
> +	/*
> +	 * Metadata accounting in bytes, pretty much the same as data.
> +	 *
> +	 * And since metadata has mandatory csum, there is no uncertain case.
> +	 */
> +	__u64	meta_scrubbed;
> +	__u64	meta_recoverable;
> +
> +	/*
> +	 * For meta, the checks are mostly progressive:
> +	 *
> +	 * - Unable to read
> +	 *   @meta_io_fail
> +	 *
> +	 * - Unable to pass basic sanity checks (e.g. bytenr check)
> +	 *   @meta_invalid
> +	 *
> +	 * - Pass basic sanity checks, but bad csum
> +	 *   @meta_bad_csum
> +	 *
> +	 * - Pass basic checks and csum, but bad transid
> +	 *   @meta_bad_transid
> +	 *
> +	 * - Pass all checks
> +	 *   The expected case, no special accounting needed.
> +	 */
> +	__u64 meta_io_fail;
> +	__u64 meta_invalid;
> +	__u64 meta_bad_csum;
> +	__u64 meta_bad_transid;
> +
> +	/*
> +	 * Parity accounting.
> +	 *
> +	 * NOTE: for unused data sectors (but still contributes to P/Q
> +	 * calculation, like the following case), they don't contribute
> +	 * to any accounting.
> +	 *
> +	 * Data 1:	|<--- Unused ---->| <<<
> +	 * Data 2:	|<- Data extent ->|
> +	 * Parity:	|<--- Parity ---->|
> +	 */
> +	__u64 parity_scrubbed;
> +	__u64 parity_recoverable;
> +
> +	/*
> +	 * This happens when there is not enough info to determine if the
> +	 * parity is correct, mostly happens when vertical stripes are
> +	 * *all* NODATASUM sectors.
> +	 *
> +	 * If there is any sector with checksum in the vertical stripe,
> +	 * parity itself will be no longer uncertain.
> +	 */
> +	__u64 parity_uncertain;
> +
> +	/*
> +	 * For parity, the checks are progressive too:
> +	 *
> +	 * - Unable to read
> +	 *   @parity_io_fail
> +	 *
> +	 * - Mismatch and any veritical data stripe has csum and
> +	 *   the data stripe csum matches
> +	 *   @parity_mismatch
> +	 *   We want to repair the parity then.
> +	 *
> +	 * - Mismatch and veritical data stripe has csum, and data
> +	 *   csum mismatch. And rebuilt data passes csum.
> +	 *   This will go @data_recoverable or @data_csum_mismatch instead.
> +	 *
> +	 * - Mismatch but no veritical data stripe has csum
> +	 *   @parity_uncertain
> +	 *
> +	 */
> +	__u64 parity_io_fail;
> +	__u64 parity_mismatch;
> +
> +	/* Padding to 256 bytes, and for later expansion. */
> +	__u64 __unused[15];
> +};
> +static_assert(sizeof(struct btrfs_scrub_fs_progress) == 256);
> +
> +/*
> + * Readonly scrub fs will not try any repair (thus *_repaired member
> + * in scrub_fs_progress should always be 0).
> + */
> +#define BTRFS_SCRUB_FS_FLAG_READONLY	(1ULL << 0)
> +
> +/*
> + * All supported flags.
> + *
> + * From the very beginning, scrub_fs ioctl would reject any unsupported
> + * flags, making later expansion much simper.
> + */
> +#define BTRFS_SCRUB_FS_FLAG_SUPP	(BTRFS_SCRUB_FS_FLAG_READONLY)
> +
> +struct btrfs_ioctl_scrub_fs_args {
> +	/* Input, logical bytenr to start the scrub */
> +	__u64 start;
> +
> +	/* Input, the logical bytenr end (inclusive) */
> +	__u64 end;
> +
> +	__u64 flags;
> +	__u64 reserved[8];
> +	struct btrfs_scrub_fs_progress progress; /* out */
> +
> +	/* pad to 1K */
> +	__u8 unused[1024 - 24 - 64 - sizeof(struct btrfs_scrub_fs_progress)];
> +};
> +
>  #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS	0
>  #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID	1
>  struct btrfs_ioctl_dev_replace_start_params {
> @@ -1137,5 +1305,10 @@ enum btrfs_err_code {
>  				    struct btrfs_ioctl_encoded_io_args)
>  #define BTRFS_IOC_ENCODED_WRITE _IOW(BTRFS_IOCTL_MAGIC, 64, \
>  				     struct btrfs_ioctl_encoded_io_args)
> +#define BTRFS_IOC_SCRUB_FS	_IOWR(BTRFS_IOCTL_MAGIC, 65, \
> +				      struct btrfs_ioctl_scrub_fs_args)
> +#define BTRFS_IOC_SCRUB_FS_CANCEL _IO(BTRFS_IOCTL_MAGIC, 66)
> +#define BTRFS_IOC_SCRUB_FS_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 67, \
> +				       struct btrfs_ioctl_scrub_fs_args)
>  
>  #endif /* _UAPI_LINUX_BTRFS_H */
> -- 
> 2.37.3
Qu Wenruo Sept. 3, 2022, 9:37 a.m. UTC | #2
On 2022/9/3 17:25, Wang Yugui wrote:
> Hi,
>
>> The new ioctls are to address the disadvantages of the existing
>> btrfs_scrub_dev():
>>
>> a One thread per-device
>>    This can cause multiple block groups to be marked read-only for scrub,
>>    reducing available space temporarily.
>>
>>    This also causes higher CPU/IO usage.
>>    For scrub, we should use the minimal amount of CPU and cause less
>>    IO when possible.
>>
>> b Extra IO for RAID56
>>    For data stripes, we will cause at least 2x IO if we run "btrfs scrub
>>    start <mnt>".
>>    1x from scrubbing the device of data stripe.
>>    The other 1x from scrubbing the parity stripe.
>>
>>    This duplicated IO should definitely be avoided
>>
>> c Bad progress report for RAID56
>>    We can not report any repaired P/Q bytes at all.
>>
>> The a and b will be addressed by the new one thread per-fs
>> btrfs_scrub_fs ioctl.
>
> CRC check of scrub is CPU sensitive, so we still need multiple threads,
> such as one thread per-fs but with additional threads pool based on
> chunks?

This depends.

Scrub should be a background work, which can already be affected by
scheduling, and I don't think users would bother 5% or 10% longer
runtime for a several TB fs.

Furthermore if checksum in a single thread is going to be a bottleneck,
then I'd say your storage is already so fast that scrub duration is not
your primary concern any more.

Yes, it can be possible to offload the csum verification into multiple
threads, like one thread per mirror/device, but I don't want to
sacrifice readability for very minor performance improvement.

>
> Best Regards
> Wang Yugui (wangyugui@e16-tech.com)
> 2022/09/03
>
>
>
>> While c will be addressed by the new btrfs_scrub_fs_progress structure,
>> which has better comments and classification for all errors.
>>
>> This patch is only a skeleton for the new family of ioctls, will return
>> -EOPNOTSUPP for now.
>>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>>   fs/btrfs/ioctl.c           |   6 ++
>>   include/uapi/linux/btrfs.h | 173 +++++++++++++++++++++++++++++++++++++
>>   2 files changed, 179 insertions(+)
>>
>> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
>> index fe0cc816b4eb..3df3bcdf06eb 100644
>> --- a/fs/btrfs/ioctl.c
>> +++ b/fs/btrfs/ioctl.c
>> @@ -5508,6 +5508,12 @@ long btrfs_ioctl(struct file *file, unsigned int
>>   		return btrfs_ioctl_scrub_cancel(fs_info);
>>   	case BTRFS_IOC_SCRUB_PROGRESS:
>>   		return btrfs_ioctl_scrub_progress(fs_info, argp);
>> +	case BTRFS_IOC_SCRUB_FS:
>> +		return -EOPNOTSUPP;
>> +	case BTRFS_IOC_SCRUB_FS_CANCEL:
>> +		return -EOPNOTSUPP;
>> +	case BTRFS_IOC_SCRUB_FS_PROGRESS:
>> +		return -EOPNOTSUPP;
>>   	case BTRFS_IOC_BALANCE_V2:
>>   		return btrfs_ioctl_balance(file, argp);
>>   	case BTRFS_IOC_BALANCE_CTL:
>> diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
>> index 7ada84e4a3ed..795ed33843ce 100644
>> --- a/include/uapi/linux/btrfs.h
>> +++ b/include/uapi/linux/btrfs.h
>> @@ -191,6 +191,174 @@ struct btrfs_ioctl_scrub_args {
>>   	__u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
>>   };
>>
>> +struct btrfs_scrub_fs_progress {
>> +	/*
>> +	 * Fatal errors, including -ENOMEM, or csum/extent tree search errors.
>> +	 *
>> +	 * Normally after hitting such fatal errors, we error out, thus later
>> +	 * accounting will no longer be reliable.
>> +	 */
>> +	__u16	nr_fatal_errors;
>> +
>> +	/*
>> +	 * All super errors, from invalid members and IO error all go into
>> +	 * nr_super_errors.
>> +	 */
>> +	__u16	nr_super_errors;
>> +
>> +	/* Super block accounting. */
>> +	__u16	nr_super_scrubbed;
>> +	__u16	nr_super_repaired;
>> +
>> +	/*
>> +	 * Data accounting in bytes.
>> +	 *
>> +	 * We only care about how many bytes we scrubbed, thus no
>> +	 * accounting for number of extents.
>> +	 *
>> +	 * This accounting includes the extra mirrors.
>> +	 * E.g. for RAID1, one 16KiB extent will cause 32KiB in @data_scrubbed.
>> +	 */
>> +	__u64	data_scrubbed;
>> +
>> +	/* How many bytes can be recovered. */
>> +	__u64	data_recoverable;
>> +
>> +	/*
>> +	 * How many bytes are uncertain, this can only happen for NODATASUM
>> +	 * cases.
>> +	 * Including NODATASUM, and no extra mirror/parity to verify.
>> +	 * Or has extra mirrors, but they mismatch with each other.
>> +	 */
>> +	__u64	data_nocsum_uncertain;
>> +
>> +	/*
>> +	 * For data error bytes, these means determining errors, including:
>> +	 *
>> +	 * - IO failure, including missing dev.
>> +	 * - Data csum mismatch
>> +	 *   Csum tree search failure must go above case.
>> +	 */
>> +	__u64	data_io_fail;
>> +	__u64	data_csum_mismatch;
>> +
>> +	/*
>> +	 * All the unmentioned cases, including data matching its csum (of
>> +	 * course, implies IO suceeded) and data has no csum but matches all
>> +	 * other copies/parities, are the expected cases, no need to record.
>> +	 */
>> +
>> +	/*
>> +	 * Metadata accounting in bytes, pretty much the same as data.
>> +	 *
>> +	 * And since metadata has mandatory csum, there is no uncertain case.
>> +	 */
>> +	__u64	meta_scrubbed;
>> +	__u64	meta_recoverable;
>> +
>> +	/*
>> +	 * For meta, the checks are mostly progressive:
>> +	 *
>> +	 * - Unable to read
>> +	 *   @meta_io_fail
>> +	 *
>> +	 * - Unable to pass basic sanity checks (e.g. bytenr check)
>> +	 *   @meta_invalid
>> +	 *
>> +	 * - Pass basic sanity checks, but bad csum
>> +	 *   @meta_bad_csum
>> +	 *
>> +	 * - Pass basic checks and csum, but bad transid
>> +	 *   @meta_bad_transid
>> +	 *
>> +	 * - Pass all checks
>> +	 *   The expected case, no special accounting needed.
>> +	 */
>> +	__u64 meta_io_fail;
>> +	__u64 meta_invalid;
>> +	__u64 meta_bad_csum;
>> +	__u64 meta_bad_transid;
>> +
>> +	/*
>> +	 * Parity accounting.
>> +	 *
>> +	 * NOTE: for unused data sectors (but still contributes to P/Q
>> +	 * calculation, like the following case), they don't contribute
>> +	 * to any accounting.
>> +	 *
>> +	 * Data 1:	|<--- Unused ---->| <<<
>> +	 * Data 2:	|<- Data extent ->|
>> +	 * Parity:	|<--- Parity ---->|
>> +	 */
>> +	__u64 parity_scrubbed;
>> +	__u64 parity_recoverable;
>> +
>> +	/*
>> +	 * This happens when there is not enough info to determine if the
>> +	 * parity is correct, mostly happens when vertical stripes are
>> +	 * *all* NODATASUM sectors.
>> +	 *
>> +	 * If there is any sector with checksum in the vertical stripe,
>> +	 * parity itself will be no longer uncertain.
>> +	 */
>> +	__u64 parity_uncertain;
>> +
>> +	/*
>> +	 * For parity, the checks are progressive too:
>> +	 *
>> +	 * - Unable to read
>> +	 *   @parity_io_fail
>> +	 *
>> +	 * - Mismatch and any veritical data stripe has csum and
>> +	 *   the data stripe csum matches
>> +	 *   @parity_mismatch
>> +	 *   We want to repair the parity then.
>> +	 *
>> +	 * - Mismatch and veritical data stripe has csum, and data
>> +	 *   csum mismatch. And rebuilt data passes csum.
>> +	 *   This will go @data_recoverable or @data_csum_mismatch instead.
>> +	 *
>> +	 * - Mismatch but no veritical data stripe has csum
>> +	 *   @parity_uncertain
>> +	 *
>> +	 */
>> +	__u64 parity_io_fail;
>> +	__u64 parity_mismatch;
>> +
>> +	/* Padding to 256 bytes, and for later expansion. */
>> +	__u64 __unused[15];
>> +};
>> +static_assert(sizeof(struct btrfs_scrub_fs_progress) == 256);
>> +
>> +/*
>> + * Readonly scrub fs will not try any repair (thus *_repaired member
>> + * in scrub_fs_progress should always be 0).
>> + */
>> +#define BTRFS_SCRUB_FS_FLAG_READONLY	(1ULL << 0)
>> +
>> +/*
>> + * All supported flags.
>> + *
>> + * From the very beginning, scrub_fs ioctl would reject any unsupported
>> + * flags, making later expansion much simper.
>> + */
>> +#define BTRFS_SCRUB_FS_FLAG_SUPP	(BTRFS_SCRUB_FS_FLAG_READONLY)
>> +
>> +struct btrfs_ioctl_scrub_fs_args {
>> +	/* Input, logical bytenr to start the scrub */
>> +	__u64 start;
>> +
>> +	/* Input, the logical bytenr end (inclusive) */
>> +	__u64 end;
>> +
>> +	__u64 flags;
>> +	__u64 reserved[8];
>> +	struct btrfs_scrub_fs_progress progress; /* out */
>> +
>> +	/* pad to 1K */
>> +	__u8 unused[1024 - 24 - 64 - sizeof(struct btrfs_scrub_fs_progress)];
>> +};
>> +
>>   #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS	0
>>   #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID	1
>>   struct btrfs_ioctl_dev_replace_start_params {
>> @@ -1137,5 +1305,10 @@ enum btrfs_err_code {
>>   				    struct btrfs_ioctl_encoded_io_args)
>>   #define BTRFS_IOC_ENCODED_WRITE _IOW(BTRFS_IOCTL_MAGIC, 64, \
>>   				     struct btrfs_ioctl_encoded_io_args)
>> +#define BTRFS_IOC_SCRUB_FS	_IOWR(BTRFS_IOCTL_MAGIC, 65, \
>> +				      struct btrfs_ioctl_scrub_fs_args)
>> +#define BTRFS_IOC_SCRUB_FS_CANCEL _IO(BTRFS_IOCTL_MAGIC, 66)
>> +#define BTRFS_IOC_SCRUB_FS_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 67, \
>> +				       struct btrfs_ioctl_scrub_fs_args)
>>
>>   #endif /* _UAPI_LINUX_BTRFS_H */
>> --
>> 2.37.3
>
>
Wang Yugui Sept. 3, 2022, 9:49 a.m. UTC | #3
Hi,

> On 2022/9/3 17:25, Wang Yugui wrote:
> > Hi,
> >
> >> The new ioctls are to address the disadvantages of the existing
> >> btrfs_scrub_dev():
> >>
> >> a One thread per-device
> >>    This can cause multiple block groups to be marked read-only for scrub,
> >>    reducing available space temporarily.
> >>
> >>    This also causes higher CPU/IO usage.
> >>    For scrub, we should use the minimal amount of CPU and cause less
> >>    IO when possible.
> >>
> >> b Extra IO for RAID56
> >>    For data stripes, we will cause at least 2x IO if we run "btrfs scrub
> >>    start <mnt>".
> >>    1x from scrubbing the device of data stripe.
> >>    The other 1x from scrubbing the parity stripe.
> >>
> >>    This duplicated IO should definitely be avoided
> >>
> >> c Bad progress report for RAID56
> >>    We can not report any repaired P/Q bytes at all.
> >>
> >> The a and b will be addressed by the new one thread per-fs
> >> btrfs_scrub_fs ioctl.
> >
> > CRC check of scrub is CPU sensitive, so we still need multiple threads,
> > such as one thread per-fs but with additional threads pool based on
> > chunks?
> 
> This depends.
> 
> Scrub should be a background work, which can already be affected by
> scheduling, and I don't think users would bother 5% or 10% longer
> runtime for a several TB fs.
> 
> Furthermore if checksum in a single thread is going to be a bottleneck,
> then I'd say your storage is already so fast that scrub duration is not
> your primary concern any more.

scrub is sequence I/O, HDD is very fast too.
HDD*10  with HW RAID6 is very fast for scrub, about 2GB/s or more.

> Yes, it can be possible to offload the csum verification into multiple
> threads, like one thread per mirror/device, but I don't want to
> sacrifice readability for very minor performance improvement.

Best Regards
Wang Yugui (wangyugui@e16-tech.com)
2022/09/03
kernel test robot Sept. 3, 2022, 11:47 a.m. UTC | #4
Hi Qu,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on kdave/for-next]
[also build test ERROR on linus/master v6.0-rc3 next-20220901]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Qu-Wenruo/btrfs-scrub-introduce-a-new-family-of-ioctl-scrub_fs/20220903-162128
base:   https://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git for-next
config: x86_64-randconfig-a004 (https://download.01.org/0day-ci/archive/20220903/202209031916.ybFIwbdf-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-5) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/8f7e2c15c08dc87518d12529e5b5cba0a42b5eb1
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Qu-Wenruo/btrfs-scrub-introduce-a-new-family-of-ioctl-scrub_fs/20220903-162128
        git checkout 8f7e2c15c08dc87518d12529e5b5cba0a42b5eb1
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from <command-line>:
>> ./usr/include/linux/btrfs.h:329:15: error: expected declaration specifiers or '...' before 'sizeof'
     329 | static_assert(sizeof(struct btrfs_scrub_fs_progress) == 256);
         |               ^~~~~~
Qu Wenruo Sept. 3, 2022, 11:55 a.m. UTC | #5
On 2022/9/3 19:47, kernel test robot wrote:
> Hi Qu,
>
> Thank you for the patch! Yet something to improve:
>
> [auto build test ERROR on kdave/for-next]
> [also build test ERROR on linus/master v6.0-rc3 next-20220901]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
>
> url:    https://github.com/intel-lab-lkp/linux/commits/Qu-Wenruo/btrfs-scrub-introduce-a-new-family-of-ioctl-scrub_fs/20220903-162128
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git for-next
> config: x86_64-randconfig-a004 (https://download.01.org/0day-ci/archive/20220903/202209031916.ybFIwbdf-lkp@intel.com/config)
> compiler: gcc-11 (Debian 11.3.0-5) 11.3.0
> reproduce (this is a W=1 build):
>          # https://github.com/intel-lab-lkp/linux/commit/8f7e2c15c08dc87518d12529e5b5cba0a42b5eb1
>          git remote add linux-review https://github.com/intel-lab-lkp/linux
>          git fetch --no-tags linux-review Qu-Wenruo/btrfs-scrub-introduce-a-new-family-of-ioctl-scrub_fs/20220903-162128
>          git checkout 8f7e2c15c08dc87518d12529e5b5cba0a42b5eb1
>          # save the config file
>          mkdir build_dir && cp config build_dir/.config
>          make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash
>
> If you fix the issue, kindly add following tag where applicable
> Reported-by: kernel test robot <lkp@intel.com>
>
> All errors (new ones prefixed by >>):
>
>     In file included from <command-line>:
>>> ./usr/include/linux/btrfs.h:329:15: error: expected declaration specifiers or '...' before 'sizeof'
>       329 | static_assert(sizeof(struct btrfs_scrub_fs_progress) == 256);
>           |               ^~~~~~
>
Hi LKP guys,

Is this a false alert from your tools?

This seems solid to me, and my compiler doesn't give me any error about
this...

Thanks,
Qu
Chen, Rong A Sept. 5, 2022, 2:05 a.m. UTC | #6
On 9/3/2022 7:55 PM, Qu Wenruo wrote:
> 
> 
> On 2022/9/3 19:47, kernel test robot wrote:
>> Hi Qu,
>>
>> Thank you for the patch! Yet something to improve:
>>
>> [auto build test ERROR on kdave/for-next]
>> [also build test ERROR on linus/master v6.0-rc3 next-20220901]
>> [If your patch is applied to the wrong git tree, kindly drop us a note.
>> And when submitting patch, we suggest to use '--base' as documented in
>> https://git-scm.com/docs/git-format-patch#_base_tree_information]
>>
>> url:    
>> https://github.com/intel-lab-lkp/linux/commits/Qu-Wenruo/btrfs-scrub-introduce-a-new-family-of-ioctl-scrub_fs/20220903-162128 
>>
>> base:   
>> https://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git for-next
>> config: x86_64-randconfig-a004 
>> (https://download.01.org/0day-ci/archive/20220903/202209031916.ybFIwbdf-lkp@intel.com/config) 
>>
>> compiler: gcc-11 (Debian 11.3.0-5) 11.3.0
>> reproduce (this is a W=1 build):
>>          # 
>> https://github.com/intel-lab-lkp/linux/commit/8f7e2c15c08dc87518d12529e5b5cba0a42b5eb1 
>>
>>          git remote add linux-review 
>> https://github.com/intel-lab-lkp/linux
>>          git fetch --no-tags linux-review 
>> Qu-Wenruo/btrfs-scrub-introduce-a-new-family-of-ioctl-scrub_fs/20220903-162128 
>>
>>          git checkout 8f7e2c15c08dc87518d12529e5b5cba0a42b5eb1
>>          # save the config file
>>          mkdir build_dir && cp config build_dir/.config
>>          make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash
>>
>> If you fix the issue, kindly add following tag where applicable
>> Reported-by: kernel test robot <lkp@intel.com>
>>
>> All errors (new ones prefixed by >>):
>>
>>     In file included from <command-line>:
>>>> ./usr/include/linux/btrfs.h:329:15: error: expected declaration 
>>>> specifiers or '...' before 'sizeof'
>>       329 | static_assert(sizeof(struct btrfs_scrub_fs_progress) == 256);
>>           |               ^~~~~~
>>
> Hi LKP guys,
> 
> Is this a false alert from your tools?
> 
> This seems solid to me, and my compiler doesn't give me any error about
> this...

Hi Qu,

Sorry for the noise, if you can't reproduce it with the linked config, 
it's possible that the bot applied the patchset to a wrong base branch.

 
https://download.01.org/0day-ci/archive/20220903/202209031916.ybFIwbdf-lkp@intel.com/config 


Best Regards,
Rong Chen

> 
> Thanks,
> Qu
> _______________________________________________
> kbuild-all mailing list -- kbuild-all@lists.01.org
> To unsubscribe send an email to kbuild-all-leave@lists.01.org
Wang Yugui Sept. 9, 2022, 4:17 a.m. UTC | #7
Hi,

> The new ioctls are to address the disadvantages of the existing
> btrfs_scrub_dev():
> 
> a One thread per-device
>   This can cause multiple block groups to be marked read-only for scrub,
>   reducing available space temporarily.
> 
>   This also causes higher CPU/IO usage.
>   For scrub, we should use the minimal amount of CPU and cause less
>   IO when possible.
> 
> b Extra IO for RAID56
>   For data stripes, we will cause at least 2x IO if we run "btrfs scrub
>   start <mnt>".
>   1x from scrubbing the device of data stripe.
>   The other 1x from scrubbing the parity stripe.
> 
>   This duplicated IO should definitely be avoided
> 
> c Bad progress report for RAID56
>   We can not report any repaired P/Q bytes at all.
> 
> The a and b will be addressed by the new one thread per-fs
> btrfs_scrub_fs ioctl.
> 
> While c will be addressed by the new btrfs_scrub_fs_progress structure,
> which has better comments and classification for all errors.
> 
> This patch is only a skeleton for the new family of ioctls, will return
> -EOPNOTSUPP for now.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  fs/btrfs/ioctl.c           |   6 ++
>  include/uapi/linux/btrfs.h | 173 +++++++++++++++++++++++++++++++++++++
>  2 files changed, 179 insertions(+)
> 
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index fe0cc816b4eb..3df3bcdf06eb 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -5508,6 +5508,12 @@ long btrfs_ioctl(struct file *file, unsigned int
>  		return btrfs_ioctl_scrub_cancel(fs_info);
>  	case BTRFS_IOC_SCRUB_PROGRESS:
>  		return btrfs_ioctl_scrub_progress(fs_info, argp);
> +	case BTRFS_IOC_SCRUB_FS:
> +		return -EOPNOTSUPP;
> +	case BTRFS_IOC_SCRUB_FS_CANCEL:
> +		return -EOPNOTSUPP;
> +	case BTRFS_IOC_SCRUB_FS_PROGRESS:
> +		return -EOPNOTSUPP;

Could we add suspend/resume for scrub when huge filesysem?

Best Regards
Wang Yugui (wangyugui@e16-tech.com)
2022/09/09
Qu Wenruo Sept. 9, 2022, 6:57 a.m. UTC | #8
On 2022/9/9 12:17, Wang Yugui wrote:
> Hi,
> 
>> The new ioctls are to address the disadvantages of the existing
>> btrfs_scrub_dev():
>>
>> a One thread per-device
>>    This can cause multiple block groups to be marked read-only for scrub,
>>    reducing available space temporarily.
>>
>>    This also causes higher CPU/IO usage.
>>    For scrub, we should use the minimal amount of CPU and cause less
>>    IO when possible.
>>
>> b Extra IO for RAID56
>>    For data stripes, we will cause at least 2x IO if we run "btrfs scrub
>>    start <mnt>".
>>    1x from scrubbing the device of data stripe.
>>    The other 1x from scrubbing the parity stripe.
>>
>>    This duplicated IO should definitely be avoided
>>
>> c Bad progress report for RAID56
>>    We can not report any repaired P/Q bytes at all.
>>
>> The a and b will be addressed by the new one thread per-fs
>> btrfs_scrub_fs ioctl.
>>
>> While c will be addressed by the new btrfs_scrub_fs_progress structure,
>> which has better comments and classification for all errors.
>>
>> This patch is only a skeleton for the new family of ioctls, will return
>> -EOPNOTSUPP for now.
>>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>>   fs/btrfs/ioctl.c           |   6 ++
>>   include/uapi/linux/btrfs.h | 173 +++++++++++++++++++++++++++++++++++++
>>   2 files changed, 179 insertions(+)
>>
>> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
>> index fe0cc816b4eb..3df3bcdf06eb 100644
>> --- a/fs/btrfs/ioctl.c
>> +++ b/fs/btrfs/ioctl.c
>> @@ -5508,6 +5508,12 @@ long btrfs_ioctl(struct file *file, unsigned int
>>   		return btrfs_ioctl_scrub_cancel(fs_info);
>>   	case BTRFS_IOC_SCRUB_PROGRESS:
>>   		return btrfs_ioctl_scrub_progress(fs_info, argp);
>> +	case BTRFS_IOC_SCRUB_FS:
>> +		return -EOPNOTSUPP;
>> +	case BTRFS_IOC_SCRUB_FS_CANCEL:
>> +		return -EOPNOTSUPP;
>> +	case BTRFS_IOC_SCRUB_FS_PROGRESS:
>> +		return -EOPNOTSUPP;
> 
> Could we add suspend/resume for scrub when huge filesysem?

It's POC, don't expect those.

It's to prove the idea works, and for discussion around how to improve 
the RAID56 scrub situation.

It's not determined to go this path, we may add extra flags without 
introducing a full ioctl number.

> 
> Best Regards
> Wang Yugui (wangyugui@e16-tech.com)
> 2022/09/09
> 
>
diff mbox series

Patch

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fe0cc816b4eb..3df3bcdf06eb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -5508,6 +5508,12 @@  long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_scrub_cancel(fs_info);
 	case BTRFS_IOC_SCRUB_PROGRESS:
 		return btrfs_ioctl_scrub_progress(fs_info, argp);
+	case BTRFS_IOC_SCRUB_FS:
+		return -EOPNOTSUPP;
+	case BTRFS_IOC_SCRUB_FS_CANCEL:
+		return -EOPNOTSUPP;
+	case BTRFS_IOC_SCRUB_FS_PROGRESS:
+		return -EOPNOTSUPP;
 	case BTRFS_IOC_BALANCE_V2:
 		return btrfs_ioctl_balance(file, argp);
 	case BTRFS_IOC_BALANCE_CTL:
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 7ada84e4a3ed..795ed33843ce 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -191,6 +191,174 @@  struct btrfs_ioctl_scrub_args {
 	__u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
 };
 
+struct btrfs_scrub_fs_progress {
+	/*
+	 * Fatal errors, including -ENOMEM, or csum/extent tree search errors.
+	 *
+	 * Normally after hitting such fatal errors, we error out, thus later
+	 * accounting will no longer be reliable.
+	 */
+	__u16	nr_fatal_errors;
+
+	/*
+	 * All super errors, from invalid members and IO error all go into
+	 * nr_super_errors.
+	 */
+	__u16	nr_super_errors;
+
+	/* Super block accounting. */
+	__u16	nr_super_scrubbed;
+	__u16	nr_super_repaired;
+
+	/*
+	 * Data accounting in bytes.
+	 *
+	 * We only care about how many bytes we scrubbed, thus no
+	 * accounting for number of extents.
+	 *
+	 * This accounting includes the extra mirrors.
+	 * E.g. for RAID1, one 16KiB extent will cause 32KiB in @data_scrubbed.
+	 */
+	__u64	data_scrubbed;
+
+	/* How many bytes can be recovered. */
+	__u64	data_recoverable;
+
+	/*
+	 * How many bytes are uncertain, this can only happen for NODATASUM
+	 * cases.
+	 * Including NODATASUM, and no extra mirror/parity to verify.
+	 * Or has extra mirrors, but they mismatch with each other.
+	 */
+	__u64	data_nocsum_uncertain;
+
+	/*
+	 * For data error bytes, these means determining errors, including:
+	 *
+	 * - IO failure, including missing dev.
+	 * - Data csum mismatch
+	 *   Csum tree search failure must go above case.
+	 */
+	__u64	data_io_fail;
+	__u64	data_csum_mismatch;
+
+	/*
+	 * All the unmentioned cases, including data matching its csum (of
+	 * course, implies IO suceeded) and data has no csum but matches all
+	 * other copies/parities, are the expected cases, no need to record.
+	 */
+
+	/*
+	 * Metadata accounting in bytes, pretty much the same as data.
+	 *
+	 * And since metadata has mandatory csum, there is no uncertain case.
+	 */
+	__u64	meta_scrubbed;
+	__u64	meta_recoverable;
+
+	/*
+	 * For meta, the checks are mostly progressive:
+	 *
+	 * - Unable to read
+	 *   @meta_io_fail
+	 *
+	 * - Unable to pass basic sanity checks (e.g. bytenr check)
+	 *   @meta_invalid
+	 *
+	 * - Pass basic sanity checks, but bad csum
+	 *   @meta_bad_csum
+	 *
+	 * - Pass basic checks and csum, but bad transid
+	 *   @meta_bad_transid
+	 *
+	 * - Pass all checks
+	 *   The expected case, no special accounting needed.
+	 */
+	__u64 meta_io_fail;
+	__u64 meta_invalid;
+	__u64 meta_bad_csum;
+	__u64 meta_bad_transid;
+
+	/*
+	 * Parity accounting.
+	 *
+	 * NOTE: for unused data sectors (but still contributes to P/Q
+	 * calculation, like the following case), they don't contribute
+	 * to any accounting.
+	 *
+	 * Data 1:	|<--- Unused ---->| <<<
+	 * Data 2:	|<- Data extent ->|
+	 * Parity:	|<--- Parity ---->|
+	 */
+	__u64 parity_scrubbed;
+	__u64 parity_recoverable;
+
+	/*
+	 * This happens when there is not enough info to determine if the
+	 * parity is correct, mostly happens when vertical stripes are
+	 * *all* NODATASUM sectors.
+	 *
+	 * If there is any sector with checksum in the vertical stripe,
+	 * parity itself will be no longer uncertain.
+	 */
+	__u64 parity_uncertain;
+
+	/*
+	 * For parity, the checks are progressive too:
+	 *
+	 * - Unable to read
+	 *   @parity_io_fail
+	 *
+	 * - Mismatch and any veritical data stripe has csum and
+	 *   the data stripe csum matches
+	 *   @parity_mismatch
+	 *   We want to repair the parity then.
+	 *
+	 * - Mismatch and veritical data stripe has csum, and data
+	 *   csum mismatch. And rebuilt data passes csum.
+	 *   This will go @data_recoverable or @data_csum_mismatch instead.
+	 *
+	 * - Mismatch but no veritical data stripe has csum
+	 *   @parity_uncertain
+	 *
+	 */
+	__u64 parity_io_fail;
+	__u64 parity_mismatch;
+
+	/* Padding to 256 bytes, and for later expansion. */
+	__u64 __unused[15];
+};
+static_assert(sizeof(struct btrfs_scrub_fs_progress) == 256);
+
+/*
+ * Readonly scrub fs will not try any repair (thus *_repaired member
+ * in scrub_fs_progress should always be 0).
+ */
+#define BTRFS_SCRUB_FS_FLAG_READONLY	(1ULL << 0)
+
+/*
+ * All supported flags.
+ *
+ * From the very beginning, scrub_fs ioctl would reject any unsupported
+ * flags, making later expansion much simper.
+ */
+#define BTRFS_SCRUB_FS_FLAG_SUPP	(BTRFS_SCRUB_FS_FLAG_READONLY)
+
+struct btrfs_ioctl_scrub_fs_args {
+	/* Input, logical bytenr to start the scrub */
+	__u64 start;
+
+	/* Input, the logical bytenr end (inclusive) */
+	__u64 end;
+
+	__u64 flags;
+	__u64 reserved[8];
+	struct btrfs_scrub_fs_progress progress; /* out */
+
+	/* pad to 1K */
+	__u8 unused[1024 - 24 - 64 - sizeof(struct btrfs_scrub_fs_progress)];
+};
+
 #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS	0
 #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID	1
 struct btrfs_ioctl_dev_replace_start_params {
@@ -1137,5 +1305,10 @@  enum btrfs_err_code {
 				    struct btrfs_ioctl_encoded_io_args)
 #define BTRFS_IOC_ENCODED_WRITE _IOW(BTRFS_IOCTL_MAGIC, 64, \
 				     struct btrfs_ioctl_encoded_io_args)
+#define BTRFS_IOC_SCRUB_FS	_IOWR(BTRFS_IOCTL_MAGIC, 65, \
+				      struct btrfs_ioctl_scrub_fs_args)
+#define BTRFS_IOC_SCRUB_FS_CANCEL _IO(BTRFS_IOCTL_MAGIC, 66)
+#define BTRFS_IOC_SCRUB_FS_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 67, \
+				       struct btrfs_ioctl_scrub_fs_args)
 
 #endif /* _UAPI_LINUX_BTRFS_H */