diff mbox series

[PoC,v2,04/10] btrfs: scrub: introduce place holder helper scrub_fs_block_group()

Message ID ba1a99b5c44dd02a9ebb63cb95d8dc4080bd1949.1664353497.git.wqu@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs: scrub: introduce a new family of ioctl, scrub_fs | expand

Commit Message

Qu Wenruo Sept. 28, 2022, 8:35 a.m. UTC
The main place holder helper scrub_fs_block_group() will:

- Initialize various needed members inside scrub_fs_ctx
  This includes:
  * Calculate the nr_copies for non-RAID56 profiles, or grab nr_stripes
    for RAID56 profiles.
  * Allocate memory for sectors/pages array, and csum_buf if it's data
    bg.
  * Initialize all sectors to type UNUSED.

  All these above memory will stay for each stripe we run, thus we only
  need to allocate these memories once-per-bg.

- Iterate stripes containing any used sector
  This is the code to be implemented.

- Cleanup above memories before we finish the block group.

The real work of scrubbing a stripe is not yet implemented.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/scrub.c | 234 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 232 insertions(+), 2 deletions(-)

Comments

Wang Yugui Sept. 28, 2022, 2:30 p.m. UTC | #1
Hi,

> The main place holder helper scrub_fs_block_group() will:
> 
> - Initialize various needed members inside scrub_fs_ctx
>   This includes:
>   * Calculate the nr_copies for non-RAID56 profiles, or grab nr_stripes
>     for RAID56 profiles.
>   * Allocate memory for sectors/pages array, and csum_buf if it's data
>     bg.
>   * Initialize all sectors to type UNUSED.
> 
>   All these above memory will stay for each stripe we run, thus we only
>   need to allocate these memories once-per-bg.
> 
> - Iterate stripes containing any used sector
>   This is the code to be implemented.
> 
> - Cleanup above memories before we finish the block group.
> 
> The real work of scrubbing a stripe is not yet implemented.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  fs/btrfs/scrub.c | 234 ++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 232 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
> index 97da8545c9ab..6e6c50962ace 100644
> --- a/fs/btrfs/scrub.c
> +++ b/fs/btrfs/scrub.c
> @@ -198,6 +198,45 @@ struct scrub_ctx {
>  	refcount_t              refs;
>  };
>  
> +#define SCRUB_FS_SECTOR_FLAG_UNUSED		(1 << 0)
> +#define SCRUB_FS_SECTOR_FLAG_DATA		(1 << 1)
> +#define SCRUB_FS_SECTOR_FLAG_META		(1 << 2)
> +#define SCRUB_FS_SECTOR_FLAG_PARITY		(1 << 3)
> +

there is few use case for SCRUB_FS_SECTOR_FLAG_PARITY ?

and we may need SCRUB_FS_SECTOR_FLAG_SYSTEM so we can match 'btrfs scrub
start' well with 'btrfs balance start -d -m -s'

Best Regards
Wang Yugui (wangyugui@e16-tech.com)
2022/09/28
Qu Wenruo Sept. 28, 2022, 10:54 p.m. UTC | #2
On 2022/9/28 22:30, Wang Yugui wrote:
> Hi,
>
>> The main place holder helper scrub_fs_block_group() will:
>>
>> - Initialize various needed members inside scrub_fs_ctx
>>    This includes:
>>    * Calculate the nr_copies for non-RAID56 profiles, or grab nr_stripes
>>      for RAID56 profiles.
>>    * Allocate memory for sectors/pages array, and csum_buf if it's data
>>      bg.
>>    * Initialize all sectors to type UNUSED.
>>
>>    All these above memory will stay for each stripe we run, thus we only
>>    need to allocate these memories once-per-bg.
>>
>> - Iterate stripes containing any used sector
>>    This is the code to be implemented.
>>
>> - Cleanup above memories before we finish the block group.
>>
>> The real work of scrubbing a stripe is not yet implemented.
>>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>>   fs/btrfs/scrub.c | 234 ++++++++++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 232 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
>> index 97da8545c9ab..6e6c50962ace 100644
>> --- a/fs/btrfs/scrub.c
>> +++ b/fs/btrfs/scrub.c
>> @@ -198,6 +198,45 @@ struct scrub_ctx {
>>   	refcount_t              refs;
>>   };
>>
>> +#define SCRUB_FS_SECTOR_FLAG_UNUSED		(1 << 0)
>> +#define SCRUB_FS_SECTOR_FLAG_DATA		(1 << 1)
>> +#define SCRUB_FS_SECTOR_FLAG_META		(1 << 2)
>> +#define SCRUB_FS_SECTOR_FLAG_PARITY		(1 << 3)
>> +
>
> there is few use case for SCRUB_FS_SECTOR_FLAG_PARITY ?

For future RAID56 support.

>
> and we may need SCRUB_FS_SECTOR_FLAG_SYSTEM so we can match 'btrfs scrub
> start' well with 'btrfs balance start -d -m -s'

"System" is still metadata.

Please understand why we need system chunks first.

>
> Best Regards
> Wang Yugui (wangyugui@e16-tech.com)
> 2022/09/28
>
>
>
diff mbox series

Patch

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 97da8545c9ab..6e6c50962ace 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -198,6 +198,45 @@  struct scrub_ctx {
 	refcount_t              refs;
 };
 
+#define SCRUB_FS_SECTOR_FLAG_UNUSED		(1 << 0)
+#define SCRUB_FS_SECTOR_FLAG_DATA		(1 << 1)
+#define SCRUB_FS_SECTOR_FLAG_META		(1 << 2)
+#define SCRUB_FS_SECTOR_FLAG_PARITY		(1 << 3)
+
+/*
+ * Represent a sector.
+ *
+ * To access the content of a sector, the caller should have the index inside
+ * the scrub_fs_ctx->sectors[] array, and use that index to calculate the page
+ * and page offset innside scrub_fs_ctx->pages[] array.
+ *
+ * To get the logical/physical bytenr of the a sector, the caller should use
+ * scrub_fs_ctx->bioc and the sector index to calclulate the logical/physical
+ * bytenr.
+ */
+struct scrub_fs_sector {
+	unsigned int flags;
+	union {
+		/*
+		 * For SCRUB_FS_SECTOR_TYPE_DATA, either it points to some byte
+		 * inside scrub_fs_ctx->csum_buf, or it's NULL for NODATACSUM
+		 * case.
+		 */
+		u8 *csum;
+
+		/*
+		 * For SECRUB_FS_SECTOR_TYPE_META, this records the generation
+		 * and the logical bytenr of the tree block.
+		 * (So we can grab the first sector to calculate their inline
+		 * csum).
+		 */
+		struct {
+			u64 eb_logical;
+			u64 eb_generation;
+		};
+	};
+};
+
 /* This structure should only has a lifespan inside btrfs_scrub_fs(). */
 struct scrub_fs_ctx {
 	struct btrfs_fs_info		*fs_info;
@@ -214,6 +253,57 @@  struct scrub_fs_ctx {
 
 	/* There will and only be one thread touching @stat. */
 	struct btrfs_scrub_fs_progress	stat;
+
+	/*
+	 * How many sectors we read per stripe.
+	 *
+	 * For now, it's fixed to BTRFS_STRIPE_LEN / sectorsize.
+	 *
+	 * This can be enlarged to full stripe size / sectorsize
+	 * for later RAID0/10/5/6 code.
+	 */
+	int				sectors_per_stripe;
+	/*
+	 * For non-RAID56 profiles, we only care how many copies the block
+	 * group has.
+	 * For RAID56 profiles, we care how many stripes the block group
+	 * has (including data and parities).
+	 */
+	union {
+		int			nr_stripes;
+		int			nr_copies;
+	};
+
+	/*
+	 * The total number of sectors we scrub in one run (including
+	 * the extra mirrors/parities).
+	 *
+	 * For non-RAID56 profiles, it would be:
+	 *   nr_copie * (BTRFS_STRIPE_LEN / sectorsize).
+	 *
+	 * For RAID56 profiles, it would be:
+	 *   nr_stripes * (BTRFS_STRIPE_LEN / sectorsize).
+	 */
+	int				total_sectors;
+
+	/* Page array for above total_sectors. */
+	struct page			**pages;
+
+	/*
+	 * Sector array for above total_sectors. The page content will be
+	 * inside above pages array.
+	 *
+	 * Both array should be initialized when start to scrub a block group.
+	 */
+	struct scrub_fs_sector		*sectors;
+
+	/*
+	 * Csum buffer allocated for the stripe.
+	 *
+	 * All sectors in different mirrors for the same logical bytenr
+	 * would point to the same location inside the buffer.
+	 */
+	u8				*csum_buf;
 };
 
 struct scrub_warning {
@@ -4466,6 +4556,147 @@  static struct scrub_fs_ctx *scrub_fs_alloc_ctx(struct btrfs_fs_info *fs_info,
 	return ERR_PTR(ret);
 }
 
+/*
+ * Cleanup the memory allocation, mostly after finishing a bg, or for error
+ * path.
+ */
+static void scrub_fs_cleanup_for_bg(struct scrub_fs_ctx *sfctx)
+{
+	int i;
+	const int nr_pages = sfctx->nr_copies * (BTRFS_STRIPE_LEN >> PAGE_SHIFT);
+
+	if (sfctx->pages) {
+		for (i = 0; i < nr_pages; i++) {
+			if (sfctx->pages[i]) {
+				__free_page(sfctx->pages[i]);
+				sfctx->pages[i] = NULL;
+			}
+		}
+	}
+	kfree(sfctx->pages);
+	sfctx->pages = NULL;
+
+	kfree(sfctx->sectors);
+	sfctx->sectors = NULL;
+
+	kfree(sfctx->csum_buf);
+	sfctx->csum_buf = NULL;
+
+	/* NOTE: block group will only be put inside scrub_fs_iterate_bgs(). */
+	sfctx->cur_bg = NULL;
+}
+
+/* Do the block group specific initialization. */
+static int scrub_fs_init_for_bg(struct scrub_fs_ctx *sfctx,
+				struct btrfs_block_group *bg)
+{
+	struct btrfs_fs_info *fs_info = sfctx->fs_info;
+	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
+	struct extent_map *em;
+	bool is_raid56 = !!(bg->flags & BTRFS_BLOCK_GROUP_RAID56_MASK);
+	int ret = 0;
+	int nr_pages;
+	int i;
+
+	/*
+	 * One stripe should be page aligned, aka, PAGE_SIZE should not be
+	 * larger than 64K.
+	 */
+	ASSERT(IS_ALIGNED(BTRFS_STRIPE_LEN, PAGE_SIZE));
+
+	/* Last run should have cleanedup all the memories. */
+	ASSERT(!sfctx->cur_bg);
+	ASSERT(!sfctx->pages);
+	ASSERT(!sfctx->sectors);
+	ASSERT(!sfctx->csum_buf);
+
+	read_lock(&map_tree->lock);
+	em = lookup_extent_mapping(map_tree, bg->start, bg->length);
+	read_unlock(&map_tree->lock);
+
+	/*
+	 * Might have been an unused block group deleted by the cleaner
+	 * kthread or relocation.
+	 */
+	if (!em) {
+		spin_lock(&bg->lock);
+		if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags))
+			ret = -EINVAL;
+		spin_unlock(&bg->lock);
+		return ret;
+	}
+	/*
+	 * Since we're ensured to be executed without any other
+	 * dev-replace/scrub running, the num_stripes should be the total
+	 * number of stripes, without the replace target device.
+	 */
+	if (is_raid56)
+		sfctx->nr_stripes = em->map_lookup->num_stripes;
+	free_extent_map(em);
+
+	if (!is_raid56)
+		sfctx->nr_copies = btrfs_num_copies(fs_info, bg->start,
+						    fs_info->sectorsize);
+	sfctx->sectors_per_stripe = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
+	sfctx->total_sectors = sfctx->sectors_per_stripe * sfctx->nr_copies;
+
+	nr_pages = (BTRFS_STRIPE_LEN >> PAGE_SHIFT) * sfctx->nr_copies;
+
+	sfctx->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!sfctx->pages)
+		goto enomem;
+
+	for (i = 0; i < nr_pages; i++) {
+		sfctx->pages[i] = alloc_page(GFP_KERNEL);
+		if (!sfctx->pages[i])
+			goto enomem;
+	}
+
+	sfctx->sectors = kcalloc(sfctx->total_sectors,
+				 sizeof(struct scrub_fs_sector), GFP_KERNEL);
+	if (!sfctx->sectors)
+		goto enomem;
+
+	for (i = 0; i < sfctx->total_sectors; i++)
+		sfctx->sectors[i].flags = SCRUB_FS_SECTOR_FLAG_UNUSED;
+
+	if (bg->flags & BTRFS_BLOCK_GROUP_DATA) {
+		sfctx->csum_buf = kzalloc(fs_info->csum_size *
+					  sfctx->sectors_per_stripe, GFP_KERNEL);
+		if (!sfctx->csum_buf)
+			goto enomem;
+	}
+	sfctx->cur_bg = bg;
+	sfctx->cur_logical = bg->start;
+	return 0;
+
+enomem:
+	sfctx->stat.nr_fatal_errors++;
+	scrub_fs_cleanup_for_bg(sfctx);
+	return -ENOMEM;
+}
+
+
+static int scrub_fs_block_group(struct scrub_fs_ctx *sfctx,
+				struct btrfs_block_group *bg)
+{
+	int ret;
+
+	/* Not yet supported, just skip RAID56 bgs for now. */
+	if (bg->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)
+		return 0;
+
+	ret = scrub_fs_init_for_bg(sfctx, bg);
+	if (ret < 0)
+		return ret;
+
+	/* Place holder for the loop itearting the sectors. */
+	ret = 0;
+
+	scrub_fs_cleanup_for_bg(sfctx);
+	return ret;
+}
+
 static int scrub_fs_iterate_bgs(struct scrub_fs_ctx *sfctx, u64 start, u64 end)
 {
 	struct btrfs_fs_info *fs_info = sfctx->fs_info;
@@ -4529,8 +4760,7 @@  static int scrub_fs_iterate_bgs(struct scrub_fs_ctx *sfctx, u64 start, u64 end)
 
 		scrub_pause_off(fs_info);
 
-		/* Place holder for the real chunk scrubbing code. */
-		ret = 0;
+		ret = scrub_fs_block_group(sfctx, bg);
 
 		if (ro_set)
 			btrfs_dec_block_group_ro(bg);