diff mbox series

[RFC] btrfs: raid56: extra debug for raid6 syndrome generation

Message ID ceaa8a9d4a19dbe017012d5cdbd78aafeac31cc9.1706239278.git.wqu@suse.com (mailing list archive)
State New, archived
Headers show
Series [RFC] btrfs: raid56: extra debug for raid6 syndrome generation | expand

Commit Message

Qu Wenruo Jan. 26, 2024, 3:21 a.m. UTC
[BUG]
I have got at least two crash report for RAID6 syndrome generation, no
matter if it's AVX2 or SSE2, they all seems to have a similar
calltrace with corrupted RAX:

 BUG: kernel NULL pointer dereference, address: 0000000000000000
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 0 P4D 0
 Oops: 0000 [#1] PREEMPT SMP PTI
 Workqueue: btrfs-rmw rmw_rbio_work [btrfs]
 RIP: 0010:raid6_sse21_gen_syndrome+0x9e/0x130 [raid6_pq]
 RAX: 0000000000000000 RBX: 0000000000001000 RCX: ffffa0ff4cfa3248
 RDX: 0000000000000000 RSI: ffffa0f74cfa3238 RDI: 0000000000000000
 Call Trace:
  <TASK>
  rmw_rbio+0x5c8/0xa80 [btrfs]
  process_one_work+0x1c7/0x3d0
  worker_thread+0x4d/0x380
  kthread+0xf3/0x120
  ret_from_fork+0x2c/0x50
  </TASK>

[CAUSE]
In fact I don't have any clue.

Recently I also hit this in AVX512 path, and that's even in v5.15
backport, which doesn't have any of my RAID56 rework.

Furthermore according to the registers:

 RAX: 0000000000000000 RBX: 0000000000001000 RCX: ffffa0ff4cfa3248

The RAX register is showing the number of stripes (including PQ),
which is not correct (0).
But the remaining two registers are all sane.

- RBX is the sectorsize
  For x86_64 it should always be 4K and matches the output.

- RCX is the pointers array
  Which is from rbio->finish_pointers, and it looks like a sane
  kernel address.

[WORKAROUND]
For now, I can only add extra debug ASSERT()s before we call raid6
gen_syndrome() helper and hopes to catch the problem.

The debug requires both CONFIG_BTRFS_DEBUG and CONFIG_BTRFS_ASSERT
enabled.

My current guess is some use-after-free, but every report is only having
corrupted RAX but seemingly valid pointers doesn't make much sense.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/raid56.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

Comments

David Sterba Feb. 14, 2024, 7:38 a.m. UTC | #1
On Fri, Jan 26, 2024 at 01:51:32PM +1030, Qu Wenruo wrote:
> [BUG]
> I have got at least two crash report for RAID6 syndrome generation, no
> matter if it's AVX2 or SSE2, they all seems to have a similar
> calltrace with corrupted RAX:
> 
>  BUG: kernel NULL pointer dereference, address: 0000000000000000
>  #PF: supervisor read access in kernel mode
>  #PF: error_code(0x0000) - not-present page
>  PGD 0 P4D 0
>  Oops: 0000 [#1] PREEMPT SMP PTI
>  Workqueue: btrfs-rmw rmw_rbio_work [btrfs]
>  RIP: 0010:raid6_sse21_gen_syndrome+0x9e/0x130 [raid6_pq]
>  RAX: 0000000000000000 RBX: 0000000000001000 RCX: ffffa0ff4cfa3248
>  RDX: 0000000000000000 RSI: ffffa0f74cfa3238 RDI: 0000000000000000
>  Call Trace:
>   <TASK>
>   rmw_rbio+0x5c8/0xa80 [btrfs]
>   process_one_work+0x1c7/0x3d0
>   worker_thread+0x4d/0x380
>   kthread+0xf3/0x120
>   ret_from_fork+0x2c/0x50
>   </TASK>
> 
> [CAUSE]
> In fact I don't have any clue.
> 
> Recently I also hit this in AVX512 path, and that's even in v5.15
> backport, which doesn't have any of my RAID56 rework.
> 
> Furthermore according to the registers:
> 
>  RAX: 0000000000000000 RBX: 0000000000001000 RCX: ffffa0ff4cfa3248
> 
> The RAX register is showing the number of stripes (including PQ),
> which is not correct (0).
> But the remaining two registers are all sane.
> 
> - RBX is the sectorsize
>   For x86_64 it should always be 4K and matches the output.
> 
> - RCX is the pointers array
>   Which is from rbio->finish_pointers, and it looks like a sane
>   kernel address.
> 
> [WORKAROUND]
> For now, I can only add extra debug ASSERT()s before we call raid6
> gen_syndrome() helper and hopes to catch the problem.
> 
> The debug requires both CONFIG_BTRFS_DEBUG and CONFIG_BTRFS_ASSERT
> enabled.
> 
> My current guess is some use-after-free, but every report is only having
> corrupted RAX but seemingly valid pointers doesn't make much sense.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>

Reviewed-by: David Sterba <dsterba@suse.com>

I haven't seen the crash for some time but with this patch I may add
some info once it happens again.

> ---
>  fs/btrfs/raid56.c | 30 ++++++++++++++++++++++++++++++
>  1 file changed, 30 insertions(+)
> 
> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
> index 5c4bf3f907c1..6f4a9cfeea44 100644
> --- a/fs/btrfs/raid56.c
> +++ b/fs/btrfs/raid56.c
> @@ -917,6 +917,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
>  	 */
>  	ASSERT(stripe_nsectors <= BITS_PER_LONG);
>  
> +	/*
> +	 * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256
> +	 * (limited by u8).
> +	 */
> +	ASSERT(real_stripes >= 2);
> +	ASSERT(real_stripes <= U8_MAX);
> +
>  	rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
>  	if (!rbio)
>  		return ERR_PTR(-ENOMEM);
> @@ -954,6 +961,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
>  
>  	ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
>  	rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
> +	ASSERT(rbio->nr_data > 0);
>  
>  	return rbio;
>  }
> @@ -1180,6 +1188,26 @@ static inline void bio_list_put(struct bio_list *bio_list)
>  		bio_put(bio);
>  }
>  
> +static void assert_rbio(struct btrfs_raid_bio *rbio)

                           const strurct btrfs_raid_bio

> +{
> +	if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
> +	    !IS_ENABLED(CONFIG_BTRFS_ASSERT))
> +		return;
> +
> +	/*
> +	 * At least two stripes (2 disks RAID5), and since real_stripes is U8,
> +	 * we won't go beyond 256 disks anyway.
> +	 */
> +	ASSERT(rbio->real_stripes >= 2);
> +	ASSERT(rbio->nr_data > 0);
> +
> +	/*
> +	 * This is another check to make sure nr data stripes is smaller
> +	 * than total stripes.
> +	 */
> +	ASSERT(rbio->nr_data < rbio->real_stripes);
> +}
David Sterba Feb. 21, 2024, 3:04 p.m. UTC | #2
On Wed, Feb 14, 2024 at 08:38:55AM +0100, David Sterba wrote:
> On Fri, Jan 26, 2024 at 01:51:32PM +1030, Qu Wenruo wrote:
> > [BUG]
> > I have got at least two crash report for RAID6 syndrome generation, no
> > matter if it's AVX2 or SSE2, they all seems to have a similar
> > calltrace with corrupted RAX:
> > 
> >  BUG: kernel NULL pointer dereference, address: 0000000000000000
> >  #PF: supervisor read access in kernel mode
> >  #PF: error_code(0x0000) - not-present page
> >  PGD 0 P4D 0
> >  Oops: 0000 [#1] PREEMPT SMP PTI
> >  Workqueue: btrfs-rmw rmw_rbio_work [btrfs]
> >  RIP: 0010:raid6_sse21_gen_syndrome+0x9e/0x130 [raid6_pq]
> >  RAX: 0000000000000000 RBX: 0000000000001000 RCX: ffffa0ff4cfa3248
> >  RDX: 0000000000000000 RSI: ffffa0f74cfa3238 RDI: 0000000000000000
> >  Call Trace:
> >   <TASK>
> >   rmw_rbio+0x5c8/0xa80 [btrfs]
> >   process_one_work+0x1c7/0x3d0
> >   worker_thread+0x4d/0x380
> >   kthread+0xf3/0x120
> >   ret_from_fork+0x2c/0x50
> >   </TASK>
> > 
> > [CAUSE]
> > In fact I don't have any clue.
> > 
> > Recently I also hit this in AVX512 path, and that's even in v5.15
> > backport, which doesn't have any of my RAID56 rework.
> > 
> > Furthermore according to the registers:
> > 
> >  RAX: 0000000000000000 RBX: 0000000000001000 RCX: ffffa0ff4cfa3248
> > 
> > The RAX register is showing the number of stripes (including PQ),
> > which is not correct (0).
> > But the remaining two registers are all sane.
> > 
> > - RBX is the sectorsize
> >   For x86_64 it should always be 4K and matches the output.
> > 
> > - RCX is the pointers array
> >   Which is from rbio->finish_pointers, and it looks like a sane
> >   kernel address.
> > 
> > [WORKAROUND]
> > For now, I can only add extra debug ASSERT()s before we call raid6
> > gen_syndrome() helper and hopes to catch the problem.
> > 
> > The debug requires both CONFIG_BTRFS_DEBUG and CONFIG_BTRFS_ASSERT
> > enabled.
> > 
> > My current guess is some use-after-free, but every report is only having
> > corrupted RAX but seemingly valid pointers doesn't make much sense.
> > 
> > Signed-off-by: Qu Wenruo <wqu@suse.com>
> 
> Reviewed-by: David Sterba <dsterba@suse.com>
> 
> I haven't seen the crash for some time but with this patch I may add
> some info once it happens again.

For the record, I added this patch to for-next.
diff mbox series

Patch

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 5c4bf3f907c1..6f4a9cfeea44 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -917,6 +917,13 @@  static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
 	 */
 	ASSERT(stripe_nsectors <= BITS_PER_LONG);
 
+	/*
+	 * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256
+	 * (limited by u8).
+	 */
+	ASSERT(real_stripes >= 2);
+	ASSERT(real_stripes <= U8_MAX);
+
 	rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
 	if (!rbio)
 		return ERR_PTR(-ENOMEM);
@@ -954,6 +961,7 @@  static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
 
 	ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
 	rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
+	ASSERT(rbio->nr_data > 0);
 
 	return rbio;
 }
@@ -1180,6 +1188,26 @@  static inline void bio_list_put(struct bio_list *bio_list)
 		bio_put(bio);
 }
 
+static void assert_rbio(struct btrfs_raid_bio *rbio)
+{
+	if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
+	    !IS_ENABLED(CONFIG_BTRFS_ASSERT))
+		return;
+
+	/*
+	 * At least two stripes (2 disks RAID5), and since real_stripes is U8,
+	 * we won't go beyond 256 disks anyway.
+	 */
+	ASSERT(rbio->real_stripes >= 2);
+	ASSERT(rbio->nr_data > 0);
+
+	/*
+	 * This is another check to make sure nr data stripes is smaller
+	 * than total stripes.
+	 */
+	ASSERT(rbio->nr_data < rbio->real_stripes);
+}
+
 /* Generate PQ for one vertical stripe. */
 static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
 {
@@ -1211,6 +1239,7 @@  static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
 		pointers[stripe++] = kmap_local_page(sector->page) +
 				     sector->pgoff;
 
+		assert_rbio(rbio);
 		raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
 					pointers);
 	} else {
@@ -2472,6 +2501,7 @@  static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
 		}
 
 		if (has_qstripe) {
+			assert_rbio(rbio);
 			/* RAID6, call the library function to fill in our P/Q */
 			raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
 						pointers);