diff mbox series

[V1,RFC,1/1] ocfs2: race between searching chunks and release journal_head from buffer_head

Message ID 1634707081-16450-1-git-send-email-gautham.ananthakrishna@oracle.com (mailing list archive)
State New, archived
Headers show
Series [V1,RFC,1/1] ocfs2: race between searching chunks and release journal_head from buffer_head | expand

Commit Message

Gautham Ananthakrishna Oct. 20, 2021, 5:18 a.m. UTC
Encountered a race between ocfs2_test_bg_bit_allocatable() and
jbd2_journal_put_journal_head() resulting in the below vmcore.

PID: 106879  TASK: ffff880244ba9c00  CPU: 2   COMMAND: "loop3"
 0 [ffff8802435ff1c0] panic at ffffffff816ed175
 1 [ffff8802435ff240] oops_end at ffffffff8101a7c9
 2 [ffff8802435ff270] no_context at ffffffff8106eccf
 3 [ffff8802435ff2e0] __bad_area_nosemaphore at ffffffff8106ef9d
 4 [ffff8802435ff330] bad_area_nosemaphore at ffffffff8106f143
 5 [ffff8802435ff340] __do_page_fault at ffffffff8106f80b
 6 [ffff8802435ff3a0] do_page_fault at ffffffff8106fc2f
 7 [ffff8802435ff3e0] page_fault at ffffffff816fd667
    [exception RIP: ocfs2_block_group_find_clear_bits+316]
    RIP: ffffffffc11ef6fc  RSP: ffff8802435ff498  RFLAGS: 00010206
    RAX: 0000000000003918  RBX: 0000000000000001  RCX: 0000000000000018
    RDX: 0000000000003918  RSI: 0000000000000000  RDI: ffff880060194040
    RBP: ffff8802435ff4f8   R8: ffffffffff000000   R9: ffffffffffffffff
    R10: ffff8802435ff730  R11: ffff8802a94e5800  R12: 0000000000000007
    R13: 0000000000007e00  R14: 0000000000003918  R15: ffff88017c973a28
    ORIG_RAX: ffffffffffffffff  CS: e030  SS: e02b
 8 [ffff8802435ff490] ocfs2_block_group_find_clear_bits at ffffffffc11ef680 [ocfs2]
 9 [ffff8802435ff500] ocfs2_cluster_group_search at ffffffffc11ef916 [ocfs2]
10 [ffff8802435ff580] ocfs2_search_chain at ffffffffc11f0fb6 [ocfs2]
11 [ffff8802435ff660] ocfs2_claim_suballoc_bits at ffffffffc11f1b1b [ocfs2]
12 [ffff8802435ff6f0] __ocfs2_claim_clusters at ffffffffc11f32cb [ocfs2]
13 [ffff8802435ff770] ocfs2_claim_clusters at ffffffffc11f5caf [ocfs2]
14 [ffff8802435ff780] ocfs2_local_alloc_slide_window at ffffffffc11cc0db [ocfs2]
15 [ffff8802435ff820] ocfs2_reserve_local_alloc_bits at ffffffffc11ce53f [ocfs2]
16 [ffff8802435ff890] ocfs2_reserve_clusters_with_limit at ffffffffc11f59b5 [ocfs2]
17 [ffff8802435ff8e0] ocfs2_reserve_clusters at ffffffffc11f5c88 [ocfs2]
18 [ffff8802435ff8f0] ocfs2_lock_refcount_allocators at ffffffffc11dc169 [ocfs2]
19 [ffff8802435ff960] ocfs2_make_clusters_writable at ffffffffc11e4274 [ocfs2]
20 [ffff8802435ffa50] ocfs2_replace_cow at ffffffffc11e4df1 [ocfs2]
21 [ffff8802435ffac0] ocfs2_refcount_cow at ffffffffc11e54b1 [ocfs2]
22 [ffff8802435ffb80] ocfs2_file_write_iter at ffffffffc11bf8f4 [ocfs2]
23 [ffff8802435ffcd0] lo_rw_aio at ffffffff814a1b5d
24 [ffff8802435ffd80] loop_queue_work at ffffffff814a2802
25 [ffff8802435ffe60] kthread_worker_fn at ffffffff810a80d2
26 [ffff8802435ffec0] kthread at ffffffff810a7afb
27 [ffff8802435fff50] ret_from_fork at ffffffff816f7da1

When ocfs2_test_bg_bit_allocatable() called bh2jh(bg_bh), the bg_bh->b_private
NULL as jbd2_journal_put_journal_head() raced and released the jounal head
from the buffer head. Needed to take bit lock for the bit 'BH_JournalHead'
to fix this race.

Signed-off-by: Gautham Ananthakrishna <gautham.ananthakrishna@oracle.com>
---
 fs/ocfs2/suballoc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

Comments

Joseph Qi Oct. 20, 2021, 8:26 a.m. UTC | #1
Hi,

How about make the change like following?

diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8521942..481017e 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1251,7 +1251,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
 {
 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 	struct journal_head *jh;
-	int ret;
+	int ret = 1;
 
 	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
 		return 0;
@@ -1259,14 +1259,18 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
 	if (!buffer_jbd(bg_bh))
 		return 1;
 
-	jh = bh2jh(bg_bh);
-	spin_lock(&jh->b_state_lock);
-	bg = (struct ocfs2_group_desc *) jh->b_committed_data;
-	if (bg)
-		ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
-	else
-		ret = 1;
-	spin_unlock(&jh->b_state_lock);
+	jbd_lock_bh_journal_head(bg_bh);
+	if (buffer_jbd(bg_bh)) {
+		jh = bh2jh(bg_bh);
+		spin_lock(&jh->b_state_lock);
+		bg = (struct ocfs2_group_desc *) jh->b_committed_data;
+		if (bg)
+			ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+		else
+			ret = 1;
+		spin_unlock(&jh->b_state_lock);
+	}
+	jbd_unlock_bh_journal_head(bg_bh);
 
 	return ret;
 }


On 10/20/21 1:18 PM, Gautham Ananthakrishna wrote:
> Encountered a race between ocfs2_test_bg_bit_allocatable() and
> jbd2_journal_put_journal_head() resulting in the below vmcore.
> 
> PID: 106879  TASK: ffff880244ba9c00  CPU: 2   COMMAND: "loop3"
>  0 [ffff8802435ff1c0] panic at ffffffff816ed175
>  1 [ffff8802435ff240] oops_end at ffffffff8101a7c9
>  2 [ffff8802435ff270] no_context at ffffffff8106eccf
>  3 [ffff8802435ff2e0] __bad_area_nosemaphore at ffffffff8106ef9d
>  4 [ffff8802435ff330] bad_area_nosemaphore at ffffffff8106f143
>  5 [ffff8802435ff340] __do_page_fault at ffffffff8106f80b
>  6 [ffff8802435ff3a0] do_page_fault at ffffffff8106fc2f
>  7 [ffff8802435ff3e0] page_fault at ffffffff816fd667
>     [exception RIP: ocfs2_block_group_find_clear_bits+316]
>     RIP: ffffffffc11ef6fc  RSP: ffff8802435ff498  RFLAGS: 00010206
>     RAX: 0000000000003918  RBX: 0000000000000001  RCX: 0000000000000018
>     RDX: 0000000000003918  RSI: 0000000000000000  RDI: ffff880060194040
>     RBP: ffff8802435ff4f8   R8: ffffffffff000000   R9: ffffffffffffffff
>     R10: ffff8802435ff730  R11: ffff8802a94e5800  R12: 0000000000000007
>     R13: 0000000000007e00  R14: 0000000000003918  R15: ffff88017c973a28
>     ORIG_RAX: ffffffffffffffff  CS: e030  SS: e02b
>  8 [ffff8802435ff490] ocfs2_block_group_find_clear_bits at ffffffffc11ef680 [ocfs2]
>  9 [ffff8802435ff500] ocfs2_cluster_group_search at ffffffffc11ef916 [ocfs2]
> 10 [ffff8802435ff580] ocfs2_search_chain at ffffffffc11f0fb6 [ocfs2]
> 11 [ffff8802435ff660] ocfs2_claim_suballoc_bits at ffffffffc11f1b1b [ocfs2]
> 12 [ffff8802435ff6f0] __ocfs2_claim_clusters at ffffffffc11f32cb [ocfs2]
> 13 [ffff8802435ff770] ocfs2_claim_clusters at ffffffffc11f5caf [ocfs2]
> 14 [ffff8802435ff780] ocfs2_local_alloc_slide_window at ffffffffc11cc0db [ocfs2]
> 15 [ffff8802435ff820] ocfs2_reserve_local_alloc_bits at ffffffffc11ce53f [ocfs2]
> 16 [ffff8802435ff890] ocfs2_reserve_clusters_with_limit at ffffffffc11f59b5 [ocfs2]
> 17 [ffff8802435ff8e0] ocfs2_reserve_clusters at ffffffffc11f5c88 [ocfs2]
> 18 [ffff8802435ff8f0] ocfs2_lock_refcount_allocators at ffffffffc11dc169 [ocfs2]
> 19 [ffff8802435ff960] ocfs2_make_clusters_writable at ffffffffc11e4274 [ocfs2]
> 20 [ffff8802435ffa50] ocfs2_replace_cow at ffffffffc11e4df1 [ocfs2]
> 21 [ffff8802435ffac0] ocfs2_refcount_cow at ffffffffc11e54b1 [ocfs2]
> 22 [ffff8802435ffb80] ocfs2_file_write_iter at ffffffffc11bf8f4 [ocfs2]
> 23 [ffff8802435ffcd0] lo_rw_aio at ffffffff814a1b5d
> 24 [ffff8802435ffd80] loop_queue_work at ffffffff814a2802
> 25 [ffff8802435ffe60] kthread_worker_fn at ffffffff810a80d2
> 26 [ffff8802435ffec0] kthread at ffffffff810a7afb
> 27 [ffff8802435fff50] ret_from_fork at ffffffff816f7da1
> 
> When ocfs2_test_bg_bit_allocatable() called bh2jh(bg_bh), the bg_bh->b_private
> NULL as jbd2_journal_put_journal_head() raced and released the jounal head
> from the buffer head. Needed to take bit lock for the bit 'BH_JournalHead'
> to fix this race.
> 
> Signed-off-by: Gautham Ananthakrishna <gautham.ananthakrishna@oracle.com>
> ---
>  fs/ocfs2/suballoc.c | 9 +++++++++
>  1 file changed, 9 insertions(+)
> 
> diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
> index 8521942..86f33f2 100644
> --- a/fs/ocfs2/suballoc.c
> +++ b/fs/ocfs2/suballoc.c
> @@ -1256,9 +1256,17 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
>  	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
>  		return 0;
>  
> +	/* Fast path */
>  	if (!buffer_jbd(bg_bh))
>  		return 1;
>  
> +	/* Slow path */
> +	jbd_lock_bh_journal_head(bg_bh);
> +	if (!buffer_jbd(bg_bh)){
> +		jbd_unlock_bh_journal_head(bg_bh);
> +		return 1;
> +	}
> +
>  	jh = bh2jh(bg_bh);
>  	spin_lock(&jh->b_state_lock);
>  	bg = (struct ocfs2_group_desc *) jh->b_committed_data;
> @@ -1267,6 +1275,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
>  	else
>  		ret = 1;
>  	spin_unlock(&jh->b_state_lock);
> +	jbd_unlock_bh_journal_head(bg_bh);
>  
>  	return ret;
>  }
>
Gautham Ananthakrishna Oct. 20, 2021, 1:46 p.m. UTC | #2
Hi Joseph

The following would retain the fast path,  as per your earlier comment:

diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8521942..048e532 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1251,22 +1251,25 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
 {
        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
        struct journal_head *jh;
-       int ret;
+       int ret = 1;
 
        if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
                return 0;
 
-       if (!buffer_jbd(bg_bh))
-               return 1;
-
-       jh = bh2jh(bg_bh);
-       spin_lock(&jh->b_state_lock);
-       bg = (struct ocfs2_group_desc *) jh->b_committed_data;
-       if (bg)
-               ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
-       else
-               ret = 1;
-       spin_unlock(&jh->b_state_lock);
+       if (buffer_jbd(bg_bh)) {
+               jbd_lock_bh_journal_head(bg_bh);
+               if (buffer_jbd(bg_bh)){
+                       jh = bh2jh(bg_bh);
+                       spin_lock(&jh->b_state_lock);
+                       bg = (struct ocfs2_group_desc *) jh->b_committed_data;
+                       if (bg)
+                               ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+                       else
+                               ret = 1;
+                       spin_unlock(&jh->b_state_lock);
+               }
+               jbd_unlock_bh_journal_head(bg_bh);
+       }
 
        return ret;
 }

We can also remove the re-initialization of ret = 1 in the 'else' part, as 'ret' is already initialized to 1 (However, personally I would like to keep this).

Could you please take a look and comment?

Thanks,
Gautham.

-----Original Message-----
From: Joseph Qi <joseph.qi@linux.alibaba.com> 
Sent: Wednesday, October 20, 2021 1:57 PM
To: Gautham Ananthakrishna <gautham.ananthakrishna@oracle.com>; ocfs2-devel@oss.oracle.com
Cc: Junxiao Bi <junxiao.bi@oracle.com>; Rajesh Sivaramasubramaniom <rajesh.sivaramasubramaniom@oracle.com>
Subject: Re: [PATCH V1 RFC 1/1] ocfs2: race between searching chunks and release journal_head from buffer_head

Hi,

How about make the change like following?

diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 8521942..481017e 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1251,7 +1251,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,  {
 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 	struct journal_head *jh;
-	int ret;
+	int ret = 1;
 
 	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
 		return 0;
@@ -1259,14 +1259,18 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
 	if (!buffer_jbd(bg_bh))
 		return 1;
 
-	jh = bh2jh(bg_bh);
-	spin_lock(&jh->b_state_lock);
-	bg = (struct ocfs2_group_desc *) jh->b_committed_data;
-	if (bg)
-		ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
-	else
-		ret = 1;
-	spin_unlock(&jh->b_state_lock);
+	jbd_lock_bh_journal_head(bg_bh);
+	if (buffer_jbd(bg_bh)) {
+		jh = bh2jh(bg_bh);
+		spin_lock(&jh->b_state_lock);
+		bg = (struct ocfs2_group_desc *) jh->b_committed_data;
+		if (bg)
+			ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+		else
+			ret = 1;
+		spin_unlock(&jh->b_state_lock);
+	}
+	jbd_unlock_bh_journal_head(bg_bh);
 
 	return ret;
 }


On 10/20/21 1:18 PM, Gautham Ananthakrishna wrote:
> Encountered a race between ocfs2_test_bg_bit_allocatable() and
> jbd2_journal_put_journal_head() resulting in the below vmcore.
> 
> PID: 106879  TASK: ffff880244ba9c00  CPU: 2   COMMAND: "loop3"
>  0 [ffff8802435ff1c0] panic at ffffffff816ed175
>  1 [ffff8802435ff240] oops_end at ffffffff8101a7c9
>  2 [ffff8802435ff270] no_context at ffffffff8106eccf
>  3 [ffff8802435ff2e0] __bad_area_nosemaphore at ffffffff8106ef9d
>  4 [ffff8802435ff330] bad_area_nosemaphore at ffffffff8106f143
>  5 [ffff8802435ff340] __do_page_fault at ffffffff8106f80b
>  6 [ffff8802435ff3a0] do_page_fault at ffffffff8106fc2f
>  7 [ffff8802435ff3e0] page_fault at ffffffff816fd667
>     [exception RIP: ocfs2_block_group_find_clear_bits+316]
>     RIP: ffffffffc11ef6fc  RSP: ffff8802435ff498  RFLAGS: 00010206
>     RAX: 0000000000003918  RBX: 0000000000000001  RCX: 0000000000000018
>     RDX: 0000000000003918  RSI: 0000000000000000  RDI: ffff880060194040
>     RBP: ffff8802435ff4f8   R8: ffffffffff000000   R9: ffffffffffffffff
>     R10: ffff8802435ff730  R11: ffff8802a94e5800  R12: 0000000000000007
>     R13: 0000000000007e00  R14: 0000000000003918  R15: ffff88017c973a28
>     ORIG_RAX: ffffffffffffffff  CS: e030  SS: e02b
>  8 [ffff8802435ff490] ocfs2_block_group_find_clear_bits at 
> ffffffffc11ef680 [ocfs2]
>  9 [ffff8802435ff500] ocfs2_cluster_group_search at ffffffffc11ef916 
> [ocfs2]
> 10 [ffff8802435ff580] ocfs2_search_chain at ffffffffc11f0fb6 [ocfs2]
> 11 [ffff8802435ff660] ocfs2_claim_suballoc_bits at ffffffffc11f1b1b 
> [ocfs2]
> 12 [ffff8802435ff6f0] __ocfs2_claim_clusters at ffffffffc11f32cb 
> [ocfs2]
> 13 [ffff8802435ff770] ocfs2_claim_clusters at ffffffffc11f5caf [ocfs2]
> 14 [ffff8802435ff780] ocfs2_local_alloc_slide_window at 
> ffffffffc11cc0db [ocfs2]
> 15 [ffff8802435ff820] ocfs2_reserve_local_alloc_bits at 
> ffffffffc11ce53f [ocfs2]
> 16 [ffff8802435ff890] ocfs2_reserve_clusters_with_limit at 
> ffffffffc11f59b5 [ocfs2]
> 17 [ffff8802435ff8e0] ocfs2_reserve_clusters at ffffffffc11f5c88 
> [ocfs2]
> 18 [ffff8802435ff8f0] ocfs2_lock_refcount_allocators at 
> ffffffffc11dc169 [ocfs2]
> 19 [ffff8802435ff960] ocfs2_make_clusters_writable at ffffffffc11e4274 
> [ocfs2]
> 20 [ffff8802435ffa50] ocfs2_replace_cow at ffffffffc11e4df1 [ocfs2]
> 21 [ffff8802435ffac0] ocfs2_refcount_cow at ffffffffc11e54b1 [ocfs2]
> 22 [ffff8802435ffb80] ocfs2_file_write_iter at ffffffffc11bf8f4 
> [ocfs2]
> 23 [ffff8802435ffcd0] lo_rw_aio at ffffffff814a1b5d
> 24 [ffff8802435ffd80] loop_queue_work at ffffffff814a2802
> 25 [ffff8802435ffe60] kthread_worker_fn at ffffffff810a80d2
> 26 [ffff8802435ffec0] kthread at ffffffff810a7afb
> 27 [ffff8802435fff50] ret_from_fork at ffffffff816f7da1
> 
> When ocfs2_test_bg_bit_allocatable() called bh2jh(bg_bh), the 
> bg_bh->b_private NULL as jbd2_journal_put_journal_head() raced and 
> released the jounal head from the buffer head. Needed to take bit lock for the bit 'BH_JournalHead'
> to fix this race.
> 
> Signed-off-by: Gautham Ananthakrishna 
> <gautham.ananthakrishna@oracle.com>
> ---
>  fs/ocfs2/suballoc.c | 9 +++++++++
>  1 file changed, 9 insertions(+)
> 
> diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 
> 8521942..86f33f2 100644
> --- a/fs/ocfs2/suballoc.c
> +++ b/fs/ocfs2/suballoc.c
> @@ -1256,9 +1256,17 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
>  	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
>  		return 0;
>  
> +	/* Fast path */
>  	if (!buffer_jbd(bg_bh))
>  		return 1;
>  
> +	/* Slow path */
> +	jbd_lock_bh_journal_head(bg_bh);
> +	if (!buffer_jbd(bg_bh)){
> +		jbd_unlock_bh_journal_head(bg_bh);
> +		return 1;
> +	}
> +
>  	jh = bh2jh(bg_bh);
>  	spin_lock(&jh->b_state_lock);
>  	bg = (struct ocfs2_group_desc *) jh->b_committed_data; @@ -1267,6 
> +1275,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
>  	else
>  		ret = 1;
>  	spin_unlock(&jh->b_state_lock);
> +	jbd_unlock_bh_journal_head(bg_bh);
>  
>  	return ret;
>  }
>
Joseph Qi Oct. 21, 2021, 7:26 a.m. UTC | #3
Seems it has one more code intent level.
Is there any issue on my suggested change?

Thanks,
Joseph

On 10/20/21 9:46 PM, Gautham Ananthakrishna wrote:
> Hi Joseph
> 
> The following would retain the fast path,  as per your earlier comment:
> 
> diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
> index 8521942..048e532 100644
> --- a/fs/ocfs2/suballoc.c
> +++ b/fs/ocfs2/suballoc.c
> @@ -1251,22 +1251,25 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
>  {
>         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
>         struct journal_head *jh;
> -       int ret;
> +       int ret = 1;
>  
>         if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
>                 return 0;
>  
> -       if (!buffer_jbd(bg_bh))
> -               return 1;
> -
> -       jh = bh2jh(bg_bh);
> -       spin_lock(&jh->b_state_lock);
> -       bg = (struct ocfs2_group_desc *) jh->b_committed_data;
> -       if (bg)
> -               ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
> -       else
> -               ret = 1;
> -       spin_unlock(&jh->b_state_lock);
> +       if (buffer_jbd(bg_bh)) {
> +               jbd_lock_bh_journal_head(bg_bh);
> +               if (buffer_jbd(bg_bh)){
> +                       jh = bh2jh(bg_bh);
> +                       spin_lock(&jh->b_state_lock);
> +                       bg = (struct ocfs2_group_desc *) jh->b_committed_data;
> +                       if (bg)
> +                               ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
> +                       else
> +                               ret = 1;
> +                       spin_unlock(&jh->b_state_lock);
> +               }
> +               jbd_unlock_bh_journal_head(bg_bh);
> +       }
>  
>         return ret;
>  }
> 
> We can also remove the re-initialization of ret = 1 in the 'else' part, as 'ret' is already initialized to 1 (However, personally I would like to keep this).
> 
> Could you please take a look and comment?
> 
> Thanks,
> Gautham.
> 
> -----Original Message-----
> From: Joseph Qi <joseph.qi@linux.alibaba.com> 
> Sent: Wednesday, October 20, 2021 1:57 PM
> To: Gautham Ananthakrishna <gautham.ananthakrishna@oracle.com>; ocfs2-devel@oss.oracle.com
> Cc: Junxiao Bi <junxiao.bi@oracle.com>; Rajesh Sivaramasubramaniom <rajesh.sivaramasubramaniom@oracle.com>
> Subject: Re: [PATCH V1 RFC 1/1] ocfs2: race between searching chunks and release journal_head from buffer_head
> 
> Hi,
> 
> How about make the change like following?
> 
> diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 8521942..481017e 100644
> --- a/fs/ocfs2/suballoc.c
> +++ b/fs/ocfs2/suballoc.c
> @@ -1251,7 +1251,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,  {
>  	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
>  	struct journal_head *jh;
> -	int ret;
> +	int ret = 1;
>  
>  	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
>  		return 0;
> @@ -1259,14 +1259,18 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
>  	if (!buffer_jbd(bg_bh))
>  		return 1;
>  
> -	jh = bh2jh(bg_bh);
> -	spin_lock(&jh->b_state_lock);
> -	bg = (struct ocfs2_group_desc *) jh->b_committed_data;
> -	if (bg)
> -		ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
> -	else
> -		ret = 1;
> -	spin_unlock(&jh->b_state_lock);
> +	jbd_lock_bh_journal_head(bg_bh);
> +	if (buffer_jbd(bg_bh)) {
> +		jh = bh2jh(bg_bh);
> +		spin_lock(&jh->b_state_lock);
> +		bg = (struct ocfs2_group_desc *) jh->b_committed_data;
> +		if (bg)
> +			ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
> +		else
> +			ret = 1;
> +		spin_unlock(&jh->b_state_lock);
> +	}
> +	jbd_unlock_bh_journal_head(bg_bh);
>  
>  	return ret;
>  }
> 
> 
> On 10/20/21 1:18 PM, Gautham Ananthakrishna wrote:
>> Encountered a race between ocfs2_test_bg_bit_allocatable() and
>> jbd2_journal_put_journal_head() resulting in the below vmcore.
>>
>> PID: 106879  TASK: ffff880244ba9c00  CPU: 2   COMMAND: "loop3"
>>  0 [ffff8802435ff1c0] panic at ffffffff816ed175
>>  1 [ffff8802435ff240] oops_end at ffffffff8101a7c9
>>  2 [ffff8802435ff270] no_context at ffffffff8106eccf
>>  3 [ffff8802435ff2e0] __bad_area_nosemaphore at ffffffff8106ef9d
>>  4 [ffff8802435ff330] bad_area_nosemaphore at ffffffff8106f143
>>  5 [ffff8802435ff340] __do_page_fault at ffffffff8106f80b
>>  6 [ffff8802435ff3a0] do_page_fault at ffffffff8106fc2f
>>  7 [ffff8802435ff3e0] page_fault at ffffffff816fd667
>>     [exception RIP: ocfs2_block_group_find_clear_bits+316]
>>     RIP: ffffffffc11ef6fc  RSP: ffff8802435ff498  RFLAGS: 00010206
>>     RAX: 0000000000003918  RBX: 0000000000000001  RCX: 0000000000000018
>>     RDX: 0000000000003918  RSI: 0000000000000000  RDI: ffff880060194040
>>     RBP: ffff8802435ff4f8   R8: ffffffffff000000   R9: ffffffffffffffff
>>     R10: ffff8802435ff730  R11: ffff8802a94e5800  R12: 0000000000000007
>>     R13: 0000000000007e00  R14: 0000000000003918  R15: ffff88017c973a28
>>     ORIG_RAX: ffffffffffffffff  CS: e030  SS: e02b
>>  8 [ffff8802435ff490] ocfs2_block_group_find_clear_bits at 
>> ffffffffc11ef680 [ocfs2]
>>  9 [ffff8802435ff500] ocfs2_cluster_group_search at ffffffffc11ef916 
>> [ocfs2]
>> 10 [ffff8802435ff580] ocfs2_search_chain at ffffffffc11f0fb6 [ocfs2]
>> 11 [ffff8802435ff660] ocfs2_claim_suballoc_bits at ffffffffc11f1b1b 
>> [ocfs2]
>> 12 [ffff8802435ff6f0] __ocfs2_claim_clusters at ffffffffc11f32cb 
>> [ocfs2]
>> 13 [ffff8802435ff770] ocfs2_claim_clusters at ffffffffc11f5caf [ocfs2]
>> 14 [ffff8802435ff780] ocfs2_local_alloc_slide_window at 
>> ffffffffc11cc0db [ocfs2]
>> 15 [ffff8802435ff820] ocfs2_reserve_local_alloc_bits at 
>> ffffffffc11ce53f [ocfs2]
>> 16 [ffff8802435ff890] ocfs2_reserve_clusters_with_limit at 
>> ffffffffc11f59b5 [ocfs2]
>> 17 [ffff8802435ff8e0] ocfs2_reserve_clusters at ffffffffc11f5c88 
>> [ocfs2]
>> 18 [ffff8802435ff8f0] ocfs2_lock_refcount_allocators at 
>> ffffffffc11dc169 [ocfs2]
>> 19 [ffff8802435ff960] ocfs2_make_clusters_writable at ffffffffc11e4274 
>> [ocfs2]
>> 20 [ffff8802435ffa50] ocfs2_replace_cow at ffffffffc11e4df1 [ocfs2]
>> 21 [ffff8802435ffac0] ocfs2_refcount_cow at ffffffffc11e54b1 [ocfs2]
>> 22 [ffff8802435ffb80] ocfs2_file_write_iter at ffffffffc11bf8f4 
>> [ocfs2]
>> 23 [ffff8802435ffcd0] lo_rw_aio at ffffffff814a1b5d
>> 24 [ffff8802435ffd80] loop_queue_work at ffffffff814a2802
>> 25 [ffff8802435ffe60] kthread_worker_fn at ffffffff810a80d2
>> 26 [ffff8802435ffec0] kthread at ffffffff810a7afb
>> 27 [ffff8802435fff50] ret_from_fork at ffffffff816f7da1
>>
>> When ocfs2_test_bg_bit_allocatable() called bh2jh(bg_bh), the 
>> bg_bh->b_private NULL as jbd2_journal_put_journal_head() raced and 
>> released the jounal head from the buffer head. Needed to take bit lock for the bit 'BH_JournalHead'
>> to fix this race.
>>
>> Signed-off-by: Gautham Ananthakrishna 
>> <gautham.ananthakrishna@oracle.com>
>> ---
>>  fs/ocfs2/suballoc.c | 9 +++++++++
>>  1 file changed, 9 insertions(+)
>>
>> diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 
>> 8521942..86f33f2 100644
>> --- a/fs/ocfs2/suballoc.c
>> +++ b/fs/ocfs2/suballoc.c
>> @@ -1256,9 +1256,17 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
>>  	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
>>  		return 0;
>>  
>> +	/* Fast path */
>>  	if (!buffer_jbd(bg_bh))
>>  		return 1;
>>  
>> +	/* Slow path */
>> +	jbd_lock_bh_journal_head(bg_bh);
>> +	if (!buffer_jbd(bg_bh)){
>> +		jbd_unlock_bh_journal_head(bg_bh);
>> +		return 1;
>> +	}
>> +
>>  	jh = bh2jh(bg_bh);
>>  	spin_lock(&jh->b_state_lock);
>>  	bg = (struct ocfs2_group_desc *) jh->b_committed_data; @@ -1267,6 
>> +1275,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
>>  	else
>>  		ret = 1;
>>  	spin_unlock(&jh->b_state_lock);
>> +	jbd_unlock_bh_journal_head(bg_bh);
>>  
>>  	return ret;
>>  }
>>
Gautham Ananthakrishna Oct. 21, 2021, 7:30 a.m. UTC | #4
Hi Joseph,

No, I don’t see any issue with your suggestion. I thought it didn’t have the fast path which you suggested earlier.

Thanks,
Gautham.

-----Original Message-----
From: Joseph Qi <joseph.qi@linux.alibaba.com> 
Sent: Thursday, October 21, 2021 12:57 PM
To: Gautham Ananthakrishna <gautham.ananthakrishna@oracle.com>; ocfs2-devel@oss.oracle.com
Cc: Junxiao Bi <junxiao.bi@oracle.com>; Rajesh Sivaramasubramaniom <rajesh.sivaramasubramaniom@oracle.com>
Subject: Re: [PATCH V1 RFC 1/1] ocfs2: race between searching chunks and release journal_head from buffer_head

Seems it has one more code intent level.
Is there any issue on my suggested change?

Thanks,
Joseph

On 10/20/21 9:46 PM, Gautham Ananthakrishna wrote:
> Hi Joseph
> 
> The following would retain the fast path,  as per your earlier comment:
> 
> diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 
> 8521942..048e532 100644
> --- a/fs/ocfs2/suballoc.c
> +++ b/fs/ocfs2/suballoc.c
> @@ -1251,22 +1251,25 @@ static int 
> ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,  {
>         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
>         struct journal_head *jh;
> -       int ret;
> +       int ret = 1;
>  
>         if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
>                 return 0;
>  
> -       if (!buffer_jbd(bg_bh))
> -               return 1;
> -
> -       jh = bh2jh(bg_bh);
> -       spin_lock(&jh->b_state_lock);
> -       bg = (struct ocfs2_group_desc *) jh->b_committed_data;
> -       if (bg)
> -               ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
> -       else
> -               ret = 1;
> -       spin_unlock(&jh->b_state_lock);
> +       if (buffer_jbd(bg_bh)) {
> +               jbd_lock_bh_journal_head(bg_bh);
> +               if (buffer_jbd(bg_bh)){
> +                       jh = bh2jh(bg_bh);
> +                       spin_lock(&jh->b_state_lock);
> +                       bg = (struct ocfs2_group_desc *) jh->b_committed_data;
> +                       if (bg)
> +                               ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
> +                       else
> +                               ret = 1;
> +                       spin_unlock(&jh->b_state_lock);
> +               }
> +               jbd_unlock_bh_journal_head(bg_bh);
> +       }
>  
>         return ret;
>  }
> 
> We can also remove the re-initialization of ret = 1 in the 'else' part, as 'ret' is already initialized to 1 (However, personally I would like to keep this).
> 
> Could you please take a look and comment?
> 
> Thanks,
> Gautham.
> 
> -----Original Message-----
> From: Joseph Qi <joseph.qi@linux.alibaba.com>
> Sent: Wednesday, October 20, 2021 1:57 PM
> To: Gautham Ananthakrishna <gautham.ananthakrishna@oracle.com>; 
> ocfs2-devel@oss.oracle.com
> Cc: Junxiao Bi <junxiao.bi@oracle.com>; Rajesh Sivaramasubramaniom 
> <rajesh.sivaramasubramaniom@oracle.com>
> Subject: Re: [PATCH V1 RFC 1/1] ocfs2: race between searching chunks 
> and release journal_head from buffer_head
> 
> Hi,
> 
> How about make the change like following?
> 
> diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 
> 8521942..481017e 100644
> --- a/fs/ocfs2/suballoc.c
> +++ b/fs/ocfs2/suballoc.c
> @@ -1251,7 +1251,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,  {
>  	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
>  	struct journal_head *jh;
> -	int ret;
> +	int ret = 1;
>  
>  	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
>  		return 0;
> @@ -1259,14 +1259,18 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
>  	if (!buffer_jbd(bg_bh))
>  		return 1;
>  
> -	jh = bh2jh(bg_bh);
> -	spin_lock(&jh->b_state_lock);
> -	bg = (struct ocfs2_group_desc *) jh->b_committed_data;
> -	if (bg)
> -		ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
> -	else
> -		ret = 1;
> -	spin_unlock(&jh->b_state_lock);
> +	jbd_lock_bh_journal_head(bg_bh);
> +	if (buffer_jbd(bg_bh)) {
> +		jh = bh2jh(bg_bh);
> +		spin_lock(&jh->b_state_lock);
> +		bg = (struct ocfs2_group_desc *) jh->b_committed_data;
> +		if (bg)
> +			ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
> +		else
> +			ret = 1;
> +		spin_unlock(&jh->b_state_lock);
> +	}
> +	jbd_unlock_bh_journal_head(bg_bh);
>  
>  	return ret;
>  }
> 
> 
> On 10/20/21 1:18 PM, Gautham Ananthakrishna wrote:
>> Encountered a race between ocfs2_test_bg_bit_allocatable() and
>> jbd2_journal_put_journal_head() resulting in the below vmcore.
>>
>> PID: 106879  TASK: ffff880244ba9c00  CPU: 2   COMMAND: "loop3"
>>  0 [ffff8802435ff1c0] panic at ffffffff816ed175
>>  1 [ffff8802435ff240] oops_end at ffffffff8101a7c9
>>  2 [ffff8802435ff270] no_context at ffffffff8106eccf
>>  3 [ffff8802435ff2e0] __bad_area_nosemaphore at ffffffff8106ef9d
>>  4 [ffff8802435ff330] bad_area_nosemaphore at ffffffff8106f143
>>  5 [ffff8802435ff340] __do_page_fault at ffffffff8106f80b
>>  6 [ffff8802435ff3a0] do_page_fault at ffffffff8106fc2f
>>  7 [ffff8802435ff3e0] page_fault at ffffffff816fd667
>>     [exception RIP: ocfs2_block_group_find_clear_bits+316]
>>     RIP: ffffffffc11ef6fc  RSP: ffff8802435ff498  RFLAGS: 00010206
>>     RAX: 0000000000003918  RBX: 0000000000000001  RCX: 0000000000000018
>>     RDX: 0000000000003918  RSI: 0000000000000000  RDI: ffff880060194040
>>     RBP: ffff8802435ff4f8   R8: ffffffffff000000   R9: ffffffffffffffff
>>     R10: ffff8802435ff730  R11: ffff8802a94e5800  R12: 0000000000000007
>>     R13: 0000000000007e00  R14: 0000000000003918  R15: ffff88017c973a28
>>     ORIG_RAX: ffffffffffffffff  CS: e030  SS: e02b
>>  8 [ffff8802435ff490] ocfs2_block_group_find_clear_bits at
>> ffffffffc11ef680 [ocfs2]
>>  9 [ffff8802435ff500] ocfs2_cluster_group_search at ffffffffc11ef916 
>> [ocfs2]
>> 10 [ffff8802435ff580] ocfs2_search_chain at ffffffffc11f0fb6 [ocfs2]
>> 11 [ffff8802435ff660] ocfs2_claim_suballoc_bits at ffffffffc11f1b1b 
>> [ocfs2]
>> 12 [ffff8802435ff6f0] __ocfs2_claim_clusters at ffffffffc11f32cb 
>> [ocfs2]
>> 13 [ffff8802435ff770] ocfs2_claim_clusters at ffffffffc11f5caf 
>> [ocfs2]
>> 14 [ffff8802435ff780] ocfs2_local_alloc_slide_window at 
>> ffffffffc11cc0db [ocfs2]
>> 15 [ffff8802435ff820] ocfs2_reserve_local_alloc_bits at 
>> ffffffffc11ce53f [ocfs2]
>> 16 [ffff8802435ff890] ocfs2_reserve_clusters_with_limit at
>> ffffffffc11f59b5 [ocfs2]
>> 17 [ffff8802435ff8e0] ocfs2_reserve_clusters at ffffffffc11f5c88 
>> [ocfs2]
>> 18 [ffff8802435ff8f0] ocfs2_lock_refcount_allocators at
>> ffffffffc11dc169 [ocfs2]
>> 19 [ffff8802435ff960] ocfs2_make_clusters_writable at 
>> ffffffffc11e4274 [ocfs2]
>> 20 [ffff8802435ffa50] ocfs2_replace_cow at ffffffffc11e4df1 [ocfs2]
>> 21 [ffff8802435ffac0] ocfs2_refcount_cow at ffffffffc11e54b1 [ocfs2]
>> 22 [ffff8802435ffb80] ocfs2_file_write_iter at ffffffffc11bf8f4 
>> [ocfs2]
>> 23 [ffff8802435ffcd0] lo_rw_aio at ffffffff814a1b5d
>> 24 [ffff8802435ffd80] loop_queue_work at ffffffff814a2802
>> 25 [ffff8802435ffe60] kthread_worker_fn at ffffffff810a80d2
>> 26 [ffff8802435ffec0] kthread at ffffffff810a7afb
>> 27 [ffff8802435fff50] ret_from_fork at ffffffff816f7da1
>>
>> When ocfs2_test_bg_bit_allocatable() called bh2jh(bg_bh), the 
>> bg_bh->b_private NULL as jbd2_journal_put_journal_head() raced and 
>> released the jounal head from the buffer head. Needed to take bit lock for the bit 'BH_JournalHead'
>> to fix this race.
>>
>> Signed-off-by: Gautham Ananthakrishna 
>> <gautham.ananthakrishna@oracle.com>
>> ---
>>  fs/ocfs2/suballoc.c | 9 +++++++++
>>  1 file changed, 9 insertions(+)
>>
>> diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index
>> 8521942..86f33f2 100644
>> --- a/fs/ocfs2/suballoc.c
>> +++ b/fs/ocfs2/suballoc.c
>> @@ -1256,9 +1256,17 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
>>  	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
>>  		return 0;
>>  
>> +	/* Fast path */
>>  	if (!buffer_jbd(bg_bh))
>>  		return 1;
>>  
>> +	/* Slow path */
>> +	jbd_lock_bh_journal_head(bg_bh);
>> +	if (!buffer_jbd(bg_bh)){
>> +		jbd_unlock_bh_journal_head(bg_bh);
>> +		return 1;
>> +	}
>> +
>>  	jh = bh2jh(bg_bh);
>>  	spin_lock(&jh->b_state_lock);
>>  	bg = (struct ocfs2_group_desc *) jh->b_committed_data; @@ -1267,6
>> +1275,7 @@ static int ocfs2_test_bg_bit_allocatable(struct 
>> +buffer_head *bg_bh,
>>  	else
>>  		ret = 1;
>>  	spin_unlock(&jh->b_state_lock);
>> +	jbd_unlock_bh_journal_head(bg_bh);
>>  
>>  	return ret;
>>  }
>>
Joseph Qi Oct. 21, 2021, 7:33 a.m. UTC | #5
I've kept the 'if (!buffer_jbd(bg_bh))', that's the exactly 'fast path'.

Thanks,
Joseph

On 10/21/21 3:30 PM, Gautham Ananthakrishna wrote:
> Hi Joseph,
> 
> No, I don’t see any issue with your suggestion. I thought it didn’t have the fast path which you suggested earlier.
> 
> Thanks,
> Gautham.
> 
> -----Original Message-----
> From: Joseph Qi <joseph.qi@linux.alibaba.com> 
> Sent: Thursday, October 21, 2021 12:57 PM
> To: Gautham Ananthakrishna <gautham.ananthakrishna@oracle.com>; ocfs2-devel@oss.oracle.com
> Cc: Junxiao Bi <junxiao.bi@oracle.com>; Rajesh Sivaramasubramaniom <rajesh.sivaramasubramaniom@oracle.com>
> Subject: Re: [PATCH V1 RFC 1/1] ocfs2: race between searching chunks and release journal_head from buffer_head
> 
> Seems it has one more code intent level.
> Is there any issue on my suggested change?
> 
> Thanks,
> Joseph
> 
> On 10/20/21 9:46 PM, Gautham Ananthakrishna wrote:
>> Hi Joseph
>>
>> The following would retain the fast path,  as per your earlier comment:
>>
>> diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 
>> 8521942..048e532 100644
>> --- a/fs/ocfs2/suballoc.c
>> +++ b/fs/ocfs2/suballoc.c
>> @@ -1251,22 +1251,25 @@ static int 
>> ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,  {
>>         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
>>         struct journal_head *jh;
>> -       int ret;
>> +       int ret = 1;
>>  
>>         if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
>>                 return 0;
>>  
>> -       if (!buffer_jbd(bg_bh))
>> -               return 1;
>> -
>> -       jh = bh2jh(bg_bh);
>> -       spin_lock(&jh->b_state_lock);
>> -       bg = (struct ocfs2_group_desc *) jh->b_committed_data;
>> -       if (bg)
>> -               ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
>> -       else
>> -               ret = 1;
>> -       spin_unlock(&jh->b_state_lock);
>> +       if (buffer_jbd(bg_bh)) {
>> +               jbd_lock_bh_journal_head(bg_bh);
>> +               if (buffer_jbd(bg_bh)){
>> +                       jh = bh2jh(bg_bh);
>> +                       spin_lock(&jh->b_state_lock);
>> +                       bg = (struct ocfs2_group_desc *) jh->b_committed_data;
>> +                       if (bg)
>> +                               ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
>> +                       else
>> +                               ret = 1;
>> +                       spin_unlock(&jh->b_state_lock);
>> +               }
>> +               jbd_unlock_bh_journal_head(bg_bh);
>> +       }
>>  
>>         return ret;
>>  }
>>
>> We can also remove the re-initialization of ret = 1 in the 'else' part, as 'ret' is already initialized to 1 (However, personally I would like to keep this).
>>
>> Could you please take a look and comment?
>>
>> Thanks,
>> Gautham.
>>
>> -----Original Message-----
>> From: Joseph Qi <joseph.qi@linux.alibaba.com>
>> Sent: Wednesday, October 20, 2021 1:57 PM
>> To: Gautham Ananthakrishna <gautham.ananthakrishna@oracle.com>; 
>> ocfs2-devel@oss.oracle.com
>> Cc: Junxiao Bi <junxiao.bi@oracle.com>; Rajesh Sivaramasubramaniom 
>> <rajesh.sivaramasubramaniom@oracle.com>
>> Subject: Re: [PATCH V1 RFC 1/1] ocfs2: race between searching chunks 
>> and release journal_head from buffer_head
>>
>> Hi,
>>
>> How about make the change like following?
>>
>> diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 
>> 8521942..481017e 100644
>> --- a/fs/ocfs2/suballoc.c
>> +++ b/fs/ocfs2/suballoc.c
>> @@ -1251,7 +1251,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,  {
>>  	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
>>  	struct journal_head *jh;
>> -	int ret;
>> +	int ret = 1;
>>  
>>  	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
>>  		return 0;
>> @@ -1259,14 +1259,18 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
>>  	if (!buffer_jbd(bg_bh))
>>  		return 1;
>>  
>> -	jh = bh2jh(bg_bh);
>> -	spin_lock(&jh->b_state_lock);
>> -	bg = (struct ocfs2_group_desc *) jh->b_committed_data;
>> -	if (bg)
>> -		ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
>> -	else
>> -		ret = 1;
>> -	spin_unlock(&jh->b_state_lock);
>> +	jbd_lock_bh_journal_head(bg_bh);
>> +	if (buffer_jbd(bg_bh)) {
>> +		jh = bh2jh(bg_bh);
>> +		spin_lock(&jh->b_state_lock);
>> +		bg = (struct ocfs2_group_desc *) jh->b_committed_data;
>> +		if (bg)
>> +			ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
>> +		else
>> +			ret = 1;
>> +		spin_unlock(&jh->b_state_lock);
>> +	}
>> +	jbd_unlock_bh_journal_head(bg_bh);
>>  
>>  	return ret;
>>  }
>>
>>
>> On 10/20/21 1:18 PM, Gautham Ananthakrishna wrote:
>>> Encountered a race between ocfs2_test_bg_bit_allocatable() and
>>> jbd2_journal_put_journal_head() resulting in the below vmcore.
>>>
>>> PID: 106879  TASK: ffff880244ba9c00  CPU: 2   COMMAND: "loop3"
>>>  0 [ffff8802435ff1c0] panic at ffffffff816ed175
>>>  1 [ffff8802435ff240] oops_end at ffffffff8101a7c9
>>>  2 [ffff8802435ff270] no_context at ffffffff8106eccf
>>>  3 [ffff8802435ff2e0] __bad_area_nosemaphore at ffffffff8106ef9d
>>>  4 [ffff8802435ff330] bad_area_nosemaphore at ffffffff8106f143
>>>  5 [ffff8802435ff340] __do_page_fault at ffffffff8106f80b
>>>  6 [ffff8802435ff3a0] do_page_fault at ffffffff8106fc2f
>>>  7 [ffff8802435ff3e0] page_fault at ffffffff816fd667
>>>     [exception RIP: ocfs2_block_group_find_clear_bits+316]
>>>     RIP: ffffffffc11ef6fc  RSP: ffff8802435ff498  RFLAGS: 00010206
>>>     RAX: 0000000000003918  RBX: 0000000000000001  RCX: 0000000000000018
>>>     RDX: 0000000000003918  RSI: 0000000000000000  RDI: ffff880060194040
>>>     RBP: ffff8802435ff4f8   R8: ffffffffff000000   R9: ffffffffffffffff
>>>     R10: ffff8802435ff730  R11: ffff8802a94e5800  R12: 0000000000000007
>>>     R13: 0000000000007e00  R14: 0000000000003918  R15: ffff88017c973a28
>>>     ORIG_RAX: ffffffffffffffff  CS: e030  SS: e02b
>>>  8 [ffff8802435ff490] ocfs2_block_group_find_clear_bits at
>>> ffffffffc11ef680 [ocfs2]
>>>  9 [ffff8802435ff500] ocfs2_cluster_group_search at ffffffffc11ef916 
>>> [ocfs2]
>>> 10 [ffff8802435ff580] ocfs2_search_chain at ffffffffc11f0fb6 [ocfs2]
>>> 11 [ffff8802435ff660] ocfs2_claim_suballoc_bits at ffffffffc11f1b1b 
>>> [ocfs2]
>>> 12 [ffff8802435ff6f0] __ocfs2_claim_clusters at ffffffffc11f32cb 
>>> [ocfs2]
>>> 13 [ffff8802435ff770] ocfs2_claim_clusters at ffffffffc11f5caf 
>>> [ocfs2]
>>> 14 [ffff8802435ff780] ocfs2_local_alloc_slide_window at 
>>> ffffffffc11cc0db [ocfs2]
>>> 15 [ffff8802435ff820] ocfs2_reserve_local_alloc_bits at 
>>> ffffffffc11ce53f [ocfs2]
>>> 16 [ffff8802435ff890] ocfs2_reserve_clusters_with_limit at
>>> ffffffffc11f59b5 [ocfs2]
>>> 17 [ffff8802435ff8e0] ocfs2_reserve_clusters at ffffffffc11f5c88 
>>> [ocfs2]
>>> 18 [ffff8802435ff8f0] ocfs2_lock_refcount_allocators at
>>> ffffffffc11dc169 [ocfs2]
>>> 19 [ffff8802435ff960] ocfs2_make_clusters_writable at 
>>> ffffffffc11e4274 [ocfs2]
>>> 20 [ffff8802435ffa50] ocfs2_replace_cow at ffffffffc11e4df1 [ocfs2]
>>> 21 [ffff8802435ffac0] ocfs2_refcount_cow at ffffffffc11e54b1 [ocfs2]
>>> 22 [ffff8802435ffb80] ocfs2_file_write_iter at ffffffffc11bf8f4 
>>> [ocfs2]
>>> 23 [ffff8802435ffcd0] lo_rw_aio at ffffffff814a1b5d
>>> 24 [ffff8802435ffd80] loop_queue_work at ffffffff814a2802
>>> 25 [ffff8802435ffe60] kthread_worker_fn at ffffffff810a80d2
>>> 26 [ffff8802435ffec0] kthread at ffffffff810a7afb
>>> 27 [ffff8802435fff50] ret_from_fork at ffffffff816f7da1
>>>
>>> When ocfs2_test_bg_bit_allocatable() called bh2jh(bg_bh), the 
>>> bg_bh->b_private NULL as jbd2_journal_put_journal_head() raced and 
>>> released the jounal head from the buffer head. Needed to take bit lock for the bit 'BH_JournalHead'
>>> to fix this race.
>>>
>>> Signed-off-by: Gautham Ananthakrishna 
>>> <gautham.ananthakrishna@oracle.com>
>>> ---
>>>  fs/ocfs2/suballoc.c | 9 +++++++++
>>>  1 file changed, 9 insertions(+)
>>>
>>> diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index
>>> 8521942..86f33f2 100644
>>> --- a/fs/ocfs2/suballoc.c
>>> +++ b/fs/ocfs2/suballoc.c
>>> @@ -1256,9 +1256,17 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
>>>  	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
>>>  		return 0;
>>>  
>>> +	/* Fast path */
>>>  	if (!buffer_jbd(bg_bh))
>>>  		return 1;
>>>  
>>> +	/* Slow path */
>>> +	jbd_lock_bh_journal_head(bg_bh);
>>> +	if (!buffer_jbd(bg_bh)){
>>> +		jbd_unlock_bh_journal_head(bg_bh);
>>> +		return 1;
>>> +	}
>>> +
>>>  	jh = bh2jh(bg_bh);
>>>  	spin_lock(&jh->b_state_lock);
>>>  	bg = (struct ocfs2_group_desc *) jh->b_committed_data; @@ -1267,6
>>> +1275,7 @@ static int ocfs2_test_bg_bit_allocatable(struct 
>>> +buffer_head *bg_bh,
>>>  	else
>>>  		ret = 1;
>>>  	spin_unlock(&jh->b_state_lock);
>>> +	jbd_unlock_bh_journal_head(bg_bh);
>>>  
>>>  	return ret;
>>>  }
>>>
Gautham Ananthakrishna Oct. 21, 2021, 7:40 a.m. UTC | #6
Oops.. I had missed it.... thank you. I will make changes and send V2.
Thanks,
Gautham.

-----Original Message-----
From: Joseph Qi <joseph.qi@linux.alibaba.com> 
Sent: Thursday, October 21, 2021 1:03 PM
To: Gautham Ananthakrishna <gautham.ananthakrishna@oracle.com>; ocfs2-devel@oss.oracle.com
Cc: Junxiao Bi <junxiao.bi@oracle.com>; Rajesh Sivaramasubramaniom <rajesh.sivaramasubramaniom@oracle.com>
Subject: Re: [PATCH V1 RFC 1/1] ocfs2: race between searching chunks and release journal_head from buffer_head

I've kept the 'if (!buffer_jbd(bg_bh))', that's the exactly 'fast path'.

Thanks,
Joseph

On 10/21/21 3:30 PM, Gautham Ananthakrishna wrote:
> Hi Joseph,
> 
> No, I don’t see any issue with your suggestion. I thought it didn’t have the fast path which you suggested earlier.
> 
> Thanks,
> Gautham.
> 
> -----Original Message-----
> From: Joseph Qi <joseph.qi@linux.alibaba.com>
> Sent: Thursday, October 21, 2021 12:57 PM
> To: Gautham Ananthakrishna <gautham.ananthakrishna@oracle.com>; 
> ocfs2-devel@oss.oracle.com
> Cc: Junxiao Bi <junxiao.bi@oracle.com>; Rajesh Sivaramasubramaniom 
> <rajesh.sivaramasubramaniom@oracle.com>
> Subject: Re: [PATCH V1 RFC 1/1] ocfs2: race between searching chunks 
> and release journal_head from buffer_head
> 
> Seems it has one more code intent level.
> Is there any issue on my suggested change?
> 
> Thanks,
> Joseph
> 
> On 10/20/21 9:46 PM, Gautham Ananthakrishna wrote:
>> Hi Joseph
>>
>> The following would retain the fast path,  as per your earlier comment:
>>
>> diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index
>> 8521942..048e532 100644
>> --- a/fs/ocfs2/suballoc.c
>> +++ b/fs/ocfs2/suballoc.c
>> @@ -1251,22 +1251,25 @@ static int
>> ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,  {
>>         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
>>         struct journal_head *jh;
>> -       int ret;
>> +       int ret = 1;
>>  
>>         if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
>>                 return 0;
>>  
>> -       if (!buffer_jbd(bg_bh))
>> -               return 1;
>> -
>> -       jh = bh2jh(bg_bh);
>> -       spin_lock(&jh->b_state_lock);
>> -       bg = (struct ocfs2_group_desc *) jh->b_committed_data;
>> -       if (bg)
>> -               ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
>> -       else
>> -               ret = 1;
>> -       spin_unlock(&jh->b_state_lock);
>> +       if (buffer_jbd(bg_bh)) {
>> +               jbd_lock_bh_journal_head(bg_bh);
>> +               if (buffer_jbd(bg_bh)){
>> +                       jh = bh2jh(bg_bh);
>> +                       spin_lock(&jh->b_state_lock);
>> +                       bg = (struct ocfs2_group_desc *) jh->b_committed_data;
>> +                       if (bg)
>> +                               ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
>> +                       else
>> +                               ret = 1;
>> +                       spin_unlock(&jh->b_state_lock);
>> +               }
>> +               jbd_unlock_bh_journal_head(bg_bh);
>> +       }
>>  
>>         return ret;
>>  }
>>
>> We can also remove the re-initialization of ret = 1 in the 'else' part, as 'ret' is already initialized to 1 (However, personally I would like to keep this).
>>
>> Could you please take a look and comment?
>>
>> Thanks,
>> Gautham.
>>
>> -----Original Message-----
>> From: Joseph Qi <joseph.qi@linux.alibaba.com>
>> Sent: Wednesday, October 20, 2021 1:57 PM
>> To: Gautham Ananthakrishna <gautham.ananthakrishna@oracle.com>;
>> ocfs2-devel@oss.oracle.com
>> Cc: Junxiao Bi <junxiao.bi@oracle.com>; Rajesh Sivaramasubramaniom 
>> <rajesh.sivaramasubramaniom@oracle.com>
>> Subject: Re: [PATCH V1 RFC 1/1] ocfs2: race between searching chunks 
>> and release journal_head from buffer_head
>>
>> Hi,
>>
>> How about make the change like following?
>>
>> diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 
>> 8521942..481017e 100644
>> --- a/fs/ocfs2/suballoc.c
>> +++ b/fs/ocfs2/suballoc.c
>> @@ -1251,7 +1251,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,  {
>>  	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
>>  	struct journal_head *jh;
>> -	int ret;
>> +	int ret = 1;
>>  
>>  	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
>>  		return 0;
>> @@ -1259,14 +1259,18 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
>>  	if (!buffer_jbd(bg_bh))
>>  		return 1;
>>  
>> -	jh = bh2jh(bg_bh);
>> -	spin_lock(&jh->b_state_lock);
>> -	bg = (struct ocfs2_group_desc *) jh->b_committed_data;
>> -	if (bg)
>> -		ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
>> -	else
>> -		ret = 1;
>> -	spin_unlock(&jh->b_state_lock);
>> +	jbd_lock_bh_journal_head(bg_bh);
>> +	if (buffer_jbd(bg_bh)) {
>> +		jh = bh2jh(bg_bh);
>> +		spin_lock(&jh->b_state_lock);
>> +		bg = (struct ocfs2_group_desc *) jh->b_committed_data;
>> +		if (bg)
>> +			ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
>> +		else
>> +			ret = 1;
>> +		spin_unlock(&jh->b_state_lock);
>> +	}
>> +	jbd_unlock_bh_journal_head(bg_bh);
>>  
>>  	return ret;
>>  }
>>
>>
>> On 10/20/21 1:18 PM, Gautham Ananthakrishna wrote:
>>> Encountered a race between ocfs2_test_bg_bit_allocatable() and
>>> jbd2_journal_put_journal_head() resulting in the below vmcore.
>>>
>>> PID: 106879  TASK: ffff880244ba9c00  CPU: 2   COMMAND: "loop3"
>>>  0 [ffff8802435ff1c0] panic at ffffffff816ed175
>>>  1 [ffff8802435ff240] oops_end at ffffffff8101a7c9
>>>  2 [ffff8802435ff270] no_context at ffffffff8106eccf
>>>  3 [ffff8802435ff2e0] __bad_area_nosemaphore at ffffffff8106ef9d
>>>  4 [ffff8802435ff330] bad_area_nosemaphore at ffffffff8106f143
>>>  5 [ffff8802435ff340] __do_page_fault at ffffffff8106f80b
>>>  6 [ffff8802435ff3a0] do_page_fault at ffffffff8106fc2f
>>>  7 [ffff8802435ff3e0] page_fault at ffffffff816fd667
>>>     [exception RIP: ocfs2_block_group_find_clear_bits+316]
>>>     RIP: ffffffffc11ef6fc  RSP: ffff8802435ff498  RFLAGS: 00010206
>>>     RAX: 0000000000003918  RBX: 0000000000000001  RCX: 0000000000000018
>>>     RDX: 0000000000003918  RSI: 0000000000000000  RDI: ffff880060194040
>>>     RBP: ffff8802435ff4f8   R8: ffffffffff000000   R9: ffffffffffffffff
>>>     R10: ffff8802435ff730  R11: ffff8802a94e5800  R12: 0000000000000007
>>>     R13: 0000000000007e00  R14: 0000000000003918  R15: ffff88017c973a28
>>>     ORIG_RAX: ffffffffffffffff  CS: e030  SS: e02b
>>>  8 [ffff8802435ff490] ocfs2_block_group_find_clear_bits at
>>> ffffffffc11ef680 [ocfs2]
>>>  9 [ffff8802435ff500] ocfs2_cluster_group_search at ffffffffc11ef916 
>>> [ocfs2]
>>> 10 [ffff8802435ff580] ocfs2_search_chain at ffffffffc11f0fb6 [ocfs2]
>>> 11 [ffff8802435ff660] ocfs2_claim_suballoc_bits at ffffffffc11f1b1b 
>>> [ocfs2]
>>> 12 [ffff8802435ff6f0] __ocfs2_claim_clusters at ffffffffc11f32cb 
>>> [ocfs2]
>>> 13 [ffff8802435ff770] ocfs2_claim_clusters at ffffffffc11f5caf 
>>> [ocfs2]
>>> 14 [ffff8802435ff780] ocfs2_local_alloc_slide_window at 
>>> ffffffffc11cc0db [ocfs2]
>>> 15 [ffff8802435ff820] ocfs2_reserve_local_alloc_bits at 
>>> ffffffffc11ce53f [ocfs2]
>>> 16 [ffff8802435ff890] ocfs2_reserve_clusters_with_limit at
>>> ffffffffc11f59b5 [ocfs2]
>>> 17 [ffff8802435ff8e0] ocfs2_reserve_clusters at ffffffffc11f5c88 
>>> [ocfs2]
>>> 18 [ffff8802435ff8f0] ocfs2_lock_refcount_allocators at
>>> ffffffffc11dc169 [ocfs2]
>>> 19 [ffff8802435ff960] ocfs2_make_clusters_writable at
>>> ffffffffc11e4274 [ocfs2]
>>> 20 [ffff8802435ffa50] ocfs2_replace_cow at ffffffffc11e4df1 [ocfs2]
>>> 21 [ffff8802435ffac0] ocfs2_refcount_cow at ffffffffc11e54b1 [ocfs2]
>>> 22 [ffff8802435ffb80] ocfs2_file_write_iter at ffffffffc11bf8f4 
>>> [ocfs2]
>>> 23 [ffff8802435ffcd0] lo_rw_aio at ffffffff814a1b5d
>>> 24 [ffff8802435ffd80] loop_queue_work at ffffffff814a2802
>>> 25 [ffff8802435ffe60] kthread_worker_fn at ffffffff810a80d2
>>> 26 [ffff8802435ffec0] kthread at ffffffff810a7afb
>>> 27 [ffff8802435fff50] ret_from_fork at ffffffff816f7da1
>>>
>>> When ocfs2_test_bg_bit_allocatable() called bh2jh(bg_bh), the 
>>> bg_bh->b_private NULL as jbd2_journal_put_journal_head() raced and 
>>> released the jounal head from the buffer head. Needed to take bit lock for the bit 'BH_JournalHead'
>>> to fix this race.
>>>
>>> Signed-off-by: Gautham Ananthakrishna 
>>> <gautham.ananthakrishna@oracle.com>
>>> ---
>>>  fs/ocfs2/suballoc.c | 9 +++++++++
>>>  1 file changed, 9 insertions(+)
>>>
>>> diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index
>>> 8521942..86f33f2 100644
>>> --- a/fs/ocfs2/suballoc.c
>>> +++ b/fs/ocfs2/suballoc.c
>>> @@ -1256,9 +1256,17 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
>>>  	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
>>>  		return 0;
>>>  
>>> +	/* Fast path */
>>>  	if (!buffer_jbd(bg_bh))
>>>  		return 1;
>>>  
>>> +	/* Slow path */
>>> +	jbd_lock_bh_journal_head(bg_bh);
>>> +	if (!buffer_jbd(bg_bh)){
>>> +		jbd_unlock_bh_journal_head(bg_bh);
>>> +		return 1;
>>> +	}
>>> +
>>>  	jh = bh2jh(bg_bh);
>>>  	spin_lock(&jh->b_state_lock);
>>>  	bg = (struct ocfs2_group_desc *) jh->b_committed_data; @@ -1267,6
>>> +1275,7 @@ static int ocfs2_test_bg_bit_allocatable(struct
>>> +buffer_head *bg_bh,
>>>  	else
>>>  		ret = 1;
>>>  	spin_unlock(&jh->b_state_lock);
>>> +	jbd_unlock_bh_journal_head(bg_bh);
>>>  
>>>  	return ret;
>>>  }
>>>
diff mbox series

Patch

diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8521942..86f33f2 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1256,9 +1256,17 @@  static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
 	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
 		return 0;
 
+	/* Fast path */
 	if (!buffer_jbd(bg_bh))
 		return 1;
 
+	/* Slow path */
+	jbd_lock_bh_journal_head(bg_bh);
+	if (!buffer_jbd(bg_bh)){
+		jbd_unlock_bh_journal_head(bg_bh);
+		return 1;
+	}
+
 	jh = bh2jh(bg_bh);
 	spin_lock(&jh->b_state_lock);
 	bg = (struct ocfs2_group_desc *) jh->b_committed_data;
@@ -1267,6 +1275,7 @@  static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
 	else
 		ret = 1;
 	spin_unlock(&jh->b_state_lock);
+	jbd_unlock_bh_journal_head(bg_bh);
 
 	return ret;
 }