diff mbox series

mm: memory-failure: use rcu lock instead of tasklist_lock when collect_procs()

Message ID 20230821022534.1381092-1-tongtiangen@huawei.com (mailing list archive)
State New
Headers show
Series mm: memory-failure: use rcu lock instead of tasklist_lock when collect_procs() | expand

Commit Message

Tong Tiangen Aug. 21, 2023, 2:25 a.m. UTC
We found a softlock issue in our test, analyzed the logs, and found that
the relevant CPU call trace as follows:

CPU0:
  _do_fork
    -> copy_process()
      -> write_lock_irq(&tasklist_lock)  //Disable irq,waiting for
      					 //tasklist_lock

CPU1:
  wp_page_copy()
    ->pte_offset_map_lock()
      -> spin_lock(&page->ptl);        //Hold page->ptl
    -> ptep_clear_flush()
      -> flush_tlb_others() ...
        -> smp_call_function_many()
          -> arch_send_call_function_ipi_mask()
            -> csd_lock_wait()         //Waiting for other CPUs respond
	                               //IPI

CPU2:
  collect_procs_anon()
    -> read_lock(&tasklist_lock)       //Hold tasklist_lock
      ->for_each_process(tsk)
        -> page_mapped_in_vma()
          -> page_vma_mapped_walk()
	    -> map_pte()
              ->spin_lock(&page->ptl)  //Waiting for page->ptl

We can see that CPU1 waiting for CPU0 respond IPI,CPU0 waiting for CPU2
unlock tasklist_lock, CPU2 waiting for CPU1 unlock page->ptl. As a result,
softlockup is triggered.

For collect_procs_anon(), we will not modify the tasklist, but only perform
read traversal. Therefore, we can use rcu lock instead of spin lock
tasklist_lock, from this, we can break the softlock chain above.

The same logic can also be applied to:
 - collect_procs_file()
 - collect_procs_fsdax()
 - collect_procs_ksm()
 - find_early_kill_thread()

Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
---
Changes since RFC[1]:
 - 1. According to Naoya's suggestion, modify the tasklist_lock in the
      comment about locking order in mm/filemap.c.
 - 2. According to Kefeng's suggestion, optimize the implementation of
      find_early_kill_thread() without functional changes.
 - 3. Modify the title description.

[1] https://lore.kernel.org/lkml/20230815130154.1100779-1-tongtiangen@huawei.com/
---
 mm/filemap.c        |  3 ---
 mm/ksm.c            |  4 ++--
 mm/memory-failure.c | 26 ++++++++++++++------------
 3 files changed, 16 insertions(+), 17 deletions(-)

Comments

Naoya Horiguchi Aug. 21, 2023, 4:13 a.m. UTC | #1
On Mon, Aug 21, 2023 at 10:25:34AM +0800, Tong Tiangen wrote:
> We found a softlock issue in our test, analyzed the logs, and found that
> the relevant CPU call trace as follows:
> 
> CPU0:
>   _do_fork
>     -> copy_process()
>       -> write_lock_irq(&tasklist_lock)  //Disable irq,waiting for
>       					 //tasklist_lock
> 
> CPU1:
>   wp_page_copy()
>     ->pte_offset_map_lock()
>       -> spin_lock(&page->ptl);        //Hold page->ptl
>     -> ptep_clear_flush()
>       -> flush_tlb_others() ...
>         -> smp_call_function_many()
>           -> arch_send_call_function_ipi_mask()
>             -> csd_lock_wait()         //Waiting for other CPUs respond
> 	                               //IPI
> 
> CPU2:
>   collect_procs_anon()
>     -> read_lock(&tasklist_lock)       //Hold tasklist_lock
>       ->for_each_process(tsk)
>         -> page_mapped_in_vma()
>           -> page_vma_mapped_walk()
> 	    -> map_pte()
>               ->spin_lock(&page->ptl)  //Waiting for page->ptl
> 
> We can see that CPU1 waiting for CPU0 respond IPI,CPU0 waiting for CPU2
> unlock tasklist_lock, CPU2 waiting for CPU1 unlock page->ptl. As a result,
> softlockup is triggered.
> 
> For collect_procs_anon(), we will not modify the tasklist, but only perform
> read traversal. Therefore, we can use rcu lock instead of spin lock
> tasklist_lock, from this, we can break the softlock chain above.
> 
> The same logic can also be applied to:
>  - collect_procs_file()
>  - collect_procs_fsdax()
>  - collect_procs_ksm()
>  - find_early_kill_thread()
> 
> Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>

Acked-by: Naoya Horiguchi <naoya.horiguchi@nec.com>

> ---
> Changes since RFC[1]:
>  - 1. According to Naoya's suggestion, modify the tasklist_lock in the
>       comment about locking order in mm/filemap.c.
>  - 2. According to Kefeng's suggestion, optimize the implementation of
>       find_early_kill_thread() without functional changes.
>  - 3. Modify the title description.
> 
> [1] https://lore.kernel.org/lkml/20230815130154.1100779-1-tongtiangen@huawei.com/
Matthew Wilcox Aug. 21, 2023, 4:34 a.m. UTC | #2
On Mon, Aug 21, 2023 at 10:25:34AM +0800, Tong Tiangen wrote:
> +++ b/mm/memory-failure.c
> @@ -546,24 +546,26 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
>   * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
>   * on behalf of the thread group. Return task_struct of the (first found)
>   * dedicated thread if found, and return NULL otherwise.
> - *
> - * We already hold read_lock(&tasklist_lock) in the caller, so we don't
> - * have to call rcu_read_lock/unlock() in this function.
>   */
>  static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
>  {
>  	struct task_struct *t;
>  
> +	rcu_read_lock();
>  	for_each_thread(tsk, t) {
>  		if (t->flags & PF_MCE_PROCESS) {
>  			if (t->flags & PF_MCE_EARLY)
> -				return t;
> +				goto found;
>  		} else {
>  			if (sysctl_memory_failure_early_kill)
> -				return t;
> +				goto found;
>  		}
>  	}
> -	return NULL;
> +
> +	t = NULL;
> +found:
> +	rcu_read_unlock();
> +	return t;
>  }

I don't understand why you need to modify find_early_kill_thread() at
all.  It's still true that the caller holds _a_ lock protecting it; the
comment needs to be updated to reflect that it might be the RCU lock
or the tasklist_lock (or did you change all callers?), but there's no
need for this function to take the RCU lock itself, afaics?
Tong Tiangen Aug. 21, 2023, 6:35 a.m. UTC | #3
在 2023/8/21 12:34, Matthew Wilcox 写道:
> On Mon, Aug 21, 2023 at 10:25:34AM +0800, Tong Tiangen wrote:
>> +++ b/mm/memory-failure.c
>> @@ -546,24 +546,26 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
>>    * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
>>    * on behalf of the thread group. Return task_struct of the (first found)
>>    * dedicated thread if found, and return NULL otherwise.
>> - *
>> - * We already hold read_lock(&tasklist_lock) in the caller, so we don't
>> - * have to call rcu_read_lock/unlock() in this function.
>>    */
>>   static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
>>   {
>>   	struct task_struct *t;
>>   
>> +	rcu_read_lock();
>>   	for_each_thread(tsk, t) {
>>   		if (t->flags & PF_MCE_PROCESS) {
>>   			if (t->flags & PF_MCE_EARLY)
>> -				return t;
>> +				goto found;
>>   		} else {
>>   			if (sysctl_memory_failure_early_kill)
>> -				return t;
>> +				goto found;
>>   		}
>>   	}
>> -	return NULL;
>> +
>> +	t = NULL;
>> +found:
>> +	rcu_read_unlock();
>> +	return t;
>>   }
> 
> I don't understand why you need to modify find_early_kill_thread() at
> all.  It's still true that the caller holds _a_ lock protecting it; the
> comment needs to be updated to reflect that it might be the RCU lock
> or the tasklist_lock (or did you change all callers?), but there's no
> need for this function to take the RCU lock itself, afaics?
> 
> .

I've checked that all the paths that call find_early_kill_thread() 
already hold the rcu lock, and there's really no need to hold the rcu 
lock here.
In the next patch version, here only the comments are modified.

Thanks,
Tong.
diff mbox series

Patch

diff --git a/mm/filemap.c b/mm/filemap.c
index 014b73eb96a1..dfade1ef1765 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -121,9 +121,6 @@ 
  *    bdi.wb->list_lock		(zap_pte_range->set_page_dirty)
  *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
  *    ->private_lock		(zap_pte_range->block_dirty_folio)
- *
- * ->i_mmap_rwsem
- *   ->tasklist_lock            (memory_failure, collect_procs_ao)
  */
 
 static void page_cache_delete(struct address_space *mapping,
diff --git a/mm/ksm.c b/mm/ksm.c
index 8d6aee05421d..981af9c72e7a 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2925,7 +2925,7 @@  void collect_procs_ksm(struct page *page, struct list_head *to_kill,
 		struct anon_vma *av = rmap_item->anon_vma;
 
 		anon_vma_lock_read(av);
-		read_lock(&tasklist_lock);
+		rcu_read_lock();
 		for_each_process(tsk) {
 			struct anon_vma_chain *vmac;
 			unsigned long addr;
@@ -2944,7 +2944,7 @@  void collect_procs_ksm(struct page *page, struct list_head *to_kill,
 				}
 			}
 		}
-		read_unlock(&tasklist_lock);
+		rcu_read_unlock();
 		anon_vma_unlock_read(av);
 	}
 }
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7b01fffe7a79..3e4fd8beec31 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -546,24 +546,26 @@  static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
  * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
  * on behalf of the thread group. Return task_struct of the (first found)
  * dedicated thread if found, and return NULL otherwise.
- *
- * We already hold read_lock(&tasklist_lock) in the caller, so we don't
- * have to call rcu_read_lock/unlock() in this function.
  */
 static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
 {
 	struct task_struct *t;
 
+	rcu_read_lock();
 	for_each_thread(tsk, t) {
 		if (t->flags & PF_MCE_PROCESS) {
 			if (t->flags & PF_MCE_EARLY)
-				return t;
+				goto found;
 		} else {
 			if (sysctl_memory_failure_early_kill)
-				return t;
+				goto found;
 		}
 	}
-	return NULL;
+
+	t = NULL;
+found:
+	rcu_read_unlock();
+	return t;
 }
 
 /*
@@ -609,7 +611,7 @@  static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 		return;
 
 	pgoff = page_to_pgoff(page);
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	for_each_process(tsk) {
 		struct anon_vma_chain *vmac;
 		struct task_struct *t = task_early_kill(tsk, force_early);
@@ -626,7 +628,7 @@  static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 			add_to_kill_anon_file(t, page, vma, to_kill);
 		}
 	}
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	anon_vma_unlock_read(av);
 }
 
@@ -642,7 +644,7 @@  static void collect_procs_file(struct page *page, struct list_head *to_kill,
 	pgoff_t pgoff;
 
 	i_mmap_lock_read(mapping);
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	pgoff = page_to_pgoff(page);
 	for_each_process(tsk) {
 		struct task_struct *t = task_early_kill(tsk, force_early);
@@ -662,7 +664,7 @@  static void collect_procs_file(struct page *page, struct list_head *to_kill,
 				add_to_kill_anon_file(t, page, vma, to_kill);
 		}
 	}
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	i_mmap_unlock_read(mapping);
 }
 
@@ -685,7 +687,7 @@  static void collect_procs_fsdax(struct page *page,
 	struct task_struct *tsk;
 
 	i_mmap_lock_read(mapping);
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	for_each_process(tsk) {
 		struct task_struct *t = task_early_kill(tsk, true);
 
@@ -696,7 +698,7 @@  static void collect_procs_fsdax(struct page *page,
 				add_to_kill_fsdax(t, page, vma, to_kill, pgoff);
 		}
 	}
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	i_mmap_unlock_read(mapping);
 }
 #endif /* CONFIG_FS_DAX */