Message ID | 20220128131006.67712-24-michel@lespinasse.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Speculative page faults | expand |
On Fri, Jan 28, 2022 at 05:09:54AM -0800, Michel Lespinasse wrote: > Introduce mmu_notifier_lock as a per-mm percpu_rw_semaphore, > as well as the code to initialize and destroy it together with the mm. > > This lock will be used to prevent races between mmu_notifier_register() > and speculative fault handlers that need to fire MMU notifications > without holding any of the mmap or rmap locks. > > Signed-off-by: Michel Lespinasse <michel@lespinasse.org> > --- > include/linux/mm_types.h | 6 +++++- > include/linux/mmu_notifier.h | 27 +++++++++++++++++++++++++-- > kernel/fork.c | 3 ++- > 3 files changed, 32 insertions(+), 4 deletions(-) > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index 305f05d2a4bc..f77e2dec038d 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -462,6 +462,7 @@ struct vm_area_struct { > } __randomize_layout; > > struct kioctx_table; > +struct percpu_rw_semaphore; > struct mm_struct { > struct { > struct vm_area_struct *mmap; /* list of VMAs */ > @@ -608,7 +609,10 @@ struct mm_struct { > struct file __rcu *exe_file; > #ifdef CONFIG_MMU_NOTIFIER > struct mmu_notifier_subscriptions *notifier_subscriptions; > -#endif > +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT > + struct percpu_rw_semaphore *mmu_notifier_lock; > +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ > +#endif /* CONFIG_MMU_NOTIFIER */ > #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS > pgtable_t pmd_huge_pte; /* protected by page_table_lock */ > #endif > diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h > index 45fc2c81e370..ace76fe91c0c 100644 > --- a/include/linux/mmu_notifier.h > +++ b/include/linux/mmu_notifier.h > @@ -6,6 +6,8 @@ > #include <linux/spinlock.h> > #include <linux/mm_types.h> > #include <linux/mmap_lock.h> > +#include <linux/percpu-rwsem.h> > +#include <linux/slab.h> > #include <linux/srcu.h> > #include <linux/interval_tree.h> > > @@ -499,15 +501,35 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, > __mmu_notifier_invalidate_range(mm, start, end); > } > > -static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) > +static inline bool mmu_notifier_subscriptions_init(struct mm_struct *mm) > { > +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT > + mm->mmu_notifier_lock = kzalloc(sizeof(struct percpu_rw_semaphore), GFP_KERNEL); > + if (!mm->mmu_notifier_lock) > + return false; > + if (percpu_init_rwsem(mm->mmu_notifier_lock)) { > + kfree(mm->mmu_notifier_lock); > + return false; > + } > +#endif > + > mm->notifier_subscriptions = NULL; > + return true; > } > > static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) > { > if (mm_has_notifiers(mm)) > __mmu_notifier_subscriptions_destroy(mm); > + > +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT > + if (!in_atomic()) { > + percpu_free_rwsem(mm->mmu_notifier_lock); > + kfree(mm->mmu_notifier_lock); > + } else { > + percpu_rwsem_async_destroy(mm->mmu_notifier_lock); > + } > +#endif > } > We have received a bug report from our customer running Android GKI kernel android-13-5.15 branch where this series is included. As the callstack [1] indicates, the non-atomic test it self is not sufficient to free the percpu rwsem. The scenario deduced from the callstack: - context switch on CPU#0 from 'A' to idle. idle thread took A's mm - 'A' later ran on another CPU and exited. A's mm has still reference. - Now CPU#0 is being hotplugged out. As part of this, idle thread's mm is switched (in idle_task_exit()) but its active_mm freeing is deferred to finish_cpu() which gets called later from the control processor (the thread which initiated the CPU hotplug). Please see the reasoning on why mmdrop() is not called in idle_task_exit() at commit bf2c59fce4074('sched/core: Fix illegal RCU from offline CPUs') - Now when finish_cpu() tries call percpu_free_rwsem() directly since we are not in atomic path but hotplug path where cpus_write_lock() called is causing the deadlock. I am not sure if there is a clean way other than freeing the per-cpu rwsemaphore asynchronously all the time. [1] -001|context_switch(inline) -001|__schedule() -002|__preempt_count_sub(inline) -002|schedule() -003|_raw_spin_unlock_irq(inline) -003|spin_unlock_irq(inline) -003|percpu_rwsem_wait() -004|__preempt_count_add(inline) -004|__percpu_down_read() -005|percpu_down_read(inline) -005|cpus_read_lock() // trying to get cpu_hotplug_lock again -006|rcu_barrier() -007|rcu_sync_dtor() -008|mmu_notifier_subscriptions_destroy(inline) -008|__mmdrop() -009|mmdrop(inline) -009|finish_cpu() -010|cpuhp_invoke_callback() -011|cpuhp_invoke_callback_range(inline) -011|cpuhp_down_callbacks() -012|_cpu_down() // acquired cpu_hotplug_lock (write lock) Thanks, Pavan
On Wed, Jul 27, 2022 at 12:34 AM Pavan Kondeti <quic_pkondeti@quicinc.com> wrote: > > On Fri, Jan 28, 2022 at 05:09:54AM -0800, Michel Lespinasse wrote: > > Introduce mmu_notifier_lock as a per-mm percpu_rw_semaphore, > > as well as the code to initialize and destroy it together with the mm. > > > > This lock will be used to prevent races between mmu_notifier_register() > > and speculative fault handlers that need to fire MMU notifications > > without holding any of the mmap or rmap locks. > > > > Signed-off-by: Michel Lespinasse <michel@lespinasse.org> > > --- > > include/linux/mm_types.h | 6 +++++- > > include/linux/mmu_notifier.h | 27 +++++++++++++++++++++++++-- > > kernel/fork.c | 3 ++- > > 3 files changed, 32 insertions(+), 4 deletions(-) > > > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > > index 305f05d2a4bc..f77e2dec038d 100644 > > --- a/include/linux/mm_types.h > > +++ b/include/linux/mm_types.h > > @@ -462,6 +462,7 @@ struct vm_area_struct { > > } __randomize_layout; > > > > struct kioctx_table; > > +struct percpu_rw_semaphore; > > struct mm_struct { > > struct { > > struct vm_area_struct *mmap; /* list of VMAs */ > > @@ -608,7 +609,10 @@ struct mm_struct { > > struct file __rcu *exe_file; > > #ifdef CONFIG_MMU_NOTIFIER > > struct mmu_notifier_subscriptions *notifier_subscriptions; > > -#endif > > +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT > > + struct percpu_rw_semaphore *mmu_notifier_lock; > > +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ > > +#endif /* CONFIG_MMU_NOTIFIER */ > > #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS > > pgtable_t pmd_huge_pte; /* protected by page_table_lock */ > > #endif > > diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h > > index 45fc2c81e370..ace76fe91c0c 100644 > > --- a/include/linux/mmu_notifier.h > > +++ b/include/linux/mmu_notifier.h > > @@ -6,6 +6,8 @@ > > #include <linux/spinlock.h> > > #include <linux/mm_types.h> > > #include <linux/mmap_lock.h> > > +#include <linux/percpu-rwsem.h> > > +#include <linux/slab.h> > > #include <linux/srcu.h> > > #include <linux/interval_tree.h> > > > > @@ -499,15 +501,35 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, > > __mmu_notifier_invalidate_range(mm, start, end); > > } > > > > -static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) > > +static inline bool mmu_notifier_subscriptions_init(struct mm_struct *mm) > > { > > +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT > > + mm->mmu_notifier_lock = kzalloc(sizeof(struct percpu_rw_semaphore), GFP_KERNEL); > > + if (!mm->mmu_notifier_lock) > > + return false; > > + if (percpu_init_rwsem(mm->mmu_notifier_lock)) { > > + kfree(mm->mmu_notifier_lock); > > + return false; > > + } > > +#endif > > + > > mm->notifier_subscriptions = NULL; > > + return true; > > } > > > > static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) > > { > > if (mm_has_notifiers(mm)) > > __mmu_notifier_subscriptions_destroy(mm); > > + > > +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT > > + if (!in_atomic()) { > > + percpu_free_rwsem(mm->mmu_notifier_lock); > > + kfree(mm->mmu_notifier_lock); > > + } else { > > + percpu_rwsem_async_destroy(mm->mmu_notifier_lock); > > + } > > +#endif > > } > > > > We have received a bug report from our customer running Android GKI kernel > android-13-5.15 branch where this series is included. As the callstack [1] > indicates, the non-atomic test it self is not sufficient to free the percpu > rwsem. > > The scenario deduced from the callstack: > > - context switch on CPU#0 from 'A' to idle. idle thread took A's mm > > - 'A' later ran on another CPU and exited. A's mm has still reference. > > - Now CPU#0 is being hotplugged out. As part of this, idle thread's > mm is switched (in idle_task_exit()) but its active_mm freeing is > deferred to finish_cpu() which gets called later from the control processor > (the thread which initiated the CPU hotplug). Please see the reasoning > on why mmdrop() is not called in idle_task_exit() at > commit bf2c59fce4074('sched/core: Fix illegal RCU from offline CPUs') > > - Now when finish_cpu() tries call percpu_free_rwsem() directly since we are > not in atomic path but hotplug path where cpus_write_lock() called is causing > the deadlock. > > I am not sure if there is a clean way other than freeing the per-cpu > rwsemaphore asynchronously all the time. Thanks for reporting this issue, Pavan. I think your suggestion of doing unconditional async destruction of mmu_notifier_lock would be fine here. percpu_rwsem_async_destroy has a bit of an overhead to schedule that work but I don't think the exit path is too performance critical to suffer from that. Michel, WDYT? > > [1] > > -001|context_switch(inline) > -001|__schedule() > -002|__preempt_count_sub(inline) > -002|schedule() > -003|_raw_spin_unlock_irq(inline) > -003|spin_unlock_irq(inline) > -003|percpu_rwsem_wait() > -004|__preempt_count_add(inline) > -004|__percpu_down_read() > -005|percpu_down_read(inline) > -005|cpus_read_lock() // trying to get cpu_hotplug_lock again > -006|rcu_barrier() > -007|rcu_sync_dtor() > -008|mmu_notifier_subscriptions_destroy(inline) > -008|__mmdrop() > -009|mmdrop(inline) > -009|finish_cpu() > -010|cpuhp_invoke_callback() > -011|cpuhp_invoke_callback_range(inline) > -011|cpuhp_down_callbacks() > -012|_cpu_down() // acquired cpu_hotplug_lock (write lock) > > Thanks, > Pavan >
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 305f05d2a4bc..f77e2dec038d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -462,6 +462,7 @@ struct vm_area_struct { } __randomize_layout; struct kioctx_table; +struct percpu_rw_semaphore; struct mm_struct { struct { struct vm_area_struct *mmap; /* list of VMAs */ @@ -608,7 +609,10 @@ struct mm_struct { struct file __rcu *exe_file; #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_subscriptions *notifier_subscriptions; -#endif +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + struct percpu_rw_semaphore *mmu_notifier_lock; +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ +#endif /* CONFIG_MMU_NOTIFIER */ #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS pgtable_t pmd_huge_pte; /* protected by page_table_lock */ #endif diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 45fc2c81e370..ace76fe91c0c 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -6,6 +6,8 @@ #include <linux/spinlock.h> #include <linux/mm_types.h> #include <linux/mmap_lock.h> +#include <linux/percpu-rwsem.h> +#include <linux/slab.h> #include <linux/srcu.h> #include <linux/interval_tree.h> @@ -499,15 +501,35 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, __mmu_notifier_invalidate_range(mm, start, end); } -static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) +static inline bool mmu_notifier_subscriptions_init(struct mm_struct *mm) { +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + mm->mmu_notifier_lock = kzalloc(sizeof(struct percpu_rw_semaphore), GFP_KERNEL); + if (!mm->mmu_notifier_lock) + return false; + if (percpu_init_rwsem(mm->mmu_notifier_lock)) { + kfree(mm->mmu_notifier_lock); + return false; + } +#endif + mm->notifier_subscriptions = NULL; + return true; } static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) { if (mm_has_notifiers(mm)) __mmu_notifier_subscriptions_destroy(mm); + +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + if (!in_atomic()) { + percpu_free_rwsem(mm->mmu_notifier_lock); + kfree(mm->mmu_notifier_lock); + } else { + percpu_rwsem_async_destroy(mm->mmu_notifier_lock); + } +#endif } @@ -724,8 +746,9 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, { } -static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) +static inline bool mmu_notifier_subscriptions_init(struct mm_struct *mm) { + return true; } static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) diff --git a/kernel/fork.c b/kernel/fork.c index 2e5f2e8de31a..db92e42d0087 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1069,7 +1069,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_owner(mm, p); mm_init_pasid(mm); RCU_INIT_POINTER(mm->exe_file, NULL); - mmu_notifier_subscriptions_init(mm); + if (!mmu_notifier_subscriptions_init(mm)) + goto fail_nopgd; init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL;
Introduce mmu_notifier_lock as a per-mm percpu_rw_semaphore, as well as the code to initialize and destroy it together with the mm. This lock will be used to prevent races between mmu_notifier_register() and speculative fault handlers that need to fire MMU notifications without holding any of the mmap or rmap locks. Signed-off-by: Michel Lespinasse <michel@lespinasse.org> --- include/linux/mm_types.h | 6 +++++- include/linux/mmu_notifier.h | 27 +++++++++++++++++++++++++-- kernel/fork.c | 3 ++- 3 files changed, 32 insertions(+), 4 deletions(-)