diff mbox series

[RESEND,2/2] mm: zswap: use SRCU to synchronize with CPU hotunplug

Message ID 20250107074724.1756696-2-yosryahmed@google.com (mailing list archive)
State New
Headers show
Series [RESEND,1/2] Revert "mm: zswap: fix race between [de]compression and CPU hotunplug" | expand

Commit Message

Yosry Ahmed Jan. 7, 2025, 7:47 a.m. UTC
In zswap_compress() and zswap_decompress(), the per-CPU acomp_ctx of the
current CPU at the beginning of the operation is retrieved and used
throughout.  However, since neither preemption nor migration are disabled,
it is possible that the operation continues on a different CPU.

If the original CPU is hotunplugged while the acomp_ctx is still in use,
we run into a UAF bug as the resources attached to the acomp_ctx are freed
during hotunplug in zswap_cpu_comp_dead().

The problem was introduced in commit 1ec3b5fe6eec ("mm/zswap: move to use
crypto_acomp API for hardware acceleration") when the switch to the
crypto_acomp API was made.  Prior to that, the per-CPU crypto_comp was
retrieved using get_cpu_ptr() which disables preemption and makes sure the
CPU cannot go away from under us.  Preemption cannot be disabled with the
crypto_acomp API as a sleepable context is needed.

Commit 8ba2f844f050 ("mm/zswap: change per-cpu mutex and buffer to
per-acomp_ctx") increased the UAF surface area by making the per-CPU
buffers dynamic, adding yet another resource that can be freed from under
zswap compression/decompression by CPU hotunplug.

There are a few ways to fix this:
(a) Add a refcount for acomp_ctx.
(b) Disable migration while using the per-CPU acomp_ctx.
(c) Use SRCU to wait for other CPUs using the acomp_ctx of the CPU being
hotunplugged. Normal RCU cannot be used as a sleepable context is
required.

Implement (c) since it's simpler than (a), and (b) involves using
migrate_disable() which is apparently undesired (see huge comment in
include/linux/preempt.h).

Fixes: 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration")
Cc: <stable@vger.kernel.org>
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Reported-by: Johannes Weiner <hannes@cmpxchg.org>
Closes: https://lore.kernel.org/lkml/20241113213007.GB1564047@cmpxchg.org/
Reported-by: Sam Sun <samsun1006219@gmail.com>
Closes: https://lore.kernel.org/lkml/CAEkJfYMtSdM5HceNsXUDf5haghD5+o2e7Qv4OcuruL4tPg6OaQ@mail.gmail.com/
---
 mm/zswap.c | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

Comments

Johannes Weiner Jan. 7, 2025, 6:03 p.m. UTC | #1
On Tue, Jan 07, 2025 at 07:47:24AM +0000, Yosry Ahmed wrote:
> In zswap_compress() and zswap_decompress(), the per-CPU acomp_ctx of the
> current CPU at the beginning of the operation is retrieved and used
> throughout.  However, since neither preemption nor migration are disabled,
> it is possible that the operation continues on a different CPU.
> 
> If the original CPU is hotunplugged while the acomp_ctx is still in use,
> we run into a UAF bug as the resources attached to the acomp_ctx are freed
> during hotunplug in zswap_cpu_comp_dead().
> 
> The problem was introduced in commit 1ec3b5fe6eec ("mm/zswap: move to use
> crypto_acomp API for hardware acceleration") when the switch to the
> crypto_acomp API was made.  Prior to that, the per-CPU crypto_comp was
> retrieved using get_cpu_ptr() which disables preemption and makes sure the
> CPU cannot go away from under us.  Preemption cannot be disabled with the
> crypto_acomp API as a sleepable context is needed.
> 
> Commit 8ba2f844f050 ("mm/zswap: change per-cpu mutex and buffer to
> per-acomp_ctx") increased the UAF surface area by making the per-CPU
> buffers dynamic, adding yet another resource that can be freed from under
> zswap compression/decompression by CPU hotunplug.
> 
> There are a few ways to fix this:
> (a) Add a refcount for acomp_ctx.
> (b) Disable migration while using the per-CPU acomp_ctx.
> (c) Use SRCU to wait for other CPUs using the acomp_ctx of the CPU being
> hotunplugged. Normal RCU cannot be used as a sleepable context is
> required.
> 
> Implement (c) since it's simpler than (a), and (b) involves using
> migrate_disable() which is apparently undesired (see huge comment in
> include/linux/preempt.h).
> 
> Fixes: 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration")
> Cc: <stable@vger.kernel.org>
> Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> Reported-by: Johannes Weiner <hannes@cmpxchg.org>
> Closes: https://lore.kernel.org/lkml/20241113213007.GB1564047@cmpxchg.org/
> Reported-by: Sam Sun <samsun1006219@gmail.com>
> Closes: https://lore.kernel.org/lkml/CAEkJfYMtSdM5HceNsXUDf5haghD5+o2e7Qv4OcuruL4tPg6OaQ@mail.gmail.com/
> ---
>  mm/zswap.c | 31 ++++++++++++++++++++++++++++---
>  1 file changed, 28 insertions(+), 3 deletions(-)
> 
> diff --git a/mm/zswap.c b/mm/zswap.c
> index f6316b66fb236..add1406d693b8 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -864,12 +864,22 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
>  	return ret;
>  }
>  
> +DEFINE_STATIC_SRCU(acomp_srcu);
> +
>  static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
>  {
>  	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
>  	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
>  
>  	if (!IS_ERR_OR_NULL(acomp_ctx)) {
> +		/*
> +		 * Even though the acomp_ctx should not be currently in use on
> +		 * @cpu, it may still be used by compress/decompress operations
> +		 * that started on @cpu and migrated to a different CPU. Wait
> +		 * for such usages to complete, any news usages would be a bug.
> +		 */
> +		synchronize_srcu(&acomp_srcu);

The docs suggest you can't solve it like that :(

Documentation/RCU/Design/Requirements/Requirements.rst:

  Also unlike other RCU flavors, synchronize_srcu() may **not** be
  invoked from CPU-hotplug notifiers, due to the fact that SRCU grace
  periods make use of timers and the possibility of timers being
  temporarily “stranded” on the outgoing CPU. This stranding of timers
  means that timers posted to the outgoing CPU will not fire until
  late in the CPU-hotplug process. The problem is that if a notifier
  is waiting on an SRCU grace period, that grace period is waiting on
  a timer, and that timer is stranded on the outgoing CPU, then the
  notifier will never be awakened, in other words, deadlock has
  occurred. This same situation of course also prohibits
  srcu_barrier() from being invoked from CPU-hotplug notifiers.
Yosry Ahmed Jan. 7, 2025, 6:13 p.m. UTC | #2
On Tue, Jan 7, 2025 at 10:03 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Tue, Jan 07, 2025 at 07:47:24AM +0000, Yosry Ahmed wrote:
> > In zswap_compress() and zswap_decompress(), the per-CPU acomp_ctx of the
> > current CPU at the beginning of the operation is retrieved and used
> > throughout.  However, since neither preemption nor migration are disabled,
> > it is possible that the operation continues on a different CPU.
> >
> > If the original CPU is hotunplugged while the acomp_ctx is still in use,
> > we run into a UAF bug as the resources attached to the acomp_ctx are freed
> > during hotunplug in zswap_cpu_comp_dead().
> >
> > The problem was introduced in commit 1ec3b5fe6eec ("mm/zswap: move to use
> > crypto_acomp API for hardware acceleration") when the switch to the
> > crypto_acomp API was made.  Prior to that, the per-CPU crypto_comp was
> > retrieved using get_cpu_ptr() which disables preemption and makes sure the
> > CPU cannot go away from under us.  Preemption cannot be disabled with the
> > crypto_acomp API as a sleepable context is needed.
> >
> > Commit 8ba2f844f050 ("mm/zswap: change per-cpu mutex and buffer to
> > per-acomp_ctx") increased the UAF surface area by making the per-CPU
> > buffers dynamic, adding yet another resource that can be freed from under
> > zswap compression/decompression by CPU hotunplug.
> >
> > There are a few ways to fix this:
> > (a) Add a refcount for acomp_ctx.
> > (b) Disable migration while using the per-CPU acomp_ctx.
> > (c) Use SRCU to wait for other CPUs using the acomp_ctx of the CPU being
> > hotunplugged. Normal RCU cannot be used as a sleepable context is
> > required.
> >
> > Implement (c) since it's simpler than (a), and (b) involves using
> > migrate_disable() which is apparently undesired (see huge comment in
> > include/linux/preempt.h).
> >
> > Fixes: 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration")
> > Cc: <stable@vger.kernel.org>
> > Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> > Reported-by: Johannes Weiner <hannes@cmpxchg.org>
> > Closes: https://lore.kernel.org/lkml/20241113213007.GB1564047@cmpxchg.org/
> > Reported-by: Sam Sun <samsun1006219@gmail.com>
> > Closes: https://lore.kernel.org/lkml/CAEkJfYMtSdM5HceNsXUDf5haghD5+o2e7Qv4OcuruL4tPg6OaQ@mail.gmail.com/
> > ---
> >  mm/zswap.c | 31 ++++++++++++++++++++++++++++---
> >  1 file changed, 28 insertions(+), 3 deletions(-)
> >
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index f6316b66fb236..add1406d693b8 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -864,12 +864,22 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> >       return ret;
> >  }
> >
> > +DEFINE_STATIC_SRCU(acomp_srcu);
> > +
> >  static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
> >  {
> >       struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
> >       struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> >
> >       if (!IS_ERR_OR_NULL(acomp_ctx)) {
> > +             /*
> > +              * Even though the acomp_ctx should not be currently in use on
> > +              * @cpu, it may still be used by compress/decompress operations
> > +              * that started on @cpu and migrated to a different CPU. Wait
> > +              * for such usages to complete, any news usages would be a bug.
> > +              */
> > +             synchronize_srcu(&acomp_srcu);
>
> The docs suggest you can't solve it like that :(
>
> Documentation/RCU/Design/Requirements/Requirements.rst:
>
>   Also unlike other RCU flavors, synchronize_srcu() may **not** be
>   invoked from CPU-hotplug notifiers, due to the fact that SRCU grace
>   periods make use of timers and the possibility of timers being
>   temporarily “stranded” on the outgoing CPU. This stranding of timers
>   means that timers posted to the outgoing CPU will not fire until
>   late in the CPU-hotplug process. The problem is that if a notifier
>   is waiting on an SRCU grace period, that grace period is waiting on
>   a timer, and that timer is stranded on the outgoing CPU, then the
>   notifier will never be awakened, in other words, deadlock has
>   occurred. This same situation of course also prohibits
>   srcu_barrier() from being invoked from CPU-hotplug notifiers.

Thanks for checking, I completely missed this. I guess it only works
with SRCU if we use call_srcu(), but then we need to copy the pointers
to a new struct to avoid racing with the CPU getting onlined again.
Otherwise we can just bite the bullet and add a refcount, or use
migrate_disable() despite that being undesirable.

Do you have a favorite? :)
Yosry Ahmed Jan. 7, 2025, 8:16 p.m. UTC | #3
On Tue, Jan 7, 2025 at 10:13 AM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Tue, Jan 7, 2025 at 10:03 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> >
> > On Tue, Jan 07, 2025 at 07:47:24AM +0000, Yosry Ahmed wrote:
> > > In zswap_compress() and zswap_decompress(), the per-CPU acomp_ctx of the
> > > current CPU at the beginning of the operation is retrieved and used
> > > throughout.  However, since neither preemption nor migration are disabled,
> > > it is possible that the operation continues on a different CPU.
> > >
> > > If the original CPU is hotunplugged while the acomp_ctx is still in use,
> > > we run into a UAF bug as the resources attached to the acomp_ctx are freed
> > > during hotunplug in zswap_cpu_comp_dead().
> > >
> > > The problem was introduced in commit 1ec3b5fe6eec ("mm/zswap: move to use
> > > crypto_acomp API for hardware acceleration") when the switch to the
> > > crypto_acomp API was made.  Prior to that, the per-CPU crypto_comp was
> > > retrieved using get_cpu_ptr() which disables preemption and makes sure the
> > > CPU cannot go away from under us.  Preemption cannot be disabled with the
> > > crypto_acomp API as a sleepable context is needed.
> > >
> > > Commit 8ba2f844f050 ("mm/zswap: change per-cpu mutex and buffer to
> > > per-acomp_ctx") increased the UAF surface area by making the per-CPU
> > > buffers dynamic, adding yet another resource that can be freed from under
> > > zswap compression/decompression by CPU hotunplug.
> > >
> > > There are a few ways to fix this:
> > > (a) Add a refcount for acomp_ctx.
> > > (b) Disable migration while using the per-CPU acomp_ctx.
> > > (c) Use SRCU to wait for other CPUs using the acomp_ctx of the CPU being
> > > hotunplugged. Normal RCU cannot be used as a sleepable context is
> > > required.
> > >
> > > Implement (c) since it's simpler than (a), and (b) involves using
> > > migrate_disable() which is apparently undesired (see huge comment in
> > > include/linux/preempt.h).
> > >
> > > Fixes: 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration")
> > > Cc: <stable@vger.kernel.org>
> > > Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> > > Reported-by: Johannes Weiner <hannes@cmpxchg.org>
> > > Closes: https://lore.kernel.org/lkml/20241113213007.GB1564047@cmpxchg.org/
> > > Reported-by: Sam Sun <samsun1006219@gmail.com>
> > > Closes: https://lore.kernel.org/lkml/CAEkJfYMtSdM5HceNsXUDf5haghD5+o2e7Qv4OcuruL4tPg6OaQ@mail.gmail.com/
> > > ---
> > >  mm/zswap.c | 31 ++++++++++++++++++++++++++++---
> > >  1 file changed, 28 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/mm/zswap.c b/mm/zswap.c
> > > index f6316b66fb236..add1406d693b8 100644
> > > --- a/mm/zswap.c
> > > +++ b/mm/zswap.c
> > > @@ -864,12 +864,22 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> > >       return ret;
> > >  }
> > >
> > > +DEFINE_STATIC_SRCU(acomp_srcu);
> > > +
> > >  static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
> > >  {
> > >       struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
> > >       struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> > >
> > >       if (!IS_ERR_OR_NULL(acomp_ctx)) {
> > > +             /*
> > > +              * Even though the acomp_ctx should not be currently in use on
> > > +              * @cpu, it may still be used by compress/decompress operations
> > > +              * that started on @cpu and migrated to a different CPU. Wait
> > > +              * for such usages to complete, any news usages would be a bug.
> > > +              */
> > > +             synchronize_srcu(&acomp_srcu);
> >
> > The docs suggest you can't solve it like that :(
> >
> > Documentation/RCU/Design/Requirements/Requirements.rst:
> >
> >   Also unlike other RCU flavors, synchronize_srcu() may **not** be
> >   invoked from CPU-hotplug notifiers, due to the fact that SRCU grace
> >   periods make use of timers and the possibility of timers being
> >   temporarily “stranded” on the outgoing CPU. This stranding of timers
> >   means that timers posted to the outgoing CPU will not fire until
> >   late in the CPU-hotplug process. The problem is that if a notifier
> >   is waiting on an SRCU grace period, that grace period is waiting on
> >   a timer, and that timer is stranded on the outgoing CPU, then the
> >   notifier will never be awakened, in other words, deadlock has
> >   occurred. This same situation of course also prohibits
> >   srcu_barrier() from being invoked from CPU-hotplug notifiers.
>
> Thanks for checking, I completely missed this. I guess it only works
> with SRCU if we use call_srcu(), but then we need to copy the pointers
> to a new struct to avoid racing with the CPU getting onlined again.
> Otherwise we can just bite the bullet and add a refcount, or use
> migrate_disable() despite that being undesirable.
>
> Do you have a favorite? :)

I briefly looked into refcounting. The annoying thing is that we need
to handle the race between putting the last refcount in
zswap_compress()/zswap_decompress(), and the CPU getting onlined again
and re-initializing the refcount. One way to do it would be to put all
dynamically allocated resources in a struct with the same struct with
the new refcount, and use RCU + refcounts to allocate and free the
struct as a whole.

I am leaning toward just disabling migration for now tbh unless there
are objections to that, especially this close to the v6.13 release.
Yosry Ahmed Jan. 7, 2025, 9:42 p.m. UTC | #4
On Tue, Jan 7, 2025 at 12:16 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Tue, Jan 7, 2025 at 10:13 AM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > On Tue, Jan 7, 2025 at 10:03 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > >
> > > On Tue, Jan 07, 2025 at 07:47:24AM +0000, Yosry Ahmed wrote:
> > > > In zswap_compress() and zswap_decompress(), the per-CPU acomp_ctx of the
> > > > current CPU at the beginning of the operation is retrieved and used
> > > > throughout.  However, since neither preemption nor migration are disabled,
> > > > it is possible that the operation continues on a different CPU.
> > > >
> > > > If the original CPU is hotunplugged while the acomp_ctx is still in use,
> > > > we run into a UAF bug as the resources attached to the acomp_ctx are freed
> > > > during hotunplug in zswap_cpu_comp_dead().
> > > >
> > > > The problem was introduced in commit 1ec3b5fe6eec ("mm/zswap: move to use
> > > > crypto_acomp API for hardware acceleration") when the switch to the
> > > > crypto_acomp API was made.  Prior to that, the per-CPU crypto_comp was
> > > > retrieved using get_cpu_ptr() which disables preemption and makes sure the
> > > > CPU cannot go away from under us.  Preemption cannot be disabled with the
> > > > crypto_acomp API as a sleepable context is needed.
> > > >
> > > > Commit 8ba2f844f050 ("mm/zswap: change per-cpu mutex and buffer to
> > > > per-acomp_ctx") increased the UAF surface area by making the per-CPU
> > > > buffers dynamic, adding yet another resource that can be freed from under
> > > > zswap compression/decompression by CPU hotunplug.
> > > >
> > > > There are a few ways to fix this:
> > > > (a) Add a refcount for acomp_ctx.
> > > > (b) Disable migration while using the per-CPU acomp_ctx.
> > > > (c) Use SRCU to wait for other CPUs using the acomp_ctx of the CPU being
> > > > hotunplugged. Normal RCU cannot be used as a sleepable context is
> > > > required.
> > > >
> > > > Implement (c) since it's simpler than (a), and (b) involves using
> > > > migrate_disable() which is apparently undesired (see huge comment in
> > > > include/linux/preempt.h).
> > > >
> > > > Fixes: 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration")
> > > > Cc: <stable@vger.kernel.org>
> > > > Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> > > > Reported-by: Johannes Weiner <hannes@cmpxchg.org>
> > > > Closes: https://lore.kernel.org/lkml/20241113213007.GB1564047@cmpxchg.org/
> > > > Reported-by: Sam Sun <samsun1006219@gmail.com>
> > > > Closes: https://lore.kernel.org/lkml/CAEkJfYMtSdM5HceNsXUDf5haghD5+o2e7Qv4OcuruL4tPg6OaQ@mail.gmail.com/
> > > > ---
> > > >  mm/zswap.c | 31 ++++++++++++++++++++++++++++---
> > > >  1 file changed, 28 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/mm/zswap.c b/mm/zswap.c
> > > > index f6316b66fb236..add1406d693b8 100644
> > > > --- a/mm/zswap.c
> > > > +++ b/mm/zswap.c
> > > > @@ -864,12 +864,22 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> > > >       return ret;
> > > >  }
> > > >
> > > > +DEFINE_STATIC_SRCU(acomp_srcu);
> > > > +
> > > >  static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
> > > >  {
> > > >       struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
> > > >       struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> > > >
> > > >       if (!IS_ERR_OR_NULL(acomp_ctx)) {
> > > > +             /*
> > > > +              * Even though the acomp_ctx should not be currently in use on
> > > > +              * @cpu, it may still be used by compress/decompress operations
> > > > +              * that started on @cpu and migrated to a different CPU. Wait
> > > > +              * for such usages to complete, any news usages would be a bug.
> > > > +              */
> > > > +             synchronize_srcu(&acomp_srcu);
> > >
> > > The docs suggest you can't solve it like that :(
> > >
> > > Documentation/RCU/Design/Requirements/Requirements.rst:
> > >
> > >   Also unlike other RCU flavors, synchronize_srcu() may **not** be
> > >   invoked from CPU-hotplug notifiers, due to the fact that SRCU grace
> > >   periods make use of timers and the possibility of timers being
> > >   temporarily “stranded” on the outgoing CPU. This stranding of timers
> > >   means that timers posted to the outgoing CPU will not fire until
> > >   late in the CPU-hotplug process. The problem is that if a notifier
> > >   is waiting on an SRCU grace period, that grace period is waiting on
> > >   a timer, and that timer is stranded on the outgoing CPU, then the
> > >   notifier will never be awakened, in other words, deadlock has
> > >   occurred. This same situation of course also prohibits
> > >   srcu_barrier() from being invoked from CPU-hotplug notifiers.
> >
> > Thanks for checking, I completely missed this. I guess it only works
> > with SRCU if we use call_srcu(), but then we need to copy the pointers
> > to a new struct to avoid racing with the CPU getting onlined again.
> > Otherwise we can just bite the bullet and add a refcount, or use
> > migrate_disable() despite that being undesirable.
> >
> > Do you have a favorite? :)
>
> I briefly looked into refcounting. The annoying thing is that we need
> to handle the race between putting the last refcount in
> zswap_compress()/zswap_decompress(), and the CPU getting onlined again
> and re-initializing the refcount. One way to do it would be to put all
> dynamically allocated resources in a struct with the same struct with
> the new refcount, and use RCU + refcounts to allocate and free the
> struct as a whole.
>
> I am leaning toward just disabling migration for now tbh unless there
> are objections to that, especially this close to the v6.13 release.

(Sorry for going back and forth on this, I am essentially thinking out loud)

Actually, as Kanchana mentioned before, we should be able to just hold
the mutex in zswap_cpu_comp_dead() before freeing the dynamic
resources. The mutex is allocated when the pool is created and will
not go away during CPU hotunplug AFAICT. It confused me before because
we call mutex_init() in zswap_cpu_comp_prepare(), but it really should
be in zswap_pool_create() after we allocate the pool->acomp_ctx.
Yosry Ahmed Jan. 7, 2025, 9:58 p.m. UTC | #5
On Tue, Jan 7, 2025 at 1:42 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Tue, Jan 7, 2025 at 12:16 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > On Tue, Jan 7, 2025 at 10:13 AM Yosry Ahmed <yosryahmed@google.com> wrote:
> > >
> > > On Tue, Jan 7, 2025 at 10:03 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > >
> > > > On Tue, Jan 07, 2025 at 07:47:24AM +0000, Yosry Ahmed wrote:
> > > > > In zswap_compress() and zswap_decompress(), the per-CPU acomp_ctx of the
> > > > > current CPU at the beginning of the operation is retrieved and used
> > > > > throughout.  However, since neither preemption nor migration are disabled,
> > > > > it is possible that the operation continues on a different CPU.
> > > > >
> > > > > If the original CPU is hotunplugged while the acomp_ctx is still in use,
> > > > > we run into a UAF bug as the resources attached to the acomp_ctx are freed
> > > > > during hotunplug in zswap_cpu_comp_dead().
> > > > >
> > > > > The problem was introduced in commit 1ec3b5fe6eec ("mm/zswap: move to use
> > > > > crypto_acomp API for hardware acceleration") when the switch to the
> > > > > crypto_acomp API was made.  Prior to that, the per-CPU crypto_comp was
> > > > > retrieved using get_cpu_ptr() which disables preemption and makes sure the
> > > > > CPU cannot go away from under us.  Preemption cannot be disabled with the
> > > > > crypto_acomp API as a sleepable context is needed.
> > > > >
> > > > > Commit 8ba2f844f050 ("mm/zswap: change per-cpu mutex and buffer to
> > > > > per-acomp_ctx") increased the UAF surface area by making the per-CPU
> > > > > buffers dynamic, adding yet another resource that can be freed from under
> > > > > zswap compression/decompression by CPU hotunplug.
> > > > >
> > > > > There are a few ways to fix this:
> > > > > (a) Add a refcount for acomp_ctx.
> > > > > (b) Disable migration while using the per-CPU acomp_ctx.
> > > > > (c) Use SRCU to wait for other CPUs using the acomp_ctx of the CPU being
> > > > > hotunplugged. Normal RCU cannot be used as a sleepable context is
> > > > > required.
> > > > >
> > > > > Implement (c) since it's simpler than (a), and (b) involves using
> > > > > migrate_disable() which is apparently undesired (see huge comment in
> > > > > include/linux/preempt.h).
> > > > >
> > > > > Fixes: 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration")
> > > > > Cc: <stable@vger.kernel.org>
> > > > > Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> > > > > Reported-by: Johannes Weiner <hannes@cmpxchg.org>
> > > > > Closes: https://lore.kernel.org/lkml/20241113213007.GB1564047@cmpxchg.org/
> > > > > Reported-by: Sam Sun <samsun1006219@gmail.com>
> > > > > Closes: https://lore.kernel.org/lkml/CAEkJfYMtSdM5HceNsXUDf5haghD5+o2e7Qv4OcuruL4tPg6OaQ@mail.gmail.com/
> > > > > ---
> > > > >  mm/zswap.c | 31 ++++++++++++++++++++++++++++---
> > > > >  1 file changed, 28 insertions(+), 3 deletions(-)
> > > > >
> > > > > diff --git a/mm/zswap.c b/mm/zswap.c
> > > > > index f6316b66fb236..add1406d693b8 100644
> > > > > --- a/mm/zswap.c
> > > > > +++ b/mm/zswap.c
> > > > > @@ -864,12 +864,22 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> > > > >       return ret;
> > > > >  }
> > > > >
> > > > > +DEFINE_STATIC_SRCU(acomp_srcu);
> > > > > +
> > > > >  static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
> > > > >  {
> > > > >       struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
> > > > >       struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> > > > >
> > > > >       if (!IS_ERR_OR_NULL(acomp_ctx)) {
> > > > > +             /*
> > > > > +              * Even though the acomp_ctx should not be currently in use on
> > > > > +              * @cpu, it may still be used by compress/decompress operations
> > > > > +              * that started on @cpu and migrated to a different CPU. Wait
> > > > > +              * for such usages to complete, any news usages would be a bug.
> > > > > +              */
> > > > > +             synchronize_srcu(&acomp_srcu);
> > > >
> > > > The docs suggest you can't solve it like that :(
> > > >
> > > > Documentation/RCU/Design/Requirements/Requirements.rst:
> > > >
> > > >   Also unlike other RCU flavors, synchronize_srcu() may **not** be
> > > >   invoked from CPU-hotplug notifiers, due to the fact that SRCU grace
> > > >   periods make use of timers and the possibility of timers being
> > > >   temporarily “stranded” on the outgoing CPU. This stranding of timers
> > > >   means that timers posted to the outgoing CPU will not fire until
> > > >   late in the CPU-hotplug process. The problem is that if a notifier
> > > >   is waiting on an SRCU grace period, that grace period is waiting on
> > > >   a timer, and that timer is stranded on the outgoing CPU, then the
> > > >   notifier will never be awakened, in other words, deadlock has
> > > >   occurred. This same situation of course also prohibits
> > > >   srcu_barrier() from being invoked from CPU-hotplug notifiers.
> > >
> > > Thanks for checking, I completely missed this. I guess it only works
> > > with SRCU if we use call_srcu(), but then we need to copy the pointers
> > > to a new struct to avoid racing with the CPU getting onlined again.
> > > Otherwise we can just bite the bullet and add a refcount, or use
> > > migrate_disable() despite that being undesirable.
> > >
> > > Do you have a favorite? :)
> >
> > I briefly looked into refcounting. The annoying thing is that we need
> > to handle the race between putting the last refcount in
> > zswap_compress()/zswap_decompress(), and the CPU getting onlined again
> > and re-initializing the refcount. One way to do it would be to put all
> > dynamically allocated resources in a struct with the same struct with
> > the new refcount, and use RCU + refcounts to allocate and free the
> > struct as a whole.
> >
> > I am leaning toward just disabling migration for now tbh unless there
> > are objections to that, especially this close to the v6.13 release.
>
> (Sorry for going back and forth on this, I am essentially thinking out loud)
>
> Actually, as Kanchana mentioned before, we should be able to just hold
> the mutex in zswap_cpu_comp_dead() before freeing the dynamic
> resources. The mutex is allocated when the pool is created and will
> not go away during CPU hotunplug AFAICT. It confused me before because
> we call mutex_init() in zswap_cpu_comp_prepare(), but it really should
> be in zswap_pool_create() after we allocate the pool->acomp_ctx.

Nope. It's possible for zswap_cpu_comp_dead() to hold the mutex and
free the resources after zswap_[de]compress() calls raw_cpu_ptr() but
before it calls mutex_lock().
Nhat Pham Jan. 8, 2025, 3:56 a.m. UTC | #6
On Wed, Jan 8, 2025 at 3:17 AM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Tue, Jan 7, 2025 at 10:13 AM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > On Tue, Jan 7, 2025 at 10:03 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > >
> > > On Tue, Jan 07, 2025 at 07:47:24AM +0000, Yosry Ahmed wrote:
> > > > In zswap_compress() and zswap_decompress(), the per-CPU acomp_ctx of the
> > > > current CPU at the beginning of the operation is retrieved and used
> > > > throughout.  However, since neither preemption nor migration are disabled,
> > > > it is possible that the operation continues on a different CPU.
> > > >
> > > > If the original CPU is hotunplugged while the acomp_ctx is still in use,
> > > > we run into a UAF bug as the resources attached to the acomp_ctx are freed
> > > > during hotunplug in zswap_cpu_comp_dead().
> > > >
> > > > The problem was introduced in commit 1ec3b5fe6eec ("mm/zswap: move to use
> > > > crypto_acomp API for hardware acceleration") when the switch to the
> > > > crypto_acomp API was made.  Prior to that, the per-CPU crypto_comp was
> > > > retrieved using get_cpu_ptr() which disables preemption and makes sure the
> > > > CPU cannot go away from under us.  Preemption cannot be disabled with the
> > > > crypto_acomp API as a sleepable context is needed.
> > > >
> > > > Commit 8ba2f844f050 ("mm/zswap: change per-cpu mutex and buffer to
> > > > per-acomp_ctx") increased the UAF surface area by making the per-CPU
> > > > buffers dynamic, adding yet another resource that can be freed from under
> > > > zswap compression/decompression by CPU hotunplug.
> > > >
> > > > There are a few ways to fix this:
> > > > (a) Add a refcount for acomp_ctx.
> > > > (b) Disable migration while using the per-CPU acomp_ctx.
> > > > (c) Use SRCU to wait for other CPUs using the acomp_ctx of the CPU being
> > > > hotunplugged. Normal RCU cannot be used as a sleepable context is
> > > > required.
> > > >
> > > > Implement (c) since it's simpler than (a), and (b) involves using
> > > > migrate_disable() which is apparently undesired (see huge comment in
> > > > include/linux/preempt.h).
> > > >
> > > > Fixes: 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration")
> > > > Cc: <stable@vger.kernel.org>
> > > > Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> > > > Reported-by: Johannes Weiner <hannes@cmpxchg.org>
> > > > Closes: https://lore.kernel.org/lkml/20241113213007.GB1564047@cmpxchg.org/
> > > > Reported-by: Sam Sun <samsun1006219@gmail.com>
> > > > Closes: https://lore.kernel.org/lkml/CAEkJfYMtSdM5HceNsXUDf5haghD5+o2e7Qv4OcuruL4tPg6OaQ@mail.gmail.com/
> > > > ---
> > > >  mm/zswap.c | 31 ++++++++++++++++++++++++++++---
> > > >  1 file changed, 28 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/mm/zswap.c b/mm/zswap.c
> > > > index f6316b66fb236..add1406d693b8 100644
> > > > --- a/mm/zswap.c
> > > > +++ b/mm/zswap.c
> > > > @@ -864,12 +864,22 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> > > >       return ret;
> > > >  }
> > > >
> > > > +DEFINE_STATIC_SRCU(acomp_srcu);
> > > > +
> > > >  static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
> > > >  {
> > > >       struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
> > > >       struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> > > >
> > > >       if (!IS_ERR_OR_NULL(acomp_ctx)) {
> > > > +             /*
> > > > +              * Even though the acomp_ctx should not be currently in use on
> > > > +              * @cpu, it may still be used by compress/decompress operations
> > > > +              * that started on @cpu and migrated to a different CPU. Wait
> > > > +              * for such usages to complete, any news usages would be a bug.
> > > > +              */
> > > > +             synchronize_srcu(&acomp_srcu);
> > >
> > > The docs suggest you can't solve it like that :(
> > >
> > > Documentation/RCU/Design/Requirements/Requirements.rst:
> > >
> > >   Also unlike other RCU flavors, synchronize_srcu() may **not** be
> > >   invoked from CPU-hotplug notifiers, due to the fact that SRCU grace
> > >   periods make use of timers and the possibility of timers being
> > >   temporarily “stranded” on the outgoing CPU. This stranding of timers
> > >   means that timers posted to the outgoing CPU will not fire until
> > >   late in the CPU-hotplug process. The problem is that if a notifier
> > >   is waiting on an SRCU grace period, that grace period is waiting on
> > >   a timer, and that timer is stranded on the outgoing CPU, then the
> > >   notifier will never be awakened, in other words, deadlock has
> > >   occurred. This same situation of course also prohibits
> > >   srcu_barrier() from being invoked from CPU-hotplug notifiers.
> >
> > Thanks for checking, I completely missed this. I guess it only works
> > with SRCU if we use call_srcu(), but then we need to copy the pointers
> > to a new struct to avoid racing with the CPU getting onlined again.
> > Otherwise we can just bite the bullet and add a refcount, or use
> > migrate_disable() despite that being undesirable.
> >
> > Do you have a favorite? :)
>
> I briefly looked into refcounting. The annoying thing is that we need
> to handle the race between putting the last refcount in
> zswap_compress()/zswap_decompress(), and the CPU getting onlined again
> and re-initializing the refcount. One way to do it would be to put all
> dynamically allocated resources in a struct with the same struct with
> the new refcount, and use RCU + refcounts to allocate and free the
> struct as a whole.
>
> I am leaning toward just disabling migration for now tbh unless there
> are objections to that, especially this close to the v6.13 release.

I much prefer the refcounting solution. IMO it's the "proper" fix -
disabling migration is such a heavy-handed resolution. A massive
hammer for a tiny nail, so to speak.

Is this a frequently occured problem in the wild? If so, we can
disable migration to firefight, and then do the proper thing down the
line.

(was also gonna suggest looking at zram, but it seems they have a
custom compression API now...)
Yosry Ahmed Jan. 8, 2025, 4:14 a.m. UTC | #7
On Tue, Jan 7, 2025 at 7:56 PM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Wed, Jan 8, 2025 at 3:17 AM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > On Tue, Jan 7, 2025 at 10:13 AM Yosry Ahmed <yosryahmed@google.com> wrote:
> > >
> > > On Tue, Jan 7, 2025 at 10:03 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > >
> > > > On Tue, Jan 07, 2025 at 07:47:24AM +0000, Yosry Ahmed wrote:
> > > > > In zswap_compress() and zswap_decompress(), the per-CPU acomp_ctx of the
> > > > > current CPU at the beginning of the operation is retrieved and used
> > > > > throughout.  However, since neither preemption nor migration are disabled,
> > > > > it is possible that the operation continues on a different CPU.
> > > > >
> > > > > If the original CPU is hotunplugged while the acomp_ctx is still in use,
> > > > > we run into a UAF bug as the resources attached to the acomp_ctx are freed
> > > > > during hotunplug in zswap_cpu_comp_dead().
> > > > >
> > > > > The problem was introduced in commit 1ec3b5fe6eec ("mm/zswap: move to use
> > > > > crypto_acomp API for hardware acceleration") when the switch to the
> > > > > crypto_acomp API was made.  Prior to that, the per-CPU crypto_comp was
> > > > > retrieved using get_cpu_ptr() which disables preemption and makes sure the
> > > > > CPU cannot go away from under us.  Preemption cannot be disabled with the
> > > > > crypto_acomp API as a sleepable context is needed.
> > > > >
> > > > > Commit 8ba2f844f050 ("mm/zswap: change per-cpu mutex and buffer to
> > > > > per-acomp_ctx") increased the UAF surface area by making the per-CPU
> > > > > buffers dynamic, adding yet another resource that can be freed from under
> > > > > zswap compression/decompression by CPU hotunplug.
> > > > >
> > > > > There are a few ways to fix this:
> > > > > (a) Add a refcount for acomp_ctx.
> > > > > (b) Disable migration while using the per-CPU acomp_ctx.
> > > > > (c) Use SRCU to wait for other CPUs using the acomp_ctx of the CPU being
> > > > > hotunplugged. Normal RCU cannot be used as a sleepable context is
> > > > > required.
> > > > >
> > > > > Implement (c) since it's simpler than (a), and (b) involves using
> > > > > migrate_disable() which is apparently undesired (see huge comment in
> > > > > include/linux/preempt.h).
> > > > >
> > > > > Fixes: 1ec3b5fe6eec ("mm/zswap: move to use crypto_acomp API for hardware acceleration")
> > > > > Cc: <stable@vger.kernel.org>
> > > > > Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> > > > > Reported-by: Johannes Weiner <hannes@cmpxchg.org>
> > > > > Closes: https://lore.kernel.org/lkml/20241113213007.GB1564047@cmpxchg.org/
> > > > > Reported-by: Sam Sun <samsun1006219@gmail.com>
> > > > > Closes: https://lore.kernel.org/lkml/CAEkJfYMtSdM5HceNsXUDf5haghD5+o2e7Qv4OcuruL4tPg6OaQ@mail.gmail.com/
> > > > > ---
> > > > >  mm/zswap.c | 31 ++++++++++++++++++++++++++++---
> > > > >  1 file changed, 28 insertions(+), 3 deletions(-)
> > > > >
> > > > > diff --git a/mm/zswap.c b/mm/zswap.c
> > > > > index f6316b66fb236..add1406d693b8 100644
> > > > > --- a/mm/zswap.c
> > > > > +++ b/mm/zswap.c
> > > > > @@ -864,12 +864,22 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> > > > >       return ret;
> > > > >  }
> > > > >
> > > > > +DEFINE_STATIC_SRCU(acomp_srcu);
> > > > > +
> > > > >  static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
> > > > >  {
> > > > >       struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
> > > > >       struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> > > > >
> > > > >       if (!IS_ERR_OR_NULL(acomp_ctx)) {
> > > > > +             /*
> > > > > +              * Even though the acomp_ctx should not be currently in use on
> > > > > +              * @cpu, it may still be used by compress/decompress operations
> > > > > +              * that started on @cpu and migrated to a different CPU. Wait
> > > > > +              * for such usages to complete, any news usages would be a bug.
> > > > > +              */
> > > > > +             synchronize_srcu(&acomp_srcu);
> > > >
> > > > The docs suggest you can't solve it like that :(
> > > >
> > > > Documentation/RCU/Design/Requirements/Requirements.rst:
> > > >
> > > >   Also unlike other RCU flavors, synchronize_srcu() may **not** be
> > > >   invoked from CPU-hotplug notifiers, due to the fact that SRCU grace
> > > >   periods make use of timers and the possibility of timers being
> > > >   temporarily “stranded” on the outgoing CPU. This stranding of timers
> > > >   means that timers posted to the outgoing CPU will not fire until
> > > >   late in the CPU-hotplug process. The problem is that if a notifier
> > > >   is waiting on an SRCU grace period, that grace period is waiting on
> > > >   a timer, and that timer is stranded on the outgoing CPU, then the
> > > >   notifier will never be awakened, in other words, deadlock has
> > > >   occurred. This same situation of course also prohibits
> > > >   srcu_barrier() from being invoked from CPU-hotplug notifiers.
> > >
> > > Thanks for checking, I completely missed this. I guess it only works
> > > with SRCU if we use call_srcu(), but then we need to copy the pointers
> > > to a new struct to avoid racing with the CPU getting onlined again.
> > > Otherwise we can just bite the bullet and add a refcount, or use
> > > migrate_disable() despite that being undesirable.
> > >
> > > Do you have a favorite? :)
> >
> > I briefly looked into refcounting. The annoying thing is that we need
> > to handle the race between putting the last refcount in
> > zswap_compress()/zswap_decompress(), and the CPU getting onlined again
> > and re-initializing the refcount. One way to do it would be to put all
> > dynamically allocated resources in a struct with the same struct with
> > the new refcount, and use RCU + refcounts to allocate and free the
> > struct as a whole.
> >
> > I am leaning toward just disabling migration for now tbh unless there
> > are objections to that, especially this close to the v6.13 release.
>
> I much prefer the refcounting solution. IMO it's the "proper" fix -
> disabling migration is such a heavy-handed resolution. A massive
> hammer for a tiny nail, so to speak.

I may have found a simpler "proper" fix than disabling migration,
please see my suggestion in:
https://lore.kernel.org/lkml/CAJD7tkYpNNsbTZZqFoRh-FkXDgxONZEUPKk1YQv7-TFMWWQRzQ@mail.gmail.com/

>
> Is this a frequently occured problem in the wild? If so, we can
> disable migration to firefight, and then do the proper thing down the
> line.

I don't believe so. Actually, I think the deadlock introduced by the
previous fix is more problematic than the UAF it fixes.

Andrew, could you please pick up patch 1 (the revert) while we figure
out the alternative fix? It's important that it lands in v6.13 to
avoid the possibility of deadlock. Figuring out an alternative fix is
less important.
Nhat Pham Jan. 8, 2025, 4:34 a.m. UTC | #8
On Wed, Jan 8, 2025 at 11:14 AM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Tue, Jan 7, 2025 at 7:56 PM Nhat Pham <nphamcs@gmail.com> wrote:
>
> I may have found a simpler "proper" fix than disabling migration,
> please see my suggestion in:
> https://lore.kernel.org/lkml/CAJD7tkYpNNsbTZZqFoRh-FkXDgxONZEUPKk1YQv7-TFMWWQRzQ@mail.gmail.com/

Discovered that thread just now - sorry, too many emails to catch up
on :) Taking a look now.

>
> >
> > Is this a frequently occured problem in the wild? If so, we can
> > disable migration to firefight, and then do the proper thing down the
> > line.
>
> I don't believe so. Actually, I think the deadlock introduced by the
> previous fix is more problematic than the UAF it fixes.
>
> Andrew, could you please pick up patch 1 (the revert) while we figure
> out the alternative fix? It's important that it lands in v6.13 to
> avoid the possibility of deadlock. Figuring out an alternative fix is
> less important.

Agree. Let's revert the "fix" first. CPU offlining is a much rarer
event than this deadlocking scenario discovered by syzbot.
Andrew Morton Jan. 8, 2025, 6:42 a.m. UTC | #9
> 
> Andrew, could you please pick up patch 1 (the revert) while we figure
> out the alternative fix? It's important that it lands in v6.13 to
> avoid the possibility of deadlock. Figuring out an alternative fix is
> less important.

I have the below patch in mm-hotfixes-unstable.

I also have
https://lkml.kernel.org/r/20250107222236.2715883-2-yosryahmed@google.com
in mm-hotfixes-unstable.  Don't know what to do with it.

I have no patch "mm: zswap: use SRCU to synchronize with CPU hotunplug"
in mm-unstable.



From: Yosry Ahmed <yosryahmed@google.com>
Subject: Revert "mm: zswap: fix race between [de]compression and CPU hotunplug"
Date: Tue, 7 Jan 2025 22:22:34 +0000

This reverts commit eaebeb93922ca6ab0dd92027b73d0112701706ef.

Commit eaebeb93922c ("mm: zswap: fix race between [de]compression and CPU
hotunplug") used the CPU hotplug lock in zswap compress/decompress
operations to protect against a race with CPU hotunplug making some
per-CPU resources go away.

However, zswap compress/decompress can be reached through reclaim while
the lock is held, resulting in a potential deadlock as reported by syzbot:
======================================================
WARNING: possible circular locking dependency detected
6.13.0-rc6-syzkaller-00006-g5428dc1906dd #0 Not tainted
------------------------------------------------------
kswapd0/89 is trying to acquire lock:
 ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: acomp_ctx_get_cpu mm/zswap.c:886 [inline]
 ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_compress mm/zswap.c:908 [inline]
 ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_store_page mm/zswap.c:1439 [inline]
 ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_store+0xa74/0x1ba0 mm/zswap.c:1546

but task is already holding lock:
 ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat mm/vmscan.c:6871 [inline]
 ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: kswapd+0xb58/0x2f30 mm/vmscan.c:7253

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #1 (fs_reclaim){+.+.}-{0:0}:
        lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
        __fs_reclaim_acquire mm/page_alloc.c:3853 [inline]
        fs_reclaim_acquire+0x88/0x130 mm/page_alloc.c:3867
        might_alloc include/linux/sched/mm.h:318 [inline]
        slab_pre_alloc_hook mm/slub.c:4070 [inline]
        slab_alloc_node mm/slub.c:4148 [inline]
        __kmalloc_cache_node_noprof+0x40/0x3a0 mm/slub.c:4337
        kmalloc_node_noprof include/linux/slab.h:924 [inline]
        alloc_worker kernel/workqueue.c:2638 [inline]
        create_worker+0x11b/0x720 kernel/workqueue.c:2781
        workqueue_prepare_cpu+0xe3/0x170 kernel/workqueue.c:6628
        cpuhp_invoke_callback+0x48d/0x830 kernel/cpu.c:194
        __cpuhp_invoke_callback_range kernel/cpu.c:965 [inline]
        cpuhp_invoke_callback_range kernel/cpu.c:989 [inline]
        cpuhp_up_callbacks kernel/cpu.c:1020 [inline]
        _cpu_up+0x2b3/0x580 kernel/cpu.c:1690
        cpu_up+0x184/0x230 kernel/cpu.c:1722
        cpuhp_bringup_mask+0xdf/0x260 kernel/cpu.c:1788
        cpuhp_bringup_cpus_parallel+0xf9/0x160 kernel/cpu.c:1878
        bringup_nonboot_cpus+0x2b/0x50 kernel/cpu.c:1892
        smp_init+0x34/0x150 kernel/smp.c:1009
        kernel_init_freeable+0x417/0x5d0 init/main.c:1569
        kernel_init+0x1d/0x2b0 init/main.c:1466
        ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
        ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244

-> #0 (cpu_hotplug_lock){++++}-{0:0}:
        check_prev_add kernel/locking/lockdep.c:3161 [inline]
        check_prevs_add kernel/locking/lockdep.c:3280 [inline]
        validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904
        __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226
        lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
        percpu_down_read include/linux/percpu-rwsem.h:51 [inline]
        cpus_read_lock+0x42/0x150 kernel/cpu.c:490
        acomp_ctx_get_cpu mm/zswap.c:886 [inline]
        zswap_compress mm/zswap.c:908 [inline]
        zswap_store_page mm/zswap.c:1439 [inline]
        zswap_store+0xa74/0x1ba0 mm/zswap.c:1546
        swap_writepage+0x647/0xce0 mm/page_io.c:279
        shmem_writepage+0x1248/0x1610 mm/shmem.c:1579
        pageout mm/vmscan.c:696 [inline]
        shrink_folio_list+0x35ee/0x57e0 mm/vmscan.c:1374
        shrink_inactive_list mm/vmscan.c:1967 [inline]
        shrink_list mm/vmscan.c:2205 [inline]
        shrink_lruvec+0x16db/0x2f30 mm/vmscan.c:5734
        mem_cgroup_shrink_node+0x385/0x8e0 mm/vmscan.c:6575
        mem_cgroup_soft_reclaim mm/memcontrol-v1.c:312 [inline]
        memcg1_soft_limit_reclaim+0x346/0x810 mm/memcontrol-v1.c:362
        balance_pgdat mm/vmscan.c:6975 [inline]
        kswapd+0x17b3/0x2f30 mm/vmscan.c:7253
        kthread+0x2f0/0x390 kernel/kthread.c:389
        ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
        ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244

other info that might help us debug this:

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(fs_reclaim);
                               lock(cpu_hotplug_lock);
                               lock(fs_reclaim);
  rlock(cpu_hotplug_lock);

 *** DEADLOCK ***

1 lock held by kswapd0/89:
  #0: ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat mm/vmscan.c:6871 [inline]
  #0: ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: kswapd+0xb58/0x2f30 mm/vmscan.c:7253

stack backtrace:
CPU: 0 UID: 0 PID: 89 Comm: kswapd0 Not tainted 6.13.0-rc6-syzkaller-00006-g5428dc1906dd #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024
Call Trace:
 <TASK>
  __dump_stack lib/dump_stack.c:94 [inline]
  dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120
  print_circular_bug+0x13a/0x1b0 kernel/locking/lockdep.c:2074
  check_noncircular+0x36a/0x4a0 kernel/locking/lockdep.c:2206
  check_prev_add kernel/locking/lockdep.c:3161 [inline]
  check_prevs_add kernel/locking/lockdep.c:3280 [inline]
  validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904
  __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226
  lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
  percpu_down_read include/linux/percpu-rwsem.h:51 [inline]
  cpus_read_lock+0x42/0x150 kernel/cpu.c:490
  acomp_ctx_get_cpu mm/zswap.c:886 [inline]
  zswap_compress mm/zswap.c:908 [inline]
  zswap_store_page mm/zswap.c:1439 [inline]
  zswap_store+0xa74/0x1ba0 mm/zswap.c:1546
  swap_writepage+0x647/0xce0 mm/page_io.c:279
  shmem_writepage+0x1248/0x1610 mm/shmem.c:1579
  pageout mm/vmscan.c:696 [inline]
  shrink_folio_list+0x35ee/0x57e0 mm/vmscan.c:1374
  shrink_inactive_list mm/vmscan.c:1967 [inline]
  shrink_list mm/vmscan.c:2205 [inline]
  shrink_lruvec+0x16db/0x2f30 mm/vmscan.c:5734
  mem_cgroup_shrink_node+0x385/0x8e0 mm/vmscan.c:6575
  mem_cgroup_soft_reclaim mm/memcontrol-v1.c:312 [inline]
  memcg1_soft_limit_reclaim+0x346/0x810 mm/memcontrol-v1.c:362
  balance_pgdat mm/vmscan.c:6975 [inline]
  kswapd+0x17b3/0x2f30 mm/vmscan.c:7253
  kthread+0x2f0/0x390 kernel/kthread.c:389
  ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
  ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
 </TASK>

Revert the change. A different fix for the race with CPU hotunplug will
follow.

Link: https://lkml.kernel.org/r/20250107222236.2715883-1-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sam Sun <samsun1006219@gmail.com>
Cc: Vitaly Wool <vitalywool@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 mm/zswap.c |   19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

--- a/mm/zswap.c~revert-mm-zswap-fix-race-between-compression-and-cpu-hotunplug
+++ a/mm/zswap.c
@@ -880,18 +880,6 @@ static int zswap_cpu_comp_dead(unsigned
 	return 0;
 }
 
-/* Prevent CPU hotplug from freeing up the per-CPU acomp_ctx resources */
-static struct crypto_acomp_ctx *acomp_ctx_get_cpu(struct crypto_acomp_ctx __percpu *acomp_ctx)
-{
-	cpus_read_lock();
-	return raw_cpu_ptr(acomp_ctx);
-}
-
-static void acomp_ctx_put_cpu(void)
-{
-	cpus_read_unlock();
-}
-
 static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 			   struct zswap_pool *pool)
 {
@@ -905,7 +893,8 @@ static bool zswap_compress(struct page *
 	gfp_t gfp;
 	u8 *dst;
 
-	acomp_ctx = acomp_ctx_get_cpu(pool->acomp_ctx);
+	acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
+
 	mutex_lock(&acomp_ctx->mutex);
 
 	dst = acomp_ctx->buffer;
@@ -961,7 +950,6 @@ unlock:
 		zswap_reject_alloc_fail++;
 
 	mutex_unlock(&acomp_ctx->mutex);
-	acomp_ctx_put_cpu();
 	return comp_ret == 0 && alloc_ret == 0;
 }
 
@@ -972,7 +960,7 @@ static void zswap_decompress(struct zswa
 	struct crypto_acomp_ctx *acomp_ctx;
 	u8 *src;
 
-	acomp_ctx = acomp_ctx_get_cpu(entry->pool->acomp_ctx);
+	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
 	mutex_lock(&acomp_ctx->mutex);
 
 	src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
@@ -1002,7 +990,6 @@ static void zswap_decompress(struct zswa
 
 	if (src != acomp_ctx->buffer)
 		zpool_unmap_handle(zpool, entry->handle);
-	acomp_ctx_put_cpu();
 }
 
 /*********************************
Yosry Ahmed Jan. 8, 2025, 7:24 a.m. UTC | #10
On Tue, Jan 7, 2025 at 10:42 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> >
> > Andrew, could you please pick up patch 1 (the revert) while we figure
> > out the alternative fix? It's important that it lands in v6.13 to
> > avoid the possibility of deadlock. Figuring out an alternative fix is
> > less important.
>
> I have the below patch in mm-hotfixes-unstable.

Please only keep that patch, "Revert "mm: zswap: fix race between
[de]compression and CPU hotunplug", in mm-hotfixes-unstable.

>
> I also have
> https://lkml.kernel.org/r/20250107222236.2715883-2-yosryahmed@google.com
> in mm-hotfixes-unstable.  Don't know what to do with it.

Please drop this one.

>
> I have no patch "mm: zswap: use SRCU to synchronize with CPU hotunplug"
> in mm-unstable.

, and keep that one dropped.

I will send another patch this week on top of mm-hotfixes-unstable,
but that should be separate. The revert should land in v6.13
regardless.

Thanks!

>
>
>
> From: Yosry Ahmed <yosryahmed@google.com>
> Subject: Revert "mm: zswap: fix race between [de]compression and CPU hotunplug"
> Date: Tue, 7 Jan 2025 22:22:34 +0000
>
> This reverts commit eaebeb93922ca6ab0dd92027b73d0112701706ef.
>
> Commit eaebeb93922c ("mm: zswap: fix race between [de]compression and CPU
> hotunplug") used the CPU hotplug lock in zswap compress/decompress
> operations to protect against a race with CPU hotunplug making some
> per-CPU resources go away.
>
> However, zswap compress/decompress can be reached through reclaim while
> the lock is held, resulting in a potential deadlock as reported by syzbot:
> ======================================================
> WARNING: possible circular locking dependency detected
> 6.13.0-rc6-syzkaller-00006-g5428dc1906dd #0 Not tainted
> ------------------------------------------------------
> kswapd0/89 is trying to acquire lock:
>  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: acomp_ctx_get_cpu mm/zswap.c:886 [inline]
>  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_compress mm/zswap.c:908 [inline]
>  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_store_page mm/zswap.c:1439 [inline]
>  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_store+0xa74/0x1ba0 mm/zswap.c:1546
>
> but task is already holding lock:
>  ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat mm/vmscan.c:6871 [inline]
>  ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: kswapd+0xb58/0x2f30 mm/vmscan.c:7253
>
> which lock already depends on the new lock.
>
> the existing dependency chain (in reverse order) is:
>
> -> #1 (fs_reclaim){+.+.}-{0:0}:
>         lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
>         __fs_reclaim_acquire mm/page_alloc.c:3853 [inline]
>         fs_reclaim_acquire+0x88/0x130 mm/page_alloc.c:3867
>         might_alloc include/linux/sched/mm.h:318 [inline]
>         slab_pre_alloc_hook mm/slub.c:4070 [inline]
>         slab_alloc_node mm/slub.c:4148 [inline]
>         __kmalloc_cache_node_noprof+0x40/0x3a0 mm/slub.c:4337
>         kmalloc_node_noprof include/linux/slab.h:924 [inline]
>         alloc_worker kernel/workqueue.c:2638 [inline]
>         create_worker+0x11b/0x720 kernel/workqueue.c:2781
>         workqueue_prepare_cpu+0xe3/0x170 kernel/workqueue.c:6628
>         cpuhp_invoke_callback+0x48d/0x830 kernel/cpu.c:194
>         __cpuhp_invoke_callback_range kernel/cpu.c:965 [inline]
>         cpuhp_invoke_callback_range kernel/cpu.c:989 [inline]
>         cpuhp_up_callbacks kernel/cpu.c:1020 [inline]
>         _cpu_up+0x2b3/0x580 kernel/cpu.c:1690
>         cpu_up+0x184/0x230 kernel/cpu.c:1722
>         cpuhp_bringup_mask+0xdf/0x260 kernel/cpu.c:1788
>         cpuhp_bringup_cpus_parallel+0xf9/0x160 kernel/cpu.c:1878
>         bringup_nonboot_cpus+0x2b/0x50 kernel/cpu.c:1892
>         smp_init+0x34/0x150 kernel/smp.c:1009
>         kernel_init_freeable+0x417/0x5d0 init/main.c:1569
>         kernel_init+0x1d/0x2b0 init/main.c:1466
>         ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
>         ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
>
> -> #0 (cpu_hotplug_lock){++++}-{0:0}:
>         check_prev_add kernel/locking/lockdep.c:3161 [inline]
>         check_prevs_add kernel/locking/lockdep.c:3280 [inline]
>         validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904
>         __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226
>         lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
>         percpu_down_read include/linux/percpu-rwsem.h:51 [inline]
>         cpus_read_lock+0x42/0x150 kernel/cpu.c:490
>         acomp_ctx_get_cpu mm/zswap.c:886 [inline]
>         zswap_compress mm/zswap.c:908 [inline]
>         zswap_store_page mm/zswap.c:1439 [inline]
>         zswap_store+0xa74/0x1ba0 mm/zswap.c:1546
>         swap_writepage+0x647/0xce0 mm/page_io.c:279
>         shmem_writepage+0x1248/0x1610 mm/shmem.c:1579
>         pageout mm/vmscan.c:696 [inline]
>         shrink_folio_list+0x35ee/0x57e0 mm/vmscan.c:1374
>         shrink_inactive_list mm/vmscan.c:1967 [inline]
>         shrink_list mm/vmscan.c:2205 [inline]
>         shrink_lruvec+0x16db/0x2f30 mm/vmscan.c:5734
>         mem_cgroup_shrink_node+0x385/0x8e0 mm/vmscan.c:6575
>         mem_cgroup_soft_reclaim mm/memcontrol-v1.c:312 [inline]
>         memcg1_soft_limit_reclaim+0x346/0x810 mm/memcontrol-v1.c:362
>         balance_pgdat mm/vmscan.c:6975 [inline]
>         kswapd+0x17b3/0x2f30 mm/vmscan.c:7253
>         kthread+0x2f0/0x390 kernel/kthread.c:389
>         ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
>         ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
>
> other info that might help us debug this:
>
>  Possible unsafe locking scenario:
>
>        CPU0                    CPU1
>        ----                    ----
>   lock(fs_reclaim);
>                                lock(cpu_hotplug_lock);
>                                lock(fs_reclaim);
>   rlock(cpu_hotplug_lock);
>
>  *** DEADLOCK ***
>
> 1 lock held by kswapd0/89:
>   #0: ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat mm/vmscan.c:6871 [inline]
>   #0: ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: kswapd+0xb58/0x2f30 mm/vmscan.c:7253
>
> stack backtrace:
> CPU: 0 UID: 0 PID: 89 Comm: kswapd0 Not tainted 6.13.0-rc6-syzkaller-00006-g5428dc1906dd #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024
> Call Trace:
>  <TASK>
>   __dump_stack lib/dump_stack.c:94 [inline]
>   dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120
>   print_circular_bug+0x13a/0x1b0 kernel/locking/lockdep.c:2074
>   check_noncircular+0x36a/0x4a0 kernel/locking/lockdep.c:2206
>   check_prev_add kernel/locking/lockdep.c:3161 [inline]
>   check_prevs_add kernel/locking/lockdep.c:3280 [inline]
>   validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904
>   __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226
>   lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
>   percpu_down_read include/linux/percpu-rwsem.h:51 [inline]
>   cpus_read_lock+0x42/0x150 kernel/cpu.c:490
>   acomp_ctx_get_cpu mm/zswap.c:886 [inline]
>   zswap_compress mm/zswap.c:908 [inline]
>   zswap_store_page mm/zswap.c:1439 [inline]
>   zswap_store+0xa74/0x1ba0 mm/zswap.c:1546
>   swap_writepage+0x647/0xce0 mm/page_io.c:279
>   shmem_writepage+0x1248/0x1610 mm/shmem.c:1579
>   pageout mm/vmscan.c:696 [inline]
>   shrink_folio_list+0x35ee/0x57e0 mm/vmscan.c:1374
>   shrink_inactive_list mm/vmscan.c:1967 [inline]
>   shrink_list mm/vmscan.c:2205 [inline]
>   shrink_lruvec+0x16db/0x2f30 mm/vmscan.c:5734
>   mem_cgroup_shrink_node+0x385/0x8e0 mm/vmscan.c:6575
>   mem_cgroup_soft_reclaim mm/memcontrol-v1.c:312 [inline]
>   memcg1_soft_limit_reclaim+0x346/0x810 mm/memcontrol-v1.c:362
>   balance_pgdat mm/vmscan.c:6975 [inline]
>   kswapd+0x17b3/0x2f30 mm/vmscan.c:7253
>   kthread+0x2f0/0x390 kernel/kthread.c:389
>   ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
>   ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
>  </TASK>
>
> Revert the change. A different fix for the race with CPU hotunplug will
> follow.
>
> Link: https://lkml.kernel.org/r/20250107222236.2715883-1-yosryahmed@google.com
> Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> Reported-by: syzbot <syzkaller@googlegroups.com>
> Cc: Barry Song <baohua@kernel.org>
> Cc: Chengming Zhou <chengming.zhou@linux.dev>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
> Cc: Nhat Pham <nphamcs@gmail.com>
> Cc: Sam Sun <samsun1006219@gmail.com>
> Cc: Vitaly Wool <vitalywool@gmail.com>
> Cc: <stable@vger.kernel.org>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> ---
>
>  mm/zswap.c |   19 +++----------------
>  1 file changed, 3 insertions(+), 16 deletions(-)
>
> --- a/mm/zswap.c~revert-mm-zswap-fix-race-between-compression-and-cpu-hotunplug
> +++ a/mm/zswap.c
> @@ -880,18 +880,6 @@ static int zswap_cpu_comp_dead(unsigned
>         return 0;
>  }
>
> -/* Prevent CPU hotplug from freeing up the per-CPU acomp_ctx resources */
> -static struct crypto_acomp_ctx *acomp_ctx_get_cpu(struct crypto_acomp_ctx __percpu *acomp_ctx)
> -{
> -       cpus_read_lock();
> -       return raw_cpu_ptr(acomp_ctx);
> -}
> -
> -static void acomp_ctx_put_cpu(void)
> -{
> -       cpus_read_unlock();
> -}
> -
>  static bool zswap_compress(struct page *page, struct zswap_entry *entry,
>                            struct zswap_pool *pool)
>  {
> @@ -905,7 +893,8 @@ static bool zswap_compress(struct page *
>         gfp_t gfp;
>         u8 *dst;
>
> -       acomp_ctx = acomp_ctx_get_cpu(pool->acomp_ctx);
> +       acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
> +
>         mutex_lock(&acomp_ctx->mutex);
>
>         dst = acomp_ctx->buffer;
> @@ -961,7 +950,6 @@ unlock:
>                 zswap_reject_alloc_fail++;
>
>         mutex_unlock(&acomp_ctx->mutex);
> -       acomp_ctx_put_cpu();
>         return comp_ret == 0 && alloc_ret == 0;
>  }
>
> @@ -972,7 +960,7 @@ static void zswap_decompress(struct zswa
>         struct crypto_acomp_ctx *acomp_ctx;
>         u8 *src;
>
> -       acomp_ctx = acomp_ctx_get_cpu(entry->pool->acomp_ctx);
> +       acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
>         mutex_lock(&acomp_ctx->mutex);
>
>         src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
> @@ -1002,7 +990,6 @@ static void zswap_decompress(struct zswa
>
>         if (src != acomp_ctx->buffer)
>                 zpool_unmap_handle(zpool, entry->handle);
> -       acomp_ctx_put_cpu();
>  }
>
>  /*********************************
> _
>
diff mbox series

Patch

diff --git a/mm/zswap.c b/mm/zswap.c
index f6316b66fb236..add1406d693b8 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -864,12 +864,22 @@  static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
 	return ret;
 }
 
+DEFINE_STATIC_SRCU(acomp_srcu);
+
 static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
 {
 	struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
 	struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
 
 	if (!IS_ERR_OR_NULL(acomp_ctx)) {
+		/*
+		 * Even though the acomp_ctx should not be currently in use on
+		 * @cpu, it may still be used by compress/decompress operations
+		 * that started on @cpu and migrated to a different CPU. Wait
+		 * for such usages to complete, any news usages would be a bug.
+		 */
+		synchronize_srcu(&acomp_srcu);
+
 		if (!IS_ERR_OR_NULL(acomp_ctx->req))
 			acomp_request_free(acomp_ctx->req);
 		if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
@@ -880,6 +890,18 @@  static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
 	return 0;
 }
 
+static struct crypto_acomp_ctx *acomp_ctx_get_cpu(struct crypto_acomp_ctx __percpu *acomp_ctx,
+						  int *srcu_idx)
+{
+	*srcu_idx = srcu_read_lock(&acomp_srcu);
+	return raw_cpu_ptr(acomp_ctx);
+}
+
+static void acomp_ctx_put_cpu(int srcu_idx)
+{
+	srcu_read_unlock(&acomp_srcu, srcu_idx);
+}
+
 static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 			   struct zswap_pool *pool)
 {
@@ -889,12 +911,12 @@  static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 	unsigned int dlen = PAGE_SIZE;
 	unsigned long handle;
 	struct zpool *zpool;
+	int srcu_idx;
 	char *buf;
 	gfp_t gfp;
 	u8 *dst;
 
-	acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
-
+	acomp_ctx = acomp_ctx_get_cpu(pool->acomp_ctx, &srcu_idx);
 	mutex_lock(&acomp_ctx->mutex);
 
 	dst = acomp_ctx->buffer;
@@ -950,6 +972,7 @@  static bool zswap_compress(struct page *page, struct zswap_entry *entry,
 		zswap_reject_alloc_fail++;
 
 	mutex_unlock(&acomp_ctx->mutex);
+	acomp_ctx_put_cpu(srcu_idx);
 	return comp_ret == 0 && alloc_ret == 0;
 }
 
@@ -958,9 +981,10 @@  static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
 	struct zpool *zpool = entry->pool->zpool;
 	struct scatterlist input, output;
 	struct crypto_acomp_ctx *acomp_ctx;
+	int srcu_idx;
 	u8 *src;
 
-	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+	acomp_ctx = acomp_ctx_get_cpu(entry->pool->acomp_ctx, &srcu_idx);
 	mutex_lock(&acomp_ctx->mutex);
 
 	src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
@@ -990,6 +1014,7 @@  static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
 
 	if (src != acomp_ctx->buffer)
 		zpool_unmap_handle(zpool, entry->handle);
+	acomp_ctx_put_cpu(srcu_idx);
 }
 
 /*********************************