diff mbox series

Possible race in obj_stock_flush_required() vs drain_obj_stock()

Message ID 1664546131660.1777662787.1655319815@gmail.com (mailing list archive)
State New
Headers show
Series Possible race in obj_stock_flush_required() vs drain_obj_stock() | expand

Commit Message

Alexander Fedorov Sept. 30, 2022, 2:06 p.m. UTC
Hi,

reposting this to the mainline list as requested and with updated patch.

I've encountered a race on kernel version 5.10.131-rt72 when running
LTP cgroup_fj_stress_memory* tests and need help with understanding
synchronization in mm/memcontrol.c, it seems really not-trivial...
Have also checked patches in the latest mainline kernel but do not see
anything similar to the problem.  Realtime patch also does not seem to
be the culprit: it just changed preemption to migration disabling and
irq_disable to local_lock.

It goes as follows:

1) First CPU:
    css_killed_work_fn() -> mem_cgroup_css_offline() ->
drain_all_stock() -> obj_stock_flush_required()
         if (stock->cached_objcg) {

This check sees a non-NULL pointer for *another* CPU's `memcg_stock` 
instance.

2) Second CPU:
   css_free_rwork_fn() -> __mem_cgroup_free() -> free_percpu() ->
obj_cgroup_uncharge() -> drain_obj_stock()
It frees `cached_objcg` pointer in its own `memcg_stock` instance:
         struct obj_cgroup *old = stock->cached_objcg;
         < ... >
         obj_cgroup_put(old);
         stock->cached_objcg = NULL;

3) First CPU continues after the 'if' check and re-reads the pointer
again, now it is NULL and dereferencing it leads to kernel panic:
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
                                      struct mem_cgroup *root_memcg)
{
< ... >
         if (stock->cached_objcg) {
                 memcg = obj_cgroup_memcg(stock->cached_objcg);

Since it seems that `cached_objcg` field is protected by RCU, I've also
tried the patch below (against 5.10.131-rt72) and confirmed that it seems
to fix the problem (at least the same LTP tests finish successfully) but
am not sure if that's the right fix.  The patch does not apply cleanly to
mainline kernel but I can try rebasing it if needed.


     mm/memcg: fix race in obj_stock_flush_required() vs drain_obj_stock()
     
     When obj_stock_flush_required() is called from drain_all_stock() it
     reads the `memcg_stock->cached_objcg` field twice for another CPU's
     per-cpu variable, leading to TOCTTOU race: another CPU can
     simultaneously enter drain_obj_stock() and clear its own instance of
     `memcg_stock->cached_objcg`.
     
     To fix it mark `cached_objcg` as RCU protected field and use
     rcu_*() accessors everywhere.

Signed-off-by: Alexander Fedorov <halcien@gmail.com>

  		ret = true;
  	}
@@ -3160,7 +3160,8 @@ static bool consume_obj_stock(struct obj_cgroup 
*objcg, unsigned int nr_bytes)
  
  static void drain_obj_stock(struct memcg_stock_pcp *stock)
  {
-	struct obj_cgroup *old = stock->cached_objcg;
+	struct obj_cgroup *old = rcu_replace_pointer(stock->cached_objcg, NULL,
+						lockdep_is_held(&stock->lock));
  
  	if (!old)
  		return;
@@ -3198,16 +3199,16 @@ static void drain_obj_stock(struct memcg_stock_pcp 
*stock)
  	}
  
  	obj_cgroup_put(old);
-	stock->cached_objcg = NULL;
  }
  
  static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
  				     struct mem_cgroup *root_memcg)
  {
  	struct mem_cgroup *memcg;
+	struct obj_cgroup *cached_objcg = rcu_dereference(stock->cached_objcg);
  
-	if (stock->cached_objcg) {
-		memcg = obj_cgroup_memcg(stock->cached_objcg);
+	if (cached_objcg) {
+		memcg = obj_cgroup_memcg(cached_objcg);
  		if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
  			return true;
  	}
@@ -3223,11 +3224,11 @@ static void refill_obj_stock(struct obj_cgroup 
*objcg, unsigned int nr_bytes)
  	local_lock_irqsave(&memcg_stock.lock, flags);
  
  	stock = this_cpu_ptr(&memcg_stock);
-	if (stock->cached_objcg != objcg) { /* reset if necessary */
+	if (rcu_access_pointer(stock->cached_objcg) != objcg) { /* reset if 
necessary */
  		drain_obj_stock(stock);
  		obj_cgroup_get(objcg);
-		stock->cached_objcg = objcg;
  		stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
+		rcu_assign_pointer(stock->cached_objcg, objcg);
  	}
  	stock->nr_bytes += nr_bytes;
  
@@ -7162,6 +7163,7 @@ static int __init mem_cgroup_init(void)
  
  		stock = per_cpu_ptr(&memcg_stock, cpu);
  		INIT_WORK(&stock->work, drain_local_stock);
+		RCU_INIT_POINTER(stock->cached_objcg, NULL);
  		local_lock_init(&stock->lock);
  	}

Comments

Roman Gushchin Sept. 30, 2022, 6:26 p.m. UTC | #1
On Fri, Sep 30, 2022 at 02:06:48PM +0000, Alexander Fedorov wrote:
> Hi,
> 
> reposting this to the mainline list as requested and with updated patch.
> 
> I've encountered a race on kernel version 5.10.131-rt72 when running
> LTP cgroup_fj_stress_memory* tests and need help with understanding
> synchronization in mm/memcontrol.c, it seems really not-trivial...
> Have also checked patches in the latest mainline kernel but do not see
> anything similar to the problem.  Realtime patch also does not seem to
> be the culprit: it just changed preemption to migration disabling and
> irq_disable to local_lock.
> 
> It goes as follows:
> 
> 1) First CPU:
>    css_killed_work_fn() -> mem_cgroup_css_offline() ->
> drain_all_stock() -> obj_stock_flush_required()
>         if (stock->cached_objcg) {
> 
> This check sees a non-NULL pointer for *another* CPU's `memcg_stock`
> instance.
> 
> 2) Second CPU:
>   css_free_rwork_fn() -> __mem_cgroup_free() -> free_percpu() ->
> obj_cgroup_uncharge() -> drain_obj_stock()
> It frees `cached_objcg` pointer in its own `memcg_stock` instance:
>         struct obj_cgroup *old = stock->cached_objcg;
>         < ... >
>         obj_cgroup_put(old);
>         stock->cached_objcg = NULL;
> 
> 3) First CPU continues after the 'if' check and re-reads the pointer
> again, now it is NULL and dereferencing it leads to kernel panic:
> static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
>                                      struct mem_cgroup *root_memcg)
> {
> < ... >
>         if (stock->cached_objcg) {
>                 memcg = obj_cgroup_memcg(stock->cached_objcg);

Great catch!

I'm not sure about switching to rcu primitives though. In all other cases
stock->cached_objcg is accessed only from a local cpu, so using rcu_*
function is an overkill.

How's something about this? (completely untested)

Also, please add
Fixes: bf4f059954dc ("mm: memcg/slab: obj_cgroup API")

Thank you!

--

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b69979c9ced5..93e9637108f0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3245,10 +3245,18 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
                                     struct mem_cgroup *root_memcg)
 {
+       struct obj_cgroup *objcg;
        struct mem_cgroup *memcg;

-       if (stock->cached_objcg) {
-               memcg = obj_cgroup_memcg(stock->cached_objcg);
+       /*
+        * stock->cached_objcg can be changed asynchronously, so read
+        * it using READ_ONCE(). The objcg can't go away though because
+        * obj_stock_flush_required() is called from within a rcu read
+        * section.
+        */
+       objcg = READ_ONCE(stock->cached_objcg);
+       if (objcg) {
+               memcg = obj_cgroup_memcg(objcg);
                if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
                        return true;
        }

Thank you!
Alexander Fedorov Oct. 1, 2022, 12:38 p.m. UTC | #2
On 30.09.2022 21:26, Roman Gushchin wrote:
> On Fri, Sep 30, 2022 at 02:06:48PM +0000, Alexander Fedorov wrote:
>> 1) First CPU:
>>    css_killed_work_fn() -> mem_cgroup_css_offline() ->
>> drain_all_stock() -> obj_stock_flush_required()
>>         if (stock->cached_objcg) {
>>
>> This check sees a non-NULL pointer for *another* CPU's `memcg_stock`
>> instance.
>>
>> 2) Second CPU:
>>   css_free_rwork_fn() -> __mem_cgroup_free() -> free_percpu() ->
>> obj_cgroup_uncharge() -> drain_obj_stock()
>> It frees `cached_objcg` pointer in its own `memcg_stock` instance:
>>         struct obj_cgroup *old = stock->cached_objcg;
>>         < ... >
>>         obj_cgroup_put(old);
>>         stock->cached_objcg = NULL;
>>
>> 3) First CPU continues after the 'if' check and re-reads the pointer
>> again, now it is NULL and dereferencing it leads to kernel panic:
>> static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
>>                                      struct mem_cgroup *root_memcg)
>> {
>> < ... >
>>         if (stock->cached_objcg) {
>>                 memcg = obj_cgroup_memcg(stock->cached_objcg);
> 
> Great catch!
> 
> I'm not sure about switching to rcu primitives though. In all other cases
> stock->cached_objcg is accessed only from a local cpu, so using rcu_*
> function is an overkill.
> 
> How's something about this? (completely untested)

Tested READ_ONCE() patch and it works.  But are rcu primitives an overkill?
For me they are documenting how actually complex is synchronization here.
For example, only after converting to rcu I noticed this (5.10.131-rt72):

static void drain_obj_stock(struct memcg_stock_pcp *stock)
{
	struct obj_cgroup *old = stock->cached_objcg;
< ... >
	obj_cgroup_put(old);
	stock->cached_objcg = NULL;
}

On kernel with enabled preemption this function can be preempted between
obj_cgroup_put() -> kfree_rcu() call and `cached_objcg` assignment, and
since scheduling marks the end of grace period then another task running
drain_all_stock() could access freed memory.

Since `cached_objcg` was not marked as synchronized variable this problem
could not be seen just by reading drain_obj_stock(), but after adding rcu
markings it is easier to notice (and fix with rcu_replace_pointer()).

Checked in mainline, this use-after-free was fixed when fixing another
problem:
5675114623872300aa9fcd72aef2b8b7f421fe12
"mm/memcg: protect memcg_stock with a local_lock_t"

> Also, please add
> Fixes: bf4f059954dc ("mm: memcg/slab: obj_cgroup API")

Done, and reposted original patch because my email client malformed it
(+ comment about use-after-free)...


    mm/memcg: fix race in obj_stock_flush_required() vs drain_obj_stock()
    
    When obj_stock_flush_required() is called from drain_all_stock() it
    reads the `memcg_stock->cached_objcg` field twice for another CPU's
    per-cpu variable, leading to TOCTTOU race: another CPU can
    simultaneously enter drain_obj_stock() and clear its own instance of
    `memcg_stock->cached_objcg`.
    
    Another problem is in drain_obj_stock() which sets `cached_objcg` to
    NULL after freeing which might lead to use-after-free.
    
    To fix it mark `cached_objcg` as RCU protected field and use rcu_*()
    accessors everywhere.

Fixes: bf4f059954dc ("mm: memcg/slab: obj_cgroup API")
Signed-off-by: Alexander Fedorov <halcien@gmail.com>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c1152f8747..2ab205f529 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2215,7 +2215,7 @@ struct memcg_stock_pcp {
 	unsigned int nr_pages;
 
 #ifdef CONFIG_MEMCG_KMEM
-	struct obj_cgroup *cached_objcg;
+	struct obj_cgroup __rcu *cached_objcg;
 	unsigned int nr_bytes;
 #endif
 
@@ -3148,7 +3148,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
 	local_lock_irqsave(&memcg_stock.lock, flags);
 
 	stock = this_cpu_ptr(&memcg_stock);
-	if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
+	if (objcg == rcu_access_pointer(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
 		stock->nr_bytes -= nr_bytes;
 		ret = true;
 	}
@@ -3160,7 +3160,8 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
 
 static void drain_obj_stock(struct memcg_stock_pcp *stock)
 {
-	struct obj_cgroup *old = stock->cached_objcg;
+	struct obj_cgroup *old = rcu_replace_pointer(stock->cached_objcg, NULL,
+						lockdep_is_held(&stock->lock));
 
 	if (!old)
 		return;
@@ -3198,16 +3199,16 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock)
 	}
 
 	obj_cgroup_put(old);
-	stock->cached_objcg = NULL;
 }
 
 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
 				     struct mem_cgroup *root_memcg)
 {
 	struct mem_cgroup *memcg;
+	struct obj_cgroup *cached_objcg = rcu_dereference(stock->cached_objcg);
 
-	if (stock->cached_objcg) {
-		memcg = obj_cgroup_memcg(stock->cached_objcg);
+	if (cached_objcg) {
+		memcg = obj_cgroup_memcg(cached_objcg);
 		if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
 			return true;
 	}
@@ -3223,11 +3224,11 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
 	local_lock_irqsave(&memcg_stock.lock, flags);
 
 	stock = this_cpu_ptr(&memcg_stock);
-	if (stock->cached_objcg != objcg) { /* reset if necessary */
+	if (rcu_access_pointer(stock->cached_objcg) != objcg) { /* reset if necessary */
 		drain_obj_stock(stock);
 		obj_cgroup_get(objcg);
-		stock->cached_objcg = objcg;
 		stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
+		rcu_assign_pointer(stock->cached_objcg, objcg);
 	}
 	stock->nr_bytes += nr_bytes;
 
@@ -7162,6 +7163,7 @@ static int __init mem_cgroup_init(void)
 
 		stock = per_cpu_ptr(&memcg_stock, cpu);
 		INIT_WORK(&stock->work, drain_local_stock);
+		RCU_INIT_POINTER(stock->cached_objcg, NULL);
 		local_lock_init(&stock->lock);
 	}
Roman Gushchin Oct. 2, 2022, 4:16 p.m. UTC | #3
On Sat, Oct 01, 2022 at 03:38:43PM +0300, Alexander Fedorov wrote:
> On 30.09.2022 21:26, Roman Gushchin wrote:
> > On Fri, Sep 30, 2022 at 02:06:48PM +0000, Alexander Fedorov wrote:
> >> 1) First CPU:
> >>    css_killed_work_fn() -> mem_cgroup_css_offline() ->
> >> drain_all_stock() -> obj_stock_flush_required()
> >>         if (stock->cached_objcg) {
> >>
> >> This check sees a non-NULL pointer for *another* CPU's `memcg_stock`
> >> instance.
> >>
> >> 2) Second CPU:
> >>   css_free_rwork_fn() -> __mem_cgroup_free() -> free_percpu() ->
> >> obj_cgroup_uncharge() -> drain_obj_stock()
> >> It frees `cached_objcg` pointer in its own `memcg_stock` instance:
> >>         struct obj_cgroup *old = stock->cached_objcg;
> >>         < ... >
> >>         obj_cgroup_put(old);
> >>         stock->cached_objcg = NULL;
> >>
> >> 3) First CPU continues after the 'if' check and re-reads the pointer
> >> again, now it is NULL and dereferencing it leads to kernel panic:
> >> static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
> >>                                      struct mem_cgroup *root_memcg)
> >> {
> >> < ... >
> >>         if (stock->cached_objcg) {
> >>                 memcg = obj_cgroup_memcg(stock->cached_objcg);
> > 
> > Great catch!
> > 
> > I'm not sure about switching to rcu primitives though. In all other cases
> > stock->cached_objcg is accessed only from a local cpu, so using rcu_*
> > function is an overkill.
> > 
> > How's something about this? (completely untested)
> 
> Tested READ_ONCE() patch and it works.

Thank you!

> But are rcu primitives an overkill?
> For me they are documenting how actually complex is synchronization here.

I agree, however rcu primitives will add unnecessary barriers on hot paths.
In this particular case most accesses to stock->cached_objcg are done from
a local cpu, so no rcu primitives are needed. So in my opinion using a
READ_ONCE() is preferred.

Thanks!
Alexander Fedorov Oct. 3, 2022, 12:47 p.m. UTC | #4
On 02.10.2022 19:16, Roman Gushchin wrote:
> On Sat, Oct 01, 2022 at 03:38:43PM +0300, Alexander Fedorov wrote:
>> Tested READ_ONCE() patch and it works.
> 
> Thank you!
> 
>> But are rcu primitives an overkill?
>> For me they are documenting how actually complex is synchronization here.
> 
> I agree, however rcu primitives will add unnecessary barriers on hot paths.
> In this particular case most accesses to stock->cached_objcg are done from
> a local cpu, so no rcu primitives are needed. So in my opinion using a
> READ_ONCE() is preferred.

Understood, then here is patch that besides READ_ONCE() also fixes mentioned
use-after-free that exists in 5.10.  In mainline the drain_obj_stock() part
of the patch should be skipped.

Should probably be also Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
but I am not sure if I have rights to add that :)


    mm/memcg: fix race in obj_stock_flush_required() vs drain_obj_stock()
    
    When obj_stock_flush_required() is called from drain_all_stock() it
    reads the `memcg_stock->cached_objcg` field twice for another CPU's
    per-cpu variable, leading to TOCTTOU race: another CPU can
    simultaneously enter drain_obj_stock() and clear its own instance of
    `memcg_stock->cached_objcg`.
    
    Another problem is in drain_obj_stock() which sets `cached_objcg` to
    NULL after freeing which might lead to use-after-free.
    
    To fix it use READ_ONCE() for TOCTTOU problem and also clear the
    `cached_objcg` pointer earlier in drain_obj_stock() for use-after-free
    problem.

Fixes: bf4f059954dc ("mm: memcg/slab: obj_cgroup API")
Signed-off-by: Alexander Fedorov <halcien@gmail.com>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c1152f8747..56bd5ea6d3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3197,17 +3197,30 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock)
 		stock->nr_bytes = 0;
 	}
 
-	obj_cgroup_put(old);
+	/*
+	 * Clear pointer before freeing memory so that
+	 * drain_all_stock() -> obj_stock_flush_required()
+	 * does not see a freed pointer.
+	 */
 	stock->cached_objcg = NULL;
+	obj_cgroup_put(old);
 }
 
 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
 				     struct mem_cgroup *root_memcg)
 {
+	struct obj_cgroup *objcg;
 	struct mem_cgroup *memcg;
 
-	if (stock->cached_objcg) {
-		memcg = obj_cgroup_memcg(stock->cached_objcg);
+	/*
+	 * stock->cached_objcg can be changed asynchronously, so read
+	 * it using READ_ONCE(). The objcg can't go away though because
+	 * obj_stock_flush_required() is called from within a rcu read
+	 * section.
+	 */
+	objcg = READ_ONCE(stock->cached_objcg);
+	if (objcg) {
+		memcg = obj_cgroup_memcg(objcg);
 		if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
 			return true;
 	}
Michal Hocko Oct. 3, 2022, 1:32 p.m. UTC | #5
On Mon 03-10-22 15:47:10, Alexander Fedorov wrote:
> On 02.10.2022 19:16, Roman Gushchin wrote:
> > On Sat, Oct 01, 2022 at 03:38:43PM +0300, Alexander Fedorov wrote:
> >> Tested READ_ONCE() patch and it works.
> > 
> > Thank you!
> > 
> >> But are rcu primitives an overkill?
> >> For me they are documenting how actually complex is synchronization here.
> > 
> > I agree, however rcu primitives will add unnecessary barriers on hot paths.
> > In this particular case most accesses to stock->cached_objcg are done from
> > a local cpu, so no rcu primitives are needed. So in my opinion using a
> > READ_ONCE() is preferred.
> 
> Understood, then here is patch that besides READ_ONCE() also fixes mentioned
> use-after-free that exists in 5.10.  In mainline the drain_obj_stock() part
> of the patch should be skipped.
> 
> Should probably be also Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
> but I am not sure if I have rights to add that :)
> 
> 
>     mm/memcg: fix race in obj_stock_flush_required() vs drain_obj_stock()
>     
>     When obj_stock_flush_required() is called from drain_all_stock() it
>     reads the `memcg_stock->cached_objcg` field twice for another CPU's
>     per-cpu variable, leading to TOCTTOU race: another CPU can
>     simultaneously enter drain_obj_stock() and clear its own instance of
>     `memcg_stock->cached_objcg`.
>     
>     Another problem is in drain_obj_stock() which sets `cached_objcg` to
>     NULL after freeing which might lead to use-after-free.
>     
>     To fix it use READ_ONCE() for TOCTTOU problem and also clear the
>     `cached_objcg` pointer earlier in drain_obj_stock() for use-after-free
>     problem.
> 
> Fixes: bf4f059954dc ("mm: memcg/slab: obj_cgroup API")
> Signed-off-by: Alexander Fedorov <halcien@gmail.com>
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index c1152f8747..56bd5ea6d3 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3197,17 +3197,30 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock)
>  		stock->nr_bytes = 0;
>  	}
>  
> -	obj_cgroup_put(old);
> +	/*
> +	 * Clear pointer before freeing memory so that
> +	 * drain_all_stock() -> obj_stock_flush_required()
> +	 * does not see a freed pointer.
> +	 */
>  	stock->cached_objcg = NULL;
> +	obj_cgroup_put(old);

Do we need barrier() or something else to ensure there is no reordering?
I am not reallyu sure what kind of barriers are implied by the pcp ref
counting.
Alexander Fedorov Oct. 3, 2022, 2:09 p.m. UTC | #6
On 03.10.2022 16:32, Michal Hocko wrote:
> On Mon 03-10-22 15:47:10, Alexander Fedorov wrote:
>> @@ -3197,17 +3197,30 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock)
>>  		stock->nr_bytes = 0;
>>  	}
>>  
>> -	obj_cgroup_put(old);
>> +	/*
>> +	 * Clear pointer before freeing memory so that
>> +	 * drain_all_stock() -> obj_stock_flush_required()
>> +	 * does not see a freed pointer.
>> +	 */
>>  	stock->cached_objcg = NULL;
>> +	obj_cgroup_put(old);
> 
> Do we need barrier() or something else to ensure there is no reordering?
> I am not reallyu sure what kind of barriers are implied by the pcp ref
> counting.

obj_cgroup_put() -> kfree_rcu() -> synchronize_rcu() should take care
of this:

3670  * Furthermore, if CPU A invoked synchronize_rcu(), which returned
3671  * to its caller on CPU B, then both CPU A and CPU B are guaranteed
3672  * to have executed a full memory barrier during the execution of
3673  * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
3674  * again only if the system has more than one CPU).
3675  */
3676 void synchronize_rcu(void)

If I'm reading this correctly:
 - on SMP A==B and there will be a full memory barrier;
 - on UP we instead rely on the guarantee that schedule() implies a full
   memory barrier (and if there is no schedule() then there is no race).
Michal Hocko Oct. 3, 2022, 2:27 p.m. UTC | #7
On Mon 03-10-22 17:09:15, Alexander Fedorov wrote:
> On 03.10.2022 16:32, Michal Hocko wrote:
> > On Mon 03-10-22 15:47:10, Alexander Fedorov wrote:
> >> @@ -3197,17 +3197,30 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock)
> >>  		stock->nr_bytes = 0;
> >>  	}
> >>  
> >> -	obj_cgroup_put(old);
> >> +	/*
> >> +	 * Clear pointer before freeing memory so that
> >> +	 * drain_all_stock() -> obj_stock_flush_required()
> >> +	 * does not see a freed pointer.
> >> +	 */
> >>  	stock->cached_objcg = NULL;
> >> +	obj_cgroup_put(old);
> > 
> > Do we need barrier() or something else to ensure there is no reordering?
> > I am not reallyu sure what kind of barriers are implied by the pcp ref
> > counting.
> 
> obj_cgroup_put() -> kfree_rcu() -> synchronize_rcu() should take care
> of this:

This is a very subtle guarantee. Also it would only apply if this is the
last reference, right? Is there any reason to not use
	WRITE_ONCE(stock->cached_objcg, NULL);
	obj_cgroup_put(old);

IIRC this should prevent any reordering.
Alexander Fedorov Oct. 3, 2022, 3:01 p.m. UTC | #8
On 03.10.2022 17:27, Michal Hocko wrote:
> On Mon 03-10-22 17:09:15, Alexander Fedorov wrote:
>> On 03.10.2022 16:32, Michal Hocko wrote:
>>> On Mon 03-10-22 15:47:10, Alexander Fedorov wrote:
>>>> @@ -3197,17 +3197,30 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock)
>>>>  		stock->nr_bytes = 0;
>>>>  	}
>>>>  
>>>> -	obj_cgroup_put(old);
>>>> +	/*
>>>> +	 * Clear pointer before freeing memory so that
>>>> +	 * drain_all_stock() -> obj_stock_flush_required()
>>>> +	 * does not see a freed pointer.
>>>> +	 */
>>>>  	stock->cached_objcg = NULL;
>>>> +	obj_cgroup_put(old);
>>>
>>> Do we need barrier() or something else to ensure there is no reordering?
>>> I am not reallyu sure what kind of barriers are implied by the pcp ref
>>> counting.
>>
>> obj_cgroup_put() -> kfree_rcu() -> synchronize_rcu() should take care
>> of this:
> 
> This is a very subtle guarantee. Also it would only apply if this is the
> last reference, right?

Hmm, yes, for the last reference only, also not sure about pcp ref
counter ordering rules for previous references.

> Is there any reason to not use
> 	WRITE_ONCE(stock->cached_objcg, NULL);
> 	obj_cgroup_put(old);
> 
> IIRC this should prevent any reordering. 

Now that I think about it we actually must use WRITE_ONCE everywhere
when writing cached_objcg because otherwise compiler might split the
pointer-sized store into several smaller-sized ones (store tearing),
and obj_stock_flush_required() would read garbage instead of pointer.

And thinking about memory barriers, maybe we need them too alongside
WRITE_ONCE when setting pointer to non-null value?  Otherwise
drain_all_stock() -> obj_stock_flush_required() might read old data.
Since that's exactly what rcu_assign_pointer() does, it seems
that we are going back to using rcu_*() primitives everywhere?
Roman Gushchin Oct. 4, 2022, 4:18 p.m. UTC | #9
On Mon, Oct 03, 2022 at 06:01:35PM +0300, Alexander Fedorov wrote:
> On 03.10.2022 17:27, Michal Hocko wrote:
> > On Mon 03-10-22 17:09:15, Alexander Fedorov wrote:
> >> On 03.10.2022 16:32, Michal Hocko wrote:
> >>> On Mon 03-10-22 15:47:10, Alexander Fedorov wrote:
> >>>> @@ -3197,17 +3197,30 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock)
> >>>>  		stock->nr_bytes = 0;
> >>>>  	}
> >>>>  
> >>>> -	obj_cgroup_put(old);
> >>>> +	/*
> >>>> +	 * Clear pointer before freeing memory so that
> >>>> +	 * drain_all_stock() -> obj_stock_flush_required()
> >>>> +	 * does not see a freed pointer.
> >>>> +	 */
> >>>>  	stock->cached_objcg = NULL;
> >>>> +	obj_cgroup_put(old);
> >>>
> >>> Do we need barrier() or something else to ensure there is no reordering?
> >>> I am not reallyu sure what kind of barriers are implied by the pcp ref
> >>> counting.
> >>
> >> obj_cgroup_put() -> kfree_rcu() -> synchronize_rcu() should take care
> >> of this:
> > 
> > This is a very subtle guarantee. Also it would only apply if this is the
> > last reference, right?
> 
> Hmm, yes, for the last reference only, also not sure about pcp ref
> counter ordering rules for previous references.
> 
> > Is there any reason to not use
> > 	WRITE_ONCE(stock->cached_objcg, NULL);
> > 	obj_cgroup_put(old);
> > 
> > IIRC this should prevent any reordering. 
> 
> Now that I think about it we actually must use WRITE_ONCE everywhere
> when writing cached_objcg because otherwise compiler might split the
> pointer-sized store into several smaller-sized ones (store tearing),
> and obj_stock_flush_required() would read garbage instead of pointer.
>
> And thinking about memory barriers, maybe we need them too alongside
> WRITE_ONCE when setting pointer to non-null value?  Otherwise
> drain_all_stock() -> obj_stock_flush_required() might read old data.
> Since that's exactly what rcu_assign_pointer() does, it seems
> that we are going back to using rcu_*() primitives everywhere?

Hm, Idk, I'm still somewhat resistant to the idea of putting rcu primitives,
but maybe it's the right thing. Maybe instead we should always schedule draining
on all cpus instead and perform a cpu-local check and bail out if a flush is not
required? Michal, Johannes, what do you think?

Btw the same approach is used for the memcg part (stock->cached),
so if we're going to change anything, we need to change it too.

Thanks!
Johannes Weiner Oct. 12, 2022, 5:23 p.m. UTC | #10
On Tue, Oct 04, 2022 at 09:18:26AM -0700, Roman Gushchin wrote:
> On Mon, Oct 03, 2022 at 06:01:35PM +0300, Alexander Fedorov wrote:
> > On 03.10.2022 17:27, Michal Hocko wrote:
> > > On Mon 03-10-22 17:09:15, Alexander Fedorov wrote:
> > >> On 03.10.2022 16:32, Michal Hocko wrote:
> > >>> On Mon 03-10-22 15:47:10, Alexander Fedorov wrote:
> > >>>> @@ -3197,17 +3197,30 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock)
> > >>>>  		stock->nr_bytes = 0;
> > >>>>  	}
> > >>>>  
> > >>>> -	obj_cgroup_put(old);
> > >>>> +	/*
> > >>>> +	 * Clear pointer before freeing memory so that
> > >>>> +	 * drain_all_stock() -> obj_stock_flush_required()
> > >>>> +	 * does not see a freed pointer.
> > >>>> +	 */
> > >>>>  	stock->cached_objcg = NULL;
> > >>>> +	obj_cgroup_put(old);
> > >>>
> > >>> Do we need barrier() or something else to ensure there is no reordering?
> > >>> I am not reallyu sure what kind of barriers are implied by the pcp ref
> > >>> counting.
> > >>
> > >> obj_cgroup_put() -> kfree_rcu() -> synchronize_rcu() should take care
> > >> of this:
> > > 
> > > This is a very subtle guarantee. Also it would only apply if this is the
> > > last reference, right?
> > 
> > Hmm, yes, for the last reference only, also not sure about pcp ref
> > counter ordering rules for previous references.
> > 
> > > Is there any reason to not use
> > > 	WRITE_ONCE(stock->cached_objcg, NULL);
> > > 	obj_cgroup_put(old);
> > > 
> > > IIRC this should prevent any reordering. 
> > 
> > Now that I think about it we actually must use WRITE_ONCE everywhere
> > when writing cached_objcg because otherwise compiler might split the
> > pointer-sized store into several smaller-sized ones (store tearing),
> > and obj_stock_flush_required() would read garbage instead of pointer.
> >
> > And thinking about memory barriers, maybe we need them too alongside
> > WRITE_ONCE when setting pointer to non-null value?  Otherwise
> > drain_all_stock() -> obj_stock_flush_required() might read old data.
> > Since that's exactly what rcu_assign_pointer() does, it seems
> > that we are going back to using rcu_*() primitives everywhere?
> 
> Hm, Idk, I'm still somewhat resistant to the idea of putting rcu primitives,
> but maybe it's the right thing. Maybe instead we should always schedule draining
> on all cpus instead and perform a cpu-local check and bail out if a flush is not
> required? Michal, Johannes, what do you think?

I agree it's overkill.

This is a speculative check, and we don't need any state coherency,
just basic lifetime. READ_ONCE should fully address this problem. That
said, I think the code could be a bit clearer and better documented.

How about the below?

(Nevermind the ifdef, I'm working on removing CONFIG_MEMCG_KMEM
altogether, as it's a really strange way to say !SLOB at this point)

---

From 22855af38b116ec030286975ed2aa06851680296 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 12 Oct 2022 12:59:07 -0400
Subject: [PATCH] mm: memcontrol: fix NULL deref race condition during cgroup
 deletion

Alexander Fedorov reports a race condition between two concurrent
stock draining operations, where the first one clears the stock's obj
pointer between the pointer test and deref of the second. Analysis:

1) First CPU:
   css_killed_work_fn() -> mem_cgroup_css_offline() ->
drain_all_stock() -> obj_stock_flush_required()
	if (stock->cached_objcg) {

This check sees a non-NULL pointer for *another* CPU's `memcg_stock` instance.

2) Second CPU:
  css_free_rwork_fn() -> __mem_cgroup_free() -> free_percpu() ->
obj_cgroup_uncharge() -> drain_obj_stock()
It frees `cached_objcg` pointer in its own `memcg_stock` instance:
	struct obj_cgroup *old = stock->cached_objcg;
	< ... >
	obj_cgroup_put(old);
	stock->cached_objcg = NULL;

3) First CPU continues after the 'if' check and re-reads the pointer
again, now it is NULL and dereferencing it leads to kernel panic:
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
				     struct mem_cgroup *root_memcg)
{
< ... >
	if (stock->cached_objcg) {
		memcg = obj_cgroup_memcg(stock->cached_objcg);

There is already RCU protection in place to ensure lifetime. Add the
missing READ_ONCE to the cgroup pointers to fix the TOCTOU, and
consolidate and document the speculative code.

Reported-by: Alexander Fedorov <halcien@gmail.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 mm/memcontrol.c | 44 ++++++++++++++++++++------------------------
 1 file changed, 20 insertions(+), 24 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2d8549ae1b30..09ac2f8991ee 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2190,8 +2190,6 @@ static DEFINE_MUTEX(percpu_charge_mutex);
 
 #ifdef CONFIG_MEMCG_KMEM
 static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
-static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
-				     struct mem_cgroup *root_memcg);
 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages);
 
 #else
@@ -2199,11 +2197,6 @@ static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
 {
 	return NULL;
 }
-static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
-				     struct mem_cgroup *root_memcg)
-{
-	return false;
-}
 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
 {
 }
@@ -2339,13 +2332,30 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
 		struct mem_cgroup *memcg;
 		bool flush = false;
 
+		/*
+		 * Speculatively check up front if this CPU has any
+		 * cached charges that belong to the specified
+		 * root_memcg. The state may change from under us -
+		 * which is okay, because the draining itself is a
+		 * best-effort operation. Just ensure lifetime of
+		 * whatever we end up looking at.
+		 */
 		rcu_read_lock();
-		memcg = stock->cached;
+		memcg = READ_ONCE(stock->cached);
 		if (memcg && stock->nr_pages &&
 		    mem_cgroup_is_descendant(memcg, root_memcg))
 			flush = true;
-		else if (obj_stock_flush_required(stock, root_memcg))
-			flush = true;
+#ifdef CONFIG_MEMCG_KMEM
+		else {
+			struct obj_cgroup *objcg;
+
+			objcg = READ_ONCE(stock->cached_objcg);
+			if (objcg && stock->nr_bytes &&
+			    mem_cgroup_is_descendant(obj_cgroup_memcg(objcg),
+						     root_memcg))
+				flush = true;
+		}
+#endif
 		rcu_read_unlock();
 
 		if (flush &&
@@ -3297,20 +3307,6 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
 	return old;
 }
 
-static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
-				     struct mem_cgroup *root_memcg)
-{
-	struct mem_cgroup *memcg;
-
-	if (stock->cached_objcg) {
-		memcg = obj_cgroup_memcg(stock->cached_objcg);
-		if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
-			return true;
-	}
-
-	return false;
-}
-
 static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
 			     bool allow_uncharge)
 {
Roman Gushchin Oct. 12, 2022, 6:49 p.m. UTC | #11
On Wed, Oct 12, 2022 at 01:23:11PM -0400, Johannes Weiner wrote:
> On Tue, Oct 04, 2022 at 09:18:26AM -0700, Roman Gushchin wrote:
> > On Mon, Oct 03, 2022 at 06:01:35PM +0300, Alexander Fedorov wrote:
> > > On 03.10.2022 17:27, Michal Hocko wrote:
> > > > On Mon 03-10-22 17:09:15, Alexander Fedorov wrote:
> > > >> On 03.10.2022 16:32, Michal Hocko wrote:
> > > >>> On Mon 03-10-22 15:47:10, Alexander Fedorov wrote:
> > > >>>> @@ -3197,17 +3197,30 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock)
> > > >>>>  		stock->nr_bytes = 0;
> > > >>>>  	}
> > > >>>>  
> > > >>>> -	obj_cgroup_put(old);
> > > >>>> +	/*
> > > >>>> +	 * Clear pointer before freeing memory so that
> > > >>>> +	 * drain_all_stock() -> obj_stock_flush_required()
> > > >>>> +	 * does not see a freed pointer.
> > > >>>> +	 */
> > > >>>>  	stock->cached_objcg = NULL;
> > > >>>> +	obj_cgroup_put(old);
> > > >>>
> > > >>> Do we need barrier() or something else to ensure there is no reordering?
> > > >>> I am not reallyu sure what kind of barriers are implied by the pcp ref
> > > >>> counting.
> > > >>
> > > >> obj_cgroup_put() -> kfree_rcu() -> synchronize_rcu() should take care
> > > >> of this:
> > > > 
> > > > This is a very subtle guarantee. Also it would only apply if this is the
> > > > last reference, right?
> > > 
> > > Hmm, yes, for the last reference only, also not sure about pcp ref
> > > counter ordering rules for previous references.
> > > 
> > > > Is there any reason to not use
> > > > 	WRITE_ONCE(stock->cached_objcg, NULL);
> > > > 	obj_cgroup_put(old);
> > > > 
> > > > IIRC this should prevent any reordering. 
> > > 
> > > Now that I think about it we actually must use WRITE_ONCE everywhere
> > > when writing cached_objcg because otherwise compiler might split the
> > > pointer-sized store into several smaller-sized ones (store tearing),
> > > and obj_stock_flush_required() would read garbage instead of pointer.
> > >
> > > And thinking about memory barriers, maybe we need them too alongside
> > > WRITE_ONCE when setting pointer to non-null value?  Otherwise
> > > drain_all_stock() -> obj_stock_flush_required() might read old data.
> > > Since that's exactly what rcu_assign_pointer() does, it seems
> > > that we are going back to using rcu_*() primitives everywhere?
> > 
> > Hm, Idk, I'm still somewhat resistant to the idea of putting rcu primitives,
> > but maybe it's the right thing. Maybe instead we should always schedule draining
> > on all cpus instead and perform a cpu-local check and bail out if a flush is not
> > required? Michal, Johannes, what do you think?
> 
> I agree it's overkill.
> 
> This is a speculative check, and we don't need any state coherency,
> just basic lifetime. READ_ONCE should fully address this problem. That
> said, I think the code could be a bit clearer and better documented.
> 
> How about the below?

I'm fine with using READ_ONCE() to fix this immediate issue (I suggested it
in the thread above), please feel free to add my ack:
Acked-by: Roman Gushchin <roman.gushchin@linux.dev> .

We might need a barrier() between zeroing stock->cached and dropping the last
reference, as discussed above, however I don't think this issue can be
realistically trgiggered in the real life.

However I think our overall approach to flushing is questionable:
1) we often don't flush when it's necessary: if there is a concurrent flushing
we just bail out, even if that flushing is related to a completely different
part of the cgroup tree (e.g. a leaf node belonging to a distant branch).
2) we can race and flush when it's not necessarily: if another cpu is busy,
likely by the time when work will be executed there will be already another
memcg cached. So IMO we need to move this check into the flushing thread.

I'm working on a different approach, but it will take time and also likely be
too invasive for @stable, so fixing the crash discovered by Alexander with
READ_ONCE() is a good idea.

Thanks!
Johannes Weiner Oct. 12, 2022, 7:18 p.m. UTC | #12
On Wed, Oct 12, 2022 at 11:49:20AM -0700, Roman Gushchin wrote:
> On Wed, Oct 12, 2022 at 01:23:11PM -0400, Johannes Weiner wrote:
> > On Tue, Oct 04, 2022 at 09:18:26AM -0700, Roman Gushchin wrote:
> > > On Mon, Oct 03, 2022 at 06:01:35PM +0300, Alexander Fedorov wrote:
> > > > On 03.10.2022 17:27, Michal Hocko wrote:
> > > > > On Mon 03-10-22 17:09:15, Alexander Fedorov wrote:
> > > > >> On 03.10.2022 16:32, Michal Hocko wrote:
> > > > >>> On Mon 03-10-22 15:47:10, Alexander Fedorov wrote:
> > > > >>>> @@ -3197,17 +3197,30 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock)
> > > > >>>>  		stock->nr_bytes = 0;
> > > > >>>>  	}
> > > > >>>>  
> > > > >>>> -	obj_cgroup_put(old);
> > > > >>>> +	/*
> > > > >>>> +	 * Clear pointer before freeing memory so that
> > > > >>>> +	 * drain_all_stock() -> obj_stock_flush_required()
> > > > >>>> +	 * does not see a freed pointer.
> > > > >>>> +	 */
> > > > >>>>  	stock->cached_objcg = NULL;
> > > > >>>> +	obj_cgroup_put(old);
> > > > >>>
> > > > >>> Do we need barrier() or something else to ensure there is no reordering?
> > > > >>> I am not reallyu sure what kind of barriers are implied by the pcp ref
> > > > >>> counting.
> > > > >>
> > > > >> obj_cgroup_put() -> kfree_rcu() -> synchronize_rcu() should take care
> > > > >> of this:
> > > > > 
> > > > > This is a very subtle guarantee. Also it would only apply if this is the
> > > > > last reference, right?
> > > > 
> > > > Hmm, yes, for the last reference only, also not sure about pcp ref
> > > > counter ordering rules for previous references.
> > > > 
> > > > > Is there any reason to not use
> > > > > 	WRITE_ONCE(stock->cached_objcg, NULL);
> > > > > 	obj_cgroup_put(old);
> > > > > 
> > > > > IIRC this should prevent any reordering. 
> > > > 
> > > > Now that I think about it we actually must use WRITE_ONCE everywhere
> > > > when writing cached_objcg because otherwise compiler might split the
> > > > pointer-sized store into several smaller-sized ones (store tearing),
> > > > and obj_stock_flush_required() would read garbage instead of pointer.
> > > >
> > > > And thinking about memory barriers, maybe we need them too alongside
> > > > WRITE_ONCE when setting pointer to non-null value?  Otherwise
> > > > drain_all_stock() -> obj_stock_flush_required() might read old data.
> > > > Since that's exactly what rcu_assign_pointer() does, it seems
> > > > that we are going back to using rcu_*() primitives everywhere?
> > > 
> > > Hm, Idk, I'm still somewhat resistant to the idea of putting rcu primitives,
> > > but maybe it's the right thing. Maybe instead we should always schedule draining
> > > on all cpus instead and perform a cpu-local check and bail out if a flush is not
> > > required? Michal, Johannes, what do you think?
> > 
> > I agree it's overkill.
> > 
> > This is a speculative check, and we don't need any state coherency,
> > just basic lifetime. READ_ONCE should fully address this problem. That
> > said, I think the code could be a bit clearer and better documented.
> > 
> > How about the below?
> 
> I'm fine with using READ_ONCE() to fix this immediate issue (I suggested it
> in the thread above), please feel free to add my ack:
> Acked-by: Roman Gushchin <roman.gushchin@linux.dev> .

Thanks!

> We might need a barrier() between zeroing stock->cached and dropping the last
> reference, as discussed above, however I don't think this issue can be
> realistically trgiggered in the real life.

Hm, plus the load tearing.

We can do WRITE_ONCE() just for ->cached and ->cached_objcg. That will
take care of both: load tearing, as well as the compile-time order
with the RCU free call. RCU will then handle the SMP effects.

I still prefer it over rcuifying the pointers completely just for that
one (questionable) optimization.

Updated patch below.

> However I think our overall approach to flushing is questionable:
> 1) we often don't flush when it's necessary: if there is a concurrent flushing
> we just bail out, even if that flushing is related to a completely different
> part of the cgroup tree (e.g. a leaf node belonging to a distant branch).

Right.

> 2) we can race and flush when it's not necessarily: if another cpu is busy,
> likely by the time when work will be executed there will be already another
> memcg cached. So IMO we need to move this check into the flushing thread.

We might just be able to remove all the speculative
checks. drain_all_stock() is slowpath after all...

> I'm working on a different approach, but it will take time and also likely be
> too invasive for @stable, so fixing the crash discovered by Alexander with
> READ_ONCE() is a good idea.

Sounds good, I'm looking forward to those changes.

---

From c9b940db5f75160b5e80c4ae83ea760ad29e8ef9 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 12 Oct 2022 12:59:07 -0400
Subject: [PATCH] mm: memcontrol: fix NULL deref race condition during cgroup
 deletion

Alexander Fedorov reports a race condition between two concurrent
stock draining operations, where the first one clears the stock's obj
pointer between the pointer test and deref of the second. Analysis:

1) First CPU:
   css_killed_work_fn() -> mem_cgroup_css_offline() ->
drain_all_stock() -> obj_stock_flush_required()
	if (stock->cached_objcg) {

This check sees a non-NULL pointer for *another* CPU's `memcg_stock` instance.

2) Second CPU:
  css_free_rwork_fn() -> __mem_cgroup_free() -> free_percpu() ->
obj_cgroup_uncharge() -> drain_obj_stock()
It frees `cached_objcg` pointer in its own `memcg_stock` instance:
	struct obj_cgroup *old = stock->cached_objcg;
	< ... >
	obj_cgroup_put(old);
	stock->cached_objcg = NULL;

3) First CPU continues after the 'if' check and re-reads the pointer
again, now it is NULL and dereferencing it leads to kernel panic:
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
				     struct mem_cgroup *root_memcg)
{
< ... >
	if (stock->cached_objcg) {
		memcg = obj_cgroup_memcg(stock->cached_objcg);

There is already RCU protection in place to ensure lifetime. Add the
missing READ_ONCE to the cgroup pointers to fix the TOCTOU, and
consolidate and document the speculative code.

Reported-by: Alexander Fedorov <halcien@gmail.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
---
 mm/memcontrol.c | 54 +++++++++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 29 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2d8549ae1b30..4357dadae95d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2190,8 +2190,6 @@ static DEFINE_MUTEX(percpu_charge_mutex);
 
 #ifdef CONFIG_MEMCG_KMEM
 static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
-static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
-				     struct mem_cgroup *root_memcg);
 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages);
 
 #else
@@ -2199,11 +2197,6 @@ static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
 {
 	return NULL;
 }
-static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
-				     struct mem_cgroup *root_memcg)
-{
-	return false;
-}
 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
 {
 }
@@ -2259,8 +2252,8 @@ static void drain_stock(struct memcg_stock_pcp *stock)
 		stock->nr_pages = 0;
 	}
 
+	WRITE_ONCE(stock->cached, NULL);
 	css_put(&old->css);
-	stock->cached = NULL;
 }
 
 static void drain_local_stock(struct work_struct *dummy)
@@ -2298,7 +2291,7 @@ static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 	if (stock->cached != memcg) { /* reset if necessary */
 		drain_stock(stock);
 		css_get(&memcg->css);
-		stock->cached = memcg;
+		WRITE_ONCE(stock->cached, memcg);
 	}
 	stock->nr_pages += nr_pages;
 
@@ -2339,13 +2332,30 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
 		struct mem_cgroup *memcg;
 		bool flush = false;
 
+		/*
+		 * Speculatively check up front if this CPU has any
+		 * cached charges that belong to the specified
+		 * root_memcg. The state may change from under us -
+		 * which is okay, because the draining itself is a
+		 * best-effort operation. Just ensure lifetime of
+		 * whatever we end up looking at.
+		 */
 		rcu_read_lock();
-		memcg = stock->cached;
+		memcg = READ_ONCE(stock->cached);
 		if (memcg && stock->nr_pages &&
 		    mem_cgroup_is_descendant(memcg, root_memcg))
 			flush = true;
-		else if (obj_stock_flush_required(stock, root_memcg))
-			flush = true;
+#ifdef CONFIG_MEMCG_KMEM
+		else {
+			struct obj_cgroup *objcg;
+
+			objcg = READ_ONCE(stock->cached_objcg);
+			if (objcg && stock->nr_bytes &&
+			    mem_cgroup_is_descendant(obj_cgroup_memcg(objcg),
+						     root_memcg))
+				flush = true;
+		}
+#endif
 		rcu_read_unlock();
 
 		if (flush &&
@@ -3170,7 +3180,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
 		obj_cgroup_get(objcg);
 		stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
 				? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
-		stock->cached_objcg = objcg;
+		WRITE_ONCE(stock->cached_objcg, objcg);
 		stock->cached_pgdat = pgdat;
 	} else if (stock->cached_pgdat != pgdat) {
 		/* Flush the existing cached vmstat data */
@@ -3289,7 +3299,7 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
 		stock->cached_pgdat = NULL;
 	}
 
-	stock->cached_objcg = NULL;
+	WRITE_ONCE(stock->cached_objcg, NULL);
 	/*
 	 * The `old' objects needs to be released by the caller via
 	 * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
@@ -3297,20 +3307,6 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
 	return old;
 }
 
-static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
-				     struct mem_cgroup *root_memcg)
-{
-	struct mem_cgroup *memcg;
-
-	if (stock->cached_objcg) {
-		memcg = obj_cgroup_memcg(stock->cached_objcg);
-		if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
-			return true;
-	}
-
-	return false;
-}
-
 static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
 			     bool allow_uncharge)
 {
@@ -3325,7 +3321,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
 	if (stock->cached_objcg != objcg) { /* reset if necessary */
 		old = drain_obj_stock(stock);
 		obj_cgroup_get(objcg);
-		stock->cached_objcg = objcg;
+		WRITE_ONCE(stock->cached_objcg, objcg);
 		stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
 				? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
 		allow_uncharge = true;	/* Allow uncharge when objcg changes */
diff mbox series

Patch

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c1152f8747..2ab205f529 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2215,7 +2215,7 @@  struct memcg_stock_pcp {
  	unsigned int nr_pages;
  
  #ifdef CONFIG_MEMCG_KMEM
-	struct obj_cgroup *cached_objcg;
+	struct obj_cgroup __rcu *cached_objcg;
  	unsigned int nr_bytes;
  #endif
  
@@ -3148,7 +3148,7 @@  static bool consume_obj_stock(struct obj_cgroup 
*objcg, unsigned int nr_bytes)
  	local_lock_irqsave(&memcg_stock.lock, flags);
  
  	stock = this_cpu_ptr(&memcg_stock);
-	if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
+	if (objcg == rcu_access_pointer(stock->cached_objcg) && stock->nr_bytes 
>= nr_bytes) {
  		stock->nr_bytes -= nr_bytes;