diff mbox series

mm: fix pgdat->kswap accessed concurrently

Message ID 20220820032506.126860-1-wangkefeng.wang@huawei.com (mailing list archive)
State New
Headers show
Series mm: fix pgdat->kswap accessed concurrently | expand

Commit Message

Kefeng Wang Aug. 20, 2022, 3:25 a.m. UTC
The pgdat->kswap could be accessed concurrently by kswapd_run() and
kcompactd(), it don't be protected by any lock, which leads to the
following null-ptr-deref,

  vmscan: Failed to start kswapd on node 0
  ...
  BUG: KASAN: null-ptr-deref in kcompactd+0x440/0x504
  Read of size 8 at addr 0000000000000024 by task kcompactd0/37

  CPU: 0 PID: 37 Comm: kcompactd0 Kdump: loaded Tainted: G           OE     5.10.60 #1
  Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
  Call trace:
   dump_backtrace+0x0/0x394
   show_stack+0x34/0x4c
   dump_stack+0x158/0x1e4
   __kasan_report+0x138/0x140
   kasan_report+0x44/0xdc
   __asan_load8+0x94/0xd0
   kcompactd+0x440/0x504
   kthread+0x1a4/0x1f0
   ret_from_fork+0x10/0x18

Fix it by adding READ_ONCE()|WRITE_ONCE().

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
 mm/compaction.c |  4 +++-
 mm/vmscan.c     | 15 +++++++++------
 2 files changed, 12 insertions(+), 7 deletions(-)

Comments

Muchun Song Aug. 20, 2022, 7:33 a.m. UTC | #1
> On Aug 20, 2022, at 11:25, Kefeng Wang <wangkefeng.wang@huawei.com> wrote:
> 
> The pgdat->kswap could be accessed concurrently by kswapd_run() and
> kcompactd(), it don't be protected by any lock, which leads to the
> following null-ptr-deref,
> 
>  vmscan: Failed to start kswapd on node 0
>  ...
>  BUG: KASAN: null-ptr-deref in kcompactd+0x440/0x504
>  Read of size 8 at addr 0000000000000024 by task kcompactd0/37
> 
>  CPU: 0 PID: 37 Comm: kcompactd0 Kdump: loaded Tainted: G           OE     5.10.60 #1
>  Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
>  Call trace:
>   dump_backtrace+0x0/0x394
>   show_stack+0x34/0x4c
>   dump_stack+0x158/0x1e4
>   __kasan_report+0x138/0x140
>   kasan_report+0x44/0xdc
>   __asan_load8+0x94/0xd0
>   kcompactd+0x440/0x504
>   kthread+0x1a4/0x1f0
>   ret_from_fork+0x10/0x18
> 
> Fix it by adding READ_ONCE()|WRITE_ONCE().
> 
> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
> ---
> mm/compaction.c |  4 +++-
> mm/vmscan.c     | 15 +++++++++------
> 2 files changed, 12 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 640fa76228dd..aa1cfe47f046 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -1983,7 +1983,9 @@ static inline bool is_via_compact_memory(int order)
> 
> static bool kswapd_is_running(pg_data_t *pgdat)
> {
> -	return pgdat->kswapd && task_is_running(pgdat->kswapd);
> +	struct task_struct *t = READ_ONCE(pgdat->kswapd);
> +
> +	return t && task_is_running(t);
> }
> 
> /*
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index b2b1431352dc..9abba714249e 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -4642,16 +4642,19 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
> void kswapd_run(int nid)
> {
> 	pg_data_t *pgdat = NODE_DATA(nid);
> +	struct task_struct *t;
> 
> -	if (pgdat->kswapd)
> +	if (READ_ONCE(pgdat->kswapd))
> 		return;
> 
> -	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
> -	if (IS_ERR(pgdat->kswapd)) {
> +	t = kthread_run(kswapd, pgdat, "kswapd%d", nid);
> +	if (IS_ERR(t)) {
> 		/* failure at boot is fatal */
> 		BUG_ON(system_state < SYSTEM_RUNNING);
> 		pr_err("Failed to start kswapd on node %d\n", nid);
> -		pgdat->kswapd = NULL;
> +		WRITE_ONCE(pgdat->kswapd, NULL);
> +	} else {
> +		WRITE_ONCE(pgdat->kswapd, t);
> 	}
> }

IIUC, the race is like the followings:

CPU 0:					CPU 1:

kswapd_run()
	pgdat->kswapd = kthread_run()
	if (IS_ERR(pgdat->kswapd))
					kswapd_is_running
						// load pgdat->kswapd and it is NOT NULL.
		pgdat->kswapd = NULL
						task_is_running(pgdat->kswapd); // NULL pointer dereference

So

Reviewed-by: Muchun Song <songmuchun@bytedance.com>

Thanks.

> 
> @@ -4661,11 +4664,11 @@ void kswapd_run(int nid)
>  */
> void kswapd_stop(int nid)
> {
> -	struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
> +	struct task_struct *kswapd = READ_ONCE(NODE_DATA(nid)->kswapd);
> 
> 	if (kswapd) {
> 		kthread_stop(kswapd);
> -		NODE_DATA(nid)->kswapd = NULL;
> +		WRITE_ONCE(NODE_DATA(nid)->kswapd, NULL);
> 	}
> }
> 
> -- 
> 2.35.3
> 
>
Andrew Morton Aug. 20, 2022, 8:59 p.m. UTC | #2
On Sat, 20 Aug 2022 15:33:04 +0800 Muchun Song <muchun.song@linux.dev> wrote:

> 
> 
> > +	if (IS_ERR(t)) {
> > 		/* failure at boot is fatal */
> > 		BUG_ON(system_state < SYSTEM_RUNNING);
> > 		pr_err("Failed to start kswapd on node %d\n", nid);
> > -		pgdat->kswapd = NULL;
> > +		WRITE_ONCE(pgdat->kswapd, NULL);
> > +	} else {
> > +		WRITE_ONCE(pgdat->kswapd, t);
> > 	}
> > }
> 
> IIUC, the race is like the followings:
> 
> CPU 0:					CPU 1:
> 
> kswapd_run()
> 	pgdat->kswapd = kthread_run()
> 	if (IS_ERR(pgdat->kswapd))
> 					kswapd_is_running
> 						// load pgdat->kswapd and it is NOT NULL.
> 		pgdat->kswapd = NULL
> 						task_is_running(pgdat->kswapd); // NULL pointer dereference
> 

But don't we still have a bug?  Sure, kswapd_is_running() will no
longer deref a null pointer.  But it now runs kswapd_is_running()
against a task which has exited - a use-after-free?
Kefeng Wang Aug. 23, 2022, 1:07 a.m. UTC | #3
On 2022/8/21 4:59, Andrew Morton wrote:
> On Sat, 20 Aug 2022 15:33:04 +0800 Muchun Song <muchun.song@linux.dev> wrote:
>
>>
>>> +	if (IS_ERR(t)) {
>>> 		/* failure at boot is fatal */
>>> 		BUG_ON(system_state < SYSTEM_RUNNING);
>>> 		pr_err("Failed to start kswapd on node %d\n", nid);
>>> -		pgdat->kswapd = NULL;
>>> +		WRITE_ONCE(pgdat->kswapd, NULL);
>>> +	} else {
>>> +		WRITE_ONCE(pgdat->kswapd, t);
>>> 	}
>>> }
>> IIUC, the race is like the followings:
>>
>> CPU 0:					CPU 1:
>>
>> kswapd_run()
>> 	pgdat->kswapd = kthread_run()
>> 	if (IS_ERR(pgdat->kswapd))
>> 					kswapd_is_running
>> 						// load pgdat->kswapd and it is NOT NULL.
>> 		pgdat->kswapd = NULL
>> 						task_is_running(pgdat->kswapd); // NULL pointer dereference
>>
> But don't we still have a bug?  Sure, kswapd_is_running() will no
> longer deref a null pointer.  But it now runs kswapd_is_running()
> against a task which has exited - a use-after-free?
we could add get/put_task_struct() to avoid the UAF, will update, thanks.
> .
Kefeng Wang Aug. 23, 2022, 2:47 p.m. UTC | #4
On 2022/8/23 9:07, Kefeng Wang wrote:
>
> On 2022/8/21 4:59, Andrew Morton wrote:
>> On Sat, 20 Aug 2022 15:33:04 +0800 Muchun Song 
>> <muchun.song@linux.dev> wrote:
>>
>>>
>>>> +    if (IS_ERR(t)) {
>>>>         /* failure at boot is fatal */
>>>>         BUG_ON(system_state < SYSTEM_RUNNING);
>>>>         pr_err("Failed to start kswapd on node %d\n", nid);
>>>> -        pgdat->kswapd = NULL;
>>>> +        WRITE_ONCE(pgdat->kswapd, NULL);
>>>> +    } else {
>>>> +        WRITE_ONCE(pgdat->kswapd, t);
>>>>     }
>>>> }
>>> IIUC, the race is like the followings:
>>>
>>> CPU 0:                    CPU 1:
>>>
>>> kswapd_run()
>>>     pgdat->kswapd = kthread_run()
>>>     if (IS_ERR(pgdat->kswapd))
>>>                     kswapd_is_running
>>>                         // load pgdat->kswapd and it is NOT NULL.
>>>         pgdat->kswapd = NULL
>>>                         task_is_running(pgdat->kswapd); // NULL 
>>> pointer dereference
>>>
>> But don't we still have a bug?  Sure, kswapd_is_running() will no
>> longer deref a null pointer.  But it now runs kswapd_is_running()
>> against a task which has exited - a use-after-free?

The UAF is caused by race between kswapd_stop() and kcompactd(), right?

so  kcompactd() should be stop before kswapd_stop() to avoid the above UAF.

$ git diff
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fad6d1f2262a..2fd45ccbce45 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1940,8 +1940,8 @@ int __ref offline_pages(unsigned long start_pfn, 
unsigned long nr_pages,

         node_states_clear_node(node, &arg);
         if (arg.status_change_nid >= 0) {
-               kswapd_stop(node);
                 kcompactd_stop(node);
+               kswapd_stop(node);
         }

         writeback_set_ratelimit();

> we could add get/put_task_struct() to avoid the UAF, will update, 
> thanks.

sorry, the task refcount won't fix anything.


> .
Muchun Song Aug. 24, 2022, 6:59 a.m. UTC | #5
> On Aug 21, 2022, at 04:59, Andrew Morton <akpm@linux-foundation.org> wrote:
> 
> On Sat, 20 Aug 2022 15:33:04 +0800 Muchun Song <muchun.song@linux.dev> wrote:
> 
>> 
>> 
>>> +	if (IS_ERR(t)) {
>>> 		/* failure at boot is fatal */
>>> 		BUG_ON(system_state < SYSTEM_RUNNING);
>>> 		pr_err("Failed to start kswapd on node %d\n", nid);
>>> -		pgdat->kswapd = NULL;
>>> +		WRITE_ONCE(pgdat->kswapd, NULL);
>>> +	} else {
>>> +		WRITE_ONCE(pgdat->kswapd, t);
>>> 	}
>>> }
>> 
>> IIUC, the race is like the followings:
>> 
>> CPU 0:					CPU 1:
>> 
>> kswapd_run()
>> 	pgdat->kswapd = kthread_run()
>> 	if (IS_ERR(pgdat->kswapd))
>> 					kswapd_is_running
>> 						// load pgdat->kswapd and it is NOT NULL.
>> 		pgdat->kswapd = NULL
>> 						task_is_running(pgdat->kswapd); // NULL pointer dereference
>> 
> 
> But don't we still have a bug?  Sure, kswapd_is_running() will no
> longer deref a null pointer.  But it now runs kswapd_is_running()
> against a task which has exited - a use-after-free?
> 

Agree. I missed that.

Thanks,
Muchun
Muchun Song Aug. 24, 2022, 7:03 a.m. UTC | #6
> On Aug 23, 2022, at 22:47, Kefeng Wang <wangkefeng.wang@huawei.com> wrote:
> 
> 
> On 2022/8/23 9:07, Kefeng Wang wrote:
>> 
>> On 2022/8/21 4:59, Andrew Morton wrote:
>>> On Sat, 20 Aug 2022 15:33:04 +0800 Muchun Song <muchun.song@linux.dev> wrote:
>>> 
>>>> 
>>>>> +    if (IS_ERR(t)) {
>>>>>         /* failure at boot is fatal */
>>>>>         BUG_ON(system_state < SYSTEM_RUNNING);
>>>>>         pr_err("Failed to start kswapd on node %d\n", nid);
>>>>> -        pgdat->kswapd = NULL;
>>>>> +        WRITE_ONCE(pgdat->kswapd, NULL);
>>>>> +    } else {
>>>>> +        WRITE_ONCE(pgdat->kswapd, t);
>>>>>     }
>>>>> }
>>>> IIUC, the race is like the followings:
>>>> 
>>>> CPU 0:                    CPU 1:
>>>> 
>>>> kswapd_run()
>>>>     pgdat->kswapd = kthread_run()
>>>>     if (IS_ERR(pgdat->kswapd))
>>>>                     kswapd_is_running
>>>>                         // load pgdat->kswapd and it is NOT NULL.
>>>>         pgdat->kswapd = NULL
>>>>                         task_is_running(pgdat->kswapd); // NULL pointer dereference
>>>> 
>>> But don't we still have a bug?  Sure, kswapd_is_running() will no
>>> longer deref a null pointer.  But it now runs kswapd_is_running()
>>> against a task which has exited - a use-after-free?
> 
> The UAF is caused by race between kswapd_stop() and kcompactd(), right?
> 
> so  kcompactd() should be stop before kswapd_stop() to avoid the above UAF.
> 
> $ git diff
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index fad6d1f2262a..2fd45ccbce45 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1940,8 +1940,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
> 
>         node_states_clear_node(node, &arg);
>         if (arg.status_change_nid >= 0) {
> -               kswapd_stop(node);
>                 kcompactd_stop(node);
> +               kswapd_stop(node);
>         }
> 
>         writeback_set_ratelimit();

The changes make sense to me. Again:

Reviewed-by: Muchun Song <songmuchun@bytedance.com>

Thanks.

> 
>> we could add get/put_task_struct() to avoid the UAF, will update, thanks.
> 
> sorry, the task refcount won't fix anything.
> 
> 
>> .
diff mbox series

Patch

diff --git a/mm/compaction.c b/mm/compaction.c
index 640fa76228dd..aa1cfe47f046 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1983,7 +1983,9 @@  static inline bool is_via_compact_memory(int order)
 
 static bool kswapd_is_running(pg_data_t *pgdat)
 {
-	return pgdat->kswapd && task_is_running(pgdat->kswapd);
+	struct task_struct *t = READ_ONCE(pgdat->kswapd);
+
+	return t && task_is_running(t);
 }
 
 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b2b1431352dc..9abba714249e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4642,16 +4642,19 @@  unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 void kswapd_run(int nid)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
+	struct task_struct *t;
 
-	if (pgdat->kswapd)
+	if (READ_ONCE(pgdat->kswapd))
 		return;
 
-	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
-	if (IS_ERR(pgdat->kswapd)) {
+	t = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+	if (IS_ERR(t)) {
 		/* failure at boot is fatal */
 		BUG_ON(system_state < SYSTEM_RUNNING);
 		pr_err("Failed to start kswapd on node %d\n", nid);
-		pgdat->kswapd = NULL;
+		WRITE_ONCE(pgdat->kswapd, NULL);
+	} else {
+		WRITE_ONCE(pgdat->kswapd, t);
 	}
 }
 
@@ -4661,11 +4664,11 @@  void kswapd_run(int nid)
  */
 void kswapd_stop(int nid)
 {
-	struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
+	struct task_struct *kswapd = READ_ONCE(NODE_DATA(nid)->kswapd);
 
 	if (kswapd) {
 		kthread_stop(kswapd);
-		NODE_DATA(nid)->kswapd = NULL;
+		WRITE_ONCE(NODE_DATA(nid)->kswapd, NULL);
 	}
 }