diff mbox series

[RFC,V1,01/13] mm: Add kmmscand kernel daemon

Message ID 20250319193028.29514-2-raghavendra.kt@amd.com (mailing list archive)
State New
Headers show
Series mm: slowtier page promotion based on PTE A bit | expand

Commit Message

Raghavendra K T March 19, 2025, 7:30 p.m. UTC
Add a skeleton to support scanning and migration.
Also add a config option for the same.

High level design:

While (1):
  scan the slowtier pages belonging to VMAs of a task.
  Add to migation list

Separate thread:
  migrate scanned pages to a toptier node based on heuristics

The overall code is heavily influenced by khugepaged design.

Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
---
 mm/Kconfig    |   8 +++
 mm/Makefile   |   1 +
 mm/kmmscand.c | 176 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 185 insertions(+)
 create mode 100644 mm/kmmscand.c

Comments

Jonathan Cameron March 21, 2025, 4:06 p.m. UTC | #1
On Wed, 19 Mar 2025 19:30:16 +0000
Raghavendra K T <raghavendra.kt@amd.com> wrote:

> Add a skeleton to support scanning and migration.
> Also add a config option for the same.
> 
> High level design:
> 
> While (1):
>   scan the slowtier pages belonging to VMAs of a task.
>   Add to migation list
> 
> Separate thread:
>   migrate scanned pages to a toptier node based on heuristics
> 
> The overall code is heavily influenced by khugepaged design.
> 
> Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>


I'm really bad and reading code and not commenting on the 'small'
stuff.  So feel free to ignore this given the RFC status!
This sort of read through helps me get my head around a series.

> ---
>  mm/Kconfig    |   8 +++
>  mm/Makefile   |   1 +
>  mm/kmmscand.c | 176 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 185 insertions(+)
>  create mode 100644 mm/kmmscand.c
> 
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 1b501db06417..5a4931633e15 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -783,6 +783,14 @@ config KSM
>  	  until a program has madvised that an area is MADV_MERGEABLE, and
>  	  root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
>  
> +config KMMSCAND
> +	bool "Enable PTE A bit scanning and Migration"
> +	depends on NUMA_BALANCING
> +	help
> +	  Enable PTE A bit scanning of page. CXL pages accessed are migrated to

Trivial but don't mention CXL.  "Other memory tier solutions are available"

> +	  a regular NUMA node. The option creates a separate kthread for
> +	  scanning and migration.
> +

> diff --git a/mm/kmmscand.c b/mm/kmmscand.c
> new file mode 100644
> index 000000000000..6c55250b5cfb
> --- /dev/null
> +++ b/mm/kmmscand.c

> +
> +struct kmmscand_scan kmmscand_scan = {
> +	.mm_head = LIST_HEAD_INIT(kmmscand_scan.mm_head),
> +};
> +
> +static int kmmscand_has_work(void)
> +{

Unless this is going to get more complex, I'd just put
the implementation inline.  Kind of obvious what is doing
so the wrapper doesn't add much.

> +	return !list_empty(&kmmscand_scan.mm_head);
> +}
> +
> +static bool kmmscand_should_wakeup(void)
> +{
> +	bool wakeup =  kthread_should_stop() || need_wakeup ||

bonus space after =

> +	       time_after_eq(jiffies, kmmscand_sleep_expire);
> +	if (need_wakeup)
> +		need_wakeup = false;

Why not set it unconditionally?  If it is false already, no
harm done and removes need to check.

> +
> +	return wakeup;
> +}
> +
> +static void kmmscand_wait_work(void)
> +{
> +	const unsigned long scan_sleep_jiffies =
> +		msecs_to_jiffies(kmmscand_scan_sleep_ms);
> +
> +	if (!scan_sleep_jiffies)
> +		return;
> +
> +	kmmscand_sleep_expire = jiffies + scan_sleep_jiffies;
> +	wait_event_timeout(kmmscand_wait,
> +			kmmscand_should_wakeup(),
> +			scan_sleep_jiffies);

strange wrap.  Maybe add a comment on why we don't care if
this timed out or not.

> +	return;
> +}
> +
> +static unsigned long kmmscand_scan_mm_slot(void)
> +{
> +	/* placeholder for scanning */

I guess this will make sense later in series!

> +	msleep(100);
> +	return 0;
> +}
> +
> +static void kmmscand_do_scan(void)
> +{
> +	unsigned long iter = 0, mms_to_scan;
> +

	unsigned long mms_to_scan = READ_ONCE(kmmscand_mms_to_scan);

> +	mms_to_scan = READ_ONCE(kmmscand_mms_to_scan);
> +
> +	while (true) {
> +		cond_resched();

Odd to do this at start. Maybe at end of loop?

> +
> +		if (unlikely(kthread_should_stop()) ||
> +			!READ_ONCE(kmmscand_scan_enabled))
> +			break;
return;  Then we don't need to read on to see if anything else happens.
> +
> +		if (kmmscand_has_work())
> +			kmmscand_scan_mm_slot();
> +
> +		iter++;
> +		if (iter >= mms_to_scan)
> +			break;
			return;
Same argument as above.

> +	}
> +}
> +
> +static int kmmscand(void *none)
> +{
> +	for (;;) {

while (true) maybe.  Feels more natural to me for a loop
with no terminating condition.   Obviously same thing in practice.

> +		if (unlikely(kthread_should_stop()))
			return;
> +			break;
> +
> +		kmmscand_do_scan();
> +
> +		while (!READ_ONCE(kmmscand_scan_enabled)) {
> +			cpu_relax();
> +			kmmscand_wait_work();
> +		}
> +
> +		kmmscand_wait_work();
> +	}
> +	return 0;
> +}
> +
> +static int start_kmmscand(void)
> +{
> +	int err = 0;
> +
> +	guard(mutex)(&kmmscand_mutex);
> +
> +	/* Some one already succeeded in starting daemon */
> +	if (kmmscand_thread)
return 0;
> +		goto end;
> +
> +	kmmscand_thread = kthread_run(kmmscand, NULL, "kmmscand");
> +	if (IS_ERR(kmmscand_thread)) {
> +		pr_err("kmmscand: kthread_run(kmmscand) failed\n");
> +		err = PTR_ERR(kmmscand_thread);
> +		kmmscand_thread = NULL;

Use a local variable instead and only assign on success. That
way you don't need to null it out in this path.

> +		goto end;

return PTR_ERR(kmmscand_thread_local);

> +	} else {
> +		pr_info("kmmscand: Successfully started kmmscand");
No need for else give the other path exits.

> +	}
> +
> +	if (!list_empty(&kmmscand_scan.mm_head))
> +		wake_up_interruptible(&kmmscand_wait);
> +
> +end:
> +	return err;
> +}
> +
> +static int stop_kmmscand(void)
> +{
> +	int err = 0;

No point in err if always 0.

> +
> +	guard(mutex)(&kmmscand_mutex);
> +
> +	if (kmmscand_thread) {
> +		kthread_stop(kmmscand_thread);
> +		kmmscand_thread = NULL;
> +	}
> +
> +	return err;
> +}
> +
> +static int __init kmmscand_init(void)
> +{
> +	int err;
> +
> +	err = start_kmmscand();
> +	if (err)
> +		goto err_kmmscand;

start_kmmscand() should be side effect free if it is returning an
error.  Not doing that makes for hard to read code.

Superficially looks like it is already side effect free so you
can probably just return here.


> +
> +	return 0;
> +
> +err_kmmscand:
> +	stop_kmmscand();
> +
> +	return err;
> +}
> +subsys_initcall(kmmscand_init);
Raghavendra K T March 24, 2025, 3:09 p.m. UTC | #2
On 3/21/2025 9:36 PM, Jonathan Cameron wrote:
> On Wed, 19 Mar 2025 19:30:16 +0000
> Raghavendra K T <raghavendra.kt@amd.com> wrote:
> 
>> Add a skeleton to support scanning and migration.
>> Also add a config option for the same.
>>
>> High level design:
>>
>> While (1):
>>    scan the slowtier pages belonging to VMAs of a task.
>>    Add to migation list
>>
>> Separate thread:
>>    migrate scanned pages to a toptier node based on heuristics
>>
>> The overall code is heavily influenced by khugepaged design.
>>
>> Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
> 
> 
> I'm really bad and reading code and not commenting on the 'small'
> stuff.  So feel free to ignore this given the RFC status!
> This sort of read through helps me get my head around a series.
> 

Hello Jonathan,
I do agree that my goal till was mostly POC, and yet to harden lot of
code. But your effort reviewing this code will help miles in converging 
to good code faster.

Thank you alot and appreciate.

>> ---
>>   mm/Kconfig    |   8 +++
>>   mm/Makefile   |   1 +
>>   mm/kmmscand.c | 176 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>   3 files changed, 185 insertions(+)
>>   create mode 100644 mm/kmmscand.c
>>
>> diff --git a/mm/Kconfig b/mm/Kconfig
>> index 1b501db06417..5a4931633e15 100644
>> --- a/mm/Kconfig
>> +++ b/mm/Kconfig
>> @@ -783,6 +783,14 @@ config KSM
>>   	  until a program has madvised that an area is MADV_MERGEABLE, and
>>   	  root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
>>   
>> +config KMMSCAND
>> +	bool "Enable PTE A bit scanning and Migration"
>> +	depends on NUMA_BALANCING
>> +	help
>> +	  Enable PTE A bit scanning of page. CXL pages accessed are migrated to
> 
> Trivial but don't mention CXL.  "Other memory tier solutions are available"

Sure.

> 
>> +	  a regular NUMA node. The option creates a separate kthread for
>> +	  scanning and migration.
>> +
> 
>> diff --git a/mm/kmmscand.c b/mm/kmmscand.c
>> new file mode 100644
>> index 000000000000..6c55250b5cfb
>> --- /dev/null
>> +++ b/mm/kmmscand.c
> 
>> +
>> +struct kmmscand_scan kmmscand_scan = {
>> +	.mm_head = LIST_HEAD_INIT(kmmscand_scan.mm_head),
>> +};
>> +
>> +static int kmmscand_has_work(void)
>> +{
> 
> Unless this is going to get more complex, I'd just put
> the implementation inline.  Kind of obvious what is doing
> so the wrapper doesn't add much.
> 

Sure.

>> +	return !list_empty(&kmmscand_scan.mm_head);
>> +}
>> +
>> +static bool kmmscand_should_wakeup(void)
>> +{
>> +	bool wakeup =  kthread_should_stop() || need_wakeup ||
> 
> bonus space after =
> 

+1

>> +	       time_after_eq(jiffies, kmmscand_sleep_expire);
>> +	if (need_wakeup)
>> +		need_wakeup = false;
> 
> Why not set it unconditionally?  If it is false already, no
> harm done and removes need to check.
>

Agree. will change. This code had wakeup from sysfs variable setting
in mind :).

>> +
>> +	return wakeup;
>> +}
>> +
>> +static void kmmscand_wait_work(void)
>> +{
>> +	const unsigned long scan_sleep_jiffies =
>> +		msecs_to_jiffies(kmmscand_scan_sleep_ms);
>> +
>> +	if (!scan_sleep_jiffies)
>> +		return;
>> +
>> +	kmmscand_sleep_expire = jiffies + scan_sleep_jiffies;
>> +	wait_event_timeout(kmmscand_wait,
>> +			kmmscand_should_wakeup(),
>> +			scan_sleep_jiffies);
> 
> strange wrap.  Maybe add a comment on why we don't care if
> this timed out or not.
> 

You mean why timeout is not harmful? sure .. will do.

>> +	return;
>> +}
>> +
>> +static unsigned long kmmscand_scan_mm_slot(void)
>> +{
>> +	/* placeholder for scanning */
> 
> I guess this will make sense later in series!
> 

Agree.
I will surely have to think about right splitting that
does not hog when bisected separately.

>> +	msleep(100);
>> +	return 0;
>> +}
>> +
>> +static void kmmscand_do_scan(void)
>> +{
>> +	unsigned long iter = 0, mms_to_scan;
>> +
> 
> 	unsigned long mms_to_scan = READ_ONCE(kmmscand_mms_to_scan);
> 
>> +	mms_to_scan = READ_ONCE(kmmscand_mms_to_scan);
>> +
>> +	while (true) {
>> +		cond_resched();
> 
> Odd to do this at start. Maybe at end of loop?
> 

+1

>> +
>> +		if (unlikely(kthread_should_stop()) ||
>> +			!READ_ONCE(kmmscand_scan_enabled))
>> +			break;
> return;  Then we don't need to read on to see if anything else happens.
>> +
>> +		if (kmmscand_has_work())
>> +			kmmscand_scan_mm_slot();
>> +
>> +		iter++;
>> +		if (iter >= mms_to_scan)
>> +			break;
> 			return;
> Same argument as above.
> 

Thanks. Will think about above.

>> +	}
>> +}
>> +
>> +static int kmmscand(void *none)
>> +{
>> +	for (;;) {
> 
> while (true) maybe.  Feels more natural to me for a loop
> with no terminating condition.   Obviously same thing in practice.
> 

+1

>> +		if (unlikely(kthread_should_stop()))
> 			return;
>> +			break;
>> +
>> +		kmmscand_do_scan();
>> +
>> +		while (!READ_ONCE(kmmscand_scan_enabled)) {
>> +			cpu_relax();
>> +			kmmscand_wait_work();
>> +		}
>> +
>> +		kmmscand_wait_work();
>> +	}
>> +	return 0;
>> +}
>> +
>> +static int start_kmmscand(void)
>> +{
>> +	int err = 0;
>> +
>> +	guard(mutex)(&kmmscand_mutex);
>> +
>> +	/* Some one already succeeded in starting daemon */
>> +	if (kmmscand_thread)
> return 0;
+1

>> +		goto end;
>> +
>> +	kmmscand_thread = kthread_run(kmmscand, NULL, "kmmscand");
>> +	if (IS_ERR(kmmscand_thread)) {
>> +		pr_err("kmmscand: kthread_run(kmmscand) failed\n");
>> +		err = PTR_ERR(kmmscand_thread);
>> +		kmmscand_thread = NULL;
> 
> Use a local variable instead and only assign on success. That
> way you don't need to null it out in this path.
> 

Agree

>> +		goto end;
> 
> return PTR_ERR(kmmscand_thread_local);
> 
>> +	} else {
>> +		pr_info("kmmscand: Successfully started kmmscand");
> No need for else give the other path exits.
> 

Agree.

>> +	}
>> +
>> +	if (!list_empty(&kmmscand_scan.mm_head))
>> +		wake_up_interruptible(&kmmscand_wait);
>> +
>> +end:
>> +	return err;
>> +}
>> +
>> +static int stop_kmmscand(void)
>> +{
>> +	int err = 0;
> 
> No point in err if always 0.
> 

Yes.

>> +
>> +	guard(mutex)(&kmmscand_mutex);
>> +
>> +	if (kmmscand_thread) {
>> +		kthread_stop(kmmscand_thread);
>> +		kmmscand_thread = NULL;
>> +	}
>> +
>> +	return err;
>> +}
>> +
>> +static int __init kmmscand_init(void)
>> +{
>> +	int err;
>> +
>> +	err = start_kmmscand();
>> +	if (err)
>> +		goto err_kmmscand;
> 
> start_kmmscand() should be side effect free if it is returning an
> error.  Not doing that makes for hard to read code.
> 
> Superficially looks like it is already side effect free so you
> can probably just return here.
> 

There is one scanctrl free added later in stop_kmmscand part.

> 
>> +
>> +	return 0;
>> +
>> +err_kmmscand:
>> +	stop_kmmscand();
>> +
>> +	return err;
>> +}
>> +subsys_initcall(kmmscand_init);
>
diff mbox series

Patch

diff --git a/mm/Kconfig b/mm/Kconfig
index 1b501db06417..5a4931633e15 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -783,6 +783,14 @@  config KSM
 	  until a program has madvised that an area is MADV_MERGEABLE, and
 	  root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
 
+config KMMSCAND
+	bool "Enable PTE A bit scanning and Migration"
+	depends on NUMA_BALANCING
+	help
+	  Enable PTE A bit scanning of page. CXL pages accessed are migrated to
+	  a regular NUMA node. The option creates a separate kthread for
+	  scanning and migration.
+
 config DEFAULT_MMAP_MIN_ADDR
 	int "Low address space to protect from user allocation"
 	depends on MMU
diff --git a/mm/Makefile b/mm/Makefile
index 850386a67b3e..45e2f8cc8fd6 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -94,6 +94,7 @@  obj-$(CONFIG_FAIL_PAGE_ALLOC) += fail_page_alloc.o
 obj-$(CONFIG_MEMTEST)		+= memtest.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_NUMA) += memory-tiers.o
+obj-$(CONFIG_KMMSCAND) += kmmscand.o
 obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
 obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
diff --git a/mm/kmmscand.c b/mm/kmmscand.c
new file mode 100644
index 000000000000..6c55250b5cfb
--- /dev/null
+++ b/mm/kmmscand.c
@@ -0,0 +1,176 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/mmu_notifier.h>
+#include <linux/swap.h>
+#include <linux/mm_inline.h>
+#include <linux/kthread.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/cleanup.h>
+
+#include <asm/pgalloc.h>
+#include "internal.h"
+
+
+static struct task_struct *kmmscand_thread __read_mostly;
+static DEFINE_MUTEX(kmmscand_mutex);
+
+/* How long to pause between two scan and migration cycle */
+static unsigned int kmmscand_scan_sleep_ms __read_mostly = 16;
+
+/* Max number of mms to scan in one scan and migration cycle */
+#define KMMSCAND_MMS_TO_SCAN	(4 * 1024UL)
+static unsigned long kmmscand_mms_to_scan __read_mostly = KMMSCAND_MMS_TO_SCAN;
+
+bool kmmscand_scan_enabled = true;
+static bool need_wakeup;
+
+static unsigned long kmmscand_sleep_expire;
+
+static DECLARE_WAIT_QUEUE_HEAD(kmmscand_wait);
+
+struct kmmscand_scan {
+	struct list_head mm_head;
+};
+
+struct kmmscand_scan kmmscand_scan = {
+	.mm_head = LIST_HEAD_INIT(kmmscand_scan.mm_head),
+};
+
+static int kmmscand_has_work(void)
+{
+	return !list_empty(&kmmscand_scan.mm_head);
+}
+
+static bool kmmscand_should_wakeup(void)
+{
+	bool wakeup =  kthread_should_stop() || need_wakeup ||
+	       time_after_eq(jiffies, kmmscand_sleep_expire);
+	if (need_wakeup)
+		need_wakeup = false;
+
+	return wakeup;
+}
+
+static void kmmscand_wait_work(void)
+{
+	const unsigned long scan_sleep_jiffies =
+		msecs_to_jiffies(kmmscand_scan_sleep_ms);
+
+	if (!scan_sleep_jiffies)
+		return;
+
+	kmmscand_sleep_expire = jiffies + scan_sleep_jiffies;
+	wait_event_timeout(kmmscand_wait,
+			kmmscand_should_wakeup(),
+			scan_sleep_jiffies);
+	return;
+}
+
+static unsigned long kmmscand_scan_mm_slot(void)
+{
+	/* placeholder for scanning */
+	msleep(100);
+	return 0;
+}
+
+static void kmmscand_do_scan(void)
+{
+	unsigned long iter = 0, mms_to_scan;
+
+	mms_to_scan = READ_ONCE(kmmscand_mms_to_scan);
+
+	while (true) {
+		cond_resched();
+
+		if (unlikely(kthread_should_stop()) ||
+			!READ_ONCE(kmmscand_scan_enabled))
+			break;
+
+		if (kmmscand_has_work())
+			kmmscand_scan_mm_slot();
+
+		iter++;
+		if (iter >= mms_to_scan)
+			break;
+	}
+}
+
+static int kmmscand(void *none)
+{
+	for (;;) {
+		if (unlikely(kthread_should_stop()))
+			break;
+
+		kmmscand_do_scan();
+
+		while (!READ_ONCE(kmmscand_scan_enabled)) {
+			cpu_relax();
+			kmmscand_wait_work();
+		}
+
+		kmmscand_wait_work();
+	}
+	return 0;
+}
+
+static int start_kmmscand(void)
+{
+	int err = 0;
+
+	guard(mutex)(&kmmscand_mutex);
+
+	/* Some one already succeeded in starting daemon */
+	if (kmmscand_thread)
+		goto end;
+
+	kmmscand_thread = kthread_run(kmmscand, NULL, "kmmscand");
+	if (IS_ERR(kmmscand_thread)) {
+		pr_err("kmmscand: kthread_run(kmmscand) failed\n");
+		err = PTR_ERR(kmmscand_thread);
+		kmmscand_thread = NULL;
+		goto end;
+	} else {
+		pr_info("kmmscand: Successfully started kmmscand");
+	}
+
+	if (!list_empty(&kmmscand_scan.mm_head))
+		wake_up_interruptible(&kmmscand_wait);
+
+end:
+	return err;
+}
+
+static int stop_kmmscand(void)
+{
+	int err = 0;
+
+	guard(mutex)(&kmmscand_mutex);
+
+	if (kmmscand_thread) {
+		kthread_stop(kmmscand_thread);
+		kmmscand_thread = NULL;
+	}
+
+	return err;
+}
+
+static int __init kmmscand_init(void)
+{
+	int err;
+
+	err = start_kmmscand();
+	if (err)
+		goto err_kmmscand;
+
+	return 0;
+
+err_kmmscand:
+	stop_kmmscand();
+
+	return err;
+}
+subsys_initcall(kmmscand_init);