diff mbox series

[RFC,4/5] mm/migrate: introduce multi-threaded page copy routine

Message ID 20250103172419.4148674-5-ziy@nvidia.com (mailing list archive)
State New
Headers show
Series Accelerate page migration with batching and multi threads | expand

Commit Message

Zi Yan Jan. 3, 2025, 5:24 p.m. UTC
Now page copies are batched, multi-threaded page copy can be used to
increase page copy throughput. Add copy_page_lists_mt() to copy pages in
multi-threaded manners. Empirical data show more than 32 base pages are
needed to show the benefit of using multi-threaded page copy, so use 32 as
the threshold.

Signed-off-by: Zi Yan <ziy@nvidia.com>
---
 include/linux/migrate.h |   3 +
 mm/Makefile             |   2 +-
 mm/copy_pages.c         | 186 ++++++++++++++++++++++++++++++++++++++++
 mm/migrate.c            |  19 ++--
 4 files changed, 199 insertions(+), 11 deletions(-)
 create mode 100644 mm/copy_pages.c

Comments

Hyeonggon Yoo Jan. 6, 2025, 1:18 a.m. UTC | #1
On 2025-01-04 2:24 AM, Zi Yan wrote:
> Now page copies are batched, multi-threaded page copy can be used to
> increase page copy throughput. Add copy_page_lists_mt() to copy pages in
> multi-threaded manners. Empirical data show more than 32 base pages are
> needed to show the benefit of using multi-threaded page copy, so use 32 as
> the threshold.
 >
> Signed-off-by: Zi Yan <ziy@nvidia.com>
> ---
>   include/linux/migrate.h |   3 +
>   mm/Makefile             |   2 +-
>   mm/copy_pages.c         | 186 ++++++++++++++++++++++++++++++++++++++++
>   mm/migrate.c            |  19 ++--
>   4 files changed, 199 insertions(+), 11 deletions(-)
>   create mode 100644 mm/copy_pages.c
> 

[...snip...]

> +++ b/mm/copy_pages.c
> @@ -0,0 +1,186 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Parallel page copy routine.
> + */
> +
> +#include <linux/sysctl.h>
> +#include <linux/highmem.h>
> +#include <linux/workqueue.h>
> +#include <linux/slab.h>
> +#include <linux/migrate.h>
> +
> +
> +unsigned int limit_mt_num = 4;
> +
> +struct copy_item {
> +	char *to;
> +	char *from;
> +	unsigned long chunk_size;
> +};
> +
> +struct copy_page_info {
> +	struct work_struct copy_page_work;
> +	unsigned long num_items;
> +	struct copy_item item_list[];
> +};
> +
> +static void copy_page_routine(char *vto, char *vfrom,
> +	unsigned long chunk_size)
> +{
> +	memcpy(vto, vfrom, chunk_size);
> +}
> +
> +static void copy_page_work_queue_thread(struct work_struct *work)
> +{
> +	struct copy_page_info *my_work = (struct copy_page_info *)work;
> +	int i;
> +
> +	for (i = 0; i < my_work->num_items; ++i)
> +		copy_page_routine(my_work->item_list[i].to,
> +						  my_work->item_list[i].from,
> +						  my_work->item_list[i].chunk_size);
> +}
> +
> +int copy_page_lists_mt(struct list_head *dst_folios,
> +		struct list_head *src_folios, int nr_items)
> +{
> +	int err = 0;
> +	unsigned int total_mt_num = limit_mt_num;
> +	int to_node = folio_nid(list_first_entry(dst_folios, struct folio, lru));
> +	int i;
> +	struct copy_page_info *work_items[32] = {0};
> +	const struct cpumask *per_node_cpumask = cpumask_of_node(to_node);

What happens here if to_node is a NUMA node without CPUs? (e.g. CXL
node).

And even with a NUMA node with CPUs I think offloading copies to CPUs
of either "from node" or "to node" will end up a CPU touching two pages
in two different NUMA nodes anyway, one page in the local node
and the other page in the remote node.

In that sense, I don't understand when push_0_pull_1 (introduced in
patch 5) should be 0 or 1. Am I missing something?

> +	int cpu_id_list[32] = {0};
> +	int cpu;
> +	int max_items_per_thread;
> +	int item_idx;
> +	struct folio *src, *src2, *dst, *dst2;
> +
> +	total_mt_num = min_t(unsigned int, total_mt_num,
> +			cpumask_weight(per_node_cpumask));
> +
> +	if (total_mt_num > 32)
> +		total_mt_num = 32;
> +
> +	/* Each threads get part of each page, if nr_items < totla_mt_num */
> +	if (nr_items < total_mt_num)
> +		max_items_per_thread = nr_items;
> +	else
> +		max_items_per_thread = (nr_items / total_mt_num) +
> +				((nr_items % total_mt_num) ? 1 : 0);
> +
> +
> +	for (cpu = 0; cpu < total_mt_num; ++cpu) {
> +		work_items[cpu] = kzalloc(sizeof(struct copy_page_info) +
> +					sizeof(struct copy_item) * max_items_per_thread,
> +					GFP_NOWAIT);
 > +
> +		if (!work_items[cpu]) {
> +			err = -ENOMEM;
> +			goto free_work_items;
> +		}
> +	}

[...snip...]

> +
> +	/* Wait until it finishes  */
> +	for (i = 0; i < total_mt_num; ++i)
> +		flush_work((struct work_struct *)work_items[i]);
> +
> +free_work_items:
> +	for (cpu = 0; cpu < total_mt_num; ++cpu)
> +		kfree(work_items[cpu]);
> +
> +	return err;

Should the kernel re-try migration without multi-threading if it failed
to allocate memory?

---
Hyeonggon
Zi Yan Jan. 6, 2025, 2:01 a.m. UTC | #2
On 5 Jan 2025, at 20:18, Hyeonggon Yoo wrote:

> On 2025-01-04 2:24 AM, Zi Yan wrote:
>> Now page copies are batched, multi-threaded page copy can be used to
>> increase page copy throughput. Add copy_page_lists_mt() to copy pages in
>> multi-threaded manners. Empirical data show more than 32 base pages are
>> needed to show the benefit of using multi-threaded page copy, so use 32 as
>> the threshold.
>>
>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>> ---
>>   include/linux/migrate.h |   3 +
>>   mm/Makefile             |   2 +-
>>   mm/copy_pages.c         | 186 ++++++++++++++++++++++++++++++++++++++++
>>   mm/migrate.c            |  19 ++--
>>   4 files changed, 199 insertions(+), 11 deletions(-)
>>   create mode 100644 mm/copy_pages.c
>>
>
> [...snip...]
>
>> +++ b/mm/copy_pages.c
>> @@ -0,0 +1,186 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Parallel page copy routine.
>> + */
>> +
>> +#include <linux/sysctl.h>
>> +#include <linux/highmem.h>
>> +#include <linux/workqueue.h>
>> +#include <linux/slab.h>
>> +#include <linux/migrate.h>
>> +
>> +
>> +unsigned int limit_mt_num = 4;
>> +
>> +struct copy_item {
>> +	char *to;
>> +	char *from;
>> +	unsigned long chunk_size;
>> +};
>> +
>> +struct copy_page_info {
>> +	struct work_struct copy_page_work;
>> +	unsigned long num_items;
>> +	struct copy_item item_list[];
>> +};
>> +
>> +static void copy_page_routine(char *vto, char *vfrom,
>> +	unsigned long chunk_size)
>> +{
>> +	memcpy(vto, vfrom, chunk_size);
>> +}
>> +
>> +static void copy_page_work_queue_thread(struct work_struct *work)
>> +{
>> +	struct copy_page_info *my_work = (struct copy_page_info *)work;
>> +	int i;
>> +
>> +	for (i = 0; i < my_work->num_items; ++i)
>> +		copy_page_routine(my_work->item_list[i].to,
>> +						  my_work->item_list[i].from,
>> +						  my_work->item_list[i].chunk_size);
>> +}
>> +
>> +int copy_page_lists_mt(struct list_head *dst_folios,
>> +		struct list_head *src_folios, int nr_items)
>> +{
>> +	int err = 0;
>> +	unsigned int total_mt_num = limit_mt_num;
>> +	int to_node = folio_nid(list_first_entry(dst_folios, struct folio, lru));
>> +	int i;
>> +	struct copy_page_info *work_items[32] = {0};
>> +	const struct cpumask *per_node_cpumask = cpumask_of_node(to_node);
>
> What happens here if to_node is a NUMA node without CPUs? (e.g. CXL
> node).

I did not think about that case. In that case, from_node will be used.
If both from and to are CPUless nodes, maybe the node of the executing
CPU should be used to select cpumask here.

>
> And even with a NUMA node with CPUs I think offloading copies to CPUs
> of either "from node" or "to node" will end up a CPU touching two pages
> in two different NUMA nodes anyway, one page in the local node
> and the other page in the remote node.
>
> In that sense, I don't understand when push_0_pull_1 (introduced in
> patch 5) should be 0 or 1. Am I missing something?

From my experiments, copy throughput differs between pushing data from local
CPUs and pulling data from remote CPUs. On NVIDIA Grace CPU, pushing data
has higher throughput. Back in 2019, when I tested it on Intel Xeon Broadwell,
pulling data has higher throughput. In the final version, a boot time
benchmark might be needed to decide whether to push data or pull data.

>> +	int cpu_id_list[32] = {0};
>> +	int cpu;
>> +	int max_items_per_thread;
>> +	int item_idx;
>> +	struct folio *src, *src2, *dst, *dst2;
>> +
>> +	total_mt_num = min_t(unsigned int, total_mt_num,
>> +			cpumask_weight(per_node_cpumask));
>> +
>> +	if (total_mt_num > 32)
>> +		total_mt_num = 32;
>> +
>> +	/* Each threads get part of each page, if nr_items < totla_mt_num */
>> +	if (nr_items < total_mt_num)
>> +		max_items_per_thread = nr_items;
>> +	else
>> +		max_items_per_thread = (nr_items / total_mt_num) +
>> +				((nr_items % total_mt_num) ? 1 : 0);
>> +
>> +
>> +	for (cpu = 0; cpu < total_mt_num; ++cpu) {
>> +		work_items[cpu] = kzalloc(sizeof(struct copy_page_info) +
>> +					sizeof(struct copy_item) * max_items_per_thread,
>> +					GFP_NOWAIT);
>> +
>> +		if (!work_items[cpu]) {
>> +			err = -ENOMEM;
>> +			goto free_work_items;
>> +		}
>> +	}
>
> [...snip...]
>
>> +
>> +	/* Wait until it finishes  */
>> +	for (i = 0; i < total_mt_num; ++i)
>> +		flush_work((struct work_struct *)work_items[i]);
>> +
>> +free_work_items:
>> +	for (cpu = 0; cpu < total_mt_num; ++cpu)
>> +		kfree(work_items[cpu]);
>> +
>> +	return err;
>
> Should the kernel re-try migration without multi-threading if it failed
> to allocate memory?

Sure. Will add it in the next version.

Thank you for the reviews.

--
Best Regards,
Yan, Zi
diff mbox series

Patch

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 29919faea2f1..a0124f4893b0 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -80,6 +80,9 @@  void folio_migrate_flags(struct folio *newfolio, struct folio *folio);
 int folio_migrate_mapping(struct address_space *mapping,
 		struct folio *newfolio, struct folio *folio, int extra_count);
 
+int copy_page_lists_mt(struct list_head *dst_folios,
+		struct list_head *src_folios, int nr_items);
+
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
diff --git a/mm/Makefile b/mm/Makefile
index 850386a67b3e..f8c7f6b4cebb 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -92,7 +92,7 @@  obj-$(CONFIG_KMSAN)	+= kmsan/
 obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_FAIL_PAGE_ALLOC) += fail_page_alloc.o
 obj-$(CONFIG_MEMTEST)		+= memtest.o
-obj-$(CONFIG_MIGRATION) += migrate.o
+obj-$(CONFIG_MIGRATION) += migrate.o copy_pages.o
 obj-$(CONFIG_NUMA) += memory-tiers.o
 obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
diff --git a/mm/copy_pages.c b/mm/copy_pages.c
new file mode 100644
index 000000000000..0e2231199f66
--- /dev/null
+++ b/mm/copy_pages.c
@@ -0,0 +1,186 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Parallel page copy routine.
+ */
+
+#include <linux/sysctl.h>
+#include <linux/highmem.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <linux/migrate.h>
+
+
+unsigned int limit_mt_num = 4;
+
+struct copy_item {
+	char *to;
+	char *from;
+	unsigned long chunk_size;
+};
+
+struct copy_page_info {
+	struct work_struct copy_page_work;
+	unsigned long num_items;
+	struct copy_item item_list[];
+};
+
+static void copy_page_routine(char *vto, char *vfrom,
+	unsigned long chunk_size)
+{
+	memcpy(vto, vfrom, chunk_size);
+}
+
+static void copy_page_work_queue_thread(struct work_struct *work)
+{
+	struct copy_page_info *my_work = (struct copy_page_info *)work;
+	int i;
+
+	for (i = 0; i < my_work->num_items; ++i)
+		copy_page_routine(my_work->item_list[i].to,
+						  my_work->item_list[i].from,
+						  my_work->item_list[i].chunk_size);
+}
+
+int copy_page_lists_mt(struct list_head *dst_folios,
+		struct list_head *src_folios, int nr_items)
+{
+	int err = 0;
+	unsigned int total_mt_num = limit_mt_num;
+	int to_node = folio_nid(list_first_entry(dst_folios, struct folio, lru));
+	int i;
+	struct copy_page_info *work_items[32] = {0};
+	const struct cpumask *per_node_cpumask = cpumask_of_node(to_node);
+	int cpu_id_list[32] = {0};
+	int cpu;
+	int max_items_per_thread;
+	int item_idx;
+	struct folio *src, *src2, *dst, *dst2;
+
+	total_mt_num = min_t(unsigned int, total_mt_num,
+			cpumask_weight(per_node_cpumask));
+
+	if (total_mt_num > 32)
+		total_mt_num = 32;
+
+	/* Each threads get part of each page, if nr_items < totla_mt_num */
+	if (nr_items < total_mt_num)
+		max_items_per_thread = nr_items;
+	else
+		max_items_per_thread = (nr_items / total_mt_num) +
+				((nr_items % total_mt_num) ? 1 : 0);
+
+
+	for (cpu = 0; cpu < total_mt_num; ++cpu) {
+		work_items[cpu] = kzalloc(sizeof(struct copy_page_info) +
+					sizeof(struct copy_item) * max_items_per_thread,
+					GFP_NOWAIT);
+		if (!work_items[cpu]) {
+			err = -ENOMEM;
+			goto free_work_items;
+		}
+	}
+
+	i = 0;
+	/* TODO: need a better cpu selection method */
+	for_each_cpu(cpu, per_node_cpumask) {
+		if (i >= total_mt_num)
+			break;
+		cpu_id_list[i] = cpu;
+		++i;
+	}
+
+	if (nr_items < total_mt_num) {
+		for (cpu = 0; cpu < total_mt_num; ++cpu) {
+			INIT_WORK((struct work_struct *)work_items[cpu],
+					  copy_page_work_queue_thread);
+			work_items[cpu]->num_items = max_items_per_thread;
+		}
+
+		item_idx = 0;
+		dst = list_first_entry(dst_folios, struct folio, lru);
+		dst2 = list_next_entry(dst, lru);
+		list_for_each_entry_safe(src, src2, src_folios, lru) {
+			unsigned long chunk_size = PAGE_SIZE * folio_nr_pages(src) / total_mt_num;
+			/* XXX: not working in HIGHMEM */
+			char *vfrom = page_address(&src->page);
+			char *vto = page_address(&dst->page);
+
+			VM_WARN_ON(PAGE_SIZE * folio_nr_pages(src) % total_mt_num);
+			VM_WARN_ON(folio_nr_pages(dst) != folio_nr_pages(src));
+
+			for (cpu = 0; cpu < total_mt_num; ++cpu) {
+				work_items[cpu]->item_list[item_idx].to =
+					vto + chunk_size * cpu;
+				work_items[cpu]->item_list[item_idx].from =
+					vfrom + chunk_size * cpu;
+				work_items[cpu]->item_list[item_idx].chunk_size =
+					chunk_size;
+			}
+
+			item_idx++;
+			dst = dst2;
+			dst2 = list_next_entry(dst, lru);
+		}
+
+		for (cpu = 0; cpu < total_mt_num; ++cpu)
+			queue_work_on(cpu_id_list[cpu],
+						  system_unbound_wq,
+						  (struct work_struct *)work_items[cpu]);
+	} else {
+		int num_xfer_per_thread = nr_items / total_mt_num;
+		int per_cpu_item_idx;
+
+
+		for (cpu = 0; cpu < total_mt_num; ++cpu) {
+			INIT_WORK((struct work_struct *)work_items[cpu],
+					  copy_page_work_queue_thread);
+
+			work_items[cpu]->num_items = num_xfer_per_thread +
+					(cpu < (nr_items % total_mt_num));
+		}
+
+		cpu = 0;
+		per_cpu_item_idx = 0;
+		item_idx = 0;
+		dst = list_first_entry(dst_folios, struct folio, lru);
+		dst2 = list_next_entry(dst, lru);
+		list_for_each_entry_safe(src, src2, src_folios, lru) {
+			/* XXX: not working in HIGHMEM */
+			work_items[cpu]->item_list[per_cpu_item_idx].to =
+				page_address(&dst->page);
+			work_items[cpu]->item_list[per_cpu_item_idx].from =
+				page_address(&src->page);
+			work_items[cpu]->item_list[per_cpu_item_idx].chunk_size =
+				PAGE_SIZE * folio_nr_pages(src);
+
+			VM_WARN_ON(folio_nr_pages(dst) !=
+				   folio_nr_pages(src));
+
+			per_cpu_item_idx++;
+			item_idx++;
+			dst = dst2;
+			dst2 = list_next_entry(dst, lru);
+
+			if (per_cpu_item_idx == work_items[cpu]->num_items) {
+				queue_work_on(cpu_id_list[cpu],
+					system_unbound_wq,
+					(struct work_struct *)work_items[cpu]);
+				per_cpu_item_idx = 0;
+				cpu++;
+			}
+		}
+		if (item_idx != nr_items)
+			pr_warn("%s: only %d out of %d pages are transferred\n",
+				__func__, item_idx - 1, nr_items);
+	}
+
+	/* Wait until it finishes  */
+	for (i = 0; i < total_mt_num; ++i)
+		flush_work((struct work_struct *)work_items[i]);
+
+free_work_items:
+	for (cpu = 0; cpu < total_mt_num; ++cpu)
+		kfree(work_items[cpu]);
+
+	return err;
+}
diff --git a/mm/migrate.c b/mm/migrate.c
index 95c4cc4a7823..18440180d747 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1799,7 +1799,7 @@  static void migrate_folios_batch_move(struct list_head *src_folios,
 		int *nr_retry_pages)
 {
 	struct folio *folio, *folio2, *dst, *dst2;
-	int rc, nr_pages = 0, nr_mig_folios = 0;
+	int rc, nr_pages = 0, total_nr_pages = 0, total_nr_folios = 0;
 	int old_page_state = 0;
 	struct anon_vma *anon_vma = NULL;
 	bool is_lru;
@@ -1807,11 +1807,6 @@  static void migrate_folios_batch_move(struct list_head *src_folios,
 	LIST_HEAD(err_src);
 	LIST_HEAD(err_dst);
 
-	if (mode != MIGRATE_ASYNC) {
-		*retry += 1;
-		return;
-	}
-
 	/*
 	 * Iterate over the list of locked src/dst folios to copy the metadata
 	 */
@@ -1859,19 +1854,23 @@  static void migrate_folios_batch_move(struct list_head *src_folios,
 			migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED,
 					anon_vma, true, ret_folios);
 			migrate_folio_undo_dst(dst, true, put_new_folio, private);
-		} else /* MIGRATEPAGE_SUCCESS */
-			nr_mig_folios++;
+		} else { /* MIGRATEPAGE_SUCCESS */
+			total_nr_pages += nr_pages;
+			total_nr_folios++;
+		}
 
 		dst = dst2;
 		dst2 = list_next_entry(dst, lru);
 	}
 
 	/* Exit if folio list for batch migration is empty */
-	if (!nr_mig_folios)
+	if (!total_nr_pages)
 		goto out;
 
 	/* Batch copy the folios */
-	{
+	if (total_nr_pages > 32) {
+		copy_page_lists_mt(dst_folios, src_folios, total_nr_folios);
+	} else {
 		dst = list_first_entry(dst_folios, struct folio, lru);
 		dst2 = list_next_entry(dst, lru);
 		list_for_each_entry_safe(folio, folio2, src_folios, lru) {