diff mbox series

[4/6] mm: introduce per-node proactive reclaim interface

Message ID 20220416053902.68517-5-dave@stgolabs.net (mailing list archive)
State New
Headers show
Series mm: proactive reclaim and memory tiering topics | expand

Commit Message

Davidlohr Bueso April 16, 2022, 5:39 a.m. UTC
This patch introduces a mechanism to trigger memory reclaim
as a per-node sysfs interface, inspired by compaction's
equivalent; ie:

	 echo 1G > /sys/devices/system/node/nodeX/reclaim

It is based on the discussions from David's thread[1] as
well as the current upstreaming of the memcg[2] interface
(which has nice explanations for the benefits of userspace
reclaim overall). In both cases conclusions were that either
way of inducing proactive reclaim should be KISS, and can be
later extended. So this patch does not allow the user much
fine tuning beyond the size of the reclaim, such as anon/file
or whether or semantics of demotion.

[1] https://lore.kernel.org/all/5df21376-7dd1-bf81-8414-32a73cea45dd@google.com/
[2] https://lore.kernel.org/all/20220408045743.1432968-1-yosryahmed@google.com/

Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
---
 Documentation/ABI/stable/sysfs-devices-node | 10 ++++
 drivers/base/node.c                         |  2 +
 include/linux/swap.h                        | 16 ++++++
 mm/vmscan.c                                 | 59 +++++++++++++++++++++
 4 files changed, 87 insertions(+)

Comments

Tim Chen April 19, 2022, midnight UTC | #1
On Fri, 2022-04-15 at 22:39 -0700, Davidlohr Bueso wrote:
> This patch introduces a mechanism to trigger memory reclaim
> as a per-node sysfs interface, inspired by compaction's
> equivalent; ie:
> 
> 	 echo 1G > /sys/devices/system/node/nodeX/reclaim
> 

I think it will be more flexible to specify a node mask
as a parameter along with amount of memory with the 
memory.reclaim memcg interface proposed by Yosry.  Doing it node
by node is more cumbersome.  It is just a special case
of reclaiming from root cgroup for a specific node.

Wei Gu, YIng and I have some discssions on this
https://lore.kernel.org/all/df6110a09cacc80ee1cbe905a71273a5f3953e16.camel@linux.intel.com/  

 
Tim

> It is based on the discussions from David's thread[1] as
> well as the current upstreaming of the memcg[2] interface
> (which has nice explanations for the benefits of userspace
> reclaim overall). In both cases conclusions were that either
> way of inducing proactive reclaim should be KISS, and can be
> later extended. So this patch does not allow the user much
> fine tuning beyond the size of the reclaim, such as anon/file
> or whether or semantics of demotion.
> 
> [1] https://lore.kernel.org/all/5df21376-7dd1-bf81-8414-32a73cea45dd@google.com/
> [2] https://lore.kernel.org/all/20220408045743.1432968-1-yosryahmed@google.com/
> 
> Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
> ---
>  Documentation/ABI/stable/sysfs-devices-node | 10 ++++
>  drivers/base/node.c                         |  2 +
>  include/linux/swap.h                        | 16 ++++++
>  mm/vmscan.c                                 | 59 +++++++++++++++++++++
>  4 files changed, 87 insertions(+)
> 
> diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
> index 8db67aa472f1..3c935e1334f7 100644
> --- a/Documentation/ABI/stable/sysfs-devices-node
> +++ b/Documentation/ABI/stable/sysfs-devices-node
> @@ -182,3 +182,13 @@ Date:		November 2021
>  Contact:	Jarkko Sakkinen <jarkko@kernel.org>
>  Description:
>  		The total amount of SGX physical memory in bytes.
> +
> +What:		/sys/devices/system/node/nodeX/reclaim
> +Date:		April 2022
> +Contact:	Davidlohr Bueso <dave@stgolabs.net>
> +Description:
> +		Write the amount of bytes to induce memory reclaim in this node.
> +		This file accepts a single key, the number of bytes to reclaim.
> +		When it completes successfully, the specified amount or more memory
> +		will have been reclaimed, and -EAGAIN if less bytes are reclaimed
> +		than the specified amount.
> diff --git a/drivers/base/node.c b/drivers/base/node.c
> index 6cdf25fd26c3..d80c478e2a6e 100644
> --- a/drivers/base/node.c
> +++ b/drivers/base/node.c
> @@ -670,6 +670,7 @@ static int register_node(struct node *node, int num)
>  
>  	hugetlb_register_node(node);
>  	compaction_register_node(node);
> +	reclaim_register_node(node);
>  	return 0;
>  }
>  
> @@ -685,6 +686,7 @@ void unregister_node(struct node *node)
>  	hugetlb_unregister_node(node);		/* no-op, if memoryless node */
>  	node_remove_accesses(node);
>  	node_remove_caches(node);
> +	reclaim_unregister_node(node);
>  	device_unregister(&node->dev);
>  }
>  
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 27093b477c5f..cca43ae6d770 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -398,6 +398,22 @@ extern unsigned long shrink_all_memory(unsigned long nr_pages);
>  extern int vm_swappiness;
>  long remove_mapping(struct address_space *mapping, struct folio *folio);
>  
> +#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
> +extern int reclaim_register_node(struct node *node);
> +extern void reclaim_unregister_node(struct node *node);
> +
> +#else
> +
> +static inline int reclaim_register_node(struct node *node)
> +{
> +	return 0;
> +}
> +
> +static inline void reclaim_unregister_node(struct node *node)
> +{
> +}
> +#endif /* CONFIG_SYSFS && CONFIG_NUMA */
> +
>  extern unsigned long reclaim_pages(struct list_head *page_list);
>  #ifdef CONFIG_NUMA
>  extern int node_reclaim_mode;
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 1735c302831c..3539f8a0f0ea 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -4819,3 +4819,62 @@ void check_move_unevictable_pages(struct pagevec *pvec)
>  	}
>  }
>  EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
> +
> +#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
> +static ssize_t reclaim_store(struct device *dev,
> +			     struct device_attribute *attr,
> +			     const char *buf, size_t count)
> +{
> +	int err, nid = dev->id;
> +	gfp_t gfp_mask = GFP_KERNEL;
> +	struct pglist_data *pgdat = NODE_DATA(nid);
> +	unsigned long nr_to_reclaim, nr_reclaimed = 0;
> +	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
> +	struct scan_control sc = {
> +		.gfp_mask = current_gfp_context(gfp_mask),
> +		.reclaim_idx = gfp_zone(gfp_mask),
> +		.priority = NODE_RECLAIM_PRIORITY,
> +		.may_writepage = !laptop_mode,
> +		.may_unmap = 1,
> +		.may_swap = 1,
> +	};
> +
> +	buf = strstrip((char *)buf);
> +	err = page_counter_memparse(buf, "", &nr_to_reclaim);
> +	if (err)
> +		return err;
> +
> +	sc.nr_to_reclaim = max(nr_to_reclaim, SWAP_CLUSTER_MAX);
> +
> +	while (nr_reclaimed < nr_to_reclaim) {
> +		unsigned long reclaimed;
> +
> +		if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
> +			return -EAGAIN;
> +
> +		/* does cond_resched() */
> +		reclaimed = __node_reclaim(pgdat, gfp_mask,
> +					   nr_to_reclaim - nr_reclaimed, &sc);
> +
> +		clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
> +
> +		if (!reclaimed && !nr_retries--)
> +			break;
> +
> +		nr_reclaimed += reclaimed;
> +	}
> +
> +	return nr_reclaimed < nr_to_reclaim ? -EAGAIN : count;
> +}
> +
> +static DEVICE_ATTR_WO(reclaim);
> +int reclaim_register_node(struct node *node)
> +{
> +	return device_create_file(&node->dev, &dev_attr_reclaim);
> +}
> +
> +void reclaim_unregister_node(struct node *node)
> +{
> +	return device_remove_file(&node->dev, &dev_attr_reclaim);
> +}
> +#endif
diff mbox series

Patch

diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
index 8db67aa472f1..3c935e1334f7 100644
--- a/Documentation/ABI/stable/sysfs-devices-node
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -182,3 +182,13 @@  Date:		November 2021
 Contact:	Jarkko Sakkinen <jarkko@kernel.org>
 Description:
 		The total amount of SGX physical memory in bytes.
+
+What:		/sys/devices/system/node/nodeX/reclaim
+Date:		April 2022
+Contact:	Davidlohr Bueso <dave@stgolabs.net>
+Description:
+		Write the amount of bytes to induce memory reclaim in this node.
+		This file accepts a single key, the number of bytes to reclaim.
+		When it completes successfully, the specified amount or more memory
+		will have been reclaimed, and -EAGAIN if less bytes are reclaimed
+		than the specified amount.
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 6cdf25fd26c3..d80c478e2a6e 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -670,6 +670,7 @@  static int register_node(struct node *node, int num)
 
 	hugetlb_register_node(node);
 	compaction_register_node(node);
+	reclaim_register_node(node);
 	return 0;
 }
 
@@ -685,6 +686,7 @@  void unregister_node(struct node *node)
 	hugetlb_unregister_node(node);		/* no-op, if memoryless node */
 	node_remove_accesses(node);
 	node_remove_caches(node);
+	reclaim_unregister_node(node);
 	device_unregister(&node->dev);
 }
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 27093b477c5f..cca43ae6d770 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -398,6 +398,22 @@  extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 long remove_mapping(struct address_space *mapping, struct folio *folio);
 
+#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
+extern int reclaim_register_node(struct node *node);
+extern void reclaim_unregister_node(struct node *node);
+
+#else
+
+static inline int reclaim_register_node(struct node *node)
+{
+	return 0;
+}
+
+static inline void reclaim_unregister_node(struct node *node)
+{
+}
+#endif /* CONFIG_SYSFS && CONFIG_NUMA */
+
 extern unsigned long reclaim_pages(struct list_head *page_list);
 #ifdef CONFIG_NUMA
 extern int node_reclaim_mode;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1735c302831c..3539f8a0f0ea 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4819,3 +4819,62 @@  void check_move_unevictable_pages(struct pagevec *pvec)
 	}
 }
 EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
+
+#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
+static ssize_t reclaim_store(struct device *dev,
+			     struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	int err, nid = dev->id;
+	gfp_t gfp_mask = GFP_KERNEL;
+	struct pglist_data *pgdat = NODE_DATA(nid);
+	unsigned long nr_to_reclaim, nr_reclaimed = 0;
+	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
+	struct scan_control sc = {
+		.gfp_mask = current_gfp_context(gfp_mask),
+		.reclaim_idx = gfp_zone(gfp_mask),
+		.priority = NODE_RECLAIM_PRIORITY,
+		.may_writepage = !laptop_mode,
+		.may_unmap = 1,
+		.may_swap = 1,
+	};
+
+	buf = strstrip((char *)buf);
+	err = page_counter_memparse(buf, "", &nr_to_reclaim);
+	if (err)
+		return err;
+
+	sc.nr_to_reclaim = max(nr_to_reclaim, SWAP_CLUSTER_MAX);
+
+	while (nr_reclaimed < nr_to_reclaim) {
+		unsigned long reclaimed;
+
+		if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
+			return -EAGAIN;
+
+		/* does cond_resched() */
+		reclaimed = __node_reclaim(pgdat, gfp_mask,
+					   nr_to_reclaim - nr_reclaimed, &sc);
+
+		clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
+
+		if (!reclaimed && !nr_retries--)
+			break;
+
+		nr_reclaimed += reclaimed;
+	}
+
+	return nr_reclaimed < nr_to_reclaim ? -EAGAIN : count;
+}
+
+static DEVICE_ATTR_WO(reclaim);
+int reclaim_register_node(struct node *node)
+{
+	return device_create_file(&node->dev, &dev_attr_reclaim);
+}
+
+void reclaim_unregister_node(struct node *node)
+{
+	return device_remove_file(&node->dev, &dev_attr_reclaim);
+}
+#endif