diff mbox series

[RFC,v1,3/3] mm/damon/reclaim: Add per NUMA node proactive reclamation by DAMON_RECLAIM.

Message ID 20220218102611.31895-4-tome01@ajou.ac.kr (mailing list archive)
State New
Headers show
Series Rebase DAMON_RECALIM for NUMA system | expand

Commit Message

Jonghyeon Kim Feb. 18, 2022, 10:26 a.m. UTC
To add DAMON_RECLAIM worker threads(kdamond) that do proactive
reclamation per NUMA node, each node must have its own context.
'per_node' is added to enable it.

If 'per_node' is true, kdamonds as online NUMA node will be waked up and
start monitoring to proactively reclaim memory. If 'per_node' is false,
only one kdamond thread will start monitoring for all system memory.

Signed-off-by: Jonghyeon Kim <tome01@ajou.ac.kr>
---
 mm/damon/reclaim.c | 147 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 104 insertions(+), 43 deletions(-)

Comments

SeongJae Park Feb. 22, 2022, 9:53 a.m. UTC | #1
On Fri, 18 Feb 2022 19:26:11 +0900 Jonghyeon Kim <tome01@ajou.ac.kr> wrote:

> To add DAMON_RECLAIM worker threads(kdamond) that do proactive
> reclamation per NUMA node, each node must have its own context.
> 'per_node' is added to enable it.
> 
> If 'per_node' is true, kdamonds as online NUMA node will be waked up and
> start monitoring to proactively reclaim memory. If 'per_node' is false,
> only one kdamond thread will start monitoring for all system memory.
> 
> Signed-off-by: Jonghyeon Kim <tome01@ajou.ac.kr>
> ---
>  mm/damon/reclaim.c | 147 ++++++++++++++++++++++++++++++++-------------
>  1 file changed, 104 insertions(+), 43 deletions(-)
> 
> diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
> index b53d9c22fad1..85e8f97dd599 100644
> --- a/mm/damon/reclaim.c
> +++ b/mm/damon/reclaim.c
> @@ -177,13 +177,27 @@ static unsigned long monitor_region_end __read_mostly;
>  module_param(monitor_region_end, ulong, 0600);
>  
>  /*
> - * PID of the DAMON thread
> + * Enable monitoring memory regions per NUMA node.
>   *
> - * If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread.
> + * By default, watermarks consist of based on total system memory.
> + */
> +static bool per_node __read_mostly;
> +module_param(per_node, bool, 0600);
> +
> +/*
> + * Number of currently running DAMON worker threads
> + */
> +static unsigned long nr_kdamond __read_mostly;
> +module_param(nr_kdamond, ulong, 0400);

I'd prefer to call this nr_kdamond*s*

> +
> +/*
> + * First PID of the DAMON threads
> + *
> + * If DAMON_RECLAIM is enabled, this becomes the first PID of the worker threads.
>   * Else, -1.
>   */
> -static int kdamond_pid __read_mostly = -1;
> -module_param(kdamond_pid, int, 0400);
> +static int kdamond_start_pid __read_mostly = -1;
> +module_param(kdamond_start_pid, int, 0400);

This change could break old users.  Let's keep the name as is and clarify the
fact that it's for only the first kdamond in the document.

As long as DAMON_RECLAIM works in the exclusive manner, users will still be
able to know all pids of kdamonds for DAMON_RECLAIM, as nr_kdamonds is also
provided. 

>  
>  /*
>   * Number of memory regions that tried to be reclaimed.
> @@ -215,8 +229,8 @@ module_param(bytes_reclaimed_regions, ulong, 0400);
>  static unsigned long nr_quota_exceeds __read_mostly;
>  module_param(nr_quota_exceeds, ulong, 0400);
>  
> -static struct damon_ctx *ctx;
> -static struct damon_target *target;
> +static struct damon_ctx *ctxs[MAX_NUMNODES];
> +static struct damon_target *targets[MAX_NUMNODES];
>  
>  struct damon_reclaim_ram_walk_arg {
>  	unsigned long start;
> @@ -251,7 +265,7 @@ static bool get_monitoring_region(unsigned long *start, unsigned long *end)
>  	return true;
>  }
>  
> -static struct damos *damon_reclaim_new_scheme(void)
> +static struct damos *damon_reclaim_new_scheme(int node)
>  {
>  	struct damos_watermarks wmarks = {
>  		.metric = DAMOS_WMARK_FREE_MEM_RATE,
> @@ -259,6 +273,7 @@ static struct damos *damon_reclaim_new_scheme(void)
>  		.high = wmarks_high,
>  		.mid = wmarks_mid,
>  		.low = wmarks_low,
> +		.node = node,
>  	};
>  	struct damos_quota quota = {
>  		/*
> @@ -290,56 +305,99 @@ static struct damos *damon_reclaim_new_scheme(void)
>  	return scheme;
>  }
>  
> -static int damon_reclaim_turn(bool on)
> +static int damon_reclaim_start(int nid)
>  {
>  	struct damon_region *region;
>  	struct damos *scheme;
>  	int err;
> +	unsigned long start, end;
>  
> -	if (!on) {
> -		err = damon_stop(&ctx, 1);
> -		if (!err)
> -			kdamond_pid = -1;
> -		return err;
> -	}
> -
> -	err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0,
> +	err = damon_set_attrs(ctxs[nid], sample_interval, aggr_interval, 0,
>  			min_nr_regions, max_nr_regions);
>  	if (err)
>  		return err;
>  
> -	if (monitor_region_start > monitor_region_end)
> -		return -EINVAL;
> -	if (!monitor_region_start && !monitor_region_end &&
> -			!get_monitoring_region(&monitor_region_start,
> -				&monitor_region_end))
> -		return -EINVAL;
> +	if (per_node) {
> +		monitor_region_start = monitor_region_end = 0;
> +
> +		start = PFN_PHYS(node_start_pfn(nid));
> +		end = PFN_PHYS(node_start_pfn(nid) + node_present_pages(nid) - 1);
> +		if (end <= start)
> +			return -EINVAL;
> +	} else {
> +		if (!monitor_region_start && !monitor_region_end &&
> +				!get_monitoring_region(&monitor_region_start,
> +					&monitor_region_end))
> +			return -EINVAL;
> +		start = monitor_region_start;
> +		end = monitor_region_end;
> +	}
> +
>  	/* DAMON will free this on its own when finish monitoring */
> -	region = damon_new_region(monitor_region_start, monitor_region_end);
> +	region = damon_new_region(start, end);
>  	if (!region)
>  		return -ENOMEM;
> -	damon_add_region(region, target);
> +	damon_add_region(region, targets[nid]);
>  
>  	/* Will be freed by 'damon_set_schemes()' below */
> -	scheme = damon_reclaim_new_scheme();
> +	scheme = damon_reclaim_new_scheme(nid);
>  	if (!scheme) {
>  		err = -ENOMEM;
>  		goto free_region_out;
>  	}
> -	err = damon_set_schemes(ctx, &scheme, 1);
> +
> +	err = damon_set_schemes(ctxs[nid], &scheme, 1);
>  	if (err)
>  		goto free_scheme_out;
>  
> -	err = damon_start(&ctx, 1);
> +	err = damon_start_one(ctxs[nid]);

This could surprise users assuming DAMON_RECLAIM would work in exclusive manner
as it was.

>  	if (!err) {
> -		kdamond_pid = ctx->kdamond->pid;
> +		if (kdamond_start_pid == -1)
> +			kdamond_start_pid = ctxs[nid]->kdamond->pid;
> +		nr_kdamond++;
>  		return 0;
>  	}
>  
>  free_scheme_out:
>  	damon_destroy_scheme(scheme);
>  free_region_out:
> -	damon_destroy_region(region, target);
> +	damon_destroy_region(region, targets[nid]);
> +
> +	return err;
> +}
> +
> +static int damon_reclaim_start_all(void)
> +{
> +	int nid, err;
> +
> +	if (!per_node)
> +		return damon_reclaim_start(0);
> +
> +	for_each_online_node(nid) {
> +		err = damon_reclaim_start(nid);
> +		if (err)
> +			break;

I'd prefer making contexts first and starting them at once in the exclusive
manner using damon_start().

> +	}
> +
> +	return err;
> +}
> +
> +static int damon_reclaim_turn(bool on)
> +{
> +	int err;
> +
> +	if (!on) {
> +		err = damon_stop(ctxs, nr_kdamond);
> +		if (!err) {
> +			kdamond_start_pid = -1;
> +			nr_kdamond = 0;
> +			monitor_region_start = 0;
> +			monitor_region_end = 0;
> +		}
> +		return err;
> +	}
> +
> +	err = damon_reclaim_start_all();
>  	return err;
>  }
>  
> @@ -380,21 +438,24 @@ static int damon_reclaim_after_aggregation(struct damon_ctx *c)
>  
>  static int __init damon_reclaim_init(void)
>  {
> -	ctx = damon_new_ctx();
> -	if (!ctx)
> -		return -ENOMEM;
> -
> -	if (damon_select_ops(ctx, DAMON_OPS_PADDR))
> -		return -EINVAL;
> -
> -	ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
> -
> -	target = damon_new_target();
> -	if (!target) {
> -		damon_destroy_ctx(ctx);
> -		return -ENOMEM;
> +	int nid;
> +
> +	for_each_node(nid) {
> +		ctxs[nid] = damon_new_ctx();
> +		if (!ctxs[nid])
> +			return -ENOMEM;
> +
> +		if (damon_select_ops(ctxs[nid], DAMON_OPS_PADDR))
> +			return -EINVAL;
> +		ctxs[nid]->callback.after_aggregation = damon_reclaim_after_aggregation;
> +
> +		targets[nid] = damon_new_target();
> +		if (!targets[nid]) {
> +			damon_destroy_ctx(ctxs[nid]);

Shouldn't we also destroy previously allocated contexts?

> +			return -ENOMEM;
> +		}
> +		damon_add_target(ctxs[nid], targets[nid]);
>  	}
> -	damon_add_target(ctx, target);
>  
>  	schedule_delayed_work(&damon_reclaim_timer, 0);
>  	return 0;
> -- 
> 2.17.1


Thanks,
SJ
Jonghyeon Kim Feb. 23, 2022, 7:12 a.m. UTC | #2
On Wed, Feb 23, 2022 at 02:11:27PM +0900, Jonghyeon Kim wrote:
> On Tue, Feb 22, 2022 at 09:53:38AM +0000, SeongJae Park wrote:
> > On Fri, 18 Feb 2022 19:26:11 +0900 Jonghyeon Kim <tome01@ajou.ac.kr> wrote:
> > 
> > > To add DAMON_RECLAIM worker threads(kdamond) that do proactive
> > > reclamation per NUMA node, each node must have its own context.
> > > 'per_node' is added to enable it.
> > > 
> > > If 'per_node' is true, kdamonds as online NUMA node will be waked up and
> > > start monitoring to proactively reclaim memory. If 'per_node' is false,
> > > only one kdamond thread will start monitoring for all system memory.
> > > 
> > > Signed-off-by: Jonghyeon Kim <tome01@ajou.ac.kr>
> > > ---
> > >  mm/damon/reclaim.c | 147 ++++++++++++++++++++++++++++++++-------------
> > >  1 file changed, 104 insertions(+), 43 deletions(-)
> > > 
> > > diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
> > > index b53d9c22fad1..85e8f97dd599 100644
> > > --- a/mm/damon/reclaim.c
> > > +++ b/mm/damon/reclaim.c
> > > @@ -177,13 +177,27 @@ static unsigned long monitor_region_end __read_mostly;
> > >  module_param(monitor_region_end, ulong, 0600);
> > >  
> > >  /*
> > > - * PID of the DAMON thread
> > > + * Enable monitoring memory regions per NUMA node.
> > >   *
> > > - * If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread.
> > > + * By default, watermarks consist of based on total system memory.
> > > + */
> > > +static bool per_node __read_mostly;
> > > +module_param(per_node, bool, 0600);
> > > +
> > > +/*
> > > + * Number of currently running DAMON worker threads
> > > + */
> > > +static unsigned long nr_kdamond __read_mostly;
> > > +module_param(nr_kdamond, ulong, 0400);
> > 
> > I'd prefer to call this nr_kdamond*s*
> > 
> 
> I see.
> 
> > > +
> > > +/*
> > > + * First PID of the DAMON threads
> > > + *
> > > + * If DAMON_RECLAIM is enabled, this becomes the first PID of the worker threads.
> > >   * Else, -1.
> > >   */
> > > -static int kdamond_pid __read_mostly = -1;
> > > -module_param(kdamond_pid, int, 0400);
> > > +static int kdamond_start_pid __read_mostly = -1;
> > > +module_param(kdamond_start_pid, int, 0400);
> > 
> > This change could break old users.  Let's keep the name as is and clarify the
> > fact that it's for only the first kdamond in the document.
> 
> Got it, I will keep that name and update the DAMON document.
> 
> > 
> > As long as DAMON_RECLAIM works in the exclusive manner, users will still be
> > able to know all pids of kdamonds for DAMON_RECLAIM, as nr_kdamonds is also
> > provided. 
> 
> I will find some kind of way to show all pids of kdamonds.
> 
> > 
> > >  
> > >  /*
> > >   * Number of memory regions that tried to be reclaimed.
> > > @@ -215,8 +229,8 @@ module_param(bytes_reclaimed_regions, ulong, 0400);
> > >  static unsigned long nr_quota_exceeds __read_mostly;
> > >  module_param(nr_quota_exceeds, ulong, 0400);
> > >  
> > > -static struct damon_ctx *ctx;
> > > -static struct damon_target *target;
> > > +static struct damon_ctx *ctxs[MAX_NUMNODES];
> > > +static struct damon_target *targets[MAX_NUMNODES];
> > >  
> > >  struct damon_reclaim_ram_walk_arg {
> > >  	unsigned long start;
> > > @@ -251,7 +265,7 @@ static bool get_monitoring_region(unsigned long *start, unsigned long *end)
> > >  	return true;
> > >  }
> > >  
> > > -static struct damos *damon_reclaim_new_scheme(void)
> > > +static struct damos *damon_reclaim_new_scheme(int node)
> > >  {
> > >  	struct damos_watermarks wmarks = {
> > >  		.metric = DAMOS_WMARK_FREE_MEM_RATE,
> > > @@ -259,6 +273,7 @@ static struct damos *damon_reclaim_new_scheme(void)
> > >  		.high = wmarks_high,
> > >  		.mid = wmarks_mid,
> > >  		.low = wmarks_low,
> > > +		.node = node,
> > >  	};
> > >  	struct damos_quota quota = {
> > >  		/*
> > > @@ -290,56 +305,99 @@ static struct damos *damon_reclaim_new_scheme(void)
> > >  	return scheme;
> > >  }
> > >  
> > > -static int damon_reclaim_turn(bool on)
> > > +static int damon_reclaim_start(int nid)
> > >  {
> > >  	struct damon_region *region;
> > >  	struct damos *scheme;
> > >  	int err;
> > > +	unsigned long start, end;
> > >  
> > > -	if (!on) {
> > > -		err = damon_stop(&ctx, 1);
> > > -		if (!err)
> > > -			kdamond_pid = -1;
> > > -		return err;
> > > -	}
> > > -
> > > -	err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0,
> > > +	err = damon_set_attrs(ctxs[nid], sample_interval, aggr_interval, 0,
> > >  			min_nr_regions, max_nr_regions);
> > >  	if (err)
> > >  		return err;
> > >  
> > > -	if (monitor_region_start > monitor_region_end)
> > > -		return -EINVAL;
> > > -	if (!monitor_region_start && !monitor_region_end &&
> > > -			!get_monitoring_region(&monitor_region_start,
> > > -				&monitor_region_end))
> > > -		return -EINVAL;
> > > +	if (per_node) {
> > > +		monitor_region_start = monitor_region_end = 0;
> > > +
> > > +		start = PFN_PHYS(node_start_pfn(nid));
> > > +		end = PFN_PHYS(node_start_pfn(nid) + node_present_pages(nid) - 1);
> > > +		if (end <= start)
> > > +			return -EINVAL;
> > > +	} else {
> > > +		if (!monitor_region_start && !monitor_region_end &&
> > > +				!get_monitoring_region(&monitor_region_start,
> > > +					&monitor_region_end))
> > > +			return -EINVAL;
> > > +		start = monitor_region_start;
> > > +		end = monitor_region_end;
> > > +	}
> > > +
> > >  	/* DAMON will free this on its own when finish monitoring */
> > > -	region = damon_new_region(monitor_region_start, monitor_region_end);
> > > +	region = damon_new_region(start, end);
> > >  	if (!region)
> > >  		return -ENOMEM;
> > > -	damon_add_region(region, target);
> > > +	damon_add_region(region, targets[nid]);
> > >  
> > >  	/* Will be freed by 'damon_set_schemes()' below */
> > > -	scheme = damon_reclaim_new_scheme();
> > > +	scheme = damon_reclaim_new_scheme(nid);
> > >  	if (!scheme) {
> > >  		err = -ENOMEM;
> > >  		goto free_region_out;
> > >  	}
> > > -	err = damon_set_schemes(ctx, &scheme, 1);
> > > +
> > > +	err = damon_set_schemes(ctxs[nid], &scheme, 1);
> > >  	if (err)
> > >  		goto free_scheme_out;
> > >  
> > > -	err = damon_start(&ctx, 1);
> > > +	err = damon_start_one(ctxs[nid]);
> > 
> > This could surprise users assuming DAMON_RECLAIM would work in exclusive manner
> > as it was.
> 
> Yes, I will drop this function following the next version.
> 
> > 
> > >  	if (!err) {
> > > -		kdamond_pid = ctx->kdamond->pid;
> > > +		if (kdamond_start_pid == -1)
> > > +			kdamond_start_pid = ctxs[nid]->kdamond->pid;
> > > +		nr_kdamond++;
> > >  		return 0;
> > >  	}
> > >  
> > >  free_scheme_out:
> > >  	damon_destroy_scheme(scheme);
> > >  free_region_out:
> > > -	damon_destroy_region(region, target);
> > > +	damon_destroy_region(region, targets[nid]);
> > > +
> > > +	return err;
> > > +}
> > > +
> > > +static int damon_reclaim_start_all(void)
> > > +{
> > > +	int nid, err;
> > > +
> > > +	if (!per_node)
> > > +		return damon_reclaim_start(0);
> > > +
> > > +	for_each_online_node(nid) {
> > > +		err = damon_reclaim_start(nid);
> > > +		if (err)
> > > +			break;
> > 
> > I'd prefer making contexts first and starting them at once in the exclusive
> > manner using damon_start().
> > 
> 
> Agree.
> 
> > > +	}
> > > +
> > > +	return err;
> > > +}
> > > +
> > > +static int damon_reclaim_turn(bool on)
> > > +{
> > > +	int err;
> > > +
> > > +	if (!on) {
> > > +		err = damon_stop(ctxs, nr_kdamond);
> > > +		if (!err) {
> > > +			kdamond_start_pid = -1;
> > > +			nr_kdamond = 0;
> > > +			monitor_region_start = 0;
> > > +			monitor_region_end = 0;
> > > +		}
> > > +		return err;
> > > +	}
> > > +
> > > +	err = damon_reclaim_start_all();
> > >  	return err;
> > >  }
> > >  
> > > @@ -380,21 +438,24 @@ static int damon_reclaim_after_aggregation(struct damon_ctx *c)
> > >  
> > >  static int __init damon_reclaim_init(void)
> > >  {
> > > -	ctx = damon_new_ctx();
> > > -	if (!ctx)
> > > -		return -ENOMEM;
> > > -
> > > -	if (damon_select_ops(ctx, DAMON_OPS_PADDR))
> > > -		return -EINVAL;
> > > -
> > > -	ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
> > > -
> > > -	target = damon_new_target();
> > > -	if (!target) {
> > > -		damon_destroy_ctx(ctx);
> > > -		return -ENOMEM;
> > > +	int nid;
> > > +
> > > +	for_each_node(nid) {
> > > +		ctxs[nid] = damon_new_ctx();
> > > +		if (!ctxs[nid])
> > > +			return -ENOMEM;
> > > +
> > > +		if (damon_select_ops(ctxs[nid], DAMON_OPS_PADDR))
> > > +			return -EINVAL;
> > > +		ctxs[nid]->callback.after_aggregation = damon_reclaim_after_aggregation;
> > > +
> > > +		targets[nid] = damon_new_target();
> > > +		if (!targets[nid]) {
> > > +			damon_destroy_ctx(ctxs[nid]);
> > 
> > Shouldn't we also destroy previously allocated contexts?
> 
> Yes, I think so. I will fix.
> 
> > 
> > > +			return -ENOMEM;
> > > +		}
> > > +		damon_add_target(ctxs[nid], targets[nid]);
> > >  	}
> > > -	damon_add_target(ctx, target);
> > >  
> > >  	schedule_delayed_work(&damon_reclaim_timer, 0);
> > >  	return 0;
> > > -- 
> > > 2.17.1
> > 
> > 
> > Thanks,
> > SJ
> 
> 
> Thanks for the comments!
> Jonghyeon
diff mbox series

Patch

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index b53d9c22fad1..85e8f97dd599 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -177,13 +177,27 @@  static unsigned long monitor_region_end __read_mostly;
 module_param(monitor_region_end, ulong, 0600);
 
 /*
- * PID of the DAMON thread
+ * Enable monitoring memory regions per NUMA node.
  *
- * If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread.
+ * By default, watermarks consist of based on total system memory.
+ */
+static bool per_node __read_mostly;
+module_param(per_node, bool, 0600);
+
+/*
+ * Number of currently running DAMON worker threads
+ */
+static unsigned long nr_kdamond __read_mostly;
+module_param(nr_kdamond, ulong, 0400);
+
+/*
+ * First PID of the DAMON threads
+ *
+ * If DAMON_RECLAIM is enabled, this becomes the first PID of the worker threads.
  * Else, -1.
  */
-static int kdamond_pid __read_mostly = -1;
-module_param(kdamond_pid, int, 0400);
+static int kdamond_start_pid __read_mostly = -1;
+module_param(kdamond_start_pid, int, 0400);
 
 /*
  * Number of memory regions that tried to be reclaimed.
@@ -215,8 +229,8 @@  module_param(bytes_reclaimed_regions, ulong, 0400);
 static unsigned long nr_quota_exceeds __read_mostly;
 module_param(nr_quota_exceeds, ulong, 0400);
 
-static struct damon_ctx *ctx;
-static struct damon_target *target;
+static struct damon_ctx *ctxs[MAX_NUMNODES];
+static struct damon_target *targets[MAX_NUMNODES];
 
 struct damon_reclaim_ram_walk_arg {
 	unsigned long start;
@@ -251,7 +265,7 @@  static bool get_monitoring_region(unsigned long *start, unsigned long *end)
 	return true;
 }
 
-static struct damos *damon_reclaim_new_scheme(void)
+static struct damos *damon_reclaim_new_scheme(int node)
 {
 	struct damos_watermarks wmarks = {
 		.metric = DAMOS_WMARK_FREE_MEM_RATE,
@@ -259,6 +273,7 @@  static struct damos *damon_reclaim_new_scheme(void)
 		.high = wmarks_high,
 		.mid = wmarks_mid,
 		.low = wmarks_low,
+		.node = node,
 	};
 	struct damos_quota quota = {
 		/*
@@ -290,56 +305,99 @@  static struct damos *damon_reclaim_new_scheme(void)
 	return scheme;
 }
 
-static int damon_reclaim_turn(bool on)
+static int damon_reclaim_start(int nid)
 {
 	struct damon_region *region;
 	struct damos *scheme;
 	int err;
+	unsigned long start, end;
 
-	if (!on) {
-		err = damon_stop(&ctx, 1);
-		if (!err)
-			kdamond_pid = -1;
-		return err;
-	}
-
-	err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0,
+	err = damon_set_attrs(ctxs[nid], sample_interval, aggr_interval, 0,
 			min_nr_regions, max_nr_regions);
 	if (err)
 		return err;
 
-	if (monitor_region_start > monitor_region_end)
-		return -EINVAL;
-	if (!monitor_region_start && !monitor_region_end &&
-			!get_monitoring_region(&monitor_region_start,
-				&monitor_region_end))
-		return -EINVAL;
+	if (per_node) {
+		monitor_region_start = monitor_region_end = 0;
+
+		start = PFN_PHYS(node_start_pfn(nid));
+		end = PFN_PHYS(node_start_pfn(nid) + node_present_pages(nid) - 1);
+		if (end <= start)
+			return -EINVAL;
+	} else {
+		if (!monitor_region_start && !monitor_region_end &&
+				!get_monitoring_region(&monitor_region_start,
+					&monitor_region_end))
+			return -EINVAL;
+		start = monitor_region_start;
+		end = monitor_region_end;
+	}
+
 	/* DAMON will free this on its own when finish monitoring */
-	region = damon_new_region(monitor_region_start, monitor_region_end);
+	region = damon_new_region(start, end);
 	if (!region)
 		return -ENOMEM;
-	damon_add_region(region, target);
+	damon_add_region(region, targets[nid]);
 
 	/* Will be freed by 'damon_set_schemes()' below */
-	scheme = damon_reclaim_new_scheme();
+	scheme = damon_reclaim_new_scheme(nid);
 	if (!scheme) {
 		err = -ENOMEM;
 		goto free_region_out;
 	}
-	err = damon_set_schemes(ctx, &scheme, 1);
+
+	err = damon_set_schemes(ctxs[nid], &scheme, 1);
 	if (err)
 		goto free_scheme_out;
 
-	err = damon_start(&ctx, 1);
+	err = damon_start_one(ctxs[nid]);
 	if (!err) {
-		kdamond_pid = ctx->kdamond->pid;
+		if (kdamond_start_pid == -1)
+			kdamond_start_pid = ctxs[nid]->kdamond->pid;
+		nr_kdamond++;
 		return 0;
 	}
 
 free_scheme_out:
 	damon_destroy_scheme(scheme);
 free_region_out:
-	damon_destroy_region(region, target);
+	damon_destroy_region(region, targets[nid]);
+
+	return err;
+}
+
+static int damon_reclaim_start_all(void)
+{
+	int nid, err;
+
+	if (!per_node)
+		return damon_reclaim_start(0);
+
+	for_each_online_node(nid) {
+		err = damon_reclaim_start(nid);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+static int damon_reclaim_turn(bool on)
+{
+	int err;
+
+	if (!on) {
+		err = damon_stop(ctxs, nr_kdamond);
+		if (!err) {
+			kdamond_start_pid = -1;
+			nr_kdamond = 0;
+			monitor_region_start = 0;
+			monitor_region_end = 0;
+		}
+		return err;
+	}
+
+	err = damon_reclaim_start_all();
 	return err;
 }
 
@@ -380,21 +438,24 @@  static int damon_reclaim_after_aggregation(struct damon_ctx *c)
 
 static int __init damon_reclaim_init(void)
 {
-	ctx = damon_new_ctx();
-	if (!ctx)
-		return -ENOMEM;
-
-	if (damon_select_ops(ctx, DAMON_OPS_PADDR))
-		return -EINVAL;
-
-	ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
-
-	target = damon_new_target();
-	if (!target) {
-		damon_destroy_ctx(ctx);
-		return -ENOMEM;
+	int nid;
+
+	for_each_node(nid) {
+		ctxs[nid] = damon_new_ctx();
+		if (!ctxs[nid])
+			return -ENOMEM;
+
+		if (damon_select_ops(ctxs[nid], DAMON_OPS_PADDR))
+			return -EINVAL;
+		ctxs[nid]->callback.after_aggregation = damon_reclaim_after_aggregation;
+
+		targets[nid] = damon_new_target();
+		if (!targets[nid]) {
+			damon_destroy_ctx(ctxs[nid]);
+			return -ENOMEM;
+		}
+		damon_add_target(ctxs[nid], targets[nid]);
 	}
-	damon_add_target(ctx, target);
 
 	schedule_delayed_work(&damon_reclaim_timer, 0);
 	return 0;