diff mbox series

[RFC,v3,5/7] mm/damon/paddr: introduce DAMOS_MIGRATE_COLD action for demotion

Message ID 20240405060858.2818-6-honggyu.kim@sk.com (mailing list archive)
State New
Headers show
Series DAMON based tiered memory management for CXL memory | expand

Commit Message

Honggyu Kim April 5, 2024, 6:08 a.m. UTC
This patch introduces DAMOS_MIGRATE_COLD action, which is similar to
DAMOS_PAGEOUT, but migrate folios to the given 'target_nid' in the sysfs
instead of swapping them out.

The 'target_nid' sysfs knob is created by this patch to inform the
migration target node ID.

Here is one of the example usage of this 'migrate_cold' action.

  $ cd /sys/kernel/mm/damon/admin/kdamonds/<N>
  $ cat contexts/<N>/schemes/<N>/action
  migrate_cold
  $ echo 2 > contexts/<N>/schemes/<N>/target_nid
  $ echo commit > state
  $ numactl -p 0 ./hot_cold 500M 600M &
  $ numastat -c -p hot_cold

  Per-node process memory usage (in MBs)
  PID             Node 0 Node 1 Node 2 Total
  --------------  ------ ------ ------ -----
  701 (hot_cold)     501      0    601  1101

Since there are some common routines with pageout, many functions have
similar logics between pageout and migrate cold.

damon_pa_migrate_folio_list() is a minimized version of
shrink_folio_list(), but it's minified only for demotion.

Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
---
 include/linux/damon.h    |   2 +
 mm/damon/paddr.c         | 146 ++++++++++++++++++++++++++++++++++++++-
 mm/damon/sysfs-schemes.c |   4 ++
 3 files changed, 151 insertions(+), 1 deletion(-)

Comments

SeongJae Park April 5, 2024, 7:24 p.m. UTC | #1
On Fri,  5 Apr 2024 15:08:54 +0900 Honggyu Kim <honggyu.kim@sk.com> wrote:

> This patch introduces DAMOS_MIGRATE_COLD action, which is similar to
> DAMOS_PAGEOUT, but migrate folios to the given 'target_nid' in the sysfs
> instead of swapping them out.
> 
> The 'target_nid' sysfs knob is created by this patch to inform the
> migration target node ID.

Isn't it created by the previous patch?

> 
> Here is one of the example usage of this 'migrate_cold' action.
> 
>   $ cd /sys/kernel/mm/damon/admin/kdamonds/<N>
>   $ cat contexts/<N>/schemes/<N>/action
>   migrate_cold
>   $ echo 2 > contexts/<N>/schemes/<N>/target_nid
>   $ echo commit > state
>   $ numactl -p 0 ./hot_cold 500M 600M &
>   $ numastat -c -p hot_cold
> 
>   Per-node process memory usage (in MBs)
>   PID             Node 0 Node 1 Node 2 Total
>   --------------  ------ ------ ------ -----
>   701 (hot_cold)     501      0    601  1101
> 
> Since there are some common routines with pageout, many functions have
> similar logics between pageout and migrate cold.
> 
> damon_pa_migrate_folio_list() is a minimized version of
> shrink_folio_list(), but it's minified only for demotion.

MIGRATE_COLD is not only for demotion, right?  I think the last two words are
better to be removed for reducing unnecessary confuses.

> 
> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
> ---
>  include/linux/damon.h    |   2 +
>  mm/damon/paddr.c         | 146 ++++++++++++++++++++++++++++++++++++++-
>  mm/damon/sysfs-schemes.c |   4 ++
>  3 files changed, 151 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/damon.h b/include/linux/damon.h
> index 24ea33a03d5d..df8671e69a70 100644
> --- a/include/linux/damon.h
> +++ b/include/linux/damon.h
> @@ -105,6 +105,7 @@ struct damon_target {
>   * @DAMOS_NOHUGEPAGE:	Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
>   * @DAMOS_LRU_PRIO:	Prioritize the region on its LRU lists.
>   * @DAMOS_LRU_DEPRIO:	Deprioritize the region on its LRU lists.
> + * @DAMOS_MIGRATE_COLD: Migrate for the given cold region.

Whether it will be for cold region or not is depending on the target access
pattern.  What about 'Migrate the regions in coldest regions first manner.'?
Or, simply 'Migrate the regions (prioritize cold)' here, and explain about the
prioritization under quota on the detailed comments part?

Also, let's use tab consistently.

>   * @DAMOS_STAT:		Do nothing but count the stat.
>   * @NR_DAMOS_ACTIONS:	Total number of DAMOS actions
>   *
> @@ -122,6 +123,7 @@ enum damos_action {
>  	DAMOS_NOHUGEPAGE,
>  	DAMOS_LRU_PRIO,
>  	DAMOS_LRU_DEPRIO,
> +	DAMOS_MIGRATE_COLD,
>  	DAMOS_STAT,		/* Do nothing but only record the stat */
>  	NR_DAMOS_ACTIONS,
>  };
> diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
> index 277a1c4d833c..fe217a26f788 100644
> --- a/mm/damon/paddr.c
> +++ b/mm/damon/paddr.c
> @@ -12,6 +12,9 @@
>  #include <linux/pagemap.h>
>  #include <linux/rmap.h>
>  #include <linux/swap.h>
> +#include <linux/memory-tiers.h>
> +#include <linux/migrate.h>
> +#include <linux/mm_inline.h>
>  
>  #include "../internal.h"
>  #include "ops-common.h"
> @@ -226,8 +229,137 @@ static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio)
>  
>  enum migration_mode {
>  	MIG_PAGEOUT,
> +	MIG_MIGRATE_COLD,
>  };
>  
> +static unsigned int migrate_folio_list(struct list_head *migrate_folios,
> +				       struct pglist_data *pgdat,
> +				       int target_nid)

To avoid name collisions, I'd prefer having damon_pa_prefix.  I show this patch
is defining damon_pa_migrate_folio_list() below, though.  What about
__damon_pa_migrate_folio_list()?

> +{
> +	unsigned int nr_succeeded;
> +	nodemask_t allowed_mask = NODE_MASK_NONE;
> +

I personally prefer not having empty lines in the middle of variable
declarations/definitions.  Could we remove this empty line?

> +	struct migration_target_control mtc = {
> +		/*
> +		 * Allocate from 'node', or fail quickly and quietly.
> +		 * When this happens, 'page' will likely just be discarded
> +		 * instead of migrated.
> +		 */
> +		.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
> +			__GFP_NOMEMALLOC | GFP_NOWAIT,
> +		.nid = target_nid,
> +		.nmask = &allowed_mask
> +	};
> +
> +	if (pgdat->node_id == target_nid || target_nid == NUMA_NO_NODE)
> +		return 0;
> +
> +	if (list_empty(migrate_folios))
> +		return 0;

Can't these checks be done by the caller?

> +
> +	/* Migration ignores all cpuset and mempolicy settings */
> +	migrate_pages(migrate_folios, alloc_migrate_folio, NULL,
> +		      (unsigned long)&mtc, MIGRATE_ASYNC, MR_DAMON,
> +		      &nr_succeeded);
> +
> +	return nr_succeeded;
> +}
> +
> +static unsigned int damon_pa_migrate_folio_list(struct list_head *folio_list,
> +						struct pglist_data *pgdat,
> +						enum migration_mode mm,

Again, 'mm' makes my poor brain a bit confused.  What about 'mode'?
And, seems this is not used at all in this function?  Can we just drop this?

> +						int target_nid)
> +{
> +	unsigned int nr_migrated = 0;
> +	struct folio *folio;
> +	LIST_HEAD(ret_folios);
> +	LIST_HEAD(migrate_folios);
> +
> +	cond_resched();

We will do this again at the beginning of the loop.  Do we need this here?

> +
> +	while (!list_empty(folio_list)) {
> +		struct folio *folio;
> +
> +		cond_resched();
> +
> +		folio = lru_to_folio(folio_list);
> +		list_del(&folio->lru);
> +
> +		if (!folio_trylock(folio))
> +			goto keep;
> +
> +		VM_BUG_ON_FOLIO(folio_test_active(folio), folio);

Why?  I think we could want to migrate active pages in some use case, e.g., to
reduce memory bandwidth?

> +
> +		/* Relocate its contents to another node. */
> +		list_add(&folio->lru, &migrate_folios);
> +		folio_unlock(folio);
> +		continue;
> +keep:
> +		list_add(&folio->lru, &ret_folios);
> +		VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

Can this happen?  I think this could be too much test?  checkpatch.pl also
warns.

> +	}
> +	/* 'folio_list' is always empty here */
> +
> +	/* Migrate folios selected for migration */
> +	nr_migrated += migrate_folio_list(&migrate_folios, pgdat, target_nid);
> +	/* Folios that could not be migrated are still in @migrate_folios */
> +	if (!list_empty(&migrate_folios)) {
> +		/* Folios which weren't migrated go back on @folio_list */
> +		list_splice_init(&migrate_folios, folio_list);
> +	}

Let's not use braces for single statement
(https://docs.kernel.org/process/coding-style.html#placing-braces-and-spaces).

> +
> +	try_to_unmap_flush();
> +
> +	list_splice(&ret_folios, folio_list);

Can't we move remaining folios in migrate_folios to ret_folios at once?

> +
> +	while (!list_empty(folio_list)) {
> +		folio = lru_to_folio(folio_list);
> +		list_del(&folio->lru);
> +		folio_putback_lru(folio);
> +	}
> +
> +	return nr_migrated;
> +}
> +
> +static unsigned long damon_pa_migrate_pages(struct list_head *folio_list,
> +					    enum migration_mode mm,

Again, I'd prefer calling this 'mode' or something other than 'mm'.
And, seems 'mm' is not really used in this function.  It is passed to
'damon_pa_migrate_folio_list()' but it deosn't really use it.  Can we drop
this?

> +					    int target_nid)
> +{
> +	int nid;
> +	unsigned int nr_migrated = 0;

Let's make this matches with the return type of this function.

> +	LIST_HEAD(node_folio_list);
> +	unsigned int noreclaim_flag;
> +
> +	if (list_empty(folio_list))
> +		return nr_migrated;
> +
> +	noreclaim_flag = memalloc_noreclaim_save();
> +
> +	nid = folio_nid(lru_to_folio(folio_list));
> +	do {
> +		struct folio *folio = lru_to_folio(folio_list);
> +
> +		if (nid == folio_nid(folio)) {
> +			folio_clear_active(folio);

I think this was necessary for demotion, but now this should be removed since
this function is no more for demotion but for migrating random pages, right?

> +			list_move(&folio->lru, &node_folio_list);
> +			continue;
> +		}
> +
> +		nr_migrated += damon_pa_migrate_folio_list(&node_folio_list,
> +							   NODE_DATA(nid), mm,
> +							   target_nid);
> +		nid = folio_nid(lru_to_folio(folio_list));
> +	} while (!list_empty(folio_list));
> +
> +	nr_migrated += damon_pa_migrate_folio_list(&node_folio_list,
> +						   NODE_DATA(nid), mm,
> +						   target_nid);
> +
> +	memalloc_noreclaim_restore(noreclaim_flag);
> +
> +	return nr_migrated;
> +}
> +
>  static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
>  				      enum migration_mode mm)
>  {
> @@ -247,7 +379,11 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
>  		folio_test_clear_young(folio);
>  		if (!folio_isolate_lru(folio))
>  			goto put_folio;
> -		if (folio_test_unevictable(folio))
> +		/*
> +		 * Since unevictable folios can be demoted or promoted,

Let's use the term 'migrated' instead of 'demoted' or 'promoted'.

> +		 * unevictable test is needed only for pageout.
> +		 */
> +		if (mm == MIG_PAGEOUT && folio_test_unevictable(folio))
>  			folio_putback_lru(folio);
>  		else
>  			list_add(&folio->lru, &folio_list);
> @@ -258,6 +394,10 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
>  	case MIG_PAGEOUT:
>  		applied = reclaim_pages(&folio_list);
>  		break;
> +	case MIG_MIGRATE_COLD:
> +		applied = damon_pa_migrate_pages(&folio_list, mm,
> +						 s->target_nid);
> +		break;
>  	default:
>  		/* Unexpected migration mode. */
>  		return 0;
> @@ -314,6 +454,8 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
>  		return damon_pa_mark_accessed(r, scheme);
>  	case DAMOS_LRU_DEPRIO:
>  		return damon_pa_deactivate_pages(r, scheme);
> +	case DAMOS_MIGRATE_COLD:
> +		return damon_pa_migrate(r, scheme, MIG_MIGRATE_COLD);
>  	case DAMOS_STAT:
>  		break;
>  	default:
> @@ -334,6 +476,8 @@ static int damon_pa_scheme_score(struct damon_ctx *context,
>  		return damon_hot_score(context, r, scheme);
>  	case DAMOS_LRU_DEPRIO:
>  		return damon_cold_score(context, r, scheme);
> +	case DAMOS_MIGRATE_COLD:
> +		return damon_cold_score(context, r, scheme);
>  	default:
>  		break;
>  	}
> diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
> index 1a30ea82c890..18b7d054c748 100644
> --- a/mm/damon/sysfs-schemes.c
> +++ b/mm/damon/sysfs-schemes.c
> @@ -1406,6 +1406,7 @@ static const char * const damon_sysfs_damos_action_strs[] = {
>  	"nohugepage",
>  	"lru_prio",
>  	"lru_deprio",
> +	"migrate_cold",
>  	"stat",
>  };
>  
> @@ -1659,6 +1660,9 @@ static ssize_t target_nid_store(struct kobject *kobj,
>  			struct damon_sysfs_scheme, kobj);
>  	int err = 0;
>  
> +        if (scheme->action != DAMOS_MIGRATE_COLD)
> +                return -EINVAL;
> +

I think user could set target_nid first, and then action.  So I think this
should not return error?

>  	/* TODO: error handling for target_nid range. */
>  	err = kstrtoint(buf, 0, &scheme->target_nid);
>  
> -- 
> 2.34.1
> 
> 


Thanks,
SJ
Honggyu Kim April 8, 2024, 12:06 p.m. UTC | #2
On Fri,  5 Apr 2024 12:24:30 -0700 SeongJae Park <sj@kernel.org> wrote:
> On Fri,  5 Apr 2024 15:08:54 +0900 Honggyu Kim <honggyu.kim@sk.com> wrote:
> 
> > This patch introduces DAMOS_MIGRATE_COLD action, which is similar to
> > DAMOS_PAGEOUT, but migrate folios to the given 'target_nid' in the sysfs
> > instead of swapping them out.
> > 
> > The 'target_nid' sysfs knob is created by this patch to inform the
> > migration target node ID.
> 
> Isn't it created by the previous patch?

Right.  I didn't fix the commit message after split this patch.  I will
fix it.

> > 
> > Here is one of the example usage of this 'migrate_cold' action.
> > 
> >   $ cd /sys/kernel/mm/damon/admin/kdamonds/<N>
> >   $ cat contexts/<N>/schemes/<N>/action
> >   migrate_cold
> >   $ echo 2 > contexts/<N>/schemes/<N>/target_nid
> >   $ echo commit > state
> >   $ numactl -p 0 ./hot_cold 500M 600M &
> >   $ numastat -c -p hot_cold
> > 
> >   Per-node process memory usage (in MBs)
> >   PID             Node 0 Node 1 Node 2 Total
> >   --------------  ------ ------ ------ -----
> >   701 (hot_cold)     501      0    601  1101
> > 
> > Since there are some common routines with pageout, many functions have
> > similar logics between pageout and migrate cold.
> > 
> > damon_pa_migrate_folio_list() is a minimized version of
> > shrink_folio_list(), but it's minified only for demotion.
> 
> MIGRATE_COLD is not only for demotion, right?  I think the last two words are
> better to be removed for reducing unnecessary confuses.

You mean the last two sentences?  I will remove them if you feel it's
confusing.

> > 
> > Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
> > Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
> > ---
> >  include/linux/damon.h    |   2 +
> >  mm/damon/paddr.c         | 146 ++++++++++++++++++++++++++++++++++++++-
> >  mm/damon/sysfs-schemes.c |   4 ++
> >  3 files changed, 151 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/linux/damon.h b/include/linux/damon.h
> > index 24ea33a03d5d..df8671e69a70 100644
> > --- a/include/linux/damon.h
> > +++ b/include/linux/damon.h
> > @@ -105,6 +105,7 @@ struct damon_target {
> >   * @DAMOS_NOHUGEPAGE:	Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
> >   * @DAMOS_LRU_PRIO:	Prioritize the region on its LRU lists.
> >   * @DAMOS_LRU_DEPRIO:	Deprioritize the region on its LRU lists.
> > + * @DAMOS_MIGRATE_COLD: Migrate for the given cold region.
> 
> Whether it will be for cold region or not is depending on the target access
> pattern.  What about 'Migrate the regions in coldest regions first manner.'?
> Or, simply 'Migrate the regions (prioritize cold)' here, and explain about the
> prioritization under quota on the detailed comments part?

"Migrate the regions in coldest regions first manner under quota" sounds
better.  I will change it.

> Also, let's use tab consistently.

Yeah, it's a mistake.  will fix it.

> >   * @DAMOS_STAT:		Do nothing but count the stat.
> >   * @NR_DAMOS_ACTIONS:	Total number of DAMOS actions
> >   *
> > @@ -122,6 +123,7 @@ enum damos_action {
> >  	DAMOS_NOHUGEPAGE,
> >  	DAMOS_LRU_PRIO,
> >  	DAMOS_LRU_DEPRIO,
> > +	DAMOS_MIGRATE_COLD,
> >  	DAMOS_STAT,		/* Do nothing but only record the stat */
> >  	NR_DAMOS_ACTIONS,
> >  };
> > diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
> > index 277a1c4d833c..fe217a26f788 100644
> > --- a/mm/damon/paddr.c
> > +++ b/mm/damon/paddr.c
> > @@ -12,6 +12,9 @@
> >  #include <linux/pagemap.h>
> >  #include <linux/rmap.h>
> >  #include <linux/swap.h>
> > +#include <linux/memory-tiers.h>
> > +#include <linux/migrate.h>
> > +#include <linux/mm_inline.h>
> >  
> >  #include "../internal.h"
> >  #include "ops-common.h"
> > @@ -226,8 +229,137 @@ static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio)
> >  
> >  enum migration_mode {
> >  	MIG_PAGEOUT,
> > +	MIG_MIGRATE_COLD,
> >  };
> >  
> > +static unsigned int migrate_folio_list(struct list_head *migrate_folios,
> > +				       struct pglist_data *pgdat,
> > +				       int target_nid)
> 
> To avoid name collisions, I'd prefer having damon_pa_prefix.  I show this patch
> is defining damon_pa_migrate_folio_list() below, though.  What about
> __damon_pa_migrate_folio_list()?

Ack.  I will change it to __damon_pa_migrate_folio_list().

> > +{
> > +	unsigned int nr_succeeded;
> > +	nodemask_t allowed_mask = NODE_MASK_NONE;
> > +
> 
> I personally prefer not having empty lines in the middle of variable
> declarations/definitions.  Could we remove this empty line?

I can remove it, but I would like to have more discussion about this
issue.  The current implementation allows only a single migration
target with "target_nid", but users might want to provide fall back
migration target nids.

For example, if more than two CXL nodes exist in the system, users might
want to migrate cold pages to any CXL nodes.  In such cases, we might
have to make "target_nid" accept comma separated node IDs.  nodemask can
be better but we should provide a way to change the scanning order.

I would like to hear how you think about this.

> > +	struct migration_target_control mtc = {
> > +		/*
> > +		 * Allocate from 'node', or fail quickly and quietly.
> > +		 * When this happens, 'page' will likely just be discarded
> > +		 * instead of migrated.
> > +		 */
> > +		.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
> > +			__GFP_NOMEMALLOC | GFP_NOWAIT,
> > +		.nid = target_nid,
> > +		.nmask = &allowed_mask
> > +	};
> > +
> > +	if (pgdat->node_id == target_nid || target_nid == NUMA_NO_NODE)
> > +		return 0;
> > +
> > +	if (list_empty(migrate_folios))
> > +		return 0;
> 
> Can't these checks be done by the caller?

Sure.  I will move them to the caller.

> > +
> > +	/* Migration ignores all cpuset and mempolicy settings */
> > +	migrate_pages(migrate_folios, alloc_migrate_folio, NULL,
> > +		      (unsigned long)&mtc, MIGRATE_ASYNC, MR_DAMON,
> > +		      &nr_succeeded);
> > +
> > +	return nr_succeeded;
> > +}
> > +
> > +static unsigned int damon_pa_migrate_folio_list(struct list_head *folio_list,
> > +						struct pglist_data *pgdat,
> > +						enum migration_mode mm,
> 
> Again, 'mm' makes my poor brain a bit confused.  What about 'mode'?
> And, seems this is not used at all in this function?  Can we just drop this?

Ack.  I will remove it in this patch and introduce it in the patch where
it's used.

> > +						int target_nid)
> > +{
> > +	unsigned int nr_migrated = 0;
> > +	struct folio *folio;
> > +	LIST_HEAD(ret_folios);
> > +	LIST_HEAD(migrate_folios);
> > +
> > +	cond_resched();
> 
> We will do this again at the beginning of the loop.  Do we need this here?

This comes from shrink_folio_list() but this function is way simpler so
it can be removed.

> > +
> > +	while (!list_empty(folio_list)) {
> > +		struct folio *folio;
> > +
> > +		cond_resched();
> > +
> > +		folio = lru_to_folio(folio_list);
> > +		list_del(&folio->lru);
> > +
> > +		if (!folio_trylock(folio))
> > +			goto keep;
> > +
> > +		VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
> 
> Why?  I think we could want to migrate active pages in some use case, e.g., to
> reduce memory bandwidth?

Yeah, I will remove it.

> > +
> > +		/* Relocate its contents to another node. */
> > +		list_add(&folio->lru, &migrate_folios);
> > +		folio_unlock(folio);
> > +		continue;
> > +keep:
> > +		list_add(&folio->lru, &ret_folios);
> > +		VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
> 
> Can this happen?  I think this could be too much test?  checkpatch.pl also
> warns.

Likewise, the current shrink_folio_list does so brought it in this patch
as well, but I think we can remove it here.

> > +	}
> > +	/* 'folio_list' is always empty here */
> > +
> > +	/* Migrate folios selected for migration */
> > +	nr_migrated += migrate_folio_list(&migrate_folios, pgdat, target_nid);
> > +	/* Folios that could not be migrated are still in @migrate_folios */
> > +	if (!list_empty(&migrate_folios)) {
> > +		/* Folios which weren't migrated go back on @folio_list */
> > +		list_splice_init(&migrate_folios, folio_list);
> > +	}
> 
> Let's not use braces for single statement
> (https://docs.kernel.org/process/coding-style.html#placing-braces-and-spaces).

Hmm.. I know the convention but left it as is because of the comment.
If I remove the braces, it would have a weird alignment for the two
lines for comment and statement lines.

> > +
> > +	try_to_unmap_flush();
> > +
> > +	list_splice(&ret_folios, folio_list);
> 
> Can't we move remaining folios in migrate_folios to ret_folios at once?

I will see if it's possible.

> > +
> > +	while (!list_empty(folio_list)) {
> > +		folio = lru_to_folio(folio_list);
> > +		list_del(&folio->lru);
> > +		folio_putback_lru(folio);
> > +	}
> > +
> > +	return nr_migrated;
> > +}
> > +
> > +static unsigned long damon_pa_migrate_pages(struct list_head *folio_list,
> > +					    enum migration_mode mm,
> 
> Again, I'd prefer calling this 'mode' or something other than 'mm'.
> And, seems 'mm' is not really used in this function.  It is passed to
> 'damon_pa_migrate_folio_list()' but it deosn't really use it.  Can we drop
> this?

Sure.  I will drop it here and rename it to "mode" where it's used.

> > +					    int target_nid)
> > +{
> > +	int nid;
> > +	unsigned int nr_migrated = 0;
> 
> Let's make this matches with the return type of this function.

Ack.  will change it to unsigned long.

> > +	LIST_HEAD(node_folio_list);
> > +	unsigned int noreclaim_flag;
> > +
> > +	if (list_empty(folio_list))
> > +		return nr_migrated;
> > +
> > +	noreclaim_flag = memalloc_noreclaim_save();
> > +
> > +	nid = folio_nid(lru_to_folio(folio_list));
> > +	do {
> > +		struct folio *folio = lru_to_folio(folio_list);
> > +
> > +		if (nid == folio_nid(folio)) {
> > +			folio_clear_active(folio);
> 
> I think this was necessary for demotion, but now this should be removed since
> this function is no more for demotion but for migrating random pages, right?

Yeah,  it can be removed because we do migration instead of demotion,
but I need to make sure if it doesn't change the performance evaluation
results.

> > +			list_move(&folio->lru, &node_folio_list);
> > +			continue;
> > +		}
> > +
> > +		nr_migrated += damon_pa_migrate_folio_list(&node_folio_list,
> > +							   NODE_DATA(nid), mm,
> > +							   target_nid);
> > +		nid = folio_nid(lru_to_folio(folio_list));
> > +	} while (!list_empty(folio_list));
> > +
> > +	nr_migrated += damon_pa_migrate_folio_list(&node_folio_list,
> > +						   NODE_DATA(nid), mm,
> > +						   target_nid);
> > +
> > +	memalloc_noreclaim_restore(noreclaim_flag);
> > +
> > +	return nr_migrated;
> > +}
> > +
> >  static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
> >  				      enum migration_mode mm)
> >  {
> > @@ -247,7 +379,11 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
> >  		folio_test_clear_young(folio);
> >  		if (!folio_isolate_lru(folio))
> >  			goto put_folio;
> > -		if (folio_test_unevictable(folio))
> > +		/*
> > +		 * Since unevictable folios can be demoted or promoted,
> 
> Let's use the term 'migrated' instead of 'demoted' or 'promoted'.

Ack.

> > +		 * unevictable test is needed only for pageout.
> > +		 */
> > +		if (mm == MIG_PAGEOUT && folio_test_unevictable(folio))
> >  			folio_putback_lru(folio);
> >  		else
> >  			list_add(&folio->lru, &folio_list);
> > @@ -258,6 +394,10 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
> >  	case MIG_PAGEOUT:
> >  		applied = reclaim_pages(&folio_list);
> >  		break;
> > +	case MIG_MIGRATE_COLD:
> > +		applied = damon_pa_migrate_pages(&folio_list, mm,
> > +						 s->target_nid);
> > +		break;
> >  	default:
> >  		/* Unexpected migration mode. */
> >  		return 0;
> > @@ -314,6 +454,8 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
> >  		return damon_pa_mark_accessed(r, scheme);
> >  	case DAMOS_LRU_DEPRIO:
> >  		return damon_pa_deactivate_pages(r, scheme);
> > +	case DAMOS_MIGRATE_COLD:
> > +		return damon_pa_migrate(r, scheme, MIG_MIGRATE_COLD);
> >  	case DAMOS_STAT:
> >  		break;
> >  	default:
> > @@ -334,6 +476,8 @@ static int damon_pa_scheme_score(struct damon_ctx *context,
> >  		return damon_hot_score(context, r, scheme);
> >  	case DAMOS_LRU_DEPRIO:
> >  		return damon_cold_score(context, r, scheme);
> > +	case DAMOS_MIGRATE_COLD:
> > +		return damon_cold_score(context, r, scheme);
> >  	default:
> >  		break;
> >  	}
> > diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
> > index 1a30ea82c890..18b7d054c748 100644
> > --- a/mm/damon/sysfs-schemes.c
> > +++ b/mm/damon/sysfs-schemes.c
> > @@ -1406,6 +1406,7 @@ static const char * const damon_sysfs_damos_action_strs[] = {
> >  	"nohugepage",
> >  	"lru_prio",
> >  	"lru_deprio",
> > +	"migrate_cold",
> >  	"stat",
> >  };
> >  
> > @@ -1659,6 +1660,9 @@ static ssize_t target_nid_store(struct kobject *kobj,
> >  			struct damon_sysfs_scheme, kobj);
> >  	int err = 0;
> >  
> > +        if (scheme->action != DAMOS_MIGRATE_COLD)
> > +                return -EINVAL;
> > +
> 
> I think user could set target_nid first, and then action.  So I think this
> should not return error?

Make sense.  I will drop this check.

Thanks,
Honggyu

> >  	/* TODO: error handling for target_nid range. */
> >  	err = kstrtoint(buf, 0, &scheme->target_nid);
> >  
> > -- 
> > 2.34.1
> > 
> > 
> 
> 
> Thanks,
> SJ
SeongJae Park April 8, 2024, 5:52 p.m. UTC | #3
On Mon,  8 Apr 2024 21:06:44 +0900 Honggyu Kim <honggyu.kim@sk.com> wrote:

> On Fri,  5 Apr 2024 12:24:30 -0700 SeongJae Park <sj@kernel.org> wrote:
> > On Fri,  5 Apr 2024 15:08:54 +0900 Honggyu Kim <honggyu.kim@sk.com> wrote:
[...]
> > > Here is one of the example usage of this 'migrate_cold' action.
> > > 
> > >   $ cd /sys/kernel/mm/damon/admin/kdamonds/<N>
> > >   $ cat contexts/<N>/schemes/<N>/action
> > >   migrate_cold
> > >   $ echo 2 > contexts/<N>/schemes/<N>/target_nid
> > >   $ echo commit > state
> > >   $ numactl -p 0 ./hot_cold 500M 600M &
> > >   $ numastat -c -p hot_cold
> > > 
> > >   Per-node process memory usage (in MBs)
> > >   PID             Node 0 Node 1 Node 2 Total
> > >   --------------  ------ ------ ------ -----
> > >   701 (hot_cold)     501      0    601  1101
> > > 
> > > Since there are some common routines with pageout, many functions have
> > > similar logics between pageout and migrate cold.
> > > 
> > > damon_pa_migrate_folio_list() is a minimized version of
> > > shrink_folio_list(), but it's minified only for demotion.
> > 
> > MIGRATE_COLD is not only for demotion, right?  I think the last two words are
> > better to be removed for reducing unnecessary confuses.
> 
> You mean the last two sentences?  I will remove them if you feel it's
> confusing.

Yes.  My real intended suggestion was 's/only for demotion/only for
migration/', but entirely removing the sentences is also ok for me.

> 
> > > 
> > > Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
> > > Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
> > > ---
> > >  include/linux/damon.h    |   2 +
> > >  mm/damon/paddr.c         | 146 ++++++++++++++++++++++++++++++++++++++-
> > >  mm/damon/sysfs-schemes.c |   4 ++
> > >  3 files changed, 151 insertions(+), 1 deletion(-)
[...]
> > > --- a/mm/damon/paddr.c
> > > +++ b/mm/damon/paddr.c
[...]
> > > +{
> > > +	unsigned int nr_succeeded;
> > > +	nodemask_t allowed_mask = NODE_MASK_NONE;
> > > +
> > 
> > I personally prefer not having empty lines in the middle of variable
> > declarations/definitions.  Could we remove this empty line?
> 
> I can remove it, but I would like to have more discussion about this
> issue.  The current implementation allows only a single migration
> target with "target_nid", but users might want to provide fall back
> migration target nids.
> 
> For example, if more than two CXL nodes exist in the system, users might
> want to migrate cold pages to any CXL nodes.  In such cases, we might
> have to make "target_nid" accept comma separated node IDs.  nodemask can
> be better but we should provide a way to change the scanning order.
> 
> I would like to hear how you think about this.

Good point.  I think we could later extend the sysfs file to receive the
comma-separated numbers, or even mask.  For simplicity, adding sysfs files
dedicated for the different format of inputs could also be an option (e.g.,
target_nids_list, target_nids_mask).  But starting from this single node as is
now looks ok to me.

[...]
> > > +	/* 'folio_list' is always empty here */
> > > +
> > > +	/* Migrate folios selected for migration */
> > > +	nr_migrated += migrate_folio_list(&migrate_folios, pgdat, target_nid);
> > > +	/* Folios that could not be migrated are still in @migrate_folios */
> > > +	if (!list_empty(&migrate_folios)) {
> > > +		/* Folios which weren't migrated go back on @folio_list */
> > > +		list_splice_init(&migrate_folios, folio_list);
> > > +	}
> > 
> > Let's not use braces for single statement
> > (https://docs.kernel.org/process/coding-style.html#placing-braces-and-spaces).
> 
> Hmm.. I know the convention but left it as is because of the comment.
> If I remove the braces, it would have a weird alignment for the two
> lines for comment and statement lines.

I don't really hate such alignment.  But if you don't like it, how about moving
the comment out of the if statement?  Having one comment for one-line if
statement looks not bad to me.

> 
> > > +
> > > +	try_to_unmap_flush();
> > > +
> > > +	list_splice(&ret_folios, folio_list);
> > 
> > Can't we move remaining folios in migrate_folios to ret_folios at once?
> 
> I will see if it's possible.

Thank you.  Not a strict request, though.

[...]
> > > +	nid = folio_nid(lru_to_folio(folio_list));
> > > +	do {
> > > +		struct folio *folio = lru_to_folio(folio_list);
> > > +
> > > +		if (nid == folio_nid(folio)) {
> > > +			folio_clear_active(folio);
> > 
> > I think this was necessary for demotion, but now this should be removed since
> > this function is no more for demotion but for migrating random pages, right?
> 
> Yeah,  it can be removed because we do migration instead of demotion,
> but I need to make sure if it doesn't change the performance evaluation
> results.

Yes, please ensure the test results are valid :)


Thanks,
SJ

[...]
Honggyu Kim April 9, 2024, 9:54 a.m. UTC | #4
Hi SeongJae,

On Mon,  8 Apr 2024 10:52:28 -0700 SeongJae Park <sj@kernel.org> wrote:
> On Mon,  8 Apr 2024 21:06:44 +0900 Honggyu Kim <honggyu.kim@sk.com> wrote:
> 
> > On Fri,  5 Apr 2024 12:24:30 -0700 SeongJae Park <sj@kernel.org> wrote:
> > > On Fri,  5 Apr 2024 15:08:54 +0900 Honggyu Kim <honggyu.kim@sk.com> wrote:
> [...]
> > > > Here is one of the example usage of this 'migrate_cold' action.
> > > > 
> > > >   $ cd /sys/kernel/mm/damon/admin/kdamonds/<N>
> > > >   $ cat contexts/<N>/schemes/<N>/action
> > > >   migrate_cold
> > > >   $ echo 2 > contexts/<N>/schemes/<N>/target_nid
> > > >   $ echo commit > state
> > > >   $ numactl -p 0 ./hot_cold 500M 600M &
> > > >   $ numastat -c -p hot_cold
> > > > 
> > > >   Per-node process memory usage (in MBs)
> > > >   PID             Node 0 Node 1 Node 2 Total
> > > >   --------------  ------ ------ ------ -----
> > > >   701 (hot_cold)     501      0    601  1101
> > > > 
> > > > Since there are some common routines with pageout, many functions have
> > > > similar logics between pageout and migrate cold.
> > > > 
> > > > damon_pa_migrate_folio_list() is a minimized version of
> > > > shrink_folio_list(), but it's minified only for demotion.
> > > 
> > > MIGRATE_COLD is not only for demotion, right?  I think the last two words are
> > > better to be removed for reducing unnecessary confuses.
> > 
> > You mean the last two sentences?  I will remove them if you feel it's
> > confusing.
> 
> Yes.  My real intended suggestion was 's/only for demotion/only for
> migration/', but entirely removing the sentences is also ok for me.

Ack.

> > 
> > > > 
> > > > Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
> > > > Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
> > > > ---
> > > >  include/linux/damon.h    |   2 +
> > > >  mm/damon/paddr.c         | 146 ++++++++++++++++++++++++++++++++++++++-
> > > >  mm/damon/sysfs-schemes.c |   4 ++
> > > >  3 files changed, 151 insertions(+), 1 deletion(-)
> [...]
> > > > --- a/mm/damon/paddr.c
> > > > +++ b/mm/damon/paddr.c
> [...]
> > > > +{
> > > > +	unsigned int nr_succeeded;
> > > > +	nodemask_t allowed_mask = NODE_MASK_NONE;
> > > > +
> > > 
> > > I personally prefer not having empty lines in the middle of variable
> > > declarations/definitions.  Could we remove this empty line?
> > 
> > I can remove it, but I would like to have more discussion about this
> > issue.  The current implementation allows only a single migration
> > target with "target_nid", but users might want to provide fall back
> > migration target nids.
> > 
> > For example, if more than two CXL nodes exist in the system, users might
> > want to migrate cold pages to any CXL nodes.  In such cases, we might
> > have to make "target_nid" accept comma separated node IDs.  nodemask can
> > be better but we should provide a way to change the scanning order.
> > 
> > I would like to hear how you think about this.
> 
> Good point.  I think we could later extend the sysfs file to receive the
> comma-separated numbers, or even mask.  For simplicity, adding sysfs files
> dedicated for the different format of inputs could also be an option (e.g.,
> target_nids_list, target_nids_mask).  But starting from this single node as is
> now looks ok to me.

If you think we can start from a single node, then I will keep it as is.
But are you okay if I change the same 'target_nid' to accept
comma-separated numbers later?  Or do you want to introduce another knob
such as 'target_nids_list'?  What about rename 'target_nid' to
'target_nids' at the first place?

> [...]
> > > > +	/* 'folio_list' is always empty here */
> > > > +
> > > > +	/* Migrate folios selected for migration */
> > > > +	nr_migrated += migrate_folio_list(&migrate_folios, pgdat, target_nid);
> > > > +	/* Folios that could not be migrated are still in @migrate_folios */
> > > > +	if (!list_empty(&migrate_folios)) {
> > > > +		/* Folios which weren't migrated go back on @folio_list */
> > > > +		list_splice_init(&migrate_folios, folio_list);
> > > > +	}
> > > 
> > > Let's not use braces for single statement
> > > (https://docs.kernel.org/process/coding-style.html#placing-braces-and-spaces).
> > 
> > Hmm.. I know the convention but left it as is because of the comment.
> > If I remove the braces, it would have a weird alignment for the two
> > lines for comment and statement lines.
> 
> I don't really hate such alignment.  But if you don't like it, how about moving
> the comment out of the if statement?  Having one comment for one-line if
> statement looks not bad to me.

Ack. I will manage this in the next revision.

> > 
> > > > +
> > > > +	try_to_unmap_flush();
> > > > +
> > > > +	list_splice(&ret_folios, folio_list);
> > > 
> > > Can't we move remaining folios in migrate_folios to ret_folios at once?
> > 
> > I will see if it's possible.
> 
> Thank you.  Not a strict request, though.
> 
> [...]
> > > > +	nid = folio_nid(lru_to_folio(folio_list));
> > > > +	do {
> > > > +		struct folio *folio = lru_to_folio(folio_list);
> > > > +
> > > > +		if (nid == folio_nid(folio)) {
> > > > +			folio_clear_active(folio);
> > > 
> > > I think this was necessary for demotion, but now this should be removed since
> > > this function is no more for demotion but for migrating random pages, right?
> > 
> > Yeah,  it can be removed because we do migration instead of demotion,
> > but I need to make sure if it doesn't change the performance evaluation
> > results.
> 
> Yes, please ensure the test results are valid :)

Sure. Thanks for your review in details!

Please note that I will be out of office this week so won't be able to
answer quickly.

Thanks,
Honggyu

> 
> Thanks,
> SJ
> 
> [...]
>
SeongJae Park April 9, 2024, 4:18 p.m. UTC | #5
Hi Honggyu,

On Tue,  9 Apr 2024 18:54:14 +0900 Honggyu Kim <honggyu.kim@sk.com> wrote:
> On Mon,  8 Apr 2024 10:52:28 -0700 SeongJae Park <sj@kernel.org> wrote:
> > On Mon,  8 Apr 2024 21:06:44 +0900 Honggyu Kim <honggyu.kim@sk.com> wrote:
> > > On Fri,  5 Apr 2024 12:24:30 -0700 SeongJae Park <sj@kernel.org> wrote:
> > > > On Fri,  5 Apr 2024 15:08:54 +0900 Honggyu Kim <honggyu.kim@sk.com> wrote:
[...]
> > > I can remove it, but I would like to have more discussion about this
> > > issue.  The current implementation allows only a single migration
> > > target with "target_nid", but users might want to provide fall back
> > > migration target nids.
> > > 
> > > For example, if more than two CXL nodes exist in the system, users might
> > > want to migrate cold pages to any CXL nodes.  In such cases, we might
> > > have to make "target_nid" accept comma separated node IDs.  nodemask can
> > > be better but we should provide a way to change the scanning order.
> > > 
> > > I would like to hear how you think about this.
> > 
> > Good point.  I think we could later extend the sysfs file to receive the
> > comma-separated numbers, or even mask.  For simplicity, adding sysfs files
> > dedicated for the different format of inputs could also be an option (e.g.,
> > target_nids_list, target_nids_mask).  But starting from this single node as is
> > now looks ok to me.
> 
> If you think we can start from a single node, then I will keep it as is.
> But are you okay if I change the same 'target_nid' to accept
> comma-separated numbers later?  Or do you want to introduce another knob
> such as 'target_nids_list'?  What about rename 'target_nid' to
> 'target_nids' at the first place?

I have no strong concern or opinion about this at the moment.  Please feel free
to renaming it to 'taget_nids' if you think that's better.

[...]
> Please note that I will be out of office this week so won't be able to
> answer quickly.

No problem, I hope you to take and enjoy your time :)


Thanks,
SJ

[...]
diff mbox series

Patch

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 24ea33a03d5d..df8671e69a70 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -105,6 +105,7 @@  struct damon_target {
  * @DAMOS_NOHUGEPAGE:	Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
  * @DAMOS_LRU_PRIO:	Prioritize the region on its LRU lists.
  * @DAMOS_LRU_DEPRIO:	Deprioritize the region on its LRU lists.
+ * @DAMOS_MIGRATE_COLD: Migrate for the given cold region.
  * @DAMOS_STAT:		Do nothing but count the stat.
  * @NR_DAMOS_ACTIONS:	Total number of DAMOS actions
  *
@@ -122,6 +123,7 @@  enum damos_action {
 	DAMOS_NOHUGEPAGE,
 	DAMOS_LRU_PRIO,
 	DAMOS_LRU_DEPRIO,
+	DAMOS_MIGRATE_COLD,
 	DAMOS_STAT,		/* Do nothing but only record the stat */
 	NR_DAMOS_ACTIONS,
 };
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 277a1c4d833c..fe217a26f788 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -12,6 +12,9 @@ 
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
+#include <linux/memory-tiers.h>
+#include <linux/migrate.h>
+#include <linux/mm_inline.h>
 
 #include "../internal.h"
 #include "ops-common.h"
@@ -226,8 +229,137 @@  static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio)
 
 enum migration_mode {
 	MIG_PAGEOUT,
+	MIG_MIGRATE_COLD,
 };
 
+static unsigned int migrate_folio_list(struct list_head *migrate_folios,
+				       struct pglist_data *pgdat,
+				       int target_nid)
+{
+	unsigned int nr_succeeded;
+	nodemask_t allowed_mask = NODE_MASK_NONE;
+
+	struct migration_target_control mtc = {
+		/*
+		 * Allocate from 'node', or fail quickly and quietly.
+		 * When this happens, 'page' will likely just be discarded
+		 * instead of migrated.
+		 */
+		.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
+			__GFP_NOMEMALLOC | GFP_NOWAIT,
+		.nid = target_nid,
+		.nmask = &allowed_mask
+	};
+
+	if (pgdat->node_id == target_nid || target_nid == NUMA_NO_NODE)
+		return 0;
+
+	if (list_empty(migrate_folios))
+		return 0;
+
+	/* Migration ignores all cpuset and mempolicy settings */
+	migrate_pages(migrate_folios, alloc_migrate_folio, NULL,
+		      (unsigned long)&mtc, MIGRATE_ASYNC, MR_DAMON,
+		      &nr_succeeded);
+
+	return nr_succeeded;
+}
+
+static unsigned int damon_pa_migrate_folio_list(struct list_head *folio_list,
+						struct pglist_data *pgdat,
+						enum migration_mode mm,
+						int target_nid)
+{
+	unsigned int nr_migrated = 0;
+	struct folio *folio;
+	LIST_HEAD(ret_folios);
+	LIST_HEAD(migrate_folios);
+
+	cond_resched();
+
+	while (!list_empty(folio_list)) {
+		struct folio *folio;
+
+		cond_resched();
+
+		folio = lru_to_folio(folio_list);
+		list_del(&folio->lru);
+
+		if (!folio_trylock(folio))
+			goto keep;
+
+		VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
+
+		/* Relocate its contents to another node. */
+		list_add(&folio->lru, &migrate_folios);
+		folio_unlock(folio);
+		continue;
+keep:
+		list_add(&folio->lru, &ret_folios);
+		VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+	}
+	/* 'folio_list' is always empty here */
+
+	/* Migrate folios selected for migration */
+	nr_migrated += migrate_folio_list(&migrate_folios, pgdat, target_nid);
+	/* Folios that could not be migrated are still in @migrate_folios */
+	if (!list_empty(&migrate_folios)) {
+		/* Folios which weren't migrated go back on @folio_list */
+		list_splice_init(&migrate_folios, folio_list);
+	}
+
+	try_to_unmap_flush();
+
+	list_splice(&ret_folios, folio_list);
+
+	while (!list_empty(folio_list)) {
+		folio = lru_to_folio(folio_list);
+		list_del(&folio->lru);
+		folio_putback_lru(folio);
+	}
+
+	return nr_migrated;
+}
+
+static unsigned long damon_pa_migrate_pages(struct list_head *folio_list,
+					    enum migration_mode mm,
+					    int target_nid)
+{
+	int nid;
+	unsigned int nr_migrated = 0;
+	LIST_HEAD(node_folio_list);
+	unsigned int noreclaim_flag;
+
+	if (list_empty(folio_list))
+		return nr_migrated;
+
+	noreclaim_flag = memalloc_noreclaim_save();
+
+	nid = folio_nid(lru_to_folio(folio_list));
+	do {
+		struct folio *folio = lru_to_folio(folio_list);
+
+		if (nid == folio_nid(folio)) {
+			folio_clear_active(folio);
+			list_move(&folio->lru, &node_folio_list);
+			continue;
+		}
+
+		nr_migrated += damon_pa_migrate_folio_list(&node_folio_list,
+							   NODE_DATA(nid), mm,
+							   target_nid);
+		nid = folio_nid(lru_to_folio(folio_list));
+	} while (!list_empty(folio_list));
+
+	nr_migrated += damon_pa_migrate_folio_list(&node_folio_list,
+						   NODE_DATA(nid), mm,
+						   target_nid);
+
+	memalloc_noreclaim_restore(noreclaim_flag);
+
+	return nr_migrated;
+}
+
 static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
 				      enum migration_mode mm)
 {
@@ -247,7 +379,11 @@  static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
 		folio_test_clear_young(folio);
 		if (!folio_isolate_lru(folio))
 			goto put_folio;
-		if (folio_test_unevictable(folio))
+		/*
+		 * Since unevictable folios can be demoted or promoted,
+		 * unevictable test is needed only for pageout.
+		 */
+		if (mm == MIG_PAGEOUT && folio_test_unevictable(folio))
 			folio_putback_lru(folio);
 		else
 			list_add(&folio->lru, &folio_list);
@@ -258,6 +394,10 @@  static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
 	case MIG_PAGEOUT:
 		applied = reclaim_pages(&folio_list);
 		break;
+	case MIG_MIGRATE_COLD:
+		applied = damon_pa_migrate_pages(&folio_list, mm,
+						 s->target_nid);
+		break;
 	default:
 		/* Unexpected migration mode. */
 		return 0;
@@ -314,6 +454,8 @@  static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
 		return damon_pa_mark_accessed(r, scheme);
 	case DAMOS_LRU_DEPRIO:
 		return damon_pa_deactivate_pages(r, scheme);
+	case DAMOS_MIGRATE_COLD:
+		return damon_pa_migrate(r, scheme, MIG_MIGRATE_COLD);
 	case DAMOS_STAT:
 		break;
 	default:
@@ -334,6 +476,8 @@  static int damon_pa_scheme_score(struct damon_ctx *context,
 		return damon_hot_score(context, r, scheme);
 	case DAMOS_LRU_DEPRIO:
 		return damon_cold_score(context, r, scheme);
+	case DAMOS_MIGRATE_COLD:
+		return damon_cold_score(context, r, scheme);
 	default:
 		break;
 	}
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 1a30ea82c890..18b7d054c748 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1406,6 +1406,7 @@  static const char * const damon_sysfs_damos_action_strs[] = {
 	"nohugepage",
 	"lru_prio",
 	"lru_deprio",
+	"migrate_cold",
 	"stat",
 };
 
@@ -1659,6 +1660,9 @@  static ssize_t target_nid_store(struct kobject *kobj,
 			struct damon_sysfs_scheme, kobj);
 	int err = 0;
 
+        if (scheme->action != DAMOS_MIGRATE_COLD)
+                return -EINVAL;
+
 	/* TODO: error handling for target_nid range. */
 	err = kstrtoint(buf, 0, &scheme->target_nid);