diff mbox series

[RFC,4/4] mm/damon: introduce DAMOS_PROMOTE action for promotion

Message ID 20240115045253.1775-5-honggyu.kim@sk.com (mailing list archive)
State Superseded
Headers show
Series DAMON based 2-tier memory management for CXL memory | expand

Commit Message

Honggyu Kim Jan. 15, 2024, 4:52 a.m. UTC
From: Hyeongtak Ji <hyeongtak.ji@sk.com>

This patch introduces DAMOS_PROMOTE action for paddr mode.

It includes renaming alloc_demote_folio to alloc_migrate_folio to use it
for promotion as well.

The execution sequence of DAMOS_DEMOTE and DAMOS_PROMOTE look as
follows for comparison.

  DAMOS_DEMOTE action
    damo_pa_apply_scheme
    -> damon_pa_reclaim
    -> demote_pages
    -> do_demote_folio_list
    -> __demote_folio_list
    -> demote_folio_list

  DAMOS_PROMOTE action
    damo_pa_apply_scheme
    -> damon_pa_promote
    -> promote_pages
    -> do_promote_folio_list
    -> __promote_folio_list
    -> promote_folio_list

Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
---
 include/linux/damon.h          |   2 +
 include/linux/migrate_mode.h   |   1 +
 include/linux/vm_event_item.h  |   1 +
 include/trace/events/migrate.h |   3 +-
 mm/damon/paddr.c               |  29 ++++++++
 mm/damon/sysfs-schemes.c       |   1 +
 mm/internal.h                  |   1 +
 mm/vmscan.c                    | 129 ++++++++++++++++++++++++++++++++-
 mm/vmstat.c                    |   1 +
 9 files changed, 165 insertions(+), 3 deletions(-)

Comments

SeongJae Park Jan. 16, 2024, 8:32 p.m. UTC | #1
On Mon, 15 Jan 2024 13:52:52 +0900 Honggyu Kim <honggyu.kim@sk.com> wrote:

> From: Hyeongtak Ji <hyeongtak.ji@sk.com>
> 
> This patch introduces DAMOS_PROMOTE action for paddr mode.
> 
> It includes renaming alloc_demote_folio to alloc_migrate_folio to use it
> for promotion as well.
> 
> The execution sequence of DAMOS_DEMOTE and DAMOS_PROMOTE look as
> follows for comparison.
> 
>   DAMOS_DEMOTE action
>     damo_pa_apply_scheme
>     -> damon_pa_reclaim
>     -> demote_pages
>     -> do_demote_folio_list
>     -> __demote_folio_list
>     -> demote_folio_list
> 
>   DAMOS_PROMOTE action
>     damo_pa_apply_scheme
>     -> damon_pa_promote
>     -> promote_pages
>     -> do_promote_folio_list
>     -> __promote_folio_list
>     -> promote_folio_list
> 
> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
> ---
>  include/linux/damon.h          |   2 +
>  include/linux/migrate_mode.h   |   1 +
>  include/linux/vm_event_item.h  |   1 +
>  include/trace/events/migrate.h |   3 +-
>  mm/damon/paddr.c               |  29 ++++++++
>  mm/damon/sysfs-schemes.c       |   1 +
>  mm/internal.h                  |   1 +
>  mm/vmscan.c                    | 129 ++++++++++++++++++++++++++++++++-
>  mm/vmstat.c                    |   1 +
>  9 files changed, 165 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/damon.h b/include/linux/damon.h
> index 4c0a0fef09c5..477060bb6718 100644
> --- a/include/linux/damon.h
> +++ b/include/linux/damon.h
> @@ -107,6 +107,7 @@ struct damon_target {
>   * @DAMOS_LRU_DEPRIO:	Deprioritize the region on its LRU lists.
>   * @DAMOS_STAT:		Do nothing but count the stat.
>   * @DAMOS_DEMOTE:	Do demotion for the current region.
> + * @DAMOS_PROMOTE:	Do promotion if possible, otherwise do nothing.

Like LRU_PRIO is defined before LRU_DEPRIO, what about defining PROMOTE before
DEMOTE?

>   * @NR_DAMOS_ACTIONS:	Total number of DAMOS actions
>   *
>   * The support of each action is up to running &struct damon_operations.
> @@ -125,6 +126,7 @@ enum damos_action {
>  	DAMOS_LRU_DEPRIO,
>  	DAMOS_STAT,		/* Do nothing but only record the stat */
>  	DAMOS_DEMOTE,
> +	DAMOS_PROMOTE,
>  	NR_DAMOS_ACTIONS,
>  };
>  
> diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h
> index f37cc03f9369..63f75eb9abf3 100644
> --- a/include/linux/migrate_mode.h
> +++ b/include/linux/migrate_mode.h
> @@ -29,6 +29,7 @@ enum migrate_reason {
>  	MR_CONTIG_RANGE,
>  	MR_LONGTERM_PIN,
>  	MR_DEMOTION,
> +	MR_PROMOTION,
>  	MR_TYPES
>  };
>  
> diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
> index 8abfa1240040..63cf920afeaa 100644
> --- a/include/linux/vm_event_item.h
> +++ b/include/linux/vm_event_item.h
> @@ -44,6 +44,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
>  		PGDEMOTE_KSWAPD,
>  		PGDEMOTE_DIRECT,
>  		PGDEMOTE_KHUGEPAGED,
> +		PGPROMOTE,
>  		PGSCAN_KSWAPD,
>  		PGSCAN_DIRECT,
>  		PGSCAN_KHUGEPAGED,
> diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
> index 0190ef725b43..f0dd569c1e62 100644
> --- a/include/trace/events/migrate.h
> +++ b/include/trace/events/migrate.h
> @@ -22,7 +22,8 @@
>  	EM( MR_NUMA_MISPLACED,	"numa_misplaced")		\
>  	EM( MR_CONTIG_RANGE,	"contig_range")			\
>  	EM( MR_LONGTERM_PIN,	"longterm_pin")			\
> -	EMe(MR_DEMOTION,	"demotion")
> +	EM( MR_DEMOTION,	"demotion")			\
> +	EMe(MR_PROMOTION,	"promotion")
>  
>  /*
>   * First define the enums in the above macros to be exported to userspace
> diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
> index d3e3f077cd00..360ce69d5898 100644
> --- a/mm/damon/paddr.c
> +++ b/mm/damon/paddr.c
> @@ -257,6 +257,32 @@ static unsigned long damon_pa_reclaim(struct damon_region *r, struct damos *s, b
>  	return applied * PAGE_SIZE;
>  }
>  
> +static unsigned long damon_pa_promote(struct damon_region *r, struct damos *s)
> +{
> +	unsigned long addr, applied;
> +	LIST_HEAD(folio_list);
> +
> +	for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
> +		struct folio *folio = damon_get_folio(PHYS_PFN(addr));
> +
> +		if (!folio)
> +			continue;
> +
> +		if (damos_pa_filter_out(s, folio))
> +			goto put_folio;
> +
> +		if (!folio_isolate_lru(folio))
> +			goto put_folio;
> +
> +		list_add(&folio->lru, &folio_list);
> +put_folio:
> +		folio_put(folio);
> +	}
> +	applied = promote_pages(&folio_list);
> +	cond_resched();
> +	return applied * PAGE_SIZE;
> +}
> +
>  static inline unsigned long damon_pa_mark_accessed_or_deactivate(
>  		struct damon_region *r, struct damos *s, bool mark_accessed)
>  {
> @@ -309,6 +335,8 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
>  		break;
>  	case DAMOS_DEMOTE:
>  		return damon_pa_reclaim(r, scheme, true);
> +	case DAMOS_PROMOTE:
> +		return damon_pa_promote(r, scheme);
>  	default:
>  		/* DAMOS actions that not yet supported by 'paddr'. */
>  		break;
> @@ -326,6 +354,7 @@ static int damon_pa_scheme_score(struct damon_ctx *context,
>  	case DAMOS_DEMOTE:
>  		return damon_cold_score(context, r, scheme);
>  	case DAMOS_LRU_PRIO:
> +	case DAMOS_PROMOTE:
>  		return damon_hot_score(context, r, scheme);

As mentioned on the previous patch, I'd prefer keeping the order of operations
and having dedicated branches for each operation.

>  	default:
>  		break;
> diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
> index ac7cd3f17b12..1b84d0af7e1f 100644
> --- a/mm/damon/sysfs-schemes.c
> +++ b/mm/damon/sysfs-schemes.c
> @@ -1188,6 +1188,7 @@ static const char * const damon_sysfs_damos_action_strs[] = {
>  	"lru_deprio",
>  	"stat",
>  	"demote",
> +	"promote",
>  };
>  
>  static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc(
> diff --git a/mm/internal.h b/mm/internal.h
> index 2380397ec2f3..f159455e63d4 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -870,6 +870,7 @@ unsigned long reclaim_pages(struct list_head *folio_list);
>  unsigned int reclaim_clean_pages_from_list(struct zone *zone,
>  					    struct list_head *folio_list);
>  unsigned long demote_pages(struct list_head *folio_list);
> +unsigned long promote_pages(struct list_head *folio_list);
>  /* The ALLOC_WMARK bits are used as an index to zone->watermark */
>  #define ALLOC_WMARK_MIN		WMARK_MIN
>  #define ALLOC_WMARK_LOW		WMARK_LOW
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index eaa3dd6b7562..f03be320f9ad 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -910,7 +910,7 @@ static void folio_check_dirty_writeback(struct folio *folio,
>  		mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
>  }
>  
> -static struct folio *alloc_demote_folio(struct folio *src,
> +static struct folio *alloc_migrate_folio(struct folio *src,
>  		unsigned long private)

As also mentioned on the previous patch, I'm unsure if vmscan.c is the right
place for general migration.

>  {
>  	struct folio *dst;
> @@ -973,7 +973,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
>  	node_get_allowed_targets(pgdat, &allowed_mask);
>  
>  	/* Demotion ignores all cpuset and mempolicy settings */
> -	migrate_pages(demote_folios, alloc_demote_folio, NULL,
> +	migrate_pages(demote_folios, alloc_migrate_folio, NULL,
>  		      (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
>  		      &nr_succeeded);
>  
> @@ -982,6 +982,48 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
>  	return nr_succeeded;
>  }
>  
> +/*
> + * Take folios on @promote_folios and attempt to promote them to another node.
> + * Folios which are not promoted are left on @promote_folios.
> + */
> +static unsigned int promote_folio_list(struct list_head *promote_folios,
> +				     struct pglist_data *pgdat)
> +{
> +	int target_nid = next_promotion_node(pgdat->node_id);
> +	unsigned int nr_succeeded;
> +	nodemask_t allowed_mask = NODE_MASK_NONE;
> +
> +	struct migration_target_control mtc = {
> +		/*
> +		 * Allocate from 'node', or fail quickly and quietly.
> +		 * When this happens, 'page' will likely be stayed
> +		 * instead of migrated.
> +		 */
> +		.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
> +			__GFP_NOMEMALLOC | GFP_NOWAIT,
> +		.nid = target_nid,
> +		.nmask = &allowed_mask
> +	};
> +
> +	if (pgdat->node_id == target_nid)
> +		return 0;
> +
> +	if (list_empty(promote_folios))
> +		return 0;
> +
> +	if (target_nid == NUMA_NO_NODE)
> +		return 0;
> +
> +	/* Promotion ignores all cpuset and mempolicy settings */
> +	migrate_pages(promote_folios, alloc_migrate_folio, NULL,
> +		      (unsigned long)&mtc, MIGRATE_ASYNC, MR_PROMOTION,
> +		      &nr_succeeded);
> +
> +	__count_vm_events(PGPROMOTE, nr_succeeded);
> +
> +	return nr_succeeded;
> +}
> +
>  static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
>  {
>  	if (gfp_mask & __GFP_FS)
> @@ -1058,6 +1100,65 @@ static unsigned int __demote_folio_list(struct list_head *folio_list,
>  	return nr_demoted;
>  }
>  
> +/*
> + * __promote_folio_list() returns the number of promoted pages
> + */
> +static unsigned int __promote_folio_list(struct list_head *folio_list,
> +		struct pglist_data *pgdat, struct scan_control *sc)
> +{
> +	LIST_HEAD(ret_folios);
> +	LIST_HEAD(promote_folios);
> +	unsigned int nr_promoted = 0;
> +
> +	cond_resched();
> +
> +	while (!list_empty(folio_list)) {
> +		struct folio *folio;
> +		enum folio_references references;
> +
> +		cond_resched();
> +
> +		folio = lru_to_folio(folio_list);
> +		list_del(&folio->lru);
> +
> +		if (!folio_trylock(folio))
> +			goto keep;
> +
> +		VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
> +
> +		references = folio_check_references(folio, sc);
> +		if (references == FOLIOREF_KEEP ||
> +		    references == FOLIOREF_RECLAIM ||
> +		    references == FOLIOREF_RECLAIM_CLEAN)
> +			goto keep_locked;
> +
> +		/* Relocate its contents to another node. */
> +		list_add(&folio->lru, &promote_folios);
> +		folio_unlock(folio);
> +		continue;
> +keep_locked:
> +		folio_unlock(folio);
> +keep:
> +		list_add(&folio->lru, &ret_folios);
> +		VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
> +	}
> +	/* 'folio_list' is always empty here */
> +
> +	/* Migrate folios selected for promotion */
> +	nr_promoted += promote_folio_list(&promote_folios, pgdat);

Again as mentioned on the previous patch, I was thinking promote_folios_list()
may call __promote_folio_list().  Making the __ prefix usage consistent with
other functions might be better, in my opinion.

> +	/* Folios that could not be promoted are still in @promote_folios */
> +	if (!list_empty(&promote_folios)) {
> +		/* Folios which weren't promoted go back on @folio_list */
> +		list_splice_init(&promote_folios, folio_list);
> +	}
> +
> +	try_to_unmap_flush();
> +
> +	list_splice(&ret_folios, folio_list);
> +
> +	return nr_promoted;
> +}
> +
>  /*
>   * shrink_folio_list() returns the number of reclaimed pages
>   */
> @@ -2186,6 +2287,25 @@ static unsigned int do_demote_folio_list(struct list_head *folio_list,
>  	return nr_demoted;
>  }
>  
> +static unsigned int do_promote_folio_list(struct list_head *folio_list,
> +				      struct pglist_data *pgdat)
> +{
> +	unsigned int nr_promoted;
> +	struct folio *folio;
> +	struct scan_control sc = {
> +		.gfp_mask = GFP_KERNEL,
> +	};
> +
> +	nr_promoted = __promote_folio_list(folio_list, pgdat, &sc);
> +	while (!list_empty(folio_list)) {
> +		folio = lru_to_folio(folio_list);
> +		list_del(&folio->lru);
> +		folio_putback_lru(folio);
> +	}
> +
> +	return nr_promoted;
> +}
> +
>  static unsigned long reclaim_or_migrate_folios(struct list_head *folio_list,
>  		unsigned int (*handler)(struct list_head *, struct pglist_data *))
>  {
> @@ -2230,6 +2350,11 @@ unsigned long demote_pages(struct list_head *folio_list)
>  	return reclaim_or_migrate_folios(folio_list, do_demote_folio_list);
>  }
>  
> +unsigned long promote_pages(struct list_head *folio_list)
> +{
> +	return reclaim_or_migrate_folios(folio_list, do_promote_folio_list);
> +}
> +
>  static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
>  				 struct lruvec *lruvec, struct scan_control *sc)
>  {
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index 359460deb377..c703abdb8137 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -1282,6 +1282,7 @@ const char * const vmstat_text[] = {
>  	"pgdemote_kswapd",
>  	"pgdemote_direct",
>  	"pgdemote_khugepaged",
> +	"pgpromote",
>  	"pgscan_kswapd",
>  	"pgscan_direct",
>  	"pgscan_khugepaged",
> -- 
> 2.34.1
diff mbox series

Patch

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 4c0a0fef09c5..477060bb6718 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -107,6 +107,7 @@  struct damon_target {
  * @DAMOS_LRU_DEPRIO:	Deprioritize the region on its LRU lists.
  * @DAMOS_STAT:		Do nothing but count the stat.
  * @DAMOS_DEMOTE:	Do demotion for the current region.
+ * @DAMOS_PROMOTE:	Do promotion if possible, otherwise do nothing.
  * @NR_DAMOS_ACTIONS:	Total number of DAMOS actions
  *
  * The support of each action is up to running &struct damon_operations.
@@ -125,6 +126,7 @@  enum damos_action {
 	DAMOS_LRU_DEPRIO,
 	DAMOS_STAT,		/* Do nothing but only record the stat */
 	DAMOS_DEMOTE,
+	DAMOS_PROMOTE,
 	NR_DAMOS_ACTIONS,
 };
 
diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h
index f37cc03f9369..63f75eb9abf3 100644
--- a/include/linux/migrate_mode.h
+++ b/include/linux/migrate_mode.h
@@ -29,6 +29,7 @@  enum migrate_reason {
 	MR_CONTIG_RANGE,
 	MR_LONGTERM_PIN,
 	MR_DEMOTION,
+	MR_PROMOTION,
 	MR_TYPES
 };
 
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 8abfa1240040..63cf920afeaa 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -44,6 +44,7 @@  enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		PGDEMOTE_KSWAPD,
 		PGDEMOTE_DIRECT,
 		PGDEMOTE_KHUGEPAGED,
+		PGPROMOTE,
 		PGSCAN_KSWAPD,
 		PGSCAN_DIRECT,
 		PGSCAN_KHUGEPAGED,
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index 0190ef725b43..f0dd569c1e62 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -22,7 +22,8 @@ 
 	EM( MR_NUMA_MISPLACED,	"numa_misplaced")		\
 	EM( MR_CONTIG_RANGE,	"contig_range")			\
 	EM( MR_LONGTERM_PIN,	"longterm_pin")			\
-	EMe(MR_DEMOTION,	"demotion")
+	EM( MR_DEMOTION,	"demotion")			\
+	EMe(MR_PROMOTION,	"promotion")
 
 /*
  * First define the enums in the above macros to be exported to userspace
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index d3e3f077cd00..360ce69d5898 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -257,6 +257,32 @@  static unsigned long damon_pa_reclaim(struct damon_region *r, struct damos *s, b
 	return applied * PAGE_SIZE;
 }
 
+static unsigned long damon_pa_promote(struct damon_region *r, struct damos *s)
+{
+	unsigned long addr, applied;
+	LIST_HEAD(folio_list);
+
+	for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
+		struct folio *folio = damon_get_folio(PHYS_PFN(addr));
+
+		if (!folio)
+			continue;
+
+		if (damos_pa_filter_out(s, folio))
+			goto put_folio;
+
+		if (!folio_isolate_lru(folio))
+			goto put_folio;
+
+		list_add(&folio->lru, &folio_list);
+put_folio:
+		folio_put(folio);
+	}
+	applied = promote_pages(&folio_list);
+	cond_resched();
+	return applied * PAGE_SIZE;
+}
+
 static inline unsigned long damon_pa_mark_accessed_or_deactivate(
 		struct damon_region *r, struct damos *s, bool mark_accessed)
 {
@@ -309,6 +335,8 @@  static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
 		break;
 	case DAMOS_DEMOTE:
 		return damon_pa_reclaim(r, scheme, true);
+	case DAMOS_PROMOTE:
+		return damon_pa_promote(r, scheme);
 	default:
 		/* DAMOS actions that not yet supported by 'paddr'. */
 		break;
@@ -326,6 +354,7 @@  static int damon_pa_scheme_score(struct damon_ctx *context,
 	case DAMOS_DEMOTE:
 		return damon_cold_score(context, r, scheme);
 	case DAMOS_LRU_PRIO:
+	case DAMOS_PROMOTE:
 		return damon_hot_score(context, r, scheme);
 	default:
 		break;
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index ac7cd3f17b12..1b84d0af7e1f 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1188,6 +1188,7 @@  static const char * const damon_sysfs_damos_action_strs[] = {
 	"lru_deprio",
 	"stat",
 	"demote",
+	"promote",
 };
 
 static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc(
diff --git a/mm/internal.h b/mm/internal.h
index 2380397ec2f3..f159455e63d4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -870,6 +870,7 @@  unsigned long reclaim_pages(struct list_head *folio_list);
 unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 					    struct list_head *folio_list);
 unsigned long demote_pages(struct list_head *folio_list);
+unsigned long promote_pages(struct list_head *folio_list);
 /* The ALLOC_WMARK bits are used as an index to zone->watermark */
 #define ALLOC_WMARK_MIN		WMARK_MIN
 #define ALLOC_WMARK_LOW		WMARK_LOW
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eaa3dd6b7562..f03be320f9ad 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -910,7 +910,7 @@  static void folio_check_dirty_writeback(struct folio *folio,
 		mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
 }
 
-static struct folio *alloc_demote_folio(struct folio *src,
+static struct folio *alloc_migrate_folio(struct folio *src,
 		unsigned long private)
 {
 	struct folio *dst;
@@ -973,7 +973,7 @@  static unsigned int demote_folio_list(struct list_head *demote_folios,
 	node_get_allowed_targets(pgdat, &allowed_mask);
 
 	/* Demotion ignores all cpuset and mempolicy settings */
-	migrate_pages(demote_folios, alloc_demote_folio, NULL,
+	migrate_pages(demote_folios, alloc_migrate_folio, NULL,
 		      (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
 		      &nr_succeeded);
 
@@ -982,6 +982,48 @@  static unsigned int demote_folio_list(struct list_head *demote_folios,
 	return nr_succeeded;
 }
 
+/*
+ * Take folios on @promote_folios and attempt to promote them to another node.
+ * Folios which are not promoted are left on @promote_folios.
+ */
+static unsigned int promote_folio_list(struct list_head *promote_folios,
+				     struct pglist_data *pgdat)
+{
+	int target_nid = next_promotion_node(pgdat->node_id);
+	unsigned int nr_succeeded;
+	nodemask_t allowed_mask = NODE_MASK_NONE;
+
+	struct migration_target_control mtc = {
+		/*
+		 * Allocate from 'node', or fail quickly and quietly.
+		 * When this happens, 'page' will likely be stayed
+		 * instead of migrated.
+		 */
+		.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
+			__GFP_NOMEMALLOC | GFP_NOWAIT,
+		.nid = target_nid,
+		.nmask = &allowed_mask
+	};
+
+	if (pgdat->node_id == target_nid)
+		return 0;
+
+	if (list_empty(promote_folios))
+		return 0;
+
+	if (target_nid == NUMA_NO_NODE)
+		return 0;
+
+	/* Promotion ignores all cpuset and mempolicy settings */
+	migrate_pages(promote_folios, alloc_migrate_folio, NULL,
+		      (unsigned long)&mtc, MIGRATE_ASYNC, MR_PROMOTION,
+		      &nr_succeeded);
+
+	__count_vm_events(PGPROMOTE, nr_succeeded);
+
+	return nr_succeeded;
+}
+
 static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
 {
 	if (gfp_mask & __GFP_FS)
@@ -1058,6 +1100,65 @@  static unsigned int __demote_folio_list(struct list_head *folio_list,
 	return nr_demoted;
 }
 
+/*
+ * __promote_folio_list() returns the number of promoted pages
+ */
+static unsigned int __promote_folio_list(struct list_head *folio_list,
+		struct pglist_data *pgdat, struct scan_control *sc)
+{
+	LIST_HEAD(ret_folios);
+	LIST_HEAD(promote_folios);
+	unsigned int nr_promoted = 0;
+
+	cond_resched();
+
+	while (!list_empty(folio_list)) {
+		struct folio *folio;
+		enum folio_references references;
+
+		cond_resched();
+
+		folio = lru_to_folio(folio_list);
+		list_del(&folio->lru);
+
+		if (!folio_trylock(folio))
+			goto keep;
+
+		VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
+
+		references = folio_check_references(folio, sc);
+		if (references == FOLIOREF_KEEP ||
+		    references == FOLIOREF_RECLAIM ||
+		    references == FOLIOREF_RECLAIM_CLEAN)
+			goto keep_locked;
+
+		/* Relocate its contents to another node. */
+		list_add(&folio->lru, &promote_folios);
+		folio_unlock(folio);
+		continue;
+keep_locked:
+		folio_unlock(folio);
+keep:
+		list_add(&folio->lru, &ret_folios);
+		VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+	}
+	/* 'folio_list' is always empty here */
+
+	/* Migrate folios selected for promotion */
+	nr_promoted += promote_folio_list(&promote_folios, pgdat);
+	/* Folios that could not be promoted are still in @promote_folios */
+	if (!list_empty(&promote_folios)) {
+		/* Folios which weren't promoted go back on @folio_list */
+		list_splice_init(&promote_folios, folio_list);
+	}
+
+	try_to_unmap_flush();
+
+	list_splice(&ret_folios, folio_list);
+
+	return nr_promoted;
+}
+
 /*
  * shrink_folio_list() returns the number of reclaimed pages
  */
@@ -2186,6 +2287,25 @@  static unsigned int do_demote_folio_list(struct list_head *folio_list,
 	return nr_demoted;
 }
 
+static unsigned int do_promote_folio_list(struct list_head *folio_list,
+				      struct pglist_data *pgdat)
+{
+	unsigned int nr_promoted;
+	struct folio *folio;
+	struct scan_control sc = {
+		.gfp_mask = GFP_KERNEL,
+	};
+
+	nr_promoted = __promote_folio_list(folio_list, pgdat, &sc);
+	while (!list_empty(folio_list)) {
+		folio = lru_to_folio(folio_list);
+		list_del(&folio->lru);
+		folio_putback_lru(folio);
+	}
+
+	return nr_promoted;
+}
+
 static unsigned long reclaim_or_migrate_folios(struct list_head *folio_list,
 		unsigned int (*handler)(struct list_head *, struct pglist_data *))
 {
@@ -2230,6 +2350,11 @@  unsigned long demote_pages(struct list_head *folio_list)
 	return reclaim_or_migrate_folios(folio_list, do_demote_folio_list);
 }
 
+unsigned long promote_pages(struct list_head *folio_list)
+{
+	return reclaim_or_migrate_folios(folio_list, do_promote_folio_list);
+}
+
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
 				 struct lruvec *lruvec, struct scan_control *sc)
 {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 359460deb377..c703abdb8137 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1282,6 +1282,7 @@  const char * const vmstat_text[] = {
 	"pgdemote_kswapd",
 	"pgdemote_direct",
 	"pgdemote_khugepaged",
+	"pgpromote",
 	"pgscan_kswapd",
 	"pgscan_direct",
 	"pgscan_khugepaged",