diff mbox series

[v2,2/2] vmscan,cgroup: apply mems_effective to reclaim

Message ID 20250418031352.1277966-2-gourry@gourry.net (mailing list archive)
State New
Headers show
Series [1/2] cpuset: rename cpuset_node_allowed to cpuset_current_node_allowed | expand

Commit Message

Gregory Price April 18, 2025, 3:13 a.m. UTC
It is possible for a reclaimer to cause demotions of an lruvec belonging
to a cgroup with cpuset.mems set to exclude some nodes. Attempt to apply
this limitation based on the lruvec's memcg and prevent demotion.

Notably, this may still allow demotion of shared libraries or any memory
first instantiated in another cgroup. This means cpusets still cannot
cannot guarantee complete isolation when demotion is enabled, and the
docs have been updated to reflect this.

This is useful for isolating workloads on a multi-tenant system from
certain classes of memory more consistently - with the noted exceptions.

Signed-off-by: Gregory Price <gourry@gourry.net>
---
 .../ABI/testing/sysfs-kernel-mm-numa          | 14 ++++---
 include/linux/cgroup.h                        |  7 ++++
 include/linux/cpuset.h                        |  5 +++
 include/linux/memcontrol.h                    |  9 ++++
 kernel/cgroup/cgroup.c                        |  5 +++
 kernel/cgroup/cpuset.c                        | 22 ++++++++++
 mm/vmscan.c                                   | 41 +++++++++++--------
 7 files changed, 82 insertions(+), 21 deletions(-)

Comments

Waiman Long April 19, 2025, 2:06 a.m. UTC | #1
On 4/17/25 11:13 PM, Gregory Price wrote:
> It is possible for a reclaimer to cause demotions of an lruvec belonging
> to a cgroup with cpuset.mems set to exclude some nodes. Attempt to apply
> this limitation based on the lruvec's memcg and prevent demotion.
>
> Notably, this may still allow demotion of shared libraries or any memory
> first instantiated in another cgroup. This means cpusets still cannot
> cannot guarantee complete isolation when demotion is enabled, and the
> docs have been updated to reflect this.
>
> This is useful for isolating workloads on a multi-tenant system from
> certain classes of memory more consistently - with the noted exceptions.
>
> Signed-off-by: Gregory Price <gourry@gourry.net>
> ---
>   .../ABI/testing/sysfs-kernel-mm-numa          | 14 ++++---
>   include/linux/cgroup.h                        |  7 ++++
>   include/linux/cpuset.h                        |  5 +++
>   include/linux/memcontrol.h                    |  9 ++++
>   kernel/cgroup/cgroup.c                        |  5 +++
>   kernel/cgroup/cpuset.c                        | 22 ++++++++++
>   mm/vmscan.c                                   | 41 +++++++++++--------
>   7 files changed, 82 insertions(+), 21 deletions(-)
>
> diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-numa b/Documentation/ABI/testing/sysfs-kernel-mm-numa
> index 77e559d4ed80..27cdcab901f7 100644
> --- a/Documentation/ABI/testing/sysfs-kernel-mm-numa
> +++ b/Documentation/ABI/testing/sysfs-kernel-mm-numa
> @@ -16,9 +16,13 @@ Description:	Enable/disable demoting pages during reclaim
>   		Allowing page migration during reclaim enables these
>   		systems to migrate pages from fast tiers to slow tiers
>   		when the fast tier is under pressure.  This migration
> -		is performed before swap.  It may move data to a NUMA
> -		node that does not fall into the cpuset of the
> -		allocating process which might be construed to violate
> -		the guarantees of cpusets.  This should not be enabled
> -		on systems which need strict cpuset location
> +		is performed before swap if an eligible numa node is
> +		present in cpuset.mems for the cgroup. If cpusets.mems
> +		changes at runtime, it may move data to a NUMA node that
> +		does not fall into the cpuset of the new cpusets.mems,
> +		which might be construed to violate the guarantees of
> +		cpusets.  Shared memory, such as libraries, owned by
> +		another cgroup may still be demoted and result in memory
> +		use on a node not present in cpusets.mem. This should not
> +		be enabled on systems which need strict cpuset location
>   		guarantees.
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index f8ef47f8a634..2915250a3e5e 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -632,6 +632,8 @@ static inline void cgroup_kthread_ready(void)
>   
>   void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen);
>   struct cgroup *cgroup_get_from_id(u64 id);
> +
> +extern bool cgroup_node_allowed(struct cgroup *cgroup, int nid);
>   #else /* !CONFIG_CGROUPS */
>   
>   struct cgroup_subsys_state;
> @@ -681,6 +683,11 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
>   
>   static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
>   {}
> +
> +static inline bool cgroup_node_allowed(struct cgroup *cgroup, int nid)
> +{
> +	return true;
> +}
>   #endif /* !CONFIG_CGROUPS */
>   
>   #ifdef CONFIG_CGROUPS
> diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
> index 893a4c340d48..c64b4a174456 100644
> --- a/include/linux/cpuset.h
> +++ b/include/linux/cpuset.h
> @@ -171,6 +171,7 @@ static inline void set_mems_allowed(nodemask_t nodemask)
>   	task_unlock(current);
>   }
>   
> +extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid);
>   #else /* !CONFIG_CPUSETS */
>   
>   static inline bool cpusets_enabled(void) { return false; }
> @@ -282,6 +283,10 @@ static inline bool read_mems_allowed_retry(unsigned int seq)
>   	return false;
>   }
>   
> +static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
> +{
> +	return false;
> +}
>   #endif /* !CONFIG_CPUSETS */
>   
>   #endif /* _LINUX_CPUSET_H */
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 53364526d877..2906e4bb12e9 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -1736,6 +1736,11 @@ static inline void count_objcg_events(struct obj_cgroup *objcg,
>   	rcu_read_unlock();
>   }
>   
> +static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
> +{
> +	return memcg ? cgroup_node_allowed(memcg->css.cgroup, nid) : true;
> +}
> +
>   #else
>   static inline bool mem_cgroup_kmem_disabled(void)
>   {
> @@ -1793,6 +1798,10 @@ static inline void count_objcg_events(struct obj_cgroup *objcg,
>   {
>   }
>   
> +static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
> +{
> +	return true;
> +}
>   #endif /* CONFIG_MEMCG */
>   
>   #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP)
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index afc665b7b1fe..ba0b90cd774c 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -7038,6 +7038,11 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
>   	return 0;
>   }
>   
> +bool cgroup_node_allowed(struct cgroup *cgroup, int nid)
> +{
> +	return cpuset_node_allowed(cgroup, nid);
> +}
> +
>   /*
>    * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
>    * definition in cgroup-defs.h.
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index d6ed3f053e62..31e4c4cbcdfc 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -4163,6 +4163,28 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
>   	return allowed;
>   }
>   
> +bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
> +{
> +	struct cgroup_subsys_state *css;
> +	unsigned long flags;
> +	struct cpuset *cs;
> +	bool allowed;
> +
> +	css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
> +	if (!css)
> +		return true;
> +
> +	cs = container_of(css, struct cpuset, css);
> +	spin_lock_irqsave(&callback_lock, flags);
> +	/* At least one parent must have a valid node list */
> +	while (nodes_empty(cs->effective_mems))
> +		cs = parent_cs(cs);

For cgroup v2, effective_mems should always be set and walking up the 
tree isn't necessary. For v1, it can be empty, but memory cgroup and 
cpuset are unlikely in the same hierarchy.

Cheers,
Longman
Tejun Heo April 19, 2025, 3:06 a.m. UTC | #2
Hello,

On Thu, Apr 17, 2025 at 11:13:52PM -0400, Gregory Price wrote:
...
> +static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
> +{
> +	return memcg ? cgroup_node_allowed(memcg->css.cgroup, nid) : true;
> +}
> +
...
> +bool cgroup_node_allowed(struct cgroup *cgroup, int nid)
> +{
> +	return cpuset_node_allowed(cgroup, nid);
> +}
...
> +bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
> +{

What does the indirection through cgroup_node_allowed() add? Why not just
call cpuset directly?

Thanks.
Gregory Price April 19, 2025, 3:27 a.m. UTC | #3
On Fri, Apr 18, 2025 at 05:06:20PM -1000, Tejun Heo wrote:
> Hello,
> 
> On Thu, Apr 17, 2025 at 11:13:52PM -0400, Gregory Price wrote:
> ...
> > +static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
> > +{
> > +	return memcg ? cgroup_node_allowed(memcg->css.cgroup, nid) : true;
> > +}
> > +
> ...
> > +bool cgroup_node_allowed(struct cgroup *cgroup, int nid)
> > +{
> > +	return cpuset_node_allowed(cgroup, nid);
> > +}
> ...
> > +bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
> > +{
> 
> What does the indirection through cgroup_node_allowed() add? Why not just
> call cpuset directly?
> 

This is an artifact of me trying to figure out how to get this to build
with allconfig (matrix of CPUSET and MEM_CGROUP).

I think you're right, I can probably drop it.  I was trying to write :

bool cpuset_node_allowed(struct cpuset *cs, int nid);

and just couldn't do it, so eventually landed on passing the cgroup into
the cpuset function, which means I think I can drop the indirection now.

Will push it and see if allconfig builds.

Thanks

~Gregory
Gregory Price April 19, 2025, 3:41 a.m. UTC | #4
On Fri, Apr 18, 2025 at 11:27:55PM -0400, Gregory Price wrote:
> On Fri, Apr 18, 2025 at 05:06:20PM -1000, Tejun Heo wrote:
> > Hello,
> > 
> > On Thu, Apr 17, 2025 at 11:13:52PM -0400, Gregory Price wrote:
> > ...
> > > +static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
> > > +{
> > > +	return memcg ? cgroup_node_allowed(memcg->css.cgroup, nid) : true;
> > > +}
> > > +
> > ...
> > > +bool cgroup_node_allowed(struct cgroup *cgroup, int nid)
> > > +{
> > > +	return cpuset_node_allowed(cgroup, nid);
> > > +}
> > ...
> > > +bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
> > > +{
> > 
> > What does the indirection through cgroup_node_allowed() add? Why not just
> > call cpuset directly?
> > 
> 
> This is an artifact of me trying to figure out how to get this to build
> with allconfig (matrix of CPUSET and MEM_CGROUP).
> 
... snip ...

Looking back through the include graph again

The reason was lack of inclusion of cpuset.h in memcontrol.c while
chasing the allconfig solution.

I was trying to following the current includes rather than making the
graph more complex - it wasn't clear to me whether going directly to
cpuset.h from memcontrol.c made sense - since memcontrol can be built
without cpuset.

The graph here is a head scratcher.  I'll still try a build with
cpuset.h included in memcontrol.c.

~Gregory
Gregory Price April 19, 2025, 3:47 a.m. UTC | #5
On Fri, Apr 18, 2025 at 10:06:40PM -0400, Waiman Long wrote:
> > +bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
> > +{
> > +	struct cgroup_subsys_state *css;
> > +	unsigned long flags;
> > +	struct cpuset *cs;
> > +	bool allowed;
> > +
> > +	css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
> > +	if (!css)
> > +		return true;
> > +
> > +	cs = container_of(css, struct cpuset, css);
> > +	spin_lock_irqsave(&callback_lock, flags);
> > +	/* At least one parent must have a valid node list */
> > +	while (nodes_empty(cs->effective_mems))
> > +		cs = parent_cs(cs);
> 
> For cgroup v2, effective_mems should always be set and walking up the tree
> isn't necessary. For v1, it can be empty, but memory cgroup and cpuset are
> unlikely in the same hierarchy.
> 

Hm, do i need different paths here for v1 vs v2 then?  Or is it
sufficient to simply return true if effective_mems is empty (which
implies v1)?

Thanks,
~Gregory
Waiman Long April 19, 2025, 3:47 a.m. UTC | #6
On 4/18/25 11:27 PM, Gregory Price wrote:
> On Fri, Apr 18, 2025 at 05:06:20PM -1000, Tejun Heo wrote:
>> Hello,
>>
>> On Thu, Apr 17, 2025 at 11:13:52PM -0400, Gregory Price wrote:
>> ...
>>> +static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
>>> +{
>>> +	return memcg ? cgroup_node_allowed(memcg->css.cgroup, nid) : true;
>>> +}
>>> +
>> ...
>>> +bool cgroup_node_allowed(struct cgroup *cgroup, int nid)
>>> +{
>>> +	return cpuset_node_allowed(cgroup, nid);
>>> +}
>> ...
>>> +bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
>>> +{
>> What does the indirection through cgroup_node_allowed() add? Why not just
>> call cpuset directly?
>>
> This is an artifact of me trying to figure out how to get this to build
> with allconfig (matrix of CPUSET and MEM_CGROUP).
>
> I think you're right, I can probably drop it.  I was trying to write :
>
> bool cpuset_node_allowed(struct cpuset *cs, int nid);

The cpuset structure isn't exposed externally. So you can't use cpuset 
from outside cpuset.c. Passing the cgroup structure is the right approach.

Cheers,
Longman

>
> and just couldn't do it, so eventually landed on passing the cgroup into
> the cpuset function, which means I think I can drop the indirection now.
>
> Will push it and see if allconfig builds.
>
> Thanks
>
> ~Gregory
>
Waiman Long April 19, 2025, 3:53 a.m. UTC | #7
On 4/18/25 11:47 PM, Gregory Price wrote:
> On Fri, Apr 18, 2025 at 10:06:40PM -0400, Waiman Long wrote:
>>> +bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
>>> +{
>>> +	struct cgroup_subsys_state *css;
>>> +	unsigned long flags;
>>> +	struct cpuset *cs;
>>> +	bool allowed;
>>> +
>>> +	css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
>>> +	if (!css)
>>> +		return true;
>>> +
>>> +	cs = container_of(css, struct cpuset, css);
>>> +	spin_lock_irqsave(&callback_lock, flags);
>>> +	/* At least one parent must have a valid node list */
>>> +	while (nodes_empty(cs->effective_mems))
>>> +		cs = parent_cs(cs);
>> For cgroup v2, effective_mems should always be set and walking up the tree
>> isn't necessary. For v1, it can be empty, but memory cgroup and cpuset are
>> unlikely in the same hierarchy.
>>
> Hm, do i need different paths here for v1 vs v2 then?  Or is it
> sufficient to simply return true if effective_mems is empty (which
> implies v1)?

Yes, you can return true if it happens to be empty, but it is 
"unlikely". In v1,cpuset and memory cgroup are in separate hierarchies 
AFAIU. So the cgroup you pass into cpuset_node_allowed() won't have a 
matching cpuset.

Cheers,
Longman

> Thanks,
> ~Gregory
>
diff mbox series

Patch

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-numa b/Documentation/ABI/testing/sysfs-kernel-mm-numa
index 77e559d4ed80..27cdcab901f7 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-numa
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-numa
@@ -16,9 +16,13 @@  Description:	Enable/disable demoting pages during reclaim
 		Allowing page migration during reclaim enables these
 		systems to migrate pages from fast tiers to slow tiers
 		when the fast tier is under pressure.  This migration
-		is performed before swap.  It may move data to a NUMA
-		node that does not fall into the cpuset of the
-		allocating process which might be construed to violate
-		the guarantees of cpusets.  This should not be enabled
-		on systems which need strict cpuset location
+		is performed before swap if an eligible numa node is
+		present in cpuset.mems for the cgroup. If cpusets.mems
+		changes at runtime, it may move data to a NUMA node that
+		does not fall into the cpuset of the new cpusets.mems,
+		which might be construed to violate the guarantees of
+		cpusets.  Shared memory, such as libraries, owned by
+		another cgroup may still be demoted and result in memory
+		use on a node not present in cpusets.mem. This should not
+		be enabled on systems which need strict cpuset location
 		guarantees.
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f8ef47f8a634..2915250a3e5e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -632,6 +632,8 @@  static inline void cgroup_kthread_ready(void)
 
 void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen);
 struct cgroup *cgroup_get_from_id(u64 id);
+
+extern bool cgroup_node_allowed(struct cgroup *cgroup, int nid);
 #else /* !CONFIG_CGROUPS */
 
 struct cgroup_subsys_state;
@@ -681,6 +683,11 @@  static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
 
 static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
 {}
+
+static inline bool cgroup_node_allowed(struct cgroup *cgroup, int nid)
+{
+	return true;
+}
 #endif /* !CONFIG_CGROUPS */
 
 #ifdef CONFIG_CGROUPS
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 893a4c340d48..c64b4a174456 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -171,6 +171,7 @@  static inline void set_mems_allowed(nodemask_t nodemask)
 	task_unlock(current);
 }
 
+extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid);
 #else /* !CONFIG_CPUSETS */
 
 static inline bool cpusets_enabled(void) { return false; }
@@ -282,6 +283,10 @@  static inline bool read_mems_allowed_retry(unsigned int seq)
 	return false;
 }
 
+static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
+{
+	return false;
+}
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 53364526d877..2906e4bb12e9 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1736,6 +1736,11 @@  static inline void count_objcg_events(struct obj_cgroup *objcg,
 	rcu_read_unlock();
 }
 
+static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
+{
+	return memcg ? cgroup_node_allowed(memcg->css.cgroup, nid) : true;
+}
+
 #else
 static inline bool mem_cgroup_kmem_disabled(void)
 {
@@ -1793,6 +1798,10 @@  static inline void count_objcg_events(struct obj_cgroup *objcg,
 {
 }
 
+static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
+{
+	return true;
+}
 #endif /* CONFIG_MEMCG */
 
 #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index afc665b7b1fe..ba0b90cd774c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -7038,6 +7038,11 @@  int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
 	return 0;
 }
 
+bool cgroup_node_allowed(struct cgroup *cgroup, int nid)
+{
+	return cpuset_node_allowed(cgroup, nid);
+}
+
 /*
  * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
  * definition in cgroup-defs.h.
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index d6ed3f053e62..31e4c4cbcdfc 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4163,6 +4163,28 @@  bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
 	return allowed;
 }
 
+bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
+{
+	struct cgroup_subsys_state *css;
+	unsigned long flags;
+	struct cpuset *cs;
+	bool allowed;
+
+	css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
+	if (!css)
+		return true;
+
+	cs = container_of(css, struct cpuset, css);
+	spin_lock_irqsave(&callback_lock, flags);
+	/* At least one parent must have a valid node list */
+	while (nodes_empty(cs->effective_mems))
+		cs = parent_cs(cs);
+	allowed = node_isset(nid, cs->effective_mems);
+	spin_unlock_irqrestore(&callback_lock, flags);
+	css_put(css);
+	return allowed;
+}
+
 /**
  * cpuset_spread_node() - On which node to begin search for a page
  * @rotor: round robin rotor
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2b2ab386cab5..32a7ce421e42 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -342,16 +342,22 @@  static void flush_reclaim_state(struct scan_control *sc)
 	}
 }
 
-static bool can_demote(int nid, struct scan_control *sc)
+static bool can_demote(int nid, struct scan_control *sc,
+		       struct mem_cgroup *memcg)
 {
+	int demotion_nid;
+
 	if (!numa_demotion_enabled)
 		return false;
 	if (sc && sc->no_demotion)
 		return false;
-	if (next_demotion_node(nid) == NUMA_NO_NODE)
+
+	demotion_nid = next_demotion_node(nid);
+	if (demotion_nid == NUMA_NO_NODE)
 		return false;
 
-	return true;
+	/* If demotion node isn't in the cgroup's mems_allowed, fall back */
+	return mem_cgroup_node_allowed(memcg, demotion_nid);
 }
 
 static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
@@ -376,7 +382,7 @@  static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
 	 *
 	 * Can it be reclaimed from this node via demotion?
 	 */
-	return can_demote(nid, sc);
+	return can_demote(nid, sc, memcg);
 }
 
 /*
@@ -1096,7 +1102,8 @@  static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
  */
 static unsigned int shrink_folio_list(struct list_head *folio_list,
 		struct pglist_data *pgdat, struct scan_control *sc,
-		struct reclaim_stat *stat, bool ignore_references)
+		struct reclaim_stat *stat, bool ignore_references,
+		struct mem_cgroup *memcg)
 {
 	struct folio_batch free_folios;
 	LIST_HEAD(ret_folios);
@@ -1109,7 +1116,7 @@  static unsigned int shrink_folio_list(struct list_head *folio_list,
 	folio_batch_init(&free_folios);
 	memset(stat, 0, sizeof(*stat));
 	cond_resched();
-	do_demote_pass = can_demote(pgdat->node_id, sc);
+	do_demote_pass = can_demote(pgdat->node_id, sc, memcg);
 
 retry:
 	while (!list_empty(folio_list)) {
@@ -1658,7 +1665,7 @@  unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 	 */
 	noreclaim_flag = memalloc_noreclaim_save();
 	nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
-					&stat, true);
+					&stat, true, NULL);
 	memalloc_noreclaim_restore(noreclaim_flag);
 
 	list_splice(&clean_folios, folio_list);
@@ -2031,7 +2038,8 @@  static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
 	if (nr_taken == 0)
 		return 0;
 
-	nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
+	nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false,
+					 lruvec_memcg(lruvec));
 
 	spin_lock_irq(&lruvec->lru_lock);
 	move_folios_to_lru(lruvec, &folio_list);
@@ -2214,7 +2222,7 @@  static unsigned int reclaim_folio_list(struct list_head *folio_list,
 		.no_demotion = 1,
 	};
 
-	nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true);
+	nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true, NULL);
 	while (!list_empty(folio_list)) {
 		folio = lru_to_folio(folio_list);
 		list_del(&folio->lru);
@@ -2646,7 +2654,7 @@  static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
  * Anonymous LRU management is a waste if there is
  * ultimately no way to reclaim the memory.
  */
-static bool can_age_anon_pages(struct pglist_data *pgdat,
+static bool can_age_anon_pages(struct lruvec *lruvec,
 			       struct scan_control *sc)
 {
 	/* Aging the anon LRU is valuable if swap is present: */
@@ -2654,7 +2662,8 @@  static bool can_age_anon_pages(struct pglist_data *pgdat,
 		return true;
 
 	/* Also valuable if anon pages can be demoted: */
-	return can_demote(pgdat->node_id, sc);
+	return can_demote(lruvec_pgdat(lruvec)->node_id, sc,
+			  lruvec_memcg(lruvec));
 }
 
 #ifdef CONFIG_LRU_GEN
@@ -2732,7 +2741,7 @@  static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
 	if (!sc->may_swap)
 		return 0;
 
-	if (!can_demote(pgdat->node_id, sc) &&
+	if (!can_demote(pgdat->node_id, sc, memcg) &&
 	    mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
 		return 0;
 
@@ -4695,7 +4704,7 @@  static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
 	if (list_empty(&list))
 		return scanned;
 retry:
-	reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
+	reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg);
 	sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
 	sc->nr_reclaimed += reclaimed;
 	trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
@@ -5850,7 +5859,7 @@  static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 	 * Even if we did not try to evict anon pages at all, we want to
 	 * rebalance the anon lru active/inactive ratio.
 	 */
-	if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
+	if (can_age_anon_pages(lruvec, sc) &&
 	    inactive_is_low(lruvec, LRU_INACTIVE_ANON))
 		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 				   sc, LRU_ACTIVE_ANON);
@@ -6681,10 +6690,10 @@  static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 		return;
 	}
 
-	if (!can_age_anon_pages(pgdat, sc))
+	lruvec = mem_cgroup_lruvec(NULL, pgdat);
+	if (!can_age_anon_pages(lruvec, sc))
 		return;
 
-	lruvec = mem_cgroup_lruvec(NULL, pgdat);
 	if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
 		return;