diff mbox series

[2/2] mm: mempolicy: Interleave policy for tiered memory nodes

Message ID 20230927095002.10245-3-ravis.opensrc@micron.com
State New, archived
Headers show
Series mm: mempolicy: Multi-tier interleaving | expand

Commit Message

Ravi Jonnalagadda Sept. 27, 2023, 9:50 a.m. UTC
From: Srinivasulu Thanneeru <sthanneeru@micron.com>

Existing interleave policy spreads out pages evenly across a set of
specified nodes, i.e. 1:1 interleave. Upcoming tiered memory systems
have CPU-less memory nodes with different peak bandwidth and
latency-bandwidth characteristics. In such systems, we will want to
use the additional bandwidth provided by lowtier memory for
bandwidth-intensive applications. However, the default 1:1 interleave
can lead to suboptimal bandwidth distribution.

Introduce an interleave policy for multi-tiers that is based on
interleave weights, where pages are assigned from nodes of the tier
based on the tier weight.

For instance, 50:30:20 are the weights of tiers 0, 1, and 3, which
leads to a 50%/30%/20% traffic breakdown across the three tiers.

Signed-off-by: Srinivasulu Thanneeru <sthanneeru@micron.com>
Co-authored-by: Ravi Jonnalagadda <ravis.opensrc@micron.com>
---
 include/linux/memory-tiers.h |  25 +++++++-
 include/linux/sched.h        |   2 +
 mm/memory-tiers.c            |  31 ++--------
 mm/mempolicy.c               | 107 +++++++++++++++++++++++++++++++++--
 4 files changed, 132 insertions(+), 33 deletions(-)

Comments

Huang, Ying Sept. 28, 2023, 6:56 a.m. UTC | #1
Ravi Jonnalagadda <ravis.opensrc@micron.com> writes:

> From: Srinivasulu Thanneeru <sthanneeru@micron.com>
>
> Existing interleave policy spreads out pages evenly across a set of
> specified nodes, i.e. 1:1 interleave. Upcoming tiered memory systems
> have CPU-less memory nodes with different peak bandwidth and
> latency-bandwidth characteristics. In such systems, we will want to
> use the additional bandwidth provided by lowtier memory for
> bandwidth-intensive applications. However, the default 1:1 interleave
> can lead to suboptimal bandwidth distribution.
>
> Introduce an interleave policy for multi-tiers that is based on
> interleave weights, where pages are assigned from nodes of the tier
> based on the tier weight.
>
> For instance, 50:30:20 are the weights of tiers 0, 1, and 3, which

Does this mean we will allocate 50 pages from node 0 before allocating from
node 1?  If so, do we need to divide GCD (greatest common divisor)
before using the weight?  That is, use 5:3:2?

> leads to a 50%/30%/20% traffic breakdown across the three tiers.
>
> Signed-off-by: Srinivasulu Thanneeru <sthanneeru@micron.com>
> Co-authored-by: Ravi Jonnalagadda <ravis.opensrc@micron.com>
> ---
>  include/linux/memory-tiers.h |  25 +++++++-
>  include/linux/sched.h        |   2 +
>  mm/memory-tiers.c            |  31 ++--------
>  mm/mempolicy.c               | 107 +++++++++++++++++++++++++++++++++--
>  4 files changed, 132 insertions(+), 33 deletions(-)
>
> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
> index c62d286749d0..74be39cb56c4 100644
> --- a/include/linux/memory-tiers.h
> +++ b/include/linux/memory-tiers.h
> @@ -2,6 +2,7 @@
>  #ifndef _LINUX_MEMORY_TIERS_H
>  #define _LINUX_MEMORY_TIERS_H
>  
> +#include <linux/device.h>
>  #include <linux/types.h>
>  #include <linux/nodemask.h>
>  #include <linux/kref.h>
> @@ -21,7 +22,27 @@
>  
>  #define MAX_TIER_INTERLEAVE_WEIGHT 100
>  
> -struct memory_tier;
> +struct memory_tier {
> +	/* hierarchy of memory tiers */
> +	struct list_head list;
> +	/* list of all memory types part of this tier */
> +	struct list_head memory_types;
> +	/*
> +	 * By default all tiers will have weight as 1, which means they
> +	 * follow default standard allocation.
> +	 */
> +	unsigned short interleave_weight;
> +	/*
> +	 * start value of abstract distance. memory tier maps
> +	 * an abstract distance  range,
> +	 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
> +	 */
> +	int adistance_start;
> +	struct device dev;
> +	/* All the nodes that are part of all the lower memory tiers. */
> +	nodemask_t lower_tier_mask;
> +};
> +
>  struct memory_dev_type {
>  	/* list of memory types that are part of same tier as this type */
>  	struct list_head tier_sibiling;
> @@ -38,6 +59,8 @@ struct memory_dev_type *alloc_memory_type(int adistance);
>  void put_memory_type(struct memory_dev_type *memtype);
>  void init_node_memory_type(int node, struct memory_dev_type *default_type);
>  void clear_node_memory_type(int node, struct memory_dev_type *memtype);
> +struct memory_tier *node_get_memory_tier(int node);
> +nodemask_t get_memtier_nodemask(struct memory_tier *memtier);
>  #ifdef CONFIG_MIGRATION
>  int next_demotion_node(int node);
>  void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 77f01ac385f7..07ea837c3afb 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1252,7 +1252,9 @@ struct task_struct {
>  	/* Protected by alloc_lock: */
>  	struct mempolicy		*mempolicy;
>  	short				il_prev;
> +	unsigned short			il_count;
>  	short				pref_node_fork;
> +	unsigned int			current_node;

current_node is only used for interleave?  If so, how about follow the
same naming convention, that is, il_XXX?

>  #endif
>  #ifdef CONFIG_NUMA_BALANCING
>  	int				numa_scan_seq;
> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> index 7e06c9e0fa41..5e2ddc9f994a 100644
> --- a/mm/memory-tiers.c
> +++ b/mm/memory-tiers.c
> @@ -8,27 +8,6 @@
>  
>  #include "internal.h"
>  
> -struct memory_tier {
> -	/* hierarchy of memory tiers */
> -	struct list_head list;
> -	/* list of all memory types part of this tier */
> -	struct list_head memory_types;
> -	/*
> -	 * By default all tiers will have weight as 1, which means they
> -	 * follow default standard allocation.
> -	 */
> -	unsigned short interleave_weight;
> -	/*
> -	 * start value of abstract distance. memory tier maps
> -	 * an abstract distance  range,
> -	 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
> -	 */
> -	int adistance_start;
> -	struct device dev;
> -	/* All the nodes that are part of all the lower memory tiers. */
> -	nodemask_t lower_tier_mask;
> -};
> -
>  struct demotion_nodes {
>  	nodemask_t preferred;
>  };
> @@ -115,7 +94,7 @@ static inline struct memory_tier *to_memory_tier(struct device *device)
>  	return container_of(device, struct memory_tier, dev);
>  }
>  
> -static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
> +nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
>  {
>  	nodemask_t nodes = NODE_MASK_NONE;
>  	struct memory_dev_type *memtype;
> @@ -264,7 +243,7 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
>  	return memtier;
>  }
>  
> -static struct memory_tier *__node_get_memory_tier(int node)
> +struct memory_tier *node_get_memory_tier(int node)
>  {
>  	pg_data_t *pgdat;
>  
> @@ -380,7 +359,7 @@ static void disable_all_demotion_targets(void)
>  		 * We are holding memory_tier_lock, it is safe
>  		 * to access pgda->memtier.
>  		 */
> -		memtier = __node_get_memory_tier(node);
> +		memtier = node_get_memory_tier(node);
>  		if (memtier)
>  			memtier->lower_tier_mask = NODE_MASK_NONE;
>  	}
> @@ -417,7 +396,7 @@ static void establish_demotion_targets(void)
>  		best_distance = -1;
>  		nd = &node_demotion[node];
>  
> -		memtier = __node_get_memory_tier(node);
> +		memtier = node_get_memory_tier(node);
>  		if (!memtier || list_is_last(&memtier->list, &memory_tiers))
>  			continue;
>  		/*
> @@ -562,7 +541,7 @@ static bool clear_node_memory_tier(int node)
>  	 * This also enables us to free the destroyed memory tier
>  	 * with kfree instead of kfree_rcu
>  	 */
> -	memtier = __node_get_memory_tier(node);
> +	memtier = node_get_memory_tier(node);
>  	if (memtier) {
>  		struct memory_dev_type *memtype;
>  
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 42b5567e3773..4f80c6ee1176 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -100,6 +100,8 @@
>  #include <linux/ctype.h>
>  #include <linux/mm_inline.h>
>  #include <linux/mmu_notifier.h>
> +#include <linux/memory-tiers.h>
> +#include <linux/nodemask.h>
>  #include <linux/printk.h>
>  #include <linux/swapops.h>
>  
> @@ -882,8 +884,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
>  
>  	old = current->mempolicy;
>  	current->mempolicy = new;
> -	if (new && new->mode == MPOL_INTERLEAVE)
> +	if (new && new->mode == MPOL_INTERLEAVE) {
>  		current->il_prev = MAX_NUMNODES-1;
> +		current->il_count = 0;
> +		current->current_node = MAX_NUMNODES;
> +	}
>  	task_unlock(current);
>  	mpol_put(old);
>  	ret = 0;
> @@ -1899,13 +1904,76 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
>  	return nd;
>  }
>  
> +/* Return interleave weight of node from tier's weight */
> +static unsigned short node_interleave_weight(int nid, nodemask_t pol_nodemask)
> +{
> +	struct memory_tier *memtier;
> +	nodemask_t tier_nodes, tier_and_pol;
> +	unsigned short avrg_weight = 0;
> +	int node, nnodes, reminder;
> +
> +	memtier = node_get_memory_tier(nid);

node_get_memory_tier() must be called with memory_tier_lock held.  Or
some other locking mechanism needs to be provided.  For example, RCU
read lock.

> +
> +	if (!memtier)
> +		return 0;
> +
> +	tier_nodes = get_memtier_nodemask(memtier);

get_memtier_nodemask() must be called with memory_tier_lock held too.

> +	nodes_and(tier_and_pol, tier_nodes, pol_nodemask);
> +	nnodes = nodes_weight(tier_and_pol);
> +	if (!nnodes)
> +		return 0;
> +
> +	avrg_weight = memtier->interleave_weight / nnodes;
> +	/* Set minimum weight of node as 1 so that at least one page
> +	 * is allocated.
> +	 */
> +	if (!avrg_weight)
> +		return 1;
> +
> +	reminder = memtier->interleave_weight % nnodes;
> +	if (reminder) {
> +		for_each_node_mask(node, tier_and_pol) {
> +			/* Increment target node's weight by 1, if it falls
> +			 * within remaining weightage 'reminder'.
> +			 */
> +			if (node == nid) {
> +				if (reminder > 0)
> +					avrg_weight = avrg_weight + 1;
> +				break;
> +			}
> +			reminder--;
> +		}
> +	}
> +	return avrg_weight;
> +}
> +
>  /* Do dynamic interleaving for a process */
>  static unsigned interleave_nodes(struct mempolicy *policy)
>  {
>  	unsigned next;
>  	struct task_struct *me = current;
> +	unsigned short node_weight = 0;
>  
> -	next = next_node_in(me->il_prev, policy->nodes);
> +	/* select current node or next node from nodelist based on
> +	 * available tier interleave weight.
> +	 */
> +	if (me->current_node == MAX_NUMNODES)
> +		next = next_node_in(me->il_prev, policy->nodes);
> +	else
> +		next = me->current_node;
> +	node_weight = node_interleave_weight(next, policy->nodes);
> +	if (!node_weight)
> +		goto set_il_prev;
> +	if (me->il_count < node_weight) {
> +		me->il_count++;
> +		me->current_node = next;
> +		if (me->il_count == node_weight) {
> +			me->current_node = MAX_NUMNODES;
> +			me->il_count = 0;
> +		}
> +	}
> +
> +set_il_prev:
>  	if (next < MAX_NUMNODES)
>  		me->il_prev = next;

I felt that we can implement the algorithm without introducing
current->current_node (use current->il_prev for it).  But I haven't
written the code.  So, I may be wrong here.

>  	return next;
> @@ -1966,9 +2034,10 @@ unsigned int mempolicy_slab_node(void)
>  static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
>  {
>  	nodemask_t nodemask = pol->nodes;
> -	unsigned int target, nnodes;
> -	int i;
> -	int nid;
> +	unsigned int target, nnodes, vnnodes = 0;
> +	unsigned short node_weight = 0;
> +	int nid, vtarget, i;
> +
>  	/*
>  	 * The barrier will stabilize the nodemask in a register or on
>  	 * the stack so that it will stop changing under the code.
> @@ -1981,7 +2050,33 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
>  	nnodes = nodes_weight(nodemask);
>  	if (!nnodes)
>  		return numa_node_id();
> -	target = (unsigned int)n % nnodes;
> +
> +	/*
> +	 * Calculate the virtual target for @n in a nodelist that is scaled
> +	 * with interleave weights....
> +	 */
> +	for_each_node_mask(nid, nodemask) {
> +		node_weight = node_interleave_weight(nid, nodemask);
> +		if (!node_weight)
> +			continue;
> +		vnnodes += node_weight;
> +	}
> +	if (!vnnodes)
> +		return numa_node_id();
> +	vtarget = (int)((unsigned int)n % vnnodes);
> +
> +	/* ...then map it back to the physical nodelist */
> +	target = 0;
> +	for_each_node_mask(nid, nodemask) {
> +		node_weight = node_interleave_weight(nid, nodemask);
> +		if (!node_weight)
> +			continue;
> +		vtarget -= node_weight;
> +		if (vtarget < 0)
> +			break;
> +		target++;
> +	}
> +
>  	nid = first_node(nodemask);
>  	for (i = 0; i < target; i++)
>  		nid = next_node(nid, nodemask);

--
Best Regards,
Huang, Ying
Gregory Price Sept. 28, 2023, 6:25 p.m. UTC | #2
On Wed, Sep 27, 2023 at 03:20:02PM +0530, Ravi Jonnalagadda wrote:
> From: Srinivasulu Thanneeru <sthanneeru@micron.com>
> 
> Existing interleave policy spreads out pages evenly across a set of
> specified nodes, i.e. 1:1 interleave. Upcoming tiered memory systems
> have CPU-less memory nodes with different peak bandwidth and
> latency-bandwidth characteristics. In such systems, we will want to
> use the additional bandwidth provided by lowtier memory for
> bandwidth-intensive applications. However, the default 1:1 interleave
> can lead to suboptimal bandwidth distribution.
> 
> Introduce an interleave policy for multi-tiers that is based on
> interleave weights, where pages are assigned from nodes of the tier
> based on the tier weight.
> 
> For instance, 50:30:20 are the weights of tiers 0, 1, and 3, which
> leads to a 50%/30%/20% traffic breakdown across the three tiers.
> 
> Signed-off-by: Srinivasulu Thanneeru <sthanneeru@micron.com>
> Co-authored-by: Ravi Jonnalagadda <ravis.opensrc@micron.com>
> ---
>  include/linux/memory-tiers.h |  25 +++++++-
>  include/linux/sched.h        |   2 +
>  mm/memory-tiers.c            |  31 ++--------
>  mm/mempolicy.c               | 107 +++++++++++++++++++++++++++++++++--
>  4 files changed, 132 insertions(+), 33 deletions(-)
> 
> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
> index c62d286749d0..74be39cb56c4 100644
> --- a/include/linux/memory-tiers.h
> +++ b/include/linux/memory-tiers.h
> @@ -2,6 +2,7 @@
>  #ifndef _LINUX_MEMORY_TIERS_H
>  #define _LINUX_MEMORY_TIERS_H
>  
> +#include <linux/device.h>
>  #include <linux/types.h>
>  #include <linux/nodemask.h>
>  #include <linux/kref.h>
> @@ -21,7 +22,27 @@
>  
>  #define MAX_TIER_INTERLEAVE_WEIGHT 100
>  
> -struct memory_tier;
> +struct memory_tier {
> +	/* hierarchy of memory tiers */
> +	struct list_head list;
> +	/* list of all memory types part of this tier */
> +	struct list_head memory_types;
> +	/*
> +	 * By default all tiers will have weight as 1, which means they
> +	 * follow default standard allocation.
> +	 */
> +	unsigned short interleave_weight;
> +	/*
> +	 * start value of abstract distance. memory tier maps
> +	 * an abstract distance  range,
> +	 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
> +	 */
> +	int adistance_start;
> +	struct device dev;
> +	/* All the nodes that are part of all the lower memory tiers. */
> +	nodemask_t lower_tier_mask;
> +};
> +

I would make this change in a separate pull-ahead patch.  To make review
of the actual functional changes easier.

>  struct memory_dev_type {
>  	/* list of memory types that are part of same tier as this type */
>  	struct list_head tier_sibiling;
> @@ -38,6 +59,8 @@ struct memory_dev_type *alloc_memory_type(int adistance);
>  void put_memory_type(struct memory_dev_type *memtype);
>  void init_node_memory_type(int node, struct memory_dev_type *default_type);
>  void clear_node_memory_type(int node, struct memory_dev_type *memtype);
> +struct memory_tier *node_get_memory_tier(int node);
> +nodemask_t get_memtier_nodemask(struct memory_tier *memtier);
>  #ifdef CONFIG_MIGRATION
>  int next_demotion_node(int node);
>  void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 77f01ac385f7..07ea837c3afb 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1252,7 +1252,9 @@ struct task_struct {
>  	/* Protected by alloc_lock: */
>  	struct mempolicy		*mempolicy;
>  	short				il_prev;
> +	unsigned short			il_count;
>  	short				pref_node_fork;
> +	unsigned int			current_node;
>  #endif
>  #ifdef CONFIG_NUMA_BALANCING
>  	int				numa_scan_seq;
> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> index 7e06c9e0fa41..5e2ddc9f994a 100644
> --- a/mm/memory-tiers.c
> +++ b/mm/memory-tiers.c
> @@ -8,27 +8,6 @@
>  
>  #include "internal.h"
>  
> -struct memory_tier {
> -	/* hierarchy of memory tiers */
> -	struct list_head list;
> -	/* list of all memory types part of this tier */
> -	struct list_head memory_types;
> -	/*
> -	 * By default all tiers will have weight as 1, which means they
> -	 * follow default standard allocation.
> -	 */
> -	unsigned short interleave_weight;
> -	/*
> -	 * start value of abstract distance. memory tier maps
> -	 * an abstract distance  range,
> -	 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
> -	 */
> -	int adistance_start;
> -	struct device dev;
> -	/* All the nodes that are part of all the lower memory tiers. */
> -	nodemask_t lower_tier_mask;
> -};
> -

See above

>  struct demotion_nodes {
>  	nodemask_t preferred;
>  };
> @@ -115,7 +94,7 @@ static inline struct memory_tier *to_memory_tier(struct device *device)
>  	return container_of(device, struct memory_tier, dev);
>  }
>  
> -static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
> +nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
>  {
>  	nodemask_t nodes = NODE_MASK_NONE;
>  	struct memory_dev_type *memtype;
> @@ -264,7 +243,7 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
>  	return memtier;
>  }
>  
> -static struct memory_tier *__node_get_memory_tier(int node)
> +struct memory_tier *node_get_memory_tier(int node)
>  {
>  	pg_data_t *pgdat;
>  
> @@ -380,7 +359,7 @@ static void disable_all_demotion_targets(void)
>  		 * We are holding memory_tier_lock, it is safe
>  		 * to access pgda->memtier.
>  		 */
> -		memtier = __node_get_memory_tier(node);
> +		memtier = node_get_memory_tier(node);
>  		if (memtier)
>  			memtier->lower_tier_mask = NODE_MASK_NONE;
>  	}
> @@ -417,7 +396,7 @@ static void establish_demotion_targets(void)
>  		best_distance = -1;
>  		nd = &node_demotion[node];
>  
> -		memtier = __node_get_memory_tier(node);
> +		memtier = node_get_memory_tier(node);
>  		if (!memtier || list_is_last(&memtier->list, &memory_tiers))
>  			continue;
>  		/*
> @@ -562,7 +541,7 @@ static bool clear_node_memory_tier(int node)
>  	 * This also enables us to free the destroyed memory tier
>  	 * with kfree instead of kfree_rcu
>  	 */
> -	memtier = __node_get_memory_tier(node);
> +	memtier = node_get_memory_tier(node);
>  	if (memtier) {
>  		struct memory_dev_type *memtype;
>  

See above

> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 42b5567e3773..4f80c6ee1176 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -100,6 +100,8 @@
>  #include <linux/ctype.h>
>  #include <linux/mm_inline.h>
>  #include <linux/mmu_notifier.h>
> +#include <linux/memory-tiers.h>
> +#include <linux/nodemask.h>
>  #include <linux/printk.h>
>  #include <linux/swapops.h>
>  
> @@ -882,8 +884,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
>  
>  	old = current->mempolicy;
>  	current->mempolicy = new;
> -	if (new && new->mode == MPOL_INTERLEAVE)
> +	if (new && new->mode == MPOL_INTERLEAVE) {
>  		current->il_prev = MAX_NUMNODES-1;
> +		current->il_count = 0;
> +		current->current_node = MAX_NUMNODES;
> +	}
>  	task_unlock(current);
>  	mpol_put(old);
>  	ret = 0;
> @@ -1899,13 +1904,76 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
>  	return nd;
>  }
>  

Should this be a change to MPOL_INTERLEAVE, or should this be a new
memory policy:  MPOL_TIERING?

For the sake of retaining the existing behavior of MPOL_INTERLEAVE as-is
and preventing the introduction of bugs, maybe creating MPOL_TIERING is
preferable?

That would allow the rest of mempolicy to be changed cleanly if the
intent is to override mempolicy with memtier behavior.

> +/* Return interleave weight of node from tier's weight */
> +static unsigned short node_interleave_weight(int nid, nodemask_t pol_nodemask)
> +{
> +	struct memory_tier *memtier;
> +	nodemask_t tier_nodes, tier_and_pol;
> +	unsigned short avrg_weight = 0;
> +	int node, nnodes, reminder;
> +
> +	memtier = node_get_memory_tier(nid);
> +
> +	if (!memtier)
> +		return 0;
> +
> +	tier_nodes = get_memtier_nodemask(memtier);
> +	nodes_and(tier_and_pol, tier_nodes, pol_nodemask);
> +	nnodes = nodes_weight(tier_and_pol);
> +	if (!nnodes)
> +		return 0;
> +
> +	avrg_weight = memtier->interleave_weight / nnodes;
> +	/* Set minimum weight of node as 1 so that at least one page
> +	 * is allocated.
> +	 */
> +	if (!avrg_weight)
> +		return 1;
> +
> +	reminder = memtier->interleave_weight % nnodes;
> +	if (reminder) {
> +		for_each_node_mask(node, tier_and_pol) {
> +			/* Increment target node's weight by 1, if it falls
> +			 * within remaining weightage 'reminder'.
> +			 */
> +			if (node == nid) {
> +				if (reminder > 0)
> +					avrg_weight = avrg_weight + 1;
> +				break;
> +			}
> +			reminder--;
> +		}
> +	}
> +	return avrg_weight;
> +}
> +

With this, there are now 3 components with node masks that have to be
cross-referenced to figure out what is both preferred and what is
allowed:  Mempolicy, Cgroups, MemTier.

is abstracting nodes with memtiers actually preferable to abstracting
devices with numa nodes and simply creating a new memory policy with
weights directly described in the policy?

>  /* Do dynamic interleaving for a process */
>  static unsigned interleave_nodes(struct mempolicy *policy)
>  {
>  	unsigned next;
>  	struct task_struct *me = current;
> +	unsigned short node_weight = 0;
>  
> -	next = next_node_in(me->il_prev, policy->nodes);
> +	/* select current node or next node from nodelist based on
> +	 * available tier interleave weight.
> +	 */
> +	if (me->current_node == MAX_NUMNODES)
> +		next = next_node_in(me->il_prev, policy->nodes);
> +	else
> +		next = me->current_node;
> +	node_weight = node_interleave_weight(next, policy->nodes);
> +	if (!node_weight)
> +		goto set_il_prev;
> +	if (me->il_count < node_weight) {

What happens if a node weight changes to be less than il_count in the
middle of operation?  Is il_count reset when node weights are changed?

Since memtier's weights are global, this can't possibly work as intended.

> +		me->il_count++;
> +		me->current_node = next;
> +		if (me->il_count == node_weight) {
> +			me->current_node = MAX_NUMNODES;
> +			me->il_count = 0;
> +		}
> +	}
> +
> +set_il_prev:
>  	if (next < MAX_NUMNODES)
>  		me->il_prev = next;
>  	return next;
> @@ -1966,9 +2034,10 @@ unsigned int mempolicy_slab_node(void)
>  static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
>  {
>  	nodemask_t nodemask = pol->nodes;
> -	unsigned int target, nnodes;
> -	int i;
> -	int nid;
> +	unsigned int target, nnodes, vnnodes = 0;
> +	unsigned short node_weight = 0;
> +	int nid, vtarget, i;
> +
>  	/*
>  	 * The barrier will stabilize the nodemask in a register or on
>  	 * the stack so that it will stop changing under the code.
> @@ -1981,7 +2050,33 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
>  	nnodes = nodes_weight(nodemask);
>  	if (!nnodes)
>  		return numa_node_id();
> -	target = (unsigned int)n % nnodes;
> +
> +	/*
> +	 * Calculate the virtual target for @n in a nodelist that is scaled
> +	 * with interleave weights....
> +	 */
> +	for_each_node_mask(nid, nodemask) {
> +		node_weight = node_interleave_weight(nid, nodemask);
> +		if (!node_weight)
> +			continue;
> +		vnnodes += node_weight;
> +	}

Should this be precomputed?  This seems like a considerable amount of
work in the hot path for page allocation and there are many numa nodes.

> +	if (!vnnodes)
> +		return numa_node_id();
> +	vtarget = (int)((unsigned int)n % vnnodes);
> +
> +	/* ...then map it back to the physical nodelist */
> +	target = 0;
> +	for_each_node_mask(nid, nodemask) {
> +		node_weight = node_interleave_weight(nid, nodemask);
> +		if (!node_weight)
> +			continue;
> +		vtarget -= node_weight;
> +		if (vtarget < 0)
> +			break;
> +		target++;
> +	}

See above, now you're double-counting.

> +
>  	nid = first_node(nodemask);
>  	for (i = 0; i < target; i++)
>  		nid = next_node(nid, nodemask);
> -- 
> 2.39.3
>
Jonathan Cameron Oct. 2, 2023, 10:48 a.m. UTC | #3
On Wed, 27 Sep 2023 15:20:02 +0530
Ravi Jonnalagadda <ravis.opensrc@micron.com> wrote:

> From: Srinivasulu Thanneeru <sthanneeru@micron.com>
> 
> Existing interleave policy spreads out pages evenly across a set of
> specified nodes, i.e. 1:1 interleave. Upcoming tiered memory systems
> have CPU-less memory nodes with different peak bandwidth and
> latency-bandwidth characteristics. In such systems, we will want to
> use the additional bandwidth provided by lowtier memory for
> bandwidth-intensive applications. However, the default 1:1 interleave
> can lead to suboptimal bandwidth distribution.
> 
> Introduce an interleave policy for multi-tiers that is based on
> interleave weights, where pages are assigned from nodes of the tier
> based on the tier weight.
> 
> For instance, 50:30:20 are the weights of tiers 0, 1, and 3, which
> leads to a 50%/30%/20% traffic breakdown across the three tiers.
> 
> Signed-off-by: Srinivasulu Thanneeru <sthanneeru@micron.com>
> Co-authored-by: Ravi Jonnalagadda <ravis.opensrc@micron.com>

Generally I'd expect to see a sign off from the co-author as well
as the Co-authored-by tag.

A few comments inline.  I'm far from an expert in this corner
of the kernels, so somewhat of a drive by review as I was curious.

Jonathan

> ---
>  include/linux/memory-tiers.h |  25 +++++++-
>  include/linux/sched.h        |   2 +
>  mm/memory-tiers.c            |  31 ++--------
>  mm/mempolicy.c               | 107 +++++++++++++++++++++++++++++++++--
>  4 files changed, 132 insertions(+), 33 deletions(-)
> 
> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
> index c62d286749d0..74be39cb56c4 100644
> --- a/include/linux/memory-tiers.h
> +++ b/include/linux/memory-tiers.h
> @@ -2,6 +2,7 @@
>  #ifndef _LINUX_MEMORY_TIERS_H
>  #define _LINUX_MEMORY_TIERS_H
>  
> +#include <linux/device.h>
>  #include <linux/types.h>
>  #include <linux/nodemask.h>
>  #include <linux/kref.h>
> @@ -21,7 +22,27 @@
>  
>  #define MAX_TIER_INTERLEAVE_WEIGHT 100
>  
> -struct memory_tier;
> +struct memory_tier {
> +	/* hierarchy of memory tiers */
> +	struct list_head list;
> +	/* list of all memory types part of this tier */
> +	struct list_head memory_types;
> +	/*
> +	 * By default all tiers will have weight as 1, which means they
> +	 * follow default standard allocation.
> +	 */
> +	unsigned short interleave_weight;
> +	/*
> +	 * start value of abstract distance. memory tier maps
> +	 * an abstract distance  range,
> +	 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
> +	 */
> +	int adistance_start;
> +	struct device dev;
> +	/* All the nodes that are part of all the lower memory tiers. */
> +	nodemask_t lower_tier_mask;
> +};

As Gregory pointed out - these moves in a precursor patch would make this one
easier to follow.


> +
>  struct memory_dev_type {
>  	/* list of memory types that are part of same tier as this type */
>  	struct list_head tier_sibiling;
> @@ -38,6 +59,8 @@ struct memory_dev_type *alloc_memory_type(int adistance);
>  void put_memory_type(struct memory_dev_type *memtype);
>  void init_node_memory_type(int node, struct memory_dev_type *default_type);
>  void clear_node_memory_type(int node, struct memory_dev_type *memtype);
> +struct memory_tier *node_get_memory_tier(int node);
> +nodemask_t get_memtier_nodemask(struct memory_tier *memtier);
>  #ifdef CONFIG_MIGRATION
>  int next_demotion_node(int node);
>  void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 77f01ac385f7..07ea837c3afb 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1252,7 +1252,9 @@ struct task_struct {
>  	/* Protected by alloc_lock: */
>  	struct mempolicy		*mempolicy;
>  	short				il_prev;
> +	unsigned short			il_count;
>  	short				pref_node_fork;
> +	unsigned int			current_node;
>  #endif
>  #ifdef CONFIG_NUMA_BALANCING
>  	int				numa_scan_seq;



> @@ -115,7 +94,7 @@ static inline struct memory_tier *to_memory_tier(struct device *device)
>  	return container_of(device, struct memory_tier, dev);
>  }
>  
> -static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
> +nodemask_t get_memtier_nodemask(struct memory_tier *memtier)

Should remain static...

>  {
>  	nodemask_t nodes = NODE_MASK_NONE;
>  	struct memory_dev_type *memtype;
> @@ -264,7 +243,7 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
>  	return memtier;
>  }
>  
> -static struct memory_tier *__node_get_memory_tier(int node)
> +struct memory_tier *node_get_memory_tier(int node)

There is a bunch of renaming in here.  I would pull that out as separate patch
(as with the structure move etc) so that we can see clear reasoning for
why the renaming.  I'm not sure in this case what the __ was conveying
so good to see an explanation of why it is being dropped.

>  {
>  	pg_data_t *pgdat;
>  
> @@ -380,7 +359,7 @@ static void disable_all_demotion_targets(void)
>  		 * We are holding memory_tier_lock, it is safe
>  		 * to access pgda->memtier.
>  		 */
> -		memtier = __node_get_memory_tier(node);
> +		memtier = node_get_memory_tier(node);
>  		if (memtier)
>  			memtier->lower_tier_mask = NODE_MASK_NONE;
>  	}
> @@ -417,7 +396,7 @@ static void establish_demotion_targets(void)
>  		best_distance = -1;
>  		nd = &node_demotion[node];
>  
> -		memtier = __node_get_memory_tier(node);
> +		memtier = node_get_memory_tier(node);
>  		if (!memtier || list_is_last(&memtier->list, &memory_tiers))
>  			continue;
>  		/*
> @@ -562,7 +541,7 @@ static bool clear_node_memory_tier(int node)
>  	 * This also enables us to free the destroyed memory tier
>  	 * with kfree instead of kfree_rcu
>  	 */
> -	memtier = __node_get_memory_tier(node);
> +	memtier = node_get_memory_tier(node);
>  	if (memtier) {
>  		struct memory_dev_type *memtype;
>  
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 42b5567e3773..4f80c6ee1176 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -100,6 +100,8 @@
>  #include <linux/ctype.h>
>  #include <linux/mm_inline.h>
>  #include <linux/mmu_notifier.h>
> +#include <linux/memory-tiers.h>
> +#include <linux/nodemask.h>
>  #include <linux/printk.h>
>  #include <linux/swapops.h>
>  
> @@ -882,8 +884,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
>  
>  	old = current->mempolicy;
>  	current->mempolicy = new;
> -	if (new && new->mode == MPOL_INTERLEAVE)
> +	if (new && new->mode == MPOL_INTERLEAVE) {
>  		current->il_prev = MAX_NUMNODES-1;
> +		current->il_count = 0;
> +		current->current_node = MAX_NUMNODES;
> +	}
>  	task_unlock(current);
>  	mpol_put(old);
>  	ret = 0;
> @@ -1899,13 +1904,76 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
>  	return nd;
>  }
>  
> +/* Return interleave weight of node from tier's weight */
> +static unsigned short node_interleave_weight(int nid, nodemask_t pol_nodemask)
> +{
> +	struct memory_tier *memtier;
> +	nodemask_t tier_nodes, tier_and_pol;
> +	unsigned short avrg_weight = 0;
> +	int node, nnodes, reminder;
> +
> +	memtier = node_get_memory_tier(nid);
> +
Trivial: Generally I'd keep error checks right next to what they are checking.
So no blank line here.
> +	if (!memtier)
> +		return 0;
> +
> +	tier_nodes = get_memtier_nodemask(memtier);
> +	nodes_and(tier_and_pol, tier_nodes, pol_nodemask);
> +	nnodes = nodes_weight(tier_and_pol);
> +	if (!nnodes)
> +		return 0;
> +
> +	avrg_weight = memtier->interleave_weight / nnodes;
> +	/* Set minimum weight of node as 1 so that at least one page

Comment syntax as below.

> +	 * is allocated.
> +	 */
> +	if (!avrg_weight)
> +		return 1;
> +
> +	reminder = memtier->interleave_weight % nnodes;
> +	if (reminder) {

remainder?

> +		for_each_node_mask(node, tier_and_pol) {
> +			/* Increment target node's weight by 1, if it falls
Comment syntax
> +			 * within remaining weightage 'reminder'.
> +			 */
> +			if (node == nid) {
> +				if (reminder > 0)
> +					avrg_weight = avrg_weight + 1;

					avrg_weight++;

> +				break;
> +			}
> +			reminder--;
> +		}
> +	}
> +	return avrg_weight;
> +}
> +
>  /* Do dynamic interleaving for a process */
>  static unsigned interleave_nodes(struct mempolicy *policy)
>  {
>  	unsigned next;
>  	struct task_struct *me = current;
> +	unsigned short node_weight = 0;
>  
> -	next = next_node_in(me->il_prev, policy->nodes);
> +	/* select current node or next node from nodelist based on
Comment syntax should match surrounding code. Looks to use
/*
 * select...


> +	 * available tier interleave weight.
> +	 */
> +	if (me->current_node == MAX_NUMNODES)
> +		next = next_node_in(me->il_prev, policy->nodes);
> +	else
> +		next = me->current_node;
> +	node_weight = node_interleave_weight(next, policy->nodes);
> +	if (!node_weight)
> +		goto set_il_prev;
> +	if (me->il_count < node_weight) {
> +		me->il_count++;
> +		me->current_node = next;
> +		if (me->il_count == node_weight) {
> +			me->current_node = MAX_NUMNODES;
> +			me->il_count = 0;
> +		}
> +	}
> +
> +set_il_prev:
>  	if (next < MAX_NUMNODES)
>  		me->il_prev = next;
>  	return next;
> @@ -1966,9 +2034,10 @@ unsigned int mempolicy_slab_node(void)
>  static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
>  {
>  	nodemask_t nodemask = pol->nodes;
> -	unsigned int target, nnodes;
> -	int i;
> -	int nid;
> +	unsigned int target, nnodes, vnnodes = 0;
> +	unsigned short node_weight = 0;
> +	int nid, vtarget, i;
> +
>  	/*
>  	 * The barrier will stabilize the nodemask in a register or on
>  	 * the stack so that it will stop changing under the code.
> @@ -1981,7 +2050,33 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
>  	nnodes = nodes_weight(nodemask);

Does this save enough to bother rather than just running the check below?
I'm not quite sure what we have to have opted in on to get to this point.
IF it is a useful optimization as it's common for !nnodes then add a comment
perhaps?

>  	if (!nnodes)
>  		return numa_node_id();
> -	target = (unsigned int)n % nnodes;
> +
> +	/*
> +	 * Calculate the virtual target for @n in a nodelist that is scaled
> +	 * with interleave weights....
> +	 */
> +	for_each_node_mask(nid, nodemask) {
> +		node_weight = node_interleave_weight(nid, nodemask);
> +		if (!node_weight)
> +			continue;
Adding 0 cheaper than checking for 0 so drop this check.

> +		vnnodes += node_weight;
> +	}
> +	if (!vnnodes)
> +		return numa_node_id();
> +	vtarget = (int)((unsigned int)n % vnnodes);

What sort of number of nodes do we ultimately expect? Is it worth
a more sophisticated structure in here?

I guess likely to be small though so maybe not.

> +
> +	/* ...then map it back to the physical nodelist */
> +	target = 0;
> +	for_each_node_mask(nid, nodemask) {
> +		node_weight = node_interleave_weight(nid, nodemask);
> +		if (!node_weight)
> +			continue;
Why allow nodes weight of 0?
Probably cheaper not to check this though that depends on what
the compiler gets up to.  You'd hope CPU will predict following
branch not taken if it wasn't previous time, so that's probably free.

> +		vtarget -= node_weight;
> +		if (vtarget < 0)
> +			break;
> +		target++;

If node_weight was 0, the code below still needs to go past the node.
So I think target == nnodes


> +	}
> +
>  	nid = first_node(nodemask);
>  	for (i = 0; i < target; i++)
>  		nid = next_node(nid, nodemask);
kernel test robot Oct. 20, 2023, 5:05 p.m. UTC | #4
Hello,

kernel test robot noticed "WARNING:suspicious_RCU_usage" on:

commit: 8536fe0dad4ce46984d207cd53583e7c36bc94b1 ("[PATCH 2/2] mm: mempolicy: Interleave policy for tiered memory nodes")
url: https://github.com/intel-lab-lkp/linux/commits/Ravi-Jonnalagadda/memory-tier-Introduce-sysfs-for-tier-interleave-weights/20230927-175208
base: https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git 612f769edd06a6e42f7cd72425488e68ddaeef0a
patch link: https://lore.kernel.org/all/20230927095002.10245-3-ravis.opensrc@micron.com/
patch subject: [PATCH 2/2] mm: mempolicy: Interleave policy for tiered memory nodes

in testcase: boot

compiler: gcc-12
test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G

(please refer to attached dmesg/kmsg for entire log/backtrace)



If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202310210005.51e08d62-oliver.sang@intel.com


[    3.474923][    T0] WARNING: suspicious RCU usage
[    3.476322][    T0] 6.6.0-rc1-00038-g8536fe0dad4c #2 Not tainted
[    3.478100][    T0] -----------------------------
[    3.480319][    T0] mm/memory-tiers.c:258 suspicious rcu_dereference_check() usage!
[    3.482562][    T0]
[    3.482562][    T0] other info that might help us debug this:
[    3.482562][    T0]
[    3.484325][    T0]
[    3.484325][    T0] rcu_scheduler_active = 1, debug_locks = 1
[    3.488321][    T0] no locks held by swapper/0/0.
[    3.489732][    T0]
[    3.489732][    T0] stack backtrace:
[    3.491424][    T0] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 6.6.0-rc1-00038-g8536fe0dad4c #2
[    3.492304][    T0] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
[    3.492304][    T0] Call Trace:
[    3.492304][    T0]  <TASK>
[ 3.492304][ T0] dump_stack_lvl (kbuild/src/rand-x86_64-3/lib/dump_stack.c:107) 
[ 3.492304][ T0] lockdep_rcu_suspicious (kbuild/src/rand-x86_64-3/include/linux/context_tracking.h:153 kbuild/src/rand-x86_64-3/kernel/locking/lockdep.c:6712) 
[ 3.492304][ T0] node_get_memory_tier (kbuild/src/rand-x86_64-3/mm/memory-tiers.c:258 (discriminator 11)) 
[ 3.492304][ T0] node_interleave_weight (kbuild/src/rand-x86_64-3/mm/mempolicy.c:1921) 
[ 3.492304][ T0] ? __orc_find (kbuild/src/rand-x86_64-3/arch/x86/kernel/unwind_orc.c:110) 
[ 3.492304][ T0] ? secondary_startup_64_no_verify (kbuild/src/rand-x86_64-3/arch/x86/kernel/head_64.S:433) 
[ 3.492304][ T0] ? orc_find (kbuild/src/rand-x86_64-3/arch/x86/kernel/unwind_orc.c:242) 
[ 3.492304][ T0] ? secondary_startup_64_no_verify (kbuild/src/rand-x86_64-3/arch/x86/kernel/head_64.S:433) 
[ 3.492304][ T0] ? secondary_startup_64_no_verify (kbuild/src/rand-x86_64-3/arch/x86/kernel/head_64.S:433) 
[ 3.492304][ T0] ? unwind_next_frame (kbuild/src/rand-x86_64-3/arch/x86/kernel/unwind_orc.c:682) 
[ 3.492304][ T0] interleave_nodes (kbuild/src/rand-x86_64-3/mm/mempolicy.c:1969) 
[ 3.492304][ T0] mempolicy_slab_node (kbuild/src/rand-x86_64-3/mm/mempolicy.c:2007) 
[ 3.492304][ T0] __slab_alloc_node+0x195/0x320 
[ 3.492304][ T0] slab_alloc_node+0x67/0x160 
[ 3.492304][ T0] kmem_cache_alloc_node (kbuild/src/rand-x86_64-3/mm/slub.c:3525) 
[ 3.492304][ T0] dup_task_struct (kbuild/src/rand-x86_64-3/kernel/fork.c:173 kbuild/src/rand-x86_64-3/kernel/fork.c:1110) 
[ 3.492304][ T0] copy_process (kbuild/src/rand-x86_64-3/kernel/fork.c:2327) 
[ 3.492304][ T0] ? kvm_sched_clock_read (kbuild/src/rand-x86_64-3/arch/x86/kernel/kvmclock.c:91) 
[ 3.492304][ T0] kernel_clone (kbuild/src/rand-x86_64-3/include/linux/random.h:26 kbuild/src/rand-x86_64-3/kernel/fork.c:2910) 
[ 3.492304][ T0] ? rest_init (kbuild/src/rand-x86_64-3/init/main.c:1429) 
[ 3.492304][ T0] user_mode_thread (kbuild/src/rand-x86_64-3/kernel/fork.c:2988) 
[ 3.492304][ T0] ? rest_init (kbuild/src/rand-x86_64-3/init/main.c:1429) 
[ 3.492304][ T0] rest_init (kbuild/src/rand-x86_64-3/init/main.c:691) 
[ 3.492304][ T0] ? acpi_enable_subsystem (kbuild/src/rand-x86_64-3/drivers/acpi/acpica/utxfinit.c:192) 
[ 3.492304][ T0] arch_call_rest_init+0xf/0x20 
[ 3.492304][ T0] start_kernel (kbuild/src/rand-x86_64-3/init/main.c:1019 (discriminator 1)) 
[ 3.492304][ T0] x86_64_start_reservations (kbuild/src/rand-x86_64-3/arch/x86/kernel/head64.c:544) 
[ 3.492304][ T0] x86_64_start_kernel (kbuild/src/rand-x86_64-3/arch/x86/kernel/head64.c:486 (discriminator 17)) 
[ 3.492304][ T0] secondary_startup_64_no_verify (kbuild/src/rand-x86_64-3/arch/x86/kernel/head_64.S:433) 
[    3.492304][    T0]  </TASK>



The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20231021/202310210005.51e08d62-oliver.sang@intel.com
diff mbox series

Patch

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index c62d286749d0..74be39cb56c4 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -2,6 +2,7 @@ 
 #ifndef _LINUX_MEMORY_TIERS_H
 #define _LINUX_MEMORY_TIERS_H
 
+#include <linux/device.h>
 #include <linux/types.h>
 #include <linux/nodemask.h>
 #include <linux/kref.h>
@@ -21,7 +22,27 @@ 
 
 #define MAX_TIER_INTERLEAVE_WEIGHT 100
 
-struct memory_tier;
+struct memory_tier {
+	/* hierarchy of memory tiers */
+	struct list_head list;
+	/* list of all memory types part of this tier */
+	struct list_head memory_types;
+	/*
+	 * By default all tiers will have weight as 1, which means they
+	 * follow default standard allocation.
+	 */
+	unsigned short interleave_weight;
+	/*
+	 * start value of abstract distance. memory tier maps
+	 * an abstract distance  range,
+	 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
+	 */
+	int adistance_start;
+	struct device dev;
+	/* All the nodes that are part of all the lower memory tiers. */
+	nodemask_t lower_tier_mask;
+};
+
 struct memory_dev_type {
 	/* list of memory types that are part of same tier as this type */
 	struct list_head tier_sibiling;
@@ -38,6 +59,8 @@  struct memory_dev_type *alloc_memory_type(int adistance);
 void put_memory_type(struct memory_dev_type *memtype);
 void init_node_memory_type(int node, struct memory_dev_type *default_type);
 void clear_node_memory_type(int node, struct memory_dev_type *memtype);
+struct memory_tier *node_get_memory_tier(int node);
+nodemask_t get_memtier_nodemask(struct memory_tier *memtier);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f01ac385f7..07ea837c3afb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1252,7 +1252,9 @@  struct task_struct {
 	/* Protected by alloc_lock: */
 	struct mempolicy		*mempolicy;
 	short				il_prev;
+	unsigned short			il_count;
 	short				pref_node_fork;
+	unsigned int			current_node;
 #endif
 #ifdef CONFIG_NUMA_BALANCING
 	int				numa_scan_seq;
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 7e06c9e0fa41..5e2ddc9f994a 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -8,27 +8,6 @@ 
 
 #include "internal.h"
 
-struct memory_tier {
-	/* hierarchy of memory tiers */
-	struct list_head list;
-	/* list of all memory types part of this tier */
-	struct list_head memory_types;
-	/*
-	 * By default all tiers will have weight as 1, which means they
-	 * follow default standard allocation.
-	 */
-	unsigned short interleave_weight;
-	/*
-	 * start value of abstract distance. memory tier maps
-	 * an abstract distance  range,
-	 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
-	 */
-	int adistance_start;
-	struct device dev;
-	/* All the nodes that are part of all the lower memory tiers. */
-	nodemask_t lower_tier_mask;
-};
-
 struct demotion_nodes {
 	nodemask_t preferred;
 };
@@ -115,7 +94,7 @@  static inline struct memory_tier *to_memory_tier(struct device *device)
 	return container_of(device, struct memory_tier, dev);
 }
 
-static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
+nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
 {
 	nodemask_t nodes = NODE_MASK_NONE;
 	struct memory_dev_type *memtype;
@@ -264,7 +243,7 @@  static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
 	return memtier;
 }
 
-static struct memory_tier *__node_get_memory_tier(int node)
+struct memory_tier *node_get_memory_tier(int node)
 {
 	pg_data_t *pgdat;
 
@@ -380,7 +359,7 @@  static void disable_all_demotion_targets(void)
 		 * We are holding memory_tier_lock, it is safe
 		 * to access pgda->memtier.
 		 */
-		memtier = __node_get_memory_tier(node);
+		memtier = node_get_memory_tier(node);
 		if (memtier)
 			memtier->lower_tier_mask = NODE_MASK_NONE;
 	}
@@ -417,7 +396,7 @@  static void establish_demotion_targets(void)
 		best_distance = -1;
 		nd = &node_demotion[node];
 
-		memtier = __node_get_memory_tier(node);
+		memtier = node_get_memory_tier(node);
 		if (!memtier || list_is_last(&memtier->list, &memory_tiers))
 			continue;
 		/*
@@ -562,7 +541,7 @@  static bool clear_node_memory_tier(int node)
 	 * This also enables us to free the destroyed memory tier
 	 * with kfree instead of kfree_rcu
 	 */
-	memtier = __node_get_memory_tier(node);
+	memtier = node_get_memory_tier(node);
 	if (memtier) {
 		struct memory_dev_type *memtype;
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 42b5567e3773..4f80c6ee1176 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -100,6 +100,8 @@ 
 #include <linux/ctype.h>
 #include <linux/mm_inline.h>
 #include <linux/mmu_notifier.h>
+#include <linux/memory-tiers.h>
+#include <linux/nodemask.h>
 #include <linux/printk.h>
 #include <linux/swapops.h>
 
@@ -882,8 +884,11 @@  static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 
 	old = current->mempolicy;
 	current->mempolicy = new;
-	if (new && new->mode == MPOL_INTERLEAVE)
+	if (new && new->mode == MPOL_INTERLEAVE) {
 		current->il_prev = MAX_NUMNODES-1;
+		current->il_count = 0;
+		current->current_node = MAX_NUMNODES;
+	}
 	task_unlock(current);
 	mpol_put(old);
 	ret = 0;
@@ -1899,13 +1904,76 @@  static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
 	return nd;
 }
 
+/* Return interleave weight of node from tier's weight */
+static unsigned short node_interleave_weight(int nid, nodemask_t pol_nodemask)
+{
+	struct memory_tier *memtier;
+	nodemask_t tier_nodes, tier_and_pol;
+	unsigned short avrg_weight = 0;
+	int node, nnodes, reminder;
+
+	memtier = node_get_memory_tier(nid);
+
+	if (!memtier)
+		return 0;
+
+	tier_nodes = get_memtier_nodemask(memtier);
+	nodes_and(tier_and_pol, tier_nodes, pol_nodemask);
+	nnodes = nodes_weight(tier_and_pol);
+	if (!nnodes)
+		return 0;
+
+	avrg_weight = memtier->interleave_weight / nnodes;
+	/* Set minimum weight of node as 1 so that at least one page
+	 * is allocated.
+	 */
+	if (!avrg_weight)
+		return 1;
+
+	reminder = memtier->interleave_weight % nnodes;
+	if (reminder) {
+		for_each_node_mask(node, tier_and_pol) {
+			/* Increment target node's weight by 1, if it falls
+			 * within remaining weightage 'reminder'.
+			 */
+			if (node == nid) {
+				if (reminder > 0)
+					avrg_weight = avrg_weight + 1;
+				break;
+			}
+			reminder--;
+		}
+	}
+	return avrg_weight;
+}
+
 /* Do dynamic interleaving for a process */
 static unsigned interleave_nodes(struct mempolicy *policy)
 {
 	unsigned next;
 	struct task_struct *me = current;
+	unsigned short node_weight = 0;
 
-	next = next_node_in(me->il_prev, policy->nodes);
+	/* select current node or next node from nodelist based on
+	 * available tier interleave weight.
+	 */
+	if (me->current_node == MAX_NUMNODES)
+		next = next_node_in(me->il_prev, policy->nodes);
+	else
+		next = me->current_node;
+	node_weight = node_interleave_weight(next, policy->nodes);
+	if (!node_weight)
+		goto set_il_prev;
+	if (me->il_count < node_weight) {
+		me->il_count++;
+		me->current_node = next;
+		if (me->il_count == node_weight) {
+			me->current_node = MAX_NUMNODES;
+			me->il_count = 0;
+		}
+	}
+
+set_il_prev:
 	if (next < MAX_NUMNODES)
 		me->il_prev = next;
 	return next;
@@ -1966,9 +2034,10 @@  unsigned int mempolicy_slab_node(void)
 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
 {
 	nodemask_t nodemask = pol->nodes;
-	unsigned int target, nnodes;
-	int i;
-	int nid;
+	unsigned int target, nnodes, vnnodes = 0;
+	unsigned short node_weight = 0;
+	int nid, vtarget, i;
+
 	/*
 	 * The barrier will stabilize the nodemask in a register or on
 	 * the stack so that it will stop changing under the code.
@@ -1981,7 +2050,33 @@  static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
 	nnodes = nodes_weight(nodemask);
 	if (!nnodes)
 		return numa_node_id();
-	target = (unsigned int)n % nnodes;
+
+	/*
+	 * Calculate the virtual target for @n in a nodelist that is scaled
+	 * with interleave weights....
+	 */
+	for_each_node_mask(nid, nodemask) {
+		node_weight = node_interleave_weight(nid, nodemask);
+		if (!node_weight)
+			continue;
+		vnnodes += node_weight;
+	}
+	if (!vnnodes)
+		return numa_node_id();
+	vtarget = (int)((unsigned int)n % vnnodes);
+
+	/* ...then map it back to the physical nodelist */
+	target = 0;
+	for_each_node_mask(nid, nodemask) {
+		node_weight = node_interleave_weight(nid, nodemask);
+		if (!node_weight)
+			continue;
+		vtarget -= node_weight;
+		if (vtarget < 0)
+			break;
+		target++;
+	}
+
 	nid = first_node(nodemask);
 	for (i = 0; i < target; i++)
 		nid = next_node(nid, nodemask);