diff mbox series

[RFC,v2,2/3] mm/memory-tiers: Introduce sysfs for tier interleave weights

Message ID 20231009204259.875232-3-gregory.price@memverge.com
State New, archived
Headers show
Series mm: mempolicy: Multi-tier weighted interleaving | expand

Commit Message

Gregory Price Oct. 9, 2023, 8:42 p.m. UTC
Allocating pages across tiers is accomplished by provisioning
interleave weights for each tier, with the distribution based on
these weight values.

Weights are relative to the node requesting it (i.e. the weight
for tier2 from node0 may be different than the weight for tier2
from node1).  This allows for cpu-bound tasks to have more
precise control over the distribution of memory.

To represent this, tiers are captured as an array of weights,
where the index is the source node.

tier->interleave_weight[source_node] = weight;

weights are set with the following sysfs mechanism:

Set tier4 weight from node 0 to 85
echo 0:85 > /sys/devices/virtual/memory_tiering/memory_tier4/interleave_weight

By default, all tiers will have a weight of 1 for all source nodes,
which maintains the default interleave behavior.

Weights are effectively aligned (up) to the number of nodes in the
operating nodemask (i.e. (policy_nodes & tier_nodes)) to simplify
the allocation logic and to avoid having to hold the tiering
semaphore for a long period of time during bulk allocation.

Weights apply to a tier, not each node in the tier.  The weight is
split between the nodes in that tier, similar to hardware interleaving.
However, when the task defines a nodemask that splits a tier's nodes,
the weight will be split between the remaining nodes - retaining the
overall weight of the tier.

Signed-off-by: Srinivasulu Thanneeru <sthanneeru@micron.com>
Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com>
Co-developed-by: Gregory Price <gregory.price@memverge.com>
Signed-off-by: Gregory Price <gregory.price@memverge.com>
---
 include/linux/memory-tiers.h |  16 ++++
 mm/memory-tiers.c            | 140 ++++++++++++++++++++++++++++++++++-
 2 files changed, 155 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 437441cdf78f..a000b9745543 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -19,6 +19,8 @@ 
  */
 #define MEMTIER_ADISTANCE_DRAM	((4 * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1))
 
+#define MAX_TIER_INTERLEAVE_WEIGHT 100
+
 struct memory_tier;
 struct memory_dev_type {
 	/* list of memory types that are part of same tier as this type */
@@ -36,6 +38,9 @@  struct memory_dev_type *alloc_memory_type(int adistance);
 void put_memory_type(struct memory_dev_type *memtype);
 void init_node_memory_type(int node, struct memory_dev_type *default_type);
 void clear_node_memory_type(int node, struct memory_dev_type *memtype);
+unsigned char memtier_get_node_weight(int from_node, int target_node,
+				      nodemask_t *pol_nodes);
+unsigned int memtier_get_total_weight(int from_node, nodemask_t *pol_nodes);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
@@ -97,5 +102,16 @@  static inline bool node_is_toptier(int node)
 {
 	return true;
 }
+
+unsigned char memtier_get_node_weight(int from_node, int target_node,
+				      nodemask_t *pol_nodes)
+{
+	return 0;
+}
+
+unsigned int memtier_get_total_weight(int from_node, nodemask_t *pol_nodes)
+{
+	return 0;
+}
 #endif	/* CONFIG_NUMA */
 #endif  /* _LINUX_MEMORY_TIERS_H */
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 0a3241a2cadc..37fc4b3f69a4 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -14,6 +14,11 @@  struct memory_tier {
 	struct list_head list;
 	/* list of all memory types part of this tier */
 	struct list_head memory_types;
+	/*
+	 * By default all tiers will have weight as 1, which means they
+	 * follow default standard allocation.
+	 */
+	unsigned char interleave_weight[MAX_NUMNODES];
 	/*
 	 * start value of abstract distance. memory tier maps
 	 * an abstract distance  range,
@@ -146,8 +151,72 @@  static ssize_t nodelist_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(nodelist);
 
+static ssize_t interleave_weight_show(struct device *dev,
+				      struct device_attribute *attr,
+				      char *buf)
+{
+	int ret = 0;
+	struct memory_tier *tier = to_memory_tier(dev);
+	int node;
+	int count = 0;
+
+	down_read(&memory_tier_sem);
+	for_each_online_node(node) {
+		if (count > 0)
+			ret += sysfs_emit_at(buf, ret, ",");
+		ret += sysfs_emit_at(buf, ret, "%d:%d", node, tier->interleave_weight[node]);
+		count++;
+	}
+	up_read(&memory_tier_sem);
+	sysfs_emit_at(buf, ret++, "\n");
+
+	return ret;
+}
+
+static ssize_t interleave_weight_store(struct device *dev,
+				       struct device_attribute *attr,
+				       const char *buf, size_t size)
+{
+	unsigned char weight;
+	int from_node;
+	char *delim;
+	int ret;
+	struct memory_tier *tier;
+
+	delim = strchr(buf, ':');
+	if (!delim)
+		return -EINVAL;
+	delim[0] = '\0';
+
+	ret = kstrtou32(buf, 10, &from_node);
+	if (ret)
+		return ret;
+
+	if (from_node >= MAX_NUMNODES || !node_online(from_node))
+		return -EINVAL;
+
+	ret = kstrtou8(delim+1, 0, &weight);
+	if (ret)
+		return ret;
+
+	if (weight > MAX_TIER_INTERLEAVE_WEIGHT)
+		return -EINVAL;
+
+	down_write(&memory_tier_sem);
+	tier = to_memory_tier(dev);
+	if (tier)
+		tier->interleave_weight[from_node] = weight;
+	else
+		ret = -ENODEV;
+	up_write(&memory_tier_sem);
+
+	return size;
+}
+static DEVICE_ATTR_RW(interleave_weight);
+
 static struct attribute *memtier_dev_attrs[] = {
 	&dev_attr_nodelist.attr,
+	&dev_attr_interleave_weight.attr,
 	NULL
 };
 
@@ -239,6 +308,72 @@  static struct memory_tier *__node_get_memory_tier(int node)
 				     lockdep_is_held(&memory_tier_sem));
 }
 
+unsigned char memtier_get_node_weight(int from_node, int target_node,
+				      nodemask_t *pol_nodes)
+{
+	struct memory_tier *tier;
+	unsigned char tier_weight, node_weight = 1;
+	int tier_nodes;
+	nodemask_t tier_nmask, tier_and_pol;
+
+	/*
+	 * If the lock is already held, revert to a low weight temporarily
+	 * This should revert any interleave behavior to basic interleave
+	 * this only happens if weights are being updated or during init
+	 */
+	if (!down_read_trylock(&memory_tier_sem))
+		return 1;
+
+	tier = __node_get_memory_tier(target_node);
+	if (tier) {
+		tier_nmask = get_memtier_nodemask(tier);
+		nodes_and(tier_and_pol, tier_nmask, *pol_nodes);
+		tier_nodes = nodes_weight(tier_and_pol);
+		tier_weight = tier->interleave_weight[from_node];
+		node_weight = tier_weight / tier_nodes;
+		node_weight += (tier_weight % tier_nodes) ? 1 : 0;
+	}
+	up_read(&memory_tier_sem);
+	return node_weight;
+}
+
+unsigned int memtier_get_total_weight(int from_node, nodemask_t *pol_nodes)
+{
+	unsigned int weight = 0;
+	struct memory_tier *tier;
+	unsigned int min = nodes_weight(*pol_nodes);
+	int node;
+	nodemask_t tier_nmask, tier_and_pol;
+	int tier_nodes;
+	unsigned int tier_weight;
+
+	/*
+	 * If the lock is already held, revert to a low weight temporarily
+	 * This should revert any interleave behavior to basic interleave
+	 * this only happens if weights are being updated or during init
+	 */
+	if (!down_read_trylock(&memory_tier_sem))
+		return nodes_weight(*pol_nodes);
+
+	for_each_node_mask(node, *pol_nodes) {
+		tier = __node_get_memory_tier(node);
+		if (!tier) {
+			weight += 1;
+			continue;
+		}
+		tier_nmask = get_memtier_nodemask(tier);
+		nodes_and(tier_and_pol, tier_nmask, *pol_nodes);
+		tier_nodes = nodes_weight(tier_and_pol);
+		/* divide node weight by number of nodes, take ceil */
+		tier_weight = tier->interleave_weight[from_node];
+		weight += tier_weight / tier_nodes;
+		weight += (tier_weight % tier_nodes) ? 1 : 0;
+	}
+	up_read(&memory_tier_sem);
+
+	return weight >= min ? weight : min;
+}
+
 #ifdef CONFIG_MIGRATION
 bool node_is_toptier(int node)
 {
@@ -490,8 +625,11 @@  static struct memory_tier *set_node_memory_tier(int node)
 	memtype = node_memory_types[node].memtype;
 	node_set(node, memtype->nodes);
 	memtier = find_create_memory_tier(memtype);
-	if (!IS_ERR(memtier))
+	if (!IS_ERR(memtier)) {
 		rcu_assign_pointer(pgdat->memtier, memtier);
+		memset(memtier->interleave_weight, 1,
+		       sizeof(memtier->interleave_weight));
+	}
 	return memtier;
 }