From patchwork Mon Oct  9 20:42:57 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Gregory Price <gourry.memverge@gmail.com>
X-Patchwork-Id: 13417943
Received: from lindbergh.monkeyblade.net (lindbergh.monkeyblade.net
 [23.128.96.19])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 87DF712E6D
	for <linux-cxl@vger.kernel.org>; Wed, 11 Oct 2023 20:44:06 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b="A9RblMU5"
Received: from mail-yw1-x1141.google.com (mail-yw1-x1141.google.com
 [IPv6:2607:f8b0:4864:20::1141])
	by lindbergh.monkeyblade.net (Postfix) with ESMTPS id E46589E;
	Wed, 11 Oct 2023 13:44:04 -0700 (PDT)
Received: by mail-yw1-x1141.google.com with SMTP id
 00721157ae682-59f6492b415so2124367b3.0;
        Wed, 11 Oct 2023 13:44:04 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20230601; t=1697057044; x=1697661844;
 darn=vger.kernel.org;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:from:to:cc:subject:date
         :message-id:reply-to;
        bh=/AKsmkTQtU/rhqiAGHEzho0MRurJuhinBZFCGxQETC4=;
        b=A9RblMU5uCCTQamjKzeenzEYrFcmuZxsY9CE2iMOFAVSrrbqO7NifM+bxmT1m66VIr
         IFNdIFuEZroj46RJTxB6lQDKbTpoKKeGMNLf+3Su0Poa/dIV2mU7f67WmgBSbmc/w7nL
         /VcyNMFgdDfOe8dGeOAaITOins4pVEd+k5uUhw4aEAXjixvxjS01QLwXcL+EitmBesR3
         d5fxSVkqU5IWB7n6mgP3qXAkWGZvM/k4MkuCe4uWwhbEKWCcBXIyjuueJwwlZ+rGl346
         l6g0dmrPjv+etqRsw3qYFdMiRW6874xl8gqjvgwtfisUI6y9eGu1gbVDWpw6fiYyQvkS
         PP0w==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20230601; t=1697057044; x=1697661844;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
         :subject:date:message-id:reply-to;
        bh=/AKsmkTQtU/rhqiAGHEzho0MRurJuhinBZFCGxQETC4=;
        b=ephZp6ITKzF7VB2glbLO/0oMJXi+ENQWZOiRkrgpbdjKzFqJpb92RgrVQZD/ijcuKX
         uhyh2yN+Hv4CpaKNHXGHMONosPvqO7g02CzZIAdTIH6AVx7S/5M0FJvHHEW6jaw+Gz5f
         pFN4jNRNVsRDqcsW7gCEY3kzFEEhF6pwhyjRx4epjr3JzNql/OGWvh9Zo06wV1Z5oiHP
         v8yZWmOismoP7utc4yvb6nR8wA6Fzy5ThWbbI+pbK/+ATG0QsapCSZOGqcpex/oVkUly
         SExegugTe6fp5ZfNbrbOhiRwqxxmPI6/l6MtPbL94AhcORLJRuaS4m6vOzltbla/vs7y
         8Vng==
X-Gm-Message-State: AOJu0YyX/i3CGSKN045lMu/NuidQzbJ1tNO+nJQebNR3ACckIsFQ7jMu
	Fm9FBtIxzKzAWgrtixA3+Q==
X-Google-Smtp-Source: 
 AGHT+IHk5yLZJeaUXqy1B+rDus9A8q+EqrUxM9q2jVm64kJSM7T3hF2zy2+/xYWfSyGz2fhCfP8TIg==
X-Received: by 2002:a81:528b:0:b0:5a5:575:e944 with SMTP id
 g133-20020a81528b000000b005a50575e944mr11929290ywb.4.1697057044087;
        Wed, 11 Oct 2023 13:44:04 -0700 (PDT)
Received: from fedora.mshome.net (pool-173-79-56-208.washdc.fios.verizon.net.
 [173.79.56.208])
        by smtp.gmail.com with ESMTPSA id
 q2-20020a819902000000b0059bc0d766f8sm1844588ywg.34.2023.10.11.13.44.03
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 11 Oct 2023 13:44:03 -0700 (PDT)
From: Gregory Price <gourry.memverge@gmail.com>
X-Google-Original-From: Gregory Price <gregory.price@memverge.com>
To: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org,
	linux-cxl@vger.kernel.org,
	akpm@linux-foundation.org,
	sthanneeru@micron.com,
	ying.huang@intel.com,
	gregory.price@memverge.com
Subject: [RFC PATCH v2 1/3] mm/memory-tiers: change mutex to rw semaphore
Date: Mon,  9 Oct 2023 16:42:57 -0400
Message-Id: <20231009204259.875232-2-gregory.price@memverge.com>
X-Mailer: git-send-email 2.39.1
In-Reply-To: <20231009204259.875232-1-gregory.price@memverge.com>
References: <20231009204259.875232-1-gregory.price@memverge.com>
Precedence: bulk
X-Mailing-List: linux-cxl@vger.kernel.org
List-Id: <linux-cxl.vger.kernel.org>
List-Subscribe: <mailto:linux-cxl+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-cxl+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
X-Spam-Status: No, score=-2.1 required=5.0 tests=BAYES_00,DKIM_SIGNED,
	DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,FREEMAIL_FROM,
	RCVD_IN_DNSWL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
	autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on
	lindbergh.monkeyblade.net

Tiers will have externally readable information, such as weights,
which may change at runtime. This information is expected to be
used by task threads during memory allocation so it cannot be
protected by hard mutual exclusion.

To support this, change the tiering mutex to a rw semaphore.

Signed-off-by: Gregory Price <gregory.price@memverge.com>
---
 mm/memory-tiers.c | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 37a4f59d9585..0a3241a2cadc 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -5,6 +5,7 @@
 #include <linux/kobject.h>
 #include <linux/memory.h>
 #include <linux/memory-tiers.h>
+#include <linux/rwsem.h>
 
 #include "internal.h"
 
@@ -33,7 +34,7 @@ struct node_memory_type_map {
 	int map_count;
 };
 
-static DEFINE_MUTEX(memory_tier_lock);
+static DECLARE_RWSEM(memory_tier_sem);
 static LIST_HEAD(memory_tiers);
 static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
 static struct memory_dev_type *default_dram_type;
@@ -137,10 +138,10 @@ static ssize_t nodelist_show(struct device *dev,
 	int ret;
 	nodemask_t nmask;
 
-	mutex_lock(&memory_tier_lock);
+	down_read(&memory_tier_sem);
 	nmask = get_memtier_nodemask(to_memory_tier(dev));
 	ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
-	mutex_unlock(&memory_tier_lock);
+	up_read(&memory_tier_sem);
 	return ret;
 }
 static DEVICE_ATTR_RO(nodelist);
@@ -167,7 +168,7 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
 	int adistance = memtype->adistance;
 	unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE;
 
-	lockdep_assert_held_once(&memory_tier_lock);
+	lockdep_assert_held_write(&memory_tier_sem);
 
 	adistance = round_down(adistance, memtier_adistance_chunk_size);
 	/*
@@ -230,12 +231,12 @@ static struct memory_tier *__node_get_memory_tier(int node)
 	if (!pgdat)
 		return NULL;
 	/*
-	 * Since we hold memory_tier_lock, we can avoid
+	 * Since we hold memory_tier_sem, we can avoid
 	 * RCU read locks when accessing the details. No
 	 * parallel updates are possible here.
 	 */
 	return rcu_dereference_check(pgdat->memtier,
-				     lockdep_is_held(&memory_tier_lock));
+				     lockdep_is_held(&memory_tier_sem));
 }
 
 #ifdef CONFIG_MIGRATION
@@ -335,7 +336,7 @@ static void disable_all_demotion_targets(void)
 	for_each_node_state(node, N_MEMORY) {
 		node_demotion[node].preferred = NODE_MASK_NONE;
 		/*
-		 * We are holding memory_tier_lock, it is safe
+		 * We are holding memory_tier_sem, it is safe
 		 * to access pgda->memtier.
 		 */
 		memtier = __node_get_memory_tier(node);
@@ -364,7 +365,7 @@ static void establish_demotion_targets(void)
 	int distance, best_distance;
 	nodemask_t tier_nodes, lower_tier;
 
-	lockdep_assert_held_once(&memory_tier_lock);
+	lockdep_assert_held_write(&memory_tier_sem);
 
 	if (!node_demotion)
 		return;
@@ -479,7 +480,7 @@ static struct memory_tier *set_node_memory_tier(int node)
 	pg_data_t *pgdat = NODE_DATA(node);
 
 
-	lockdep_assert_held_once(&memory_tier_lock);
+	lockdep_assert_held_write(&memory_tier_sem);
 
 	if (!node_state(node, N_MEMORY))
 		return ERR_PTR(-EINVAL);
@@ -569,15 +570,15 @@ EXPORT_SYMBOL_GPL(put_memory_type);
 void init_node_memory_type(int node, struct memory_dev_type *memtype)
 {
 
-	mutex_lock(&memory_tier_lock);
+	down_write(&memory_tier_sem);
 	__init_node_memory_type(node, memtype);
-	mutex_unlock(&memory_tier_lock);
+	up_write(&memory_tier_sem);
 }
 EXPORT_SYMBOL_GPL(init_node_memory_type);
 
 void clear_node_memory_type(int node, struct memory_dev_type *memtype)
 {
-	mutex_lock(&memory_tier_lock);
+	down_write(&memory_tier_sem);
 	if (node_memory_types[node].memtype == memtype)
 		node_memory_types[node].map_count--;
 	/*
@@ -588,7 +589,7 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype)
 		node_memory_types[node].memtype = NULL;
 		put_memory_type(memtype);
 	}
-	mutex_unlock(&memory_tier_lock);
+	up_write(&memory_tier_sem);
 }
 EXPORT_SYMBOL_GPL(clear_node_memory_type);
 
@@ -607,17 +608,17 @@ static int __meminit memtier_hotplug_callback(struct notifier_block *self,
 
 	switch (action) {
 	case MEM_OFFLINE:
-		mutex_lock(&memory_tier_lock);
+		down_write(&memory_tier_sem);
 		if (clear_node_memory_tier(arg->status_change_nid))
 			establish_demotion_targets();
-		mutex_unlock(&memory_tier_lock);
+		up_write(&memory_tier_sem);
 		break;
 	case MEM_ONLINE:
-		mutex_lock(&memory_tier_lock);
+		down_write(&memory_tier_sem);
 		memtier = set_node_memory_tier(arg->status_change_nid);
 		if (!IS_ERR(memtier))
 			establish_demotion_targets();
-		mutex_unlock(&memory_tier_lock);
+		up_write(&memory_tier_sem);
 		break;
 	}
 
@@ -638,7 +639,7 @@ static int __init memory_tier_init(void)
 				GFP_KERNEL);
 	WARN_ON(!node_demotion);
 #endif
-	mutex_lock(&memory_tier_lock);
+	down_write(&memory_tier_sem);
 	/*
 	 * For now we can have 4 faster memory tiers with smaller adistance
 	 * than default DRAM tier.
@@ -661,7 +662,7 @@ static int __init memory_tier_init(void)
 			break;
 	}
 	establish_demotion_targets();
-	mutex_unlock(&memory_tier_lock);
+	up_write(&memory_tier_sem);
 
 	hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
 	return 0;

From patchwork Mon Oct  9 20:42:58 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Gregory Price <gourry.memverge@gmail.com>
X-Patchwork-Id: 13417945
Received: from lindbergh.monkeyblade.net (lindbergh.monkeyblade.net
 [23.128.96.19])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0B46419BBA
	for <linux-cxl@vger.kernel.org>; Wed, 11 Oct 2023 20:44:07 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b="U8JDKXSn"
Received: from mail-yw1-x1142.google.com (mail-yw1-x1142.google.com
 [IPv6:2607:f8b0:4864:20::1142])
	by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 57E67A9;
	Wed, 11 Oct 2023 13:44:06 -0700 (PDT)
Received: by mail-yw1-x1142.google.com with SMTP id
 00721157ae682-5a7db1f864bso3166057b3.3;
        Wed, 11 Oct 2023 13:44:06 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20230601; t=1697057045; x=1697661845;
 darn=vger.kernel.org;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:from:to:cc:subject:date
         :message-id:reply-to;
        bh=W9UI87hogpDREX7ILSAbskFyWe+DuAm4NfdE6acGURg=;
        b=U8JDKXSnyEemEKTWKbYcx1RgfsreFQEeEsKkhP0/xU21a8AIR1bUe35jyy0G7VJer5
         +Imv8crFmtCMTjmLYt0xPC9HR2nIAJ73pfWI3FRvYVgbMGf5n4nCe25s3BWgHO+fCB+4
         BrkwCDKO8D93IQmI4UPB8+k+JQMQyEgaZ5lRinxKEzow61Uz48lK4+bPZKdlAcFjtn+v
         nSD9J0INtUv56Y3cxF6DqOjYPBurVPo+CKdYccredrauqi7js8JFGbw5ASGA0ufVDNSf
         z5qbSNF+0WqvWBnrNvM+jgcc+3idPnr/CpHlH726R93B93kZpanK6azY3d9B/Sa9zEvT
         iITg==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20230601; t=1697057045; x=1697661845;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
         :subject:date:message-id:reply-to;
        bh=W9UI87hogpDREX7ILSAbskFyWe+DuAm4NfdE6acGURg=;
        b=dFoSkNwmJKOPTimKgpEKvzWNFDh7PYG8x5Mgtdkk27q5ivmJBrsD49FN63U9UeCzir
         dW4cAIbEbaFp4i67WncYjMlhXQwzPVLy6TQauLTNjdYWHg8NcwqSh1AyYmHTFj+2/u5b
         LBC4/2L1BrZdZMZ7aP7+AEEdMxHMehwqS1fqXUWZ10YgxUuFy7Fkz6cHg+gOLXlRbvEE
         C9IkJiOblRat+EvYp1fsKx5cCg3ejWZnwCHn43LMGCRPkVjWY6Ot7oQ+Bd6+O/aBJ3Ep
         lHoTzVC/EA+ZlgWhtgsA4NqyRi7Tk7pJuhgbsimLkXItZIH8U+vHSTNcmSGWhbASScJH
         Xsxg==
X-Gm-Message-State: AOJu0YwrabHcxXSUTdTc2aV7+2gNxgLPI9/1CV5Kz1EouBEWadPKeQxE
	qc5KMTlrC5WhznsgOX4G2f+y76YucfwvlQE=
X-Google-Smtp-Source: 
 AGHT+IFBybVofn8uA1QG5FhUr2s6pjfiaJnoBxHbehRcIa/YpEEJNFJWO1bMZ/d+F+TzB5zVTpZ6LQ==
X-Received: by 2002:a05:690c:368b:b0:5a5:575:cf42 with SMTP id
 fu11-20020a05690c368b00b005a50575cf42mr20698083ywb.40.1697057045483;
        Wed, 11 Oct 2023 13:44:05 -0700 (PDT)
Received: from fedora.mshome.net (pool-173-79-56-208.washdc.fios.verizon.net.
 [173.79.56.208])
        by smtp.gmail.com with ESMTPSA id
 q2-20020a819902000000b0059bc0d766f8sm1844588ywg.34.2023.10.11.13.44.04
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 11 Oct 2023 13:44:05 -0700 (PDT)
From: Gregory Price <gourry.memverge@gmail.com>
X-Google-Original-From: Gregory Price <gregory.price@memverge.com>
To: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org,
	linux-cxl@vger.kernel.org,
	akpm@linux-foundation.org,
	sthanneeru@micron.com,
	ying.huang@intel.com,
	gregory.price@memverge.com,
	Ravi Jonnalagadda <ravis.opensrc@micron.com>
Subject: [RFC PATCH v2 2/3] mm/memory-tiers: Introduce sysfs for tier
 interleave weights
Date: Mon,  9 Oct 2023 16:42:58 -0400
Message-Id: <20231009204259.875232-3-gregory.price@memverge.com>
X-Mailer: git-send-email 2.39.1
In-Reply-To: <20231009204259.875232-1-gregory.price@memverge.com>
References: <20231009204259.875232-1-gregory.price@memverge.com>
Precedence: bulk
X-Mailing-List: linux-cxl@vger.kernel.org
List-Id: <linux-cxl.vger.kernel.org>
List-Subscribe: <mailto:linux-cxl+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-cxl+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
X-Spam-Status: No, score=-2.1 required=5.0 tests=BAYES_00,DKIM_SIGNED,
	DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,FREEMAIL_FROM,
	RCVD_IN_DNSWL_BLOCKED,SPF_HELO_NONE,SPF_PASS autolearn=ham
	autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on
	lindbergh.monkeyblade.net

Allocating pages across tiers is accomplished by provisioning
interleave weights for each tier, with the distribution based on
these weight values.

Weights are relative to the node requesting it (i.e. the weight
for tier2 from node0 may be different than the weight for tier2
from node1).  This allows for cpu-bound tasks to have more
precise control over the distribution of memory.

To represent this, tiers are captured as an array of weights,
where the index is the source node.

tier->interleave_weight[source_node] = weight;

weights are set with the following sysfs mechanism:

Set tier4 weight from node 0 to 85
echo 0:85 > /sys/devices/virtual/memory_tiering/memory_tier4/interleave_weight

By default, all tiers will have a weight of 1 for all source nodes,
which maintains the default interleave behavior.

Weights are effectively aligned (up) to the number of nodes in the
operating nodemask (i.e. (policy_nodes & tier_nodes)) to simplify
the allocation logic and to avoid having to hold the tiering
semaphore for a long period of time during bulk allocation.

Weights apply to a tier, not each node in the tier.  The weight is
split between the nodes in that tier, similar to hardware interleaving.
However, when the task defines a nodemask that splits a tier's nodes,
the weight will be split between the remaining nodes - retaining the
overall weight of the tier.

Signed-off-by: Srinivasulu Thanneeru <sthanneeru@micron.com>
Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com>
Co-developed-by: Gregory Price <gregory.price@memverge.com>
Signed-off-by: Gregory Price <gregory.price@memverge.com>
---
 include/linux/memory-tiers.h |  16 ++++
 mm/memory-tiers.c            | 140 ++++++++++++++++++++++++++++++++++-
 2 files changed, 155 insertions(+), 1 deletion(-)

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 437441cdf78f..a000b9745543 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -19,6 +19,8 @@
  */
 #define MEMTIER_ADISTANCE_DRAM	((4 * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1))
 
+#define MAX_TIER_INTERLEAVE_WEIGHT 100
+
 struct memory_tier;
 struct memory_dev_type {
 	/* list of memory types that are part of same tier as this type */
@@ -36,6 +38,9 @@ struct memory_dev_type *alloc_memory_type(int adistance);
 void put_memory_type(struct memory_dev_type *memtype);
 void init_node_memory_type(int node, struct memory_dev_type *default_type);
 void clear_node_memory_type(int node, struct memory_dev_type *memtype);
+unsigned char memtier_get_node_weight(int from_node, int target_node,
+				      nodemask_t *pol_nodes);
+unsigned int memtier_get_total_weight(int from_node, nodemask_t *pol_nodes);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
@@ -97,5 +102,16 @@ static inline bool node_is_toptier(int node)
 {
 	return true;
 }
+
+unsigned char memtier_get_node_weight(int from_node, int target_node,
+				      nodemask_t *pol_nodes)
+{
+	return 0;
+}
+
+unsigned int memtier_get_total_weight(int from_node, nodemask_t *pol_nodes)
+{
+	return 0;
+}
 #endif	/* CONFIG_NUMA */
 #endif  /* _LINUX_MEMORY_TIERS_H */
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 0a3241a2cadc..37fc4b3f69a4 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -14,6 +14,11 @@ struct memory_tier {
 	struct list_head list;
 	/* list of all memory types part of this tier */
 	struct list_head memory_types;
+	/*
+	 * By default all tiers will have weight as 1, which means they
+	 * follow default standard allocation.
+	 */
+	unsigned char interleave_weight[MAX_NUMNODES];
 	/*
 	 * start value of abstract distance. memory tier maps
 	 * an abstract distance  range,
@@ -146,8 +151,72 @@ static ssize_t nodelist_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(nodelist);
 
+static ssize_t interleave_weight_show(struct device *dev,
+				      struct device_attribute *attr,
+				      char *buf)
+{
+	int ret = 0;
+	struct memory_tier *tier = to_memory_tier(dev);
+	int node;
+	int count = 0;
+
+	down_read(&memory_tier_sem);
+	for_each_online_node(node) {
+		if (count > 0)
+			ret += sysfs_emit_at(buf, ret, ",");
+		ret += sysfs_emit_at(buf, ret, "%d:%d", node, tier->interleave_weight[node]);
+		count++;
+	}
+	up_read(&memory_tier_sem);
+	sysfs_emit_at(buf, ret++, "\n");
+
+	return ret;
+}
+
+static ssize_t interleave_weight_store(struct device *dev,
+				       struct device_attribute *attr,
+				       const char *buf, size_t size)
+{
+	unsigned char weight;
+	int from_node;
+	char *delim;
+	int ret;
+	struct memory_tier *tier;
+
+	delim = strchr(buf, ':');
+	if (!delim)
+		return -EINVAL;
+	delim[0] = '\0';
+
+	ret = kstrtou32(buf, 10, &from_node);
+	if (ret)
+		return ret;
+
+	if (from_node >= MAX_NUMNODES || !node_online(from_node))
+		return -EINVAL;
+
+	ret = kstrtou8(delim+1, 0, &weight);
+	if (ret)
+		return ret;
+
+	if (weight > MAX_TIER_INTERLEAVE_WEIGHT)
+		return -EINVAL;
+
+	down_write(&memory_tier_sem);
+	tier = to_memory_tier(dev);
+	if (tier)
+		tier->interleave_weight[from_node] = weight;
+	else
+		ret = -ENODEV;
+	up_write(&memory_tier_sem);
+
+	return size;
+}
+static DEVICE_ATTR_RW(interleave_weight);
+
 static struct attribute *memtier_dev_attrs[] = {
 	&dev_attr_nodelist.attr,
+	&dev_attr_interleave_weight.attr,
 	NULL
 };
 
@@ -239,6 +308,72 @@ static struct memory_tier *__node_get_memory_tier(int node)
 				     lockdep_is_held(&memory_tier_sem));
 }
 
+unsigned char memtier_get_node_weight(int from_node, int target_node,
+				      nodemask_t *pol_nodes)
+{
+	struct memory_tier *tier;
+	unsigned char tier_weight, node_weight = 1;
+	int tier_nodes;
+	nodemask_t tier_nmask, tier_and_pol;
+
+	/*
+	 * If the lock is already held, revert to a low weight temporarily
+	 * This should revert any interleave behavior to basic interleave
+	 * this only happens if weights are being updated or during init
+	 */
+	if (!down_read_trylock(&memory_tier_sem))
+		return 1;
+
+	tier = __node_get_memory_tier(target_node);
+	if (tier) {
+		tier_nmask = get_memtier_nodemask(tier);
+		nodes_and(tier_and_pol, tier_nmask, *pol_nodes);
+		tier_nodes = nodes_weight(tier_and_pol);
+		tier_weight = tier->interleave_weight[from_node];
+		node_weight = tier_weight / tier_nodes;
+		node_weight += (tier_weight % tier_nodes) ? 1 : 0;
+	}
+	up_read(&memory_tier_sem);
+	return node_weight;
+}
+
+unsigned int memtier_get_total_weight(int from_node, nodemask_t *pol_nodes)
+{
+	unsigned int weight = 0;
+	struct memory_tier *tier;
+	unsigned int min = nodes_weight(*pol_nodes);
+	int node;
+	nodemask_t tier_nmask, tier_and_pol;
+	int tier_nodes;
+	unsigned int tier_weight;
+
+	/*
+	 * If the lock is already held, revert to a low weight temporarily
+	 * This should revert any interleave behavior to basic interleave
+	 * this only happens if weights are being updated or during init
+	 */
+	if (!down_read_trylock(&memory_tier_sem))
+		return nodes_weight(*pol_nodes);
+
+	for_each_node_mask(node, *pol_nodes) {
+		tier = __node_get_memory_tier(node);
+		if (!tier) {
+			weight += 1;
+			continue;
+		}
+		tier_nmask = get_memtier_nodemask(tier);
+		nodes_and(tier_and_pol, tier_nmask, *pol_nodes);
+		tier_nodes = nodes_weight(tier_and_pol);
+		/* divide node weight by number of nodes, take ceil */
+		tier_weight = tier->interleave_weight[from_node];
+		weight += tier_weight / tier_nodes;
+		weight += (tier_weight % tier_nodes) ? 1 : 0;
+	}
+	up_read(&memory_tier_sem);
+
+	return weight >= min ? weight : min;
+}
+
 #ifdef CONFIG_MIGRATION
 bool node_is_toptier(int node)
 {
@@ -490,8 +625,11 @@ static struct memory_tier *set_node_memory_tier(int node)
 	memtype = node_memory_types[node].memtype;
 	node_set(node, memtype->nodes);
 	memtier = find_create_memory_tier(memtype);
-	if (!IS_ERR(memtier))
+	if (!IS_ERR(memtier)) {
 		rcu_assign_pointer(pgdat->memtier, memtier);
+		memset(memtier->interleave_weight, 1,
+		       sizeof(memtier->interleave_weight));
+	}
 	return memtier;
 }
 

From patchwork Mon Oct  9 20:42:59 2023
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Gregory Price <gourry.memverge@gmail.com>
X-Patchwork-Id: 13417946
Received: from lindbergh.monkeyblade.net (lindbergh.monkeyblade.net
 [23.128.96.19])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0591D18E29
	for <linux-cxl@vger.kernel.org>; Wed, 11 Oct 2023 20:44:09 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b="MdW62QyT"
Received: from mail-yw1-x1143.google.com (mail-yw1-x1143.google.com
 [IPv6:2607:f8b0:4864:20::1143])
	by lindbergh.monkeyblade.net (Postfix) with ESMTPS id B5C1CB6;
	Wed, 11 Oct 2023 13:44:07 -0700 (PDT)
Received: by mail-yw1-x1143.google.com with SMTP id
 00721157ae682-5a7c93507d5so3184017b3.2;
        Wed, 11 Oct 2023 13:44:07 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20230601; t=1697057047; x=1697661847;
 darn=vger.kernel.org;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:from:to:cc:subject:date
         :message-id:reply-to;
        bh=soG7YmuaB3BHU6f8QJ9tYSLPWTcCInx3P1b5GCGBm/o=;
        b=MdW62QyTCyXish/fgpWFyPZ0v4vgNkmLmTgyTehmm6edZL3gNXTboVm2CcS7nPKiJt
         HYIyeNXcnvaeL5HKg6vfJVly3fbb5vdEC99Jw9AkNOMpj9R41leSsP0rRD8J9U47KoT1
         MscH7EUy3h1uMYYniPMhIrYzZSvOtMFr0T/tBMBl5aTzWgh1J0xRz+qL14ZoljuH3Ydm
         qNnIsP5OdXMG13U92jszz0oMrrUoD8kivGQWM0OvEEpMC5ax3CZCWUSxGzo2V1/MtwLc
         Iyr0VVJ+deHbY9tmdOSFeNaQ0A2COGKJnFe2QQCkgs88Rp/CVM3ieUetrYxV/lSkmEx6
         9D2g==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20230601; t=1697057047; x=1697661847;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
         :subject:date:message-id:reply-to;
        bh=soG7YmuaB3BHU6f8QJ9tYSLPWTcCInx3P1b5GCGBm/o=;
        b=TJgsScROoRGGMHBR8/vOuXZ4UjXt/6PPMosEhQS1VQVnH4FzQn6Mku0zlO3jqbJ1Iv
         6r/PQzqUzHcMB/pO/nR2rgd3FdSXw+9K2QM0EBGhvW+6Ji9puFcdKhtFcpTHizX0Jts6
         FzQClzy0hc3hioODps6Zuz15QkjTROQF+kMIYpvgTbotEeZvAik0ky2ELjxPu6OSlpf7
         znJTpUNpwGPHNke6ILuHOPQTP9zZtDR7R9o6UitV8xOv+jQhWzTCrBTMysqAmSvILzKK
         UTDC9YcaLDUw9fI0k2YY2Ch8foJVq4tOQhAeueyVO/aGkqqZO18osZHjK5IvJcoh2Dyx
         rt3Q==
X-Gm-Message-State: AOJu0YzbgY1GhXckKoZ4X7mBGViQ+Mf7uuFzEEA3LzlgK4VSKUn5xYJY
	b0Cb6Mu429wLMO30Xt8xXg==
X-Google-Smtp-Source: 
 AGHT+IHf6jLfcMIHuOM7n2tG5dhIzfpBz1A/0EiVInopY36IZIJJhi9a24C+fLfRIC5CuvWG/p91Rw==
X-Received: by 2002:a81:a0ce:0:b0:5a7:b10c:4772 with SMTP id
 x197-20020a81a0ce000000b005a7b10c4772mr9010552ywg.19.1697057046856;
        Wed, 11 Oct 2023 13:44:06 -0700 (PDT)
Received: from fedora.mshome.net (pool-173-79-56-208.washdc.fios.verizon.net.
 [173.79.56.208])
        by smtp.gmail.com with ESMTPSA id
 q2-20020a819902000000b0059bc0d766f8sm1844588ywg.34.2023.10.11.13.44.06
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Wed, 11 Oct 2023 13:44:06 -0700 (PDT)
From: Gregory Price <gourry.memverge@gmail.com>
X-Google-Original-From: Gregory Price <gregory.price@memverge.com>
To: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org,
	linux-cxl@vger.kernel.org,
	akpm@linux-foundation.org,
	sthanneeru@micron.com,
	ying.huang@intel.com,
	gregory.price@memverge.com
Subject: [RFC PATCH v2 3/3] mm/mempolicy: modify interleave mempolicy to use
 memtier weights
Date: Mon,  9 Oct 2023 16:42:59 -0400
Message-Id: <20231009204259.875232-4-gregory.price@memverge.com>
X-Mailer: git-send-email 2.39.1
In-Reply-To: <20231009204259.875232-1-gregory.price@memverge.com>
References: <20231009204259.875232-1-gregory.price@memverge.com>
Precedence: bulk
X-Mailing-List: linux-cxl@vger.kernel.org
List-Id: <linux-cxl.vger.kernel.org>
List-Subscribe: <mailto:linux-cxl+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-cxl+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
X-Spam-Status: No, score=-2.1 required=5.0 tests=BAYES_00,DKIM_SIGNED,
	DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,FREEMAIL_FROM,
	RCVD_IN_DNSWL_NONE,SPF_HELO_NONE,SPF_PASS autolearn=ham
	autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on
	lindbergh.monkeyblade.net

The memory-tier subsystem implements interleave weighting
for tiers for the purpose of bandwidth optimization.  Each
tier may contain multiple numa nodes, and each tier may have
different weights in relation to each compute node ("from node").

The mempolicy MPOL_INTERLEAVE utilizes the memory-tier subsystem
functions to implement weighted tiering.  By default, since all
tiers default to a weight of 1, the original interleave behavior
is retained.

The mempolicy nodemask does not have to be inclusive of all nodes
in each respective memory tier, though this may lead to a more
complicated calculation in terms of how memory is distributed.

Examples

Weight settings:
echo 0:4 > memory_tier4/interleave_weight
echo 1:3 > memory_tier4/interleave_weight
echo 0:2 > memory_tier22/interleave_weight
echo 1:1 > memory_tier22/interleave_weight

Results:
Tier 1: Nodes(0,1), Weights(4,3) <- from nodes(0,1) respectively
Tier 2: Nodes(2,3), Weights(2,1) <- from nodes(0,1) respectively

Task A:
   cpunode:  0
   nodemask: [0,1]
   weights:  [4]
   allocation result: [0,0,1,1, repeat]
   Notice how weight is split between the nodes

Task B:
   cpunode:  0
   nodemask: [0,2]
   weights:  [4,2]
   allocation result: [0,0,0,0,2,2, repeat]
   Notice how weights are not split, each node
   has the entire weight of the respective tier applied

Task C:
   cpunode: 1
   nodemask: [1,3]
   weights:  [3,1]
   allocation result: [1,1,1,3, repeat]
   Notice the weights differ based on cpunode

Task D:
   cpunode: 0
   nodemask: [0,1,2]
   weights:  [4,2]
   allocation result: [0,0,1,1,2,2]
   Notice how tier1 splits the weight between nodes 0 and 1
   but tier 2 has the entire weight applied to node 2

Task E:
   cpunode:  1
   nodemask: [0,1]
   weights:  [3]
   allocation result: [0,0,1,1]
   Notice how the weight is aligned up to an effective 4, because
   weights are aligned to the number of nodes in the tier.

Signed-off-by: Gregory Price <gregory.price@memverge.com>
---
 include/linux/mempolicy.h |   3 +
 mm/mempolicy.c            | 148 ++++++++++++++++++++++++++++++--------
 2 files changed, 122 insertions(+), 29 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index d232de7cdc56..ad57fdfdb57a 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -48,6 +48,9 @@ struct mempolicy {
 	nodemask_t nodes;	/* interleave/bind/perfer */
 	int home_node;		/* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */
 
+	/* weighted interleave settings */
+	unsigned char cur_weight;
+
 	union {
 		nodemask_t cpuset_mems_allowed;	/* relative to these nodes */
 		nodemask_t user_nodemask;	/* nodemask passed by user */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f1b00d6ac7ee..131e6e56b2de 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -102,6 +102,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/printk.h>
 #include <linux/swapops.h>
+#include <linux/memory-tiers.h>
 
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
@@ -300,6 +301,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 	policy->mode = mode;
 	policy->flags = flags;
 	policy->home_node = NUMA_NO_NODE;
+	policy->cur_weight = 0;
 
 	return policy;
 }
@@ -334,6 +336,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 		tmp = *nodes;
 
 	pol->nodes = tmp;
+	pol->cur_weight = 0;
 }
 
 static void mpol_rebind_preferred(struct mempolicy *pol,
@@ -881,8 +884,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 
 	old = current->mempolicy;
 	current->mempolicy = new;
-	if (new && new->mode == MPOL_INTERLEAVE)
+	if (new && new->mode == MPOL_INTERLEAVE) {
 		current->il_prev = MAX_NUMNODES-1;
+		new->cur_weight = 0;
+	}
+
 	task_unlock(current);
 	mpol_put(old);
 	ret = 0;
@@ -1901,12 +1907,23 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
 /* Do dynamic interleaving for a process */
 static unsigned interleave_nodes(struct mempolicy *policy)
 {
-	unsigned next;
+	unsigned int next;
+	unsigned char next_weight;
 	struct task_struct *me = current;
 
 	next = next_node_in(me->il_prev, policy->nodes);
-	if (next < MAX_NUMNODES)
+	if (!policy->cur_weight) {
+		/* If the node is set, at least 1 allocation is required */
+		next_weight = memtier_get_node_weight(numa_node_id(), next,
+						      &policy->nodes);
+
+		policy->cur_weight = next_weight ? next_weight : 1;
+	}
+
+	policy->cur_weight--;
+	if (next < MAX_NUMNODES && !policy->cur_weight)
 		me->il_prev = next;
+
 	return next;
 }
 
@@ -1965,25 +1982,37 @@ unsigned int mempolicy_slab_node(void)
 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
 {
 	nodemask_t nodemask = pol->nodes;
-	unsigned int target, nnodes;
-	int i;
+	unsigned int target, nnodes, il_weight;
+	unsigned char weight;
 	int nid;
+	int cur_node = numa_node_id();
+
 	/*
 	 * The barrier will stabilize the nodemask in a register or on
 	 * the stack so that it will stop changing under the code.
 	 *
 	 * Between first_node() and next_node(), pol->nodes could be changed
 	 * by other threads. So we put pol->nodes in a local stack.
+	 *
+	 * Additionally, place the cur_node on the stack in case of a migration
 	 */
 	barrier();
 
 	nnodes = nodes_weight(nodemask);
 	if (!nnodes)
-		return numa_node_id();
-	target = (unsigned int)n % nnodes;
+		return cur_node;
+
+	il_weight = memtier_get_total_weight(cur_node, &nodemask);
+	target = (unsigned int)n % il_weight;
 	nid = first_node(nodemask);
-	for (i = 0; i < target; i++)
-		nid = next_node(nid, nodemask);
+	while (target) {
+		weight = memtier_get_node_weight(cur_node, nid, &nodemask);
+		if (target < weight)
+			break;
+		target -= weight;
+		nid = next_node_in(nid, nodemask);
+	}
+
 	return nid;
 }
 
@@ -2317,32 +2346,93 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
 		struct mempolicy *pol, unsigned long nr_pages,
 		struct page **page_array)
 {
-	int nodes;
-	unsigned long nr_pages_per_node;
-	int delta;
-	int i;
-	unsigned long nr_allocated;
+	struct task_struct *me = current;
 	unsigned long total_allocated = 0;
+	unsigned long nr_allocated;
+	unsigned long rounds;
+	unsigned long node_pages, delta;
+	unsigned char weight;
+	unsigned long il_weight;
+	unsigned long req_pages = nr_pages;
+	int nnodes, node, prev_node;
+	int cur_node = numa_node_id();
+	int i;
 
-	nodes = nodes_weight(pol->nodes);
-	nr_pages_per_node = nr_pages / nodes;
-	delta = nr_pages - nodes * nr_pages_per_node;
-
-	for (i = 0; i < nodes; i++) {
-		if (delta) {
-			nr_allocated = __alloc_pages_bulk(gfp,
-					interleave_nodes(pol), NULL,
-					nr_pages_per_node + 1, NULL,
-					page_array);
-			delta--;
-		} else {
-			nr_allocated = __alloc_pages_bulk(gfp,
-					interleave_nodes(pol), NULL,
-					nr_pages_per_node, NULL, page_array);
+	prev_node = me->il_prev;
+	nnodes = nodes_weight(pol->nodes);
+	/* Continue allocating from most recent node */
+	if (pol->cur_weight) {
+		node = next_node_in(prev_node, pol->nodes);
+		node_pages = pol->cur_weight;
+		if (node_pages > nr_pages)
+			node_pages = nr_pages;
+		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+						  NULL, page_array);
+		page_array += nr_allocated;
+		total_allocated += nr_allocated;
+		/* if that's all the pages, no need to interleave */
+		if (req_pages <= pol->cur_weight) {
+			pol->cur_weight -= req_pages;
+			return total_allocated;
 		}
+		/* Otherwise we adjust req_pages down, and continue from there */
+		req_pages -= pol->cur_weight;
+		pol->cur_weight = 0;
+		prev_node = node;
+	}
 
+	/*
+	 * The memtier lock is not held during allocation, if weights change
+	 * there may be edge-cases (over/under-allocation) to handle.
+	 */
+try_again:
+	il_weight = memtier_get_total_weight(cur_node, &pol->nodes);
+	rounds = req_pages / il_weight;
+	delta = req_pages % il_weight;
+	for (i = 0; i < nnodes; i++) {
+		node = next_node_in(prev_node, pol->nodes);
+		weight = memtier_get_node_weight(cur_node, node, &pol->nodes);
+		node_pages = weight * rounds;
+		if (delta > weight) {
+			node_pages += weight;
+			delta -= weight;
+		} else if (delta) {
+			node_pages += delta;
+			delta = 0;
+		}
+		/* The number of requested pages may not hit every node */
+		if (!node_pages)
+			break;
+		/* If an over-allocation would occur, floor it */
+		if (node_pages + total_allocated > nr_pages) {
+			node_pages = nr_pages - total_allocated;
+			delta = 0;
+		}
+		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
+						  NULL, page_array);
 		page_array += nr_allocated;
 		total_allocated += nr_allocated;
+		prev_node = node;
+	}
+
+	/* If an under-allocation would occur, apply interleave again */
+	if (total_allocated != nr_pages)
+		goto try_again;
+
+	/*
+	 * Finally, we need to update me->il_prev and pol->cur_weight
+	 * if there were overflow pages, but not equivalent to the node
+	 * weight, set the cur_weight to node_weight - delta and the
+	 * me->il_prev to the previous node. Otherwise if it was perfect
+	 * we can simply set il_prev to node and cur_weight to 0
+	 */
+	if (node_pages) {
+		me->il_prev = prev_node;
+		node_pages %= weight;
+		pol->cur_weight = weight - node_pages;
+	} else {
+		me->il_prev = node;
+		pol->cur_weight = 0;
 	}
 
 	return total_allocated;