From patchwork Tue Oct 3 00:21:53 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gregory Price X-Patchwork-Id: 13406637 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id E1FEAE776E3 for ; Tue, 3 Oct 2023 00:22:05 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S238032AbjJCAWG (ORCPT ); Mon, 2 Oct 2023 20:22:06 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:49194 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S237475AbjJCAWG (ORCPT ); Mon, 2 Oct 2023 20:22:06 -0400 Received: from mail-oa1-x44.google.com (mail-oa1-x44.google.com [IPv6:2001:4860:4864:20::44]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 806AAAB; Mon, 2 Oct 2023 17:22:03 -0700 (PDT) Received: by mail-oa1-x44.google.com with SMTP id 586e51a60fabf-1dcfb21f9d9so218383fac.0; Mon, 02 Oct 2023 17:22:03 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20230601; t=1696292523; x=1696897323; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=WPM4Tm+gRAF5HcZr/x/m1YRPeyyhsFwt3Uq8B84rUs0=; b=ehO8UiO8w2i2wuGVPPGtyh33fY1tjQX8INV2vKWS1nINVSa2K2oc/+RILv8PB6w3jY Z3sGxpVcADX6aUBKlkA4UgDbP7+baCKIKOM6VX+Iz3aeUOoCfU9TYFceZmEDcctRRA3M /aDbQExqFU/XTDEcXl3wD/WcxJsofGxf6LW1npgm+EI8JmpOjM/M5AaY7YtFfS17KGdF r2QZzsNSFPCbXWDdXNpLNmEYh4SiymLR/bAvg7r8yRiBpudWK5UjDeOvZGAmk4wsSvI5 6xedDGC6XUY/efbDZa4a4iF4Z8qy1njxMzJOU5oEyfeQt6Hg8j4G6+mRU7tAP4ADm+dY Ps/w== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1696292523; x=1696897323; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=WPM4Tm+gRAF5HcZr/x/m1YRPeyyhsFwt3Uq8B84rUs0=; b=vCs6lfvbyRM6H1H5pCvJUTCwh5ZhXlkGyOXfx/LCjVHKRshqMVcxLsBKLiPe1idDkv wDFGKdMmMuhXwdYUkt/2KSpEH6M5BbOrpSLDtAXEpMDoOE1uBsp8Z0FvRmg3j+KKiBAp BJRZkwBwhNHf5v35VSRTmMoR8eEsGE9y8Uy9unzCpludV3VO0azJzUsXR50vjqJ5lwLD +gANBuvOATouVWJka2kmJYCLSBcffuqjKjmInNTeMZu3J6Od+Pukrljuugdx91lMCOKX LXaVbFy6w6yJLniStFUar3cn26ycrXuWc2OK7Ojj9VIFmj6Loo89+KQKUHeAGhtiKje4 c0pg== X-Gm-Message-State: AOJu0YyvIQ7i03+sRgRovK35XrU+7dlFas2u9NEg1ISQtYqy6eeWntc3 Ir/S2cu6owUbnZfced3CGGlq5oI54SVIfpM= X-Google-Smtp-Source: AGHT+IGmh/sX3rnQH0khFcSXjot7fTRBE+PB1L5NwU2r4WZVwOD7IXwhw/MY3oSnIyASIsxwaojz7g== X-Received: by 2002:a05:6870:1494:b0:1bb:8842:7b5c with SMTP id k20-20020a056870149400b001bb88427b5cmr15065347oab.43.1696292522792; Mon, 02 Oct 2023 17:22:02 -0700 (PDT) Received: from fedora.mshome.net (pool-173-79-56-208.washdc.fios.verizon.net. [173.79.56.208]) by smtp.gmail.com with ESMTPSA id a2-20020a056870618200b001e135f4f849sm24725oah.9.2023.10.02.17.22.01 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Mon, 02 Oct 2023 17:22:02 -0700 (PDT) From: Gregory Price X-Google-Original-From: Gregory Price To: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org, linux-arch@vger.kernel.org, linux-api@vger.kernel.org, linux-cxl@vger.kernel.org, luto@kernel.org, tglx@linutronix.de, mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com, hpa@zytor.com, arnd@arndb.de, akpm@linux-foundation.org, x86@kernel.org, Gregory Price Subject: [RFC PATCH v2 1/4] mm/mempolicy: refactor do_set_mempolicy for code re-use Date: Mon, 2 Oct 2023 20:21:53 -0400 Message-Id: <20231003002156.740595-2-gregory.price@memverge.com> X-Mailer: git-send-email 2.39.1 In-Reply-To: <20231003002156.740595-1-gregory.price@memverge.com> References: <20231003002156.740595-1-gregory.price@memverge.com> MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-cxl@vger.kernel.org Refactors do_set_mempolicy into swap_mempolicy and do_set_mempolicy so that replace_mempolicy can be re-used with set_mempolicy2. Signed-off-by: Gregory Price --- mm/mempolicy.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f1b00d6ac7ee..ad26f41b91de 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -854,28 +854,20 @@ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, return vma_replace_policy(vma, new_pol); } -/* Set the process memory policy */ -static long do_set_mempolicy(unsigned short mode, unsigned short flags, - nodemask_t *nodes) +/* Attempt to replace mempolicy, release the old one if successful */ +static long replace_mempolicy(struct mempolicy *new, nodemask_t *nodes) { - struct mempolicy *new, *old; + struct mempolicy *old = NULL; NODEMASK_SCRATCH(scratch); int ret; if (!scratch) return -ENOMEM; - new = mpol_new(mode, flags, nodes); - if (IS_ERR(new)) { - ret = PTR_ERR(new); - goto out; - } - task_lock(current); ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { task_unlock(current); - mpol_put(new); goto out; } @@ -883,14 +875,32 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, current->mempolicy = new; if (new && new->mode == MPOL_INTERLEAVE) current->il_prev = MAX_NUMNODES-1; +out: task_unlock(current); mpol_put(old); - ret = 0; -out: + NODEMASK_SCRATCH_FREE(scratch); return ret; } +/* Set the process memory policy */ +static long do_set_mempolicy(unsigned short mode, unsigned short flags, + nodemask_t *nodes) +{ + struct mempolicy *new; + int ret; + + new = mpol_new(mode, flags, nodes); + if (IS_ERR(new)) + return PTR_ERR(new); + + ret = replace_mempolicy(new, nodes); + if (ret) + mpol_put(new); + + return ret; +} + /* * Return nodemask for policy for get_mempolicy() query * From patchwork Tue Oct 3 00:21:54 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gregory Price X-Patchwork-Id: 13406639 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 25B29E776E5 for ; Tue, 3 Oct 2023 00:22:23 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S238807AbjJCAWT (ORCPT ); Mon, 2 Oct 2023 20:22:19 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:59952 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S238397AbjJCAWJ (ORCPT ); Mon, 2 Oct 2023 20:22:09 -0400 Received: from mail-oa1-x43.google.com (mail-oa1-x43.google.com [IPv6:2001:4860:4864:20::43]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id B4F6FAB; Mon, 2 Oct 2023 17:22:05 -0700 (PDT) Received: by mail-oa1-x43.google.com with SMTP id 586e51a60fabf-1e141f543b2so1827081fac.1; Mon, 02 Oct 2023 17:22:05 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20230601; t=1696292525; x=1696897325; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=GzhlmCAeBw7LKDJVS9wKgjhn+zSbvnR0xVFbPBNPM68=; b=UzQTviRDU03JWdFehMvmA4EbpmQt4Gr00LHo+koA/i71HT/L7dep7tu1ZUrRHJEHAs 71HafbN3F4MZTK9hFX0EpWFgGs8zK4CsjXPKGkl++q02QG0NShCDF8DS3k6F2EpojXMp 9jYC9nSrKMPlM5Qjeo0qsEb9G/twwuPWIJP1BH9vEPwnPPfTW+Z8Gkv4ConN9GZ8hWX2 dQzwXLivo5Dx4wDEg75d2EOMY+yzSt8uhz0WGtAW1rSOea35m9SfqLbnk9wRkmYq1BPy 2PUS2IjemG/9e3IgnOUrXq+0LXwVsnRhc3wW5W+lOTmwR6213YxUwWrEW8QabO1mPI14 TAZg== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1696292525; x=1696897325; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=GzhlmCAeBw7LKDJVS9wKgjhn+zSbvnR0xVFbPBNPM68=; b=jK8l3TPMsbtFHDJKtPqvFerfE0vi3iTEDU9VsNWHc82fXDYnWMxqGFfUXWZUPPwXct grotWUHe9c+5E60W3RuqRrvBWRHIHkYU0p43m9RyzPUtGKQy+SH55t6DL+wYDPChp9TP 1uiDg08vdA2z61f3uKRuHJgq8Ng5iafF4+fbr98G2dVNGlP4MQmls4JeareE8WTUojOt 1BoBtbKpTyT9ze6Leb5tmf6aqlY4zIvDofYwhzwabPhYbwHEy3SChryrHddK+xf2gt13 fQS9v9Hm5qRTpxTsVpTISfU3I/5vPiUJmXr2ExC+4IzpXP+pepeJYIzyyCsUuzWAIiZ3 xlWg== X-Gm-Message-State: AOJu0YyQ7JwsKzq5p07MXH2SFfbjLb5WGOeymQL7z5WyuxK/yN/VmVqI hUEzaV6fB1/vMVFq14IGfQ== X-Google-Smtp-Source: AGHT+IEj9tKN67uLDs72Qn+I8fAd0j4gDX1bLRCTXKGugvdnF3kTPlMMBmDjkvf1TbX7SGX5Xn4zaA== X-Received: by 2002:a05:6870:e611:b0:1d7:1a69:5056 with SMTP id q17-20020a056870e61100b001d71a695056mr629626oag.14.1696292524806; Mon, 02 Oct 2023 17:22:04 -0700 (PDT) Received: from fedora.mshome.net (pool-173-79-56-208.washdc.fios.verizon.net. [173.79.56.208]) by smtp.gmail.com with ESMTPSA id a2-20020a056870618200b001e135f4f849sm24725oah.9.2023.10.02.17.22.03 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Mon, 02 Oct 2023 17:22:04 -0700 (PDT) From: Gregory Price X-Google-Original-From: Gregory Price To: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org, linux-arch@vger.kernel.org, linux-api@vger.kernel.org, linux-cxl@vger.kernel.org, luto@kernel.org, tglx@linutronix.de, mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com, hpa@zytor.com, arnd@arndb.de, akpm@linux-foundation.org, x86@kernel.org, Gregory Price Subject: [RFC PATCH v2 2/4] mm/mempolicy: Implement set_mempolicy2 and Date: Mon, 2 Oct 2023 20:21:54 -0400 Message-Id: <20231003002156.740595-3-gregory.price@memverge.com> X-Mailer: git-send-email 2.39.1 In-Reply-To: <20231003002156.740595-1-gregory.price@memverge.com> References: <20231003002156.740595-1-gregory.price@memverge.com> MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-cxl@vger.kernel.org sys_set_mempolicy is limited by its current argument structure (mode, nodes, flags) to implementing policies that can be described in that manner. Implement set/get_mempolicy2 with a new mempolicy_args structure which encapsulates the old behavior, and allows for new mempolicies which may require additional information. Signed-off-by: Gregory Price --- arch/x86/entry/syscalls/syscall_32.tbl | 2 + arch/x86/entry/syscalls/syscall_64.tbl | 2 + include/linux/syscalls.h | 4 + include/uapi/asm-generic/unistd.h | 10 +- include/uapi/linux/mempolicy.h | 29 ++++ mm/mempolicy.c | 196 ++++++++++++++++++++++++- 6 files changed, 241 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 2d0b1bd866ea..a72ef588a704 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -457,3 +457,5 @@ 450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node 451 i386 cachestat sys_cachestat 452 i386 fchmodat2 sys_fchmodat2 +454 i386 set_mempolicy2 sys_set_mempolicy2 +455 i386 get_mempolicy2 sys_get_mempolicy2 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 1d6eee30eceb..ec54064de8b3 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -375,6 +375,8 @@ 451 common cachestat sys_cachestat 452 common fchmodat2 sys_fchmodat2 453 64 map_shadow_stack sys_map_shadow_stack +454 common set_mempolicy2 sys_set_mempolicy2 +455 common get_mempolicy2 sys_get_mempolicy2 # # Due to a historical design error, certain syscalls are numbered differently diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 22bc6bc147f8..0c4a71177df9 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -813,6 +813,10 @@ asmlinkage long sys_get_mempolicy(int __user *policy, unsigned long addr, unsigned long flags); asmlinkage long sys_set_mempolicy(int mode, const unsigned long __user *nmask, unsigned long maxnode); +asmlinkage long sys_get_mempolicy2(struct mempolicy_args __user *args, + size_t size); +asmlinkage long sys_set_mempolicy2(struct mempolicy_args __user *args, + size_t size); asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *from, const unsigned long __user *to); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index abe087c53b4b..397dcf804941 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -823,8 +823,16 @@ __SYSCALL(__NR_cachestat, sys_cachestat) #define __NR_fchmodat2 452 __SYSCALL(__NR_fchmodat2, sys_fchmodat2) +/* CONFIG_MMU only */ +#ifndef __ARCH_NOMMU +#define __NR_set_mempolicy 454 +__SYSCALL(__NR_set_mempolicy2, sys_set_mempolicy2) +#define __NR_set_mempolicy 455 +__SYSCALL(__NR_get_mempolicy2, sys_get_mempolicy2) +#endif + #undef __NR_syscalls -#define __NR_syscalls 453 +#define __NR_syscalls 456 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 046d0ccba4cd..ea386872094b 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -23,9 +23,38 @@ enum { MPOL_INTERLEAVE, MPOL_LOCAL, MPOL_PREFERRED_MANY, + MPOL_LEGACY, /* set_mempolicy limited to above modes */ MPOL_MAX, /* always last member of enum */ }; +struct mempolicy_args { + unsigned short mode; + unsigned long *nodemask; + unsigned long maxnode; + unsigned short flags; + struct { + /* Memory allowed */ + struct { + unsigned long maxnode; + unsigned long *nodemask; + } allowed; + /* Address information */ + struct { + unsigned long addr; + unsigned long node; + unsigned short mode; + unsigned short flags; + } addr; + /* Interleave */ + } get; + /* Mode specific settings */ + union { + struct { + unsigned long next_node; /* get only */ + } interleave; + }; +}; + /* Flags for set_mempolicy */ #define MPOL_F_STATIC_NODES (1 << 15) #define MPOL_F_RELATIVE_NODES (1 << 14) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ad26f41b91de..936c641f554e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1478,7 +1478,7 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) *flags = *mode & MPOL_MODE_FLAGS; *mode &= ~MPOL_MODE_FLAGS; - if ((unsigned int)(*mode) >= MPOL_MAX) + if ((unsigned int)(*mode) >= MPOL_LEGACY) return -EINVAL; if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; @@ -1609,6 +1609,200 @@ SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, return kernel_set_mempolicy(mode, nmask, maxnode); } +static long do_set_mempolicy2(struct mempolicy_args *args) +{ + struct mempolicy *new = NULL; + nodemask_t nodes; + int err; + + if (args->mode <= MPOL_LEGACY) + return -EINVAL; + + if (args->mode >= MPOL_MAX) + return -EINVAL; + + err = get_nodes(&nodes, args->nodemask, args->maxnode); + if (err) + return err; + + new = mpol_new(args->mode, args->flags, &nodes); + if (IS_ERR(new)) + return PTR_ERR(new); + + switch (args->mode) { + default: + BUG(); + } + + if (err) + goto out; + + err = replace_mempolicy(new, &nodes); +out: + if (err) + mpol_put(new); + return err; +}; + +static bool mempolicy2_args_valid(struct mempolicy_args *kargs) +{ + /* Legacy modes are routed through the legacy interface */ + return kargs->mode > MPOL_LEGACY && kargs->mode < MPOL_MAX; +} + +static long kernel_set_mempolicy2(const struct mempolicy_args __user *uargs, + size_t usize) +{ + struct mempolicy_args kargs; + int err; + + if (usize < sizeof(kargs)) + return -EINVAL; + + err = copy_struct_from_user(&kargs, sizeof(kargs), uargs, usize); + if (err) + return err; + + /* If the mode is legacy, use the legacy path */ + if (kargs.mode < MPOL_LEGACY) { + int legacy_mode = kargs.mode | kargs.flags; + const unsigned long __user *lnmask = kargs.nodemask; + unsigned long maxnode = kargs.maxnode; + + return kernel_set_mempolicy(legacy_mode, lnmask, maxnode); + } + + if (!mempolicy2_args_valid(&kargs)) + return -EINVAL; + + return do_set_mempolicy2(&kargs); +} + +SYSCALL_DEFINE2(set_mempolicy2, const struct mempolicy_args __user *, args, + size_t, size) +{ + return kernel_set_mempolicy2(args, size); +} + +/* Gets extended mempolicy information */ +static long do_get_mempolicy2(struct mempolicy_args *kargs) +{ + struct mempolicy *pol = current->mempolicy; + nodemask_t knodes; + int rc = 0; + + kargs->mode = pol->mode; + /* Mask off internal flags */ + kargs->flags = pol->flags & MPOL_MODE_FLAGS; + + if (kargs->nodemask) { + if (mpol_store_user_nodemask(pol)) { + knodes = pol->w.user_nodemask; + } else { + task_lock(current); + get_policy_nodemask(pol, &knodes); + task_unlock(current); + } + rc = copy_nodes_to_user(kargs->nodemask, kargs->maxnode, + &knodes); + if (rc) + return rc; + } + + + if (kargs->get.allowed.nodemask) { + task_lock(current); + knodes = cpuset_current_mems_allowed; + task_unlock(current); + rc = copy_nodes_to_user(kargs->get.allowed.nodemask, + kargs->get.allowed.maxnode, + &knodes); + if (rc) + return rc; + } + + if (kargs->get.addr.addr) { + struct mempolicy *addr_pol; + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + unsigned long addr = kargs->get.addr.addr; + + /* + * Do NOT fall back to task policy if the vma/shared policy + * at addr is NULL. Return MPOL_DEFAULT in this case. + */ + mmap_read_lock(mm); + vma = vma_lookup(mm, addr); + if (!vma) { + mmap_read_unlock(mm); + return -EFAULT; + } + if (vma->vm_ops && vma->vm_ops->get_policy) + addr_pol = vma->vm_ops->get_policy(vma, addr); + else + addr_pol = vma->vm_policy; + + kargs->get.addr.mode = addr_pol->mode; + /* Mask off internal flags */ + kargs->get.addr.flags = (pol->flags & MPOL_MODE_FLAGS); + + /* + * Take a refcount on the mpol, because we are about to + * drop the mmap_lock, after which only "pol" remains + * valid, "vma" is stale. + */ + vma = NULL; + mpol_get(addr_pol); + mmap_read_unlock(mm); + rc = lookup_node(mm, addr); + mpol_put(addr_pol); + if (rc < 0) + return rc; + kargs->get.addr.node = rc; + } + + switch (kargs->mode) { + case MPOL_INTERLEAVE: + kargs->interleave.next_node = next_node_in(current->il_prev, + pol->nodes); + rc = 0; + break; + default: + BUG(); + } + + return rc; +} + +static long kernel_get_mempolicy2(struct mempolicy_args __user *uargs, + size_t usize) +{ + struct mempolicy_args kargs; + int err; + + if (usize < sizeof(kargs)) + return -EINVAL; + + err = copy_struct_from_user(&kargs, sizeof(kargs), uargs, usize); + if (err) + return err; + + /* Get the extended memory policy information (kargs.ext) */ + err = do_get_mempolicy2(&kargs); + if (err) + return err; + + err = copy_to_user(uargs, &kargs, sizeof(kargs)); + + return err; +} + +SYSCALL_DEFINE2(get_mempolicy2, struct mempolicy_args __user *, policy, + size_t, size) +{ + return kernel_get_mempolicy2(policy, size); +} + static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *old_nodes, const unsigned long __user *new_nodes) From patchwork Tue Oct 3 00:21:55 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gregory Price X-Patchwork-Id: 13406638 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id E0815E776DF for ; Tue, 3 Oct 2023 00:22:19 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S238908AbjJCAWU (ORCPT ); Mon, 2 Oct 2023 20:22:20 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:59966 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S237475AbjJCAWL (ORCPT ); Mon, 2 Oct 2023 20:22:11 -0400 Received: from mail-oa1-x43.google.com (mail-oa1-x43.google.com [IPv6:2001:4860:4864:20::43]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id D3998CE; Mon, 2 Oct 2023 17:22:07 -0700 (PDT) Received: by mail-oa1-x43.google.com with SMTP id 586e51a60fabf-1e10ba12fd3so208593fac.1; Mon, 02 Oct 2023 17:22:07 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20230601; t=1696292527; x=1696897327; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=M+oV1ZgyPCblrOh65x12ZqFYHoDSqMkCfYV5/Np3CLc=; b=iGSn5mlZ9uN3glRU+nqG/iL5dxscvtmbjxqGcGlkJ5bpoEhj6J7YK32DjiU+YDpTHF JwHBR3hWl4rX4aziRVgpcehOK2oBbbZ5Yft9lCQpVAVZYOQvdQrxUn3CI7Nrcwmzb8C/ q2bPwfQqpUVUCgR1znVzdHePViE+1Ok3aozUy6dFrP1IP1Hz/W/N+8yCdhjeERleHGQn 2irrnvlotJ2EK1840pqfQD79wXfqaklLX/T0ynZSGxz4cHIv3DsBygbSmcHeTh9ZiUAU 79EsEowoUXCWnslb4s9G8B2w9a2jQvZqaf9tC9jbcjkkGdzGAsktOn1GKnjQhOC9Amq0 a20A== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1696292527; x=1696897327; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=M+oV1ZgyPCblrOh65x12ZqFYHoDSqMkCfYV5/Np3CLc=; b=RvYEGYAgZEYi3657MYi9uZeCQSu1sRemAIP50AE7ntoPxwd6nIhrS8lwyoSddRkdoF kWndQ+R9Rn8YxIDnYXPX3DhwfkmQDlSYRem3x/GoJcHSIg/gYzfJPY/g1wPLP4GTSl1u Cqpx67RxTowisjmwCEgpPRqkRjL2Ma5yw6PD5KzFM1YsKx3yBMblLErSQ98wFx2tjuJv uZ4j7l9kINpwS3iS9vKkQMW7teZDdqg4MeMzfPGZRI0b+UdJTbT6/sm3BrNrt42wmCPe CbBD3mJJxxYm0WkEsc/sINaFpHwrW7jaqpGRCr3/+hqcKUMD93+lrA63QXg2KUgLS7Zz qlOA== X-Gm-Message-State: AOJu0YxWqjMzgs0u3I8XLr9wexQTWzHy3jDGNWQU32/w/BFpIf32SsDX aEyLVn+Iy8xJ/IprUtQnHg== X-Google-Smtp-Source: AGHT+IF3H7cFXE9uqnzYRiSwmCzVCT00Oy06v4WEmrurh9FJPLnPgHbJ2bdP2Q5gVA4VPLuva4WPHg== X-Received: by 2002:a05:6870:3282:b0:1d6:439d:d04e with SMTP id q2-20020a056870328200b001d6439dd04emr14842596oac.53.1696292526913; Mon, 02 Oct 2023 17:22:06 -0700 (PDT) Received: from fedora.mshome.net (pool-173-79-56-208.washdc.fios.verizon.net. [173.79.56.208]) by smtp.gmail.com with ESMTPSA id a2-20020a056870618200b001e135f4f849sm24725oah.9.2023.10.02.17.22.05 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Mon, 02 Oct 2023 17:22:06 -0700 (PDT) From: Gregory Price X-Google-Original-From: Gregory Price To: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org, linux-arch@vger.kernel.org, linux-api@vger.kernel.org, linux-cxl@vger.kernel.org, luto@kernel.org, tglx@linutronix.de, mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com, hpa@zytor.com, arnd@arndb.de, akpm@linux-foundation.org, x86@kernel.org, Gregory Price Subject: [RFC PATCH v2 3/4] mm/mempolicy: implement a preferred-interleave Date: Mon, 2 Oct 2023 20:21:55 -0400 Message-Id: <20231003002156.740595-4-gregory.price@memverge.com> X-Mailer: git-send-email 2.39.1 In-Reply-To: <20231003002156.740595-1-gregory.price@memverge.com> References: <20231003002156.740595-1-gregory.price@memverge.com> MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-cxl@vger.kernel.org The preferred-interleave mempolicy implements single-weight interleave mechanism where the preferred node is the local node. If the local node is not set in nodemask, the first node in the node mask is the preferred node. When set, N (weight) pages will be allocated on the preferred node beforce an interleave pass occurs. For example: nodes=0,1,2 interval=3 cpunode=0 Over 10 consecutive allocations, the following nodes will be selected: [0,0,0,1,2,0,0,0,1,2] In this example, there is a 60%/20%/20% distribution of memory. Using this mechanism, it becomes possible to define an approximate distribution percentage of memory across a set of nodes: local_node% : interval/((nr_nodes-1)+interval-1) other_node% : (1-local_node%)/(nr_nodes-1) The behavior can be preferred over a fully-weighted interleave (where each node has a separate weight) when migrations or multiple sockets may be in use. If a task migrates, the weight applies to the new local node without a need for the task to "rebalance" its weights. Similarly, if nodes are removed from the nodemask, no weights need to be recalculated. The exception to this is when the local node is removed from the nodemask, which is a rare situation. Similarly, consider a task executing on a 2-socket system which creates a new thread. If the first thread is scheduled to execute on socket 0 and the second thread is scheduled to execute on socket 1, weightings set by thread 1 (which are inherited by thread 2) would very likely be a poor interleave strategy for the new thread. In this scheme, thread 2 would inherit the same weight, but it would apply to the local node of thread 2, leading to more predictable behavior for new allocations. Signed-off-by: Gregory Price --- include/linux/mempolicy.h | 8 ++ include/uapi/linux/mempolicy.h | 6 + mm/mempolicy.c | 203 ++++++++++++++++++++++++++++++++- 3 files changed, 212 insertions(+), 5 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index d232de7cdc56..8f918488c61c 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -48,6 +48,14 @@ struct mempolicy { nodemask_t nodes; /* interleave/bind/perfer */ int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ + union { + /* Preferred Interleave: Weight local, then interleave */ + struct { + int weight; + int count; + } pil; + }; + union { nodemask_t cpuset_mems_allowed; /* relative to these nodes */ nodemask_t user_nodemask; /* nodemask passed by user */ diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index ea386872094b..41c35f404c5e 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -24,6 +24,7 @@ enum { MPOL_LOCAL, MPOL_PREFERRED_MANY, MPOL_LEGACY, /* set_mempolicy limited to above modes */ + MPOL_PREFERRED_INTERLEAVE, MPOL_MAX, /* always last member of enum */ }; @@ -52,6 +53,11 @@ struct mempolicy_args { struct { unsigned long next_node; /* get only */ } interleave; + /* Partial interleave */ + struct { + unsigned long weight; /* get and set */ + unsigned long next_node; /* get only */ + } pil; }; }; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 936c641f554e..6374312cef5f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -399,6 +399,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, + [MPOL_PREFERRED_INTERLEAVE] = { + .create = mpol_new_nodemask, + .rebind = mpol_rebind_nodemask, + }, [MPOL_PREFERRED] = { .create = mpol_new_preferred, .rebind = mpol_rebind_preferred, @@ -873,7 +877,8 @@ static long replace_mempolicy(struct mempolicy *new, nodemask_t *nodes) old = current->mempolicy; current->mempolicy = new; - if (new && new->mode == MPOL_INTERLEAVE) + if (new && (new->mode == MPOL_INTERLEAVE || + new->mode == MPOL_PREFERRED_INTERLEAVE)) current->il_prev = MAX_NUMNODES-1; out: task_unlock(current); @@ -915,6 +920,7 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) switch (p->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: + case MPOL_PREFERRED_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: *nodes = p->nodes; @@ -1609,6 +1615,23 @@ SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, return kernel_set_mempolicy(mode, nmask, maxnode); } +static long do_set_preferred_interleave(struct mempolicy_args *args, + struct mempolicy *new, + nodemask_t *nodes) +{ + /* Preferred interleave cannot be done with no nodemask */ + if (nodes_empty(*nodes)) + return -EINVAL; + + /* Preferred interleave weight cannot be <= 0 */ + if (args->pil.weight <= 0) + return -EINVAL; + + new->pil.weight = args->pil.weight; + new->pil.count = 0; + return 0; +} + static long do_set_mempolicy2(struct mempolicy_args *args) { struct mempolicy *new = NULL; @@ -1630,6 +1653,9 @@ static long do_set_mempolicy2(struct mempolicy_args *args) return PTR_ERR(new); switch (args->mode) { + case MPOL_PREFERRED_INTERLEAVE: + err = do_set_preferred_interleave(args, new, &nodes); + break; default: BUG(); } @@ -1767,6 +1793,12 @@ static long do_get_mempolicy2(struct mempolicy_args *kargs) pol->nodes); rc = 0; break; + case MPOL_PREFERRED_INTERLEAVE: + kargs->pil.next_node = next_node_in(current->il_prev, + pol->nodes); + kargs->pil.weight = pol->pil.weight; + rc = 0; + break; default: BUG(); } @@ -2102,12 +2134,41 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) return nd; } +static unsigned int preferred_interleave_nodes(struct mempolicy *policy) +{ + int mynode = numa_node_id(); + struct task_struct *me = current; + int next; + + /* + * If the local node is not in the node mask, we treat the + * lowest node as the preferred node. This can happen if the + * cpu is bound to a node that is not present in the mempolicy + */ + if (!node_isset(mynode, policy->nodes)) + mynode = first_node(policy->nodes); + + next = next_node_in(me->il_prev, policy->nodes); + if (next == mynode) { + if (++policy->pil.count >= policy->pil.weight) { + policy->pil.count = 0; + me->il_prev = next; + } + } else if (next < MAX_NUMNODES) { + me->il_prev = next; + } + return next; +} + /* Do dynamic interleaving for a process */ static unsigned interleave_nodes(struct mempolicy *policy) { unsigned next; struct task_struct *me = current; + if (policy->mode == MPOL_PREFERRED_INTERLEAVE) + return preferred_interleave_nodes(policy); + next = next_node_in(me->il_prev, policy->nodes); if (next < MAX_NUMNODES) me->il_prev = next; @@ -2135,6 +2196,7 @@ unsigned int mempolicy_slab_node(void) return first_node(policy->nodes); case MPOL_INTERLEAVE: + case MPOL_PREFERRED_INTERLEAVE: return interleave_nodes(policy); case MPOL_BIND: @@ -2161,6 +2223,56 @@ unsigned int mempolicy_slab_node(void) } } +static unsigned int offset_pil_node(struct mempolicy *pol, unsigned long n) +{ + nodemask_t nodemask = pol->nodes; + unsigned int target, nnodes; + int i; + int nid = MAX_NUMNODES; + int weight = pol->pil.weight; + + /* + * The barrier will stabilize the nodemask in a register or on + * the stack so that it will stop changing under the code. + * + * Between first_node() and next_node(), pol->nodes could be changed + * by other threads. So we put pol->nodes in a local stack. + */ + barrier(); + + nnodes = nodes_weight(nodemask); + + /* + * If the local node ID is not set (cpu is bound to a node + * but that node is not set in the memory nodemask), interleave + * based on the lowest set node. + */ + nid = numa_node_id(); + if (!node_isset(nid, nodemask)) + nid = first_node(nodemask); + /* + * Mode or weight can change so default to basic interleave + * if the weight has become invalid. Basic interleave is + * equivalent to weight=1. Don't double-count the base node + */ + if (weight == 0) + weight = 1; + weight -= 1; + + /* If target <= the weight, no need to call next_node */ + target = ((unsigned int)n % (nnodes + weight)); + target -= (target > weight) ? weight : target; + target %= MAX_NUMNODES; + + /* Target may not be the first node, so use next_node_in to wrap */ + for (i = 0; i < target; i++) { + nid = next_node_in(nid, nodemask); + if (nid == MAX_NUMNODES) + nid = first_node(nodemask); + } + return nid; +} + /* * Do static interleaving for a VMA with known offset @n. Returns the n'th * node in pol->nodes (starting from n=0), wrapping around if n exceeds the @@ -2168,10 +2280,16 @@ unsigned int mempolicy_slab_node(void) */ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) { - nodemask_t nodemask = pol->nodes; + nodemask_t nodemask; unsigned int target, nnodes; int i; int nid; + + if (pol->mode == MPOL_PREFERRED_INTERLEAVE) + return offset_pil_node(pol, n); + + nodemask = pol->nodes; + /* * The barrier will stabilize the nodemask in a register or on * the stack so that it will stop changing under the code. @@ -2239,7 +2357,8 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, *nodemask = NULL; mode = (*mpol)->mode; - if (unlikely(mode == MPOL_INTERLEAVE)) { + if (unlikely(mode == MPOL_INTERLEAVE) || + unlikely(mode == MPOL_PREFERRED_INTERLEAVE)) { nid = interleave_nid(*mpol, vma, addr, huge_page_shift(hstate_vma(vma))); } else { @@ -2280,6 +2399,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: + case MPOL_PREFERRED_INTERLEAVE: *mask = mempolicy->nodes; break; @@ -2390,7 +2510,8 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, pol = get_vma_policy(vma, addr); - if (pol->mode == MPOL_INTERLEAVE) { + if (pol->mode == MPOL_INTERLEAVE || + pol->mode == MPOL_PREFERRED_INTERLEAVE) { struct page *page; unsigned nid; @@ -2492,7 +2613,8 @@ struct page *alloc_pages(gfp_t gfp, unsigned order) * No reference counting needed for current->mempolicy * nor system default_policy */ - if (pol->mode == MPOL_INTERLEAVE) + if (pol->mode == MPOL_INTERLEAVE || + pol->mode == MPOL_PREFERRED_INTERLEAVE) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else if (pol->mode == MPOL_PREFERRED_MANY) page = alloc_pages_preferred_many(gfp, order, @@ -2552,6 +2674,69 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, return total_allocated; } +static unsigned long alloc_pages_bulk_array_pil(gfp_t gfp, + struct mempolicy *pol, + unsigned long nr_pages, + struct page **page_array) +{ + nodemask_t nodemask = pol->nodes; + unsigned long nr_pages_main; + unsigned long nr_pages_other; + unsigned long total_cycle; + unsigned long delta; + unsigned long weight; + int allocated = 0; + int start_nid; + int nnodes; + int prev, next; + int i; + + /* This stabilizes nodes on the stack incase pol->nodes changes */ + barrier(); + + nnodes = nodes_weight(nodemask); + start_nid = numa_node_id(); + + if (!node_isset(start_nid, nodemask)) + start_nid = first_node(nodemask); + + if (nnodes == 1) { + allocated = __alloc_pages_bulk(gfp, start_nid, + NULL, nr_pages_main, + NULL, page_array); + return allocated; + } + /* We don't want to double-count the main node in calculations */ + nnodes--; + + weight = pol->pil.weight; + total_cycle = (weight + nnodes); + /* Number of pages on main node: (cycles*weight + up to weight) */ + nr_pages_main = ((nr_pages / total_cycle) * weight); + nr_pages_main += (nr_pages % total_cycle % (weight + 1)); + /* Number of pages on others: (remaining/nodes) + 1 page if delta */ + nr_pages_other = (nr_pages - nr_pages_main) / nnodes; + nr_pages_other /= nnodes; + /* Delta is number of pages beyond weight up to full cycle */ + delta = nr_pages - (nr_pages_main + (nr_pages_other * nnodes)); + + /* start by allocating for the main node, then interleave rest */ + prev = start_nid; + allocated = __alloc_pages_bulk(gfp, start_nid, NULL, nr_pages_main, + NULL, page_array); + for (i = 0; i < nnodes; i++) { + int pages = nr_pages_other + (delta-- ? 1 : 0); + + next = next_node_in(prev, nodemask); + if (next < MAX_NUMNODES) + prev = next; + allocated += __alloc_pages_bulk(gfp, next, NULL, pages, + NULL, page_array); + } + + return allocated; +} + static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) @@ -2590,6 +2775,10 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, return alloc_pages_bulk_array_interleave(gfp, pol, nr_pages, page_array); + if (pol->mode == MPOL_PREFERRED_INTERLEAVE) + return alloc_pages_bulk_array_pil(gfp, pol, nr_pages, + page_array); + if (pol->mode == MPOL_PREFERRED_MANY) return alloc_pages_bulk_array_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); @@ -2662,6 +2851,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) switch (a->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: + case MPOL_PREFERRED_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: return !!nodes_equal(a->nodes, b->nodes); @@ -2798,6 +2988,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long switch (pol->mode) { case MPOL_INTERLEAVE: + case MPOL_PREFERRED_INTERLEAVE: pgoff = vma->vm_pgoff; pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; polnid = offset_il_node(pol, pgoff); @@ -3185,6 +3376,7 @@ static const char * const policy_modes[] = [MPOL_PREFERRED] = "prefer", [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", + [MPOL_PREFERRED_INTERLEAVE] = "preferred interleave", [MPOL_LOCAL] = "local", [MPOL_PREFERRED_MANY] = "prefer (many)", }; @@ -3355,6 +3547,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: + case MPOL_PREFERRED_INTERLEAVE: nodes = pol->nodes; break; default: From patchwork Tue Oct 3 00:21:56 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gregory Price X-Patchwork-Id: 13406640 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id D5F07E776D4 for ; Tue, 3 Oct 2023 00:22:22 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S238959AbjJCAWW (ORCPT ); Mon, 2 Oct 2023 20:22:22 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:48752 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S238487AbjJCAWT (ORCPT ); Mon, 2 Oct 2023 20:22:19 -0400 Received: from mail-oa1-x44.google.com (mail-oa1-x44.google.com [IPv6:2001:4860:4864:20::44]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 0112BE1; Mon, 2 Oct 2023 17:22:09 -0700 (PDT) Received: by mail-oa1-x44.google.com with SMTP id 586e51a60fabf-1e10ba12fd3so208605fac.1; Mon, 02 Oct 2023 17:22:09 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20230601; t=1696292529; x=1696897329; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=qMDruOuLBm4Gji0WOsN7NL1T5EMDFv1mGIaADmwpZLQ=; b=jgyB//XTpIumAx1eopl01dscE+Ri9QRfEYp40wGKollDMZHgb92BiKXcNJ4MxP0Xe4 jrW3jiRhk7y5+miCCLq8PUa9eVgcDTTpsg81Y+cpzSwr6oauwTg1Udr36l7Uln9YtJtF reDuGwJ/73LlWe+85G4zFQo/0yx5rniGwenNv6xUryOW1KdGuuKa4yAlyliGNyCC4zQG BGsQIYSEEmT7OnoGn65cKPxjcq8+vGOVUW7hMqypqsEaJVq0zXUAQfliQ0/QbAzy8fSz kRX5TSr6ivMu9lqbaAUu3qKIQK6vcByR+gTZRGO1jdt8NGrTV9k0z2wNVpQwrDpyhcap 3NdA== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1696292529; x=1696897329; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=qMDruOuLBm4Gji0WOsN7NL1T5EMDFv1mGIaADmwpZLQ=; b=NtRsdds1XKaHkGVftSsnKrEtMZrybLc1yeWBLjRoG9PJdHpZr79tIcFHGQE81c91jt 07WpoRdSlAP3OLPTvybAWL4lCK/sLOSsH+Onuf3yyMhzw1ld9GclqvtVI1WOkipLtrOk vCo1HH3hBxKIhyIiV4wxB3tyniTJLtFRIrH6CJwe/SI2laf/OGgudGjSe1ENMd7KSQd+ h358hI+l1xTXSuVlgguXkLpfm82mURpxV+3FEhwsbMhTg6I0ekaPeJvrsg3JXU5MI5N0 dX9mYKWyEMrbBkBXIgk3+hbSFYMpY8HGuXEIS4A9cYyF4MdDouL2jo3K1fMrnP2IkM2P Rr8Q== X-Gm-Message-State: AOJu0Ywsx+dDzweG+cX8ehyXVYLpXqUK3nqONyv7xvl2sEHFewAyuCO9 kadB1d+rynI2Pib8u0M03w== X-Google-Smtp-Source: AGHT+IE2YAsLVNpJsdNcG2bxedQTiFrhif1A8zx9yeRwEXXF9bKT3nIRs/0OjC+MT/bG/oTToYDBxA== X-Received: by 2002:a05:6870:9720:b0:1d5:c134:cecb with SMTP id n32-20020a056870972000b001d5c134cecbmr14980678oaq.1.1696292529153; Mon, 02 Oct 2023 17:22:09 -0700 (PDT) Received: from fedora.mshome.net (pool-173-79-56-208.washdc.fios.verizon.net. [173.79.56.208]) by smtp.gmail.com with ESMTPSA id a2-20020a056870618200b001e135f4f849sm24725oah.9.2023.10.02.17.22.07 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Mon, 02 Oct 2023 17:22:08 -0700 (PDT) From: Gregory Price X-Google-Original-From: Gregory Price To: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org, linux-arch@vger.kernel.org, linux-api@vger.kernel.org, linux-cxl@vger.kernel.org, luto@kernel.org, tglx@linutronix.de, mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com, hpa@zytor.com, arnd@arndb.de, akpm@linux-foundation.org, x86@kernel.org, Gregory Price Subject: [RFC PATCH v2 4/4] mm/mempolicy: implement a weighted-interleave Date: Mon, 2 Oct 2023 20:21:56 -0400 Message-Id: <20231003002156.740595-5-gregory.price@memverge.com> X-Mailer: git-send-email 2.39.1 In-Reply-To: <20231003002156.740595-1-gregory.price@memverge.com> References: <20231003002156.740595-1-gregory.price@memverge.com> MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: linux-cxl@vger.kernel.org The weighted-interleave mempolicy implements weights per-node which are used to distribute memory while interleaving. For example: nodes: 0,1,2 weights: 5,3,2 Over 10 consecutive allocations, the following nodes will be selected: [0,0,0,0,0,1,1,1,2,2] In this example there is a 50%/30%/20% distribution of memory across the enabled nodes. If a node is enabled, the minimum weight is expected to be 0. If an enabled node ends up with a weight of 0 (as can happen if weights are being recalculated due to a cgroup mask update), a minimum of 1 is applied during the interleave mechanism. Signed-off-by: Gregory Price --- include/linux/mempolicy.h | 6 + include/uapi/linux/mempolicy.h | 6 + mm/mempolicy.c | 261 ++++++++++++++++++++++++++++++++- 3 files changed, 269 insertions(+), 4 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 8f918488c61c..8763e536d4a2 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -54,6 +54,12 @@ struct mempolicy { int weight; int count; } pil; + /* weighted interleave */ + struct { + unsigned int il_weight; + unsigned char cur_weight; + unsigned char weights[MAX_NUMNODES]; + } wil; }; union { diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 41c35f404c5e..913ca9bf9af7 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -25,6 +25,7 @@ enum { MPOL_PREFERRED_MANY, MPOL_LEGACY, /* set_mempolicy limited to above modes */ MPOL_PREFERRED_INTERLEAVE, + MPOL_WEIGHTED_INTERLEAVE, MPOL_MAX, /* always last member of enum */ }; @@ -58,6 +59,11 @@ struct mempolicy_args { unsigned long weight; /* get and set */ unsigned long next_node; /* get only */ } pil; + /* Weighted interleave */ + struct { + unsigned long next_node; /* get only */ + unsigned char *weights; /* get and set */ + } wil; }; }; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6374312cef5f..92be74d4c431 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -195,11 +195,43 @@ static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, nodes_onto(*ret, tmp, *rel); } +static void mpol_recalculate_weights(struct mempolicy *pol) +{ + unsigned int il_weight = 0; + int node; + + /* Recalculate weights to ensure minimum node weight */ + for (node = 0; node < MAX_NUMNODES; node++) { + if (!node_isset(node, pol->nodes) && pol->wil.weights[node]) { + /* If node is not set, weight should be 0 */ + pol->wil.weights[node] = 0; + } else if (!pol->wil.weights[node]) { + /* If node is set, weight should be minimum of 1 */ + pol->wil.weights[node] = 1; + pol->wil.il_weight += 1; + il_weight += 1; + } else { + /* Otherwise, keep the existing weight */ + il_weight += pol->wil.weights[node]; + } + } + pol->wil.il_weight = il_weight; + /* + * It's possible an allocation has been occurring at this point + * force it to go to the next node, since we just changed weights + */ + pol->wil.cur_weight = 0; +} + static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; pol->nodes = *nodes; + + if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) + mpol_recalculate_weights(pol); + return 0; } @@ -334,6 +366,10 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) tmp = *nodes; pol->nodes = tmp; + + /* After a change to the nodemask, weights must be recalculated */ + if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) + mpol_recalculate_weights(pol); } static void mpol_rebind_preferred(struct mempolicy *pol, @@ -403,6 +439,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, + [MPOL_WEIGHTED_INTERLEAVE] = { + .create = mpol_new_nodemask, + .rebind = mpol_rebind_nodemask, + }, [MPOL_PREFERRED] = { .create = mpol_new_preferred, .rebind = mpol_rebind_preferred, @@ -878,8 +918,10 @@ static long replace_mempolicy(struct mempolicy *new, nodemask_t *nodes) old = current->mempolicy; current->mempolicy = new; if (new && (new->mode == MPOL_INTERLEAVE || - new->mode == MPOL_PREFERRED_INTERLEAVE)) + new->mode == MPOL_PREFERRED_INTERLEAVE || + new->mode == MPOL_WEIGHTED_INTERLEAVE)) current->il_prev = MAX_NUMNODES-1; + out: task_unlock(current); mpol_put(old); @@ -921,6 +963,7 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED_INTERLEAVE: + case MPOL_WEIGHTED_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: *nodes = p->nodes; @@ -1632,6 +1675,56 @@ static long do_set_preferred_interleave(struct mempolicy_args *args, return 0; } +static long do_set_weighted_interleave(struct mempolicy_args *args, + struct mempolicy *new, + nodemask_t *nodes) +{ + unsigned char weight; + unsigned char *weights; + int node; + int ret = 0; + + /* Weighted interleave cannot be done with no nodemask */ + if (nodes_empty(*nodes)) + return -EINVAL; + + /* Weighted interleave requires a set of weights */ + if (!args->wil.weights) + return -EINVAL; + + weights = kmalloc(MAX_NUMNODES, GFP_KERNEL); + if (!weights) + return -ENOMEM; + + ret = copy_from_user(weights, args->wil.weights, MAX_NUMNODES); + if (ret) { + ret = -EFAULT; + goto weights_out; + } + + new->wil.cur_weight = 0; + new->wil.il_weight = 0; + memset(new->wil.weights, 0, sizeof(new->wil.weights)); + + /* Weights for set nodes cannot be 0 */ + node = first_node(*nodes); + while (node != MAX_NUMNODES) { + weight = weights[node]; + if (!weight) { + ret = -EINVAL; + goto weights_out; + } + /* policy creation initializes total to nr_nodes, adjust it */ + new->wil.il_weight += weight; + new->wil.weights[node] = weight; + node = next_node(node, *nodes); + } + +weights_out: + kfree(weights); + return ret; +} + static long do_set_mempolicy2(struct mempolicy_args *args) { struct mempolicy *new = NULL; @@ -1656,6 +1749,9 @@ static long do_set_mempolicy2(struct mempolicy_args *args) case MPOL_PREFERRED_INTERLEAVE: err = do_set_preferred_interleave(args, new, &nodes); break; + case MPOL_WEIGHTED_INTERLEAVE: + err = do_set_weighted_interleave(args, new, &nodes); + break; default: BUG(); } @@ -1799,6 +1895,12 @@ static long do_get_mempolicy2(struct mempolicy_args *kargs) kargs->pil.weight = pol->pil.weight; rc = 0; break; + case MPOL_WEIGHTED_INTERLEAVE: + kargs->wil.next_node = next_node_in(current->il_prev, + pol->nodes); + rc = copy_to_user(kargs->wil.weights, pol->wil.weights, + MAX_NUMNODES); + break; default: BUG(); } @@ -2160,6 +2262,27 @@ static unsigned int preferred_interleave_nodes(struct mempolicy *policy) return next; } +static unsigned int weighted_interleave_nodes(struct mempolicy *policy) +{ + unsigned int next; + unsigned char next_weight; + struct task_struct *me = current; + + /* When weight reaches 0, we're on a new node, reset the weight */ + next = next_node_in(me->il_prev, policy->nodes); + if (!policy->wil.cur_weight) { + /* If the node is set, at least 1 allocation is required */ + next_weight = policy->wil.weights[next]; + policy->wil.cur_weight = next_weight ? next_weight : 1; + } + + policy->wil.cur_weight--; + if (next < MAX_NUMNODES && !policy->wil.cur_weight) + me->il_prev = next; + + return next; +} + /* Do dynamic interleaving for a process */ static unsigned interleave_nodes(struct mempolicy *policy) { @@ -2168,6 +2291,8 @@ static unsigned interleave_nodes(struct mempolicy *policy) if (policy->mode == MPOL_PREFERRED_INTERLEAVE) return preferred_interleave_nodes(policy); + else if (policy->mode == MPOL_WEIGHTED_INTERLEAVE) + return weighted_interleave_nodes(policy); next = next_node_in(me->il_prev, policy->nodes); if (next < MAX_NUMNODES) @@ -2197,6 +2322,7 @@ unsigned int mempolicy_slab_node(void) case MPOL_INTERLEAVE: case MPOL_PREFERRED_INTERLEAVE: + case MPOL_WEIGHTED_INTERLEAVE: return interleave_nodes(policy); case MPOL_BIND: @@ -2273,6 +2399,40 @@ static unsigned int offset_pil_node(struct mempolicy *pol, unsigned long n) return nid; } +static unsigned int offset_wil_node(struct mempolicy *pol, unsigned long n) +{ + nodemask_t nodemask = pol->nodes; + unsigned int target, nnodes; + unsigned char weight; + int nid; + + /* + * The barrier will stabilize the nodemask in a register or on + * the stack so that it will stop changing under the code. + * + * Between first_node() and next_node(), pol->nodes could be changed + * by other threads. So we put pol->nodes in a local stack. + */ + barrier(); + + nnodes = nodes_weight(nodemask); + if (!nnodes) + return numa_node_id(); + target = (unsigned int)n % pol->wil.il_weight; + nid = first_node(nodemask); + while (target) { + weight = pol->wil.weights[nid]; + /* If weights are being recaculated, revert to interleave */ + if (!weight) + weight = 1; + if (target < weight) + break; + target -= weight; + nid = next_node_in(nid, nodemask); + } + return nid; +} + /* * Do static interleaving for a VMA with known offset @n. Returns the n'th * node in pol->nodes (starting from n=0), wrapping around if n exceeds the @@ -2287,6 +2447,8 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) if (pol->mode == MPOL_PREFERRED_INTERLEAVE) return offset_pil_node(pol, n); + else if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) + return offset_wil_node(pol, n); nodemask = pol->nodes; @@ -2358,7 +2520,8 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, mode = (*mpol)->mode; if (unlikely(mode == MPOL_INTERLEAVE) || - unlikely(mode == MPOL_PREFERRED_INTERLEAVE)) { + unlikely(mode == MPOL_PREFERRED_INTERLEAVE) || + unlikely(mode == MPOL_WEIGHTED_INTERLEAVE)) { nid = interleave_nid(*mpol, vma, addr, huge_page_shift(hstate_vma(vma))); } else { @@ -2400,6 +2563,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED_INTERLEAVE: + case MPOL_WEIGHTED_INTERLEAVE: *mask = mempolicy->nodes; break; @@ -2511,7 +2675,8 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, pol = get_vma_policy(vma, addr); if (pol->mode == MPOL_INTERLEAVE || - pol->mode == MPOL_PREFERRED_INTERLEAVE) { + pol->mode == MPOL_PREFERRED_INTERLEAVE || + pol->mode == MPOL_WEIGHTED_INTERLEAVE) { struct page *page; unsigned nid; @@ -2614,7 +2779,8 @@ struct page *alloc_pages(gfp_t gfp, unsigned order) * nor system default_policy */ if (pol->mode == MPOL_INTERLEAVE || - pol->mode == MPOL_PREFERRED_INTERLEAVE) + pol->mode == MPOL_PREFERRED_INTERLEAVE || + pol->mode == MPOL_WEIGHTED_INTERLEAVE) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else if (pol->mode == MPOL_PREFERRED_MANY) page = alloc_pages_preferred_many(gfp, order, @@ -2737,6 +2903,84 @@ static unsigned long alloc_pages_bulk_array_pil(gfp_t gfp, return allocated; } +static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, + struct mempolicy *pol, unsigned long nr_pages, + struct page **page_array) +{ + struct task_struct *me = current; + unsigned long total_allocated = 0; + unsigned long nr_allocated; + unsigned long rounds; + unsigned long node_pages, delta; + unsigned char weight; + int nnodes, node, prev_node; + int i; + + nnodes = nodes_weight(pol->nodes); + /* Continue allocating from most recent node and adjust the nr_pages */ + if (pol->wil.cur_weight) { + node = next_node_in(me->il_prev, pol->nodes); + node_pages = pol->wil.cur_weight; + nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, + NULL, page_array); + page_array += nr_allocated; + total_allocated += nr_allocated; + /* if that's all the pages, no need to interleave */ + if (nr_pages <= pol->wil.cur_weight) { + pol->wil.cur_weight -= nr_pages; + return total_allocated; + } + /* Otherwise we adjust nr_pages down, and continue from there */ + nr_pages -= pol->wil.cur_weight; + pol->wil.cur_weight = 0; + prev_node = node; + } + + /* Now we can continue allocating from this point */ + rounds = nr_pages / pol->wil.il_weight; + delta = nr_pages % pol->wil.il_weight; + for (i = 0; i < nnodes; i++) { + node = next_node_in(prev_node, pol->nodes); + weight = pol->wil.weights[node]; + node_pages = weight * rounds; + if (delta) { + if (delta > weight) { + node_pages += weight; + delta -= weight; + } else { + node_pages += delta; + delta = 0; + } + } + /* We may not make it all the way around */ + if (!node_pages) + break; + nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, + NULL, page_array); + page_array += nr_allocated; + total_allocated += nr_allocated; + prev_node = node; + } + + /* + * Finally, we need to update me->il_prev and pol->wil.cur_weight + * if there were overflow pages, but not equivalent to the node + * weight, set the cur_weight to node_weight - delta and the + * me->il_prev to the previous node. Otherwise if it was perfect + * we can simply set il_prev to node and cur_weight to 0 + */ + delta %= weight; + if (node_pages) { + me->il_prev = prev_node; + pol->wil.cur_weight = pol->wil.weights[node] - node_pages; + } else { + me->il_prev = node; + pol->wil.cur_weight = 0; + } + + return total_allocated; +} + static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) @@ -2779,6 +3023,11 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, return alloc_pages_bulk_array_pil(gfp, pol, nr_pages, page_array); + if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) + return alloc_pages_bulk_array_weighted_interleave(gfp, pol, + nr_pages, + page_array); + if (pol->mode == MPOL_PREFERRED_MANY) return alloc_pages_bulk_array_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); @@ -2852,6 +3101,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED_INTERLEAVE: + case MPOL_WEIGHTED_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: return !!nodes_equal(a->nodes, b->nodes); @@ -2989,6 +3239,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long switch (pol->mode) { case MPOL_INTERLEAVE: case MPOL_PREFERRED_INTERLEAVE: + case MPOL_WEIGHTED_INTERLEAVE: pgoff = vma->vm_pgoff; pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; polnid = offset_il_node(pol, pgoff); @@ -3377,6 +3628,7 @@ static const char * const policy_modes[] = [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", [MPOL_PREFERRED_INTERLEAVE] = "preferred interleave", + [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave", [MPOL_LOCAL] = "local", [MPOL_PREFERRED_MANY] = "prefer (many)", }; @@ -3548,6 +3800,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED_INTERLEAVE: + case MPOL_WEIGHTED_INTERLEAVE: nodes = pol->nodes; break; default: