[RFC,v2,3/4] mm/mempolicy: implement a preferred-interleave

Message ID	20231003002156.740595-4-gregory.price@memverge.com (mailing list archive)
State	New
Headers	show Return-Path: <owner-linux-mm@kvack.org> From: Gregory Price <gourry.memverge@gmail.com> To: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org, linux-arch@vger.kernel.org, linux-api@vger.kernel.org, linux-cxl@vger.kernel.org, luto@kernel.org, tglx@linutronix.de, mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com, hpa@zytor.com, arnd@arndb.de, akpm@linux-foundation.org, x86@kernel.org, Gregory Price <gregory.price@memverge.com> Subject: [RFC PATCH v2 3/4] mm/mempolicy: implement a preferred-interleave Date: Mon, 2 Oct 2023 20:21:55 -0400 Message-Id: <20231003002156.740595-4-gregory.price@memverge.com> In-Reply-To: <20231003002156.740595-1-gregory.price@memverge.com> References: <20231003002156.740595-1-gregory.price@memverge.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	mm/mempolicy: get/set_mempolicy2 syscalls \| expand [RFC,v2,0/4] mm/mempolicy: get/set_mempolicy2 syscalls [RFC,v2,3/4] mm/mempolicy: implement a preferred-interleave [RFC,v2,4/4] mm/mempolicy: implement a weighted-interleave

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index d232de7cdc56..8f918488c61c 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -48,6 +48,14 @@ struct mempolicy { nodemask_t nodes; /* interleave/bind/perfer */ int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ + union { + /* Preferred Interleave: Weight local, then interleave */ + struct { + int weight; + int count; + } pil; + }; + union { nodemask_t cpuset_mems_allowed; /* relative to these nodes */ nodemask_t user_nodemask; /* nodemask passed by user */ diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index ea386872094b..41c35f404c5e 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -24,6 +24,7 @@ enum { MPOL_LOCAL, MPOL_PREFERRED_MANY, MPOL_LEGACY, /* set_mempolicy limited to above modes */ + MPOL_PREFERRED_INTERLEAVE, MPOL_MAX, /* always last member of enum */ }; @@ -52,6 +53,11 @@ struct mempolicy_args { struct { unsigned long next_node; /* get only */ } interleave; + /* Partial interleave */ + struct { + unsigned long weight; /* get and set */ + unsigned long next_node; /* get only */ + } pil; }; }; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 936c641f554e..6374312cef5f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -399,6 +399,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, + [MPOL_PREFERRED_INTERLEAVE] = { + .create = mpol_new_nodemask, + .rebind = mpol_rebind_nodemask, + }, [MPOL_PREFERRED] = { .create = mpol_new_preferred, .rebind = mpol_rebind_preferred, @@ -873,7 +877,8 @@ static long replace_mempolicy(struct mempolicy *new, nodemask_t *nodes) old = current->mempolicy; current->mempolicy = new; - if (new && new->mode == MPOL_INTERLEAVE) + if (new && (new->mode == MPOL_INTERLEAVE || + new->mode == MPOL_PREFERRED_INTERLEAVE)) current->il_prev = MAX_NUMNODES-1; out: task_unlock(current); @@ -915,6 +920,7 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) switch (p->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: + case MPOL_PREFERRED_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: *nodes = p->nodes; @@ -1609,6 +1615,23 @@ SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, return kernel_set_mempolicy(mode, nmask, maxnode); } +static long do_set_preferred_interleave(struct mempolicy_args *args, + struct mempolicy *new, + nodemask_t *nodes) +{ + /* Preferred interleave cannot be done with no nodemask */ + if (nodes_empty(*nodes)) + return -EINVAL; + + /* Preferred interleave weight cannot be <= 0 */ + if (args->pil.weight <= 0) + return -EINVAL; + + new->pil.weight = args->pil.weight; + new->pil.count = 0; + return 0; +} + static long do_set_mempolicy2(struct mempolicy_args *args) { struct mempolicy *new = NULL; @@ -1630,6 +1653,9 @@ static long do_set_mempolicy2(struct mempolicy_args *args) return PTR_ERR(new); switch (args->mode) { + case MPOL_PREFERRED_INTERLEAVE: + err = do_set_preferred_interleave(args, new, &nodes); + break; default: BUG(); } @@ -1767,6 +1793,12 @@ static long do_get_mempolicy2(struct mempolicy_args *kargs) pol->nodes); rc = 0; break; + case MPOL_PREFERRED_INTERLEAVE: + kargs->pil.next_node = next_node_in(current->il_prev, + pol->nodes); + kargs->pil.weight = pol->pil.weight; + rc = 0; + break; default: BUG(); } @@ -2102,12 +2134,41 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) return nd; } +static unsigned int preferred_interleave_nodes(struct mempolicy *policy) +{ + int mynode = numa_node_id(); + struct task_struct *me = current; + int next; + + /* + * If the local node is not in the node mask, we treat the + * lowest node as the preferred node. This can happen if the + * cpu is bound to a node that is not present in the mempolicy + */ + if (!node_isset(mynode, policy->nodes)) + mynode = first_node(policy->nodes); + + next = next_node_in(me->il_prev, policy->nodes); + if (next == mynode) { + if (++policy->pil.count >= policy->pil.weight) { + policy->pil.count = 0; + me->il_prev = next; + } + } else if (next < MAX_NUMNODES) { + me->il_prev = next; + } + return next; +} + /* Do dynamic interleaving for a process */ static unsigned interleave_nodes(struct mempolicy *policy) { unsigned next; struct task_struct *me = current; + if (policy->mode == MPOL_PREFERRED_INTERLEAVE) + return preferred_interleave_nodes(policy); + next = next_node_in(me->il_prev, policy->nodes); if (next < MAX_NUMNODES) me->il_prev = next; @@ -2135,6 +2196,7 @@ unsigned int mempolicy_slab_node(void) return first_node(policy->nodes); case MPOL_INTERLEAVE: + case MPOL_PREFERRED_INTERLEAVE: return interleave_nodes(policy); case MPOL_BIND: @@ -2161,6 +2223,56 @@ unsigned int mempolicy_slab_node(void) } } +static unsigned int offset_pil_node(struct mempolicy *pol, unsigned long n) +{ + nodemask_t nodemask = pol->nodes; + unsigned int target, nnodes; + int i; + int nid = MAX_NUMNODES; + int weight = pol->pil.weight; + + /* + * The barrier will stabilize the nodemask in a register or on + * the stack so that it will stop changing under the code. + * + * Between first_node() and next_node(), pol->nodes could be changed + * by other threads. So we put pol->nodes in a local stack. + */ + barrier(); + + nnodes = nodes_weight(nodemask); + + /* + * If the local node ID is not set (cpu is bound to a node + * but that node is not set in the memory nodemask), interleave + * based on the lowest set node. + */ + nid = numa_node_id(); + if (!node_isset(nid, nodemask)) + nid = first_node(nodemask); + /* + * Mode or weight can change so default to basic interleave + * if the weight has become invalid. Basic interleave is + * equivalent to weight=1. Don't double-count the base node + */ + if (weight == 0) + weight = 1; + weight -= 1; + + /* If target <= the weight, no need to call next_node */ + target = ((unsigned int)n % (nnodes + weight)); + target -= (target > weight) ? weight : target; + target %= MAX_NUMNODES; + + /* Target may not be the first node, so use next_node_in to wrap */ + for (i = 0; i < target; i++) { + nid = next_node_in(nid, nodemask); + if (nid == MAX_NUMNODES) + nid = first_node(nodemask); + } + return nid; +} + /* * Do static interleaving for a VMA with known offset @n. Returns the n'th * node in pol->nodes (starting from n=0), wrapping around if n exceeds the @@ -2168,10 +2280,16 @@ unsigned int mempolicy_slab_node(void) */ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) { - nodemask_t nodemask = pol->nodes; + nodemask_t nodemask; unsigned int target, nnodes; int i; int nid; + + if (pol->mode == MPOL_PREFERRED_INTERLEAVE) + return offset_pil_node(pol, n); + + nodemask = pol->nodes; + /* * The barrier will stabilize the nodemask in a register or on * the stack so that it will stop changing under the code. @@ -2239,7 +2357,8 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, *nodemask = NULL; mode = (*mpol)->mode; - if (unlikely(mode == MPOL_INTERLEAVE)) { + if (unlikely(mode == MPOL_INTERLEAVE) || + unlikely(mode == MPOL_PREFERRED_INTERLEAVE)) { nid = interleave_nid(*mpol, vma, addr, huge_page_shift(hstate_vma(vma))); } else { @@ -2280,6 +2399,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: + case MPOL_PREFERRED_INTERLEAVE: *mask = mempolicy->nodes; break; @@ -2390,7 +2510,8 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, pol = get_vma_policy(vma, addr); - if (pol->mode == MPOL_INTERLEAVE) { + if (pol->mode == MPOL_INTERLEAVE || + pol->mode == MPOL_PREFERRED_INTERLEAVE) { struct page *page; unsigned nid; @@ -2492,7 +2613,8 @@ struct page *alloc_pages(gfp_t gfp, unsigned order) * No reference counting needed for current->mempolicy * nor system default_policy */ - if (pol->mode == MPOL_INTERLEAVE) + if (pol->mode == MPOL_INTERLEAVE || + pol->mode == MPOL_PREFERRED_INTERLEAVE) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else if (pol->mode == MPOL_PREFERRED_MANY) page = alloc_pages_preferred_many(gfp, order, @@ -2552,6 +2674,69 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, return total_allocated; } +static unsigned long alloc_pages_bulk_array_pil(gfp_t gfp, + struct mempolicy *pol, + unsigned long nr_pages, + struct page **page_array) +{ + nodemask_t nodemask = pol->nodes; + unsigned long nr_pages_main; + unsigned long nr_pages_other; + unsigned long total_cycle; + unsigned long delta; + unsigned long weight; + int allocated = 0; + int start_nid; + int nnodes; + int prev, next; + int i; + + /* This stabilizes nodes on the stack incase pol->nodes changes */ + barrier(); + + nnodes = nodes_weight(nodemask); + start_nid = numa_node_id(); + + if (!node_isset(start_nid, nodemask)) + start_nid = first_node(nodemask); + + if (nnodes == 1) { + allocated = __alloc_pages_bulk(gfp, start_nid, + NULL, nr_pages_main, + NULL, page_array); + return allocated; + } + /* We don't want to double-count the main node in calculations */ + nnodes--; + + weight = pol->pil.weight; + total_cycle = (weight + nnodes); + /* Number of pages on main node: (cycles*weight + up to weight) */ + nr_pages_main = ((nr_pages / total_cycle) * weight); + nr_pages_main += (nr_pages % total_cycle % (weight + 1)); + /* Number of pages on others: (remaining/nodes) + 1 page if delta */ + nr_pages_other = (nr_pages - nr_pages_main) / nnodes; + nr_pages_other /= nnodes; + /* Delta is number of pages beyond weight up to full cycle */ + delta = nr_pages - (nr_pages_main + (nr_pages_other * nnodes)); + + /* start by allocating for the main node, then interleave rest */ + prev = start_nid; + allocated = __alloc_pages_bulk(gfp, start_nid, NULL, nr_pages_main, + NULL, page_array); + for (i = 0; i < nnodes; i++) { + int pages = nr_pages_other + (delta-- ? 1 : 0); + + next = next_node_in(prev, nodemask); + if (next < MAX_NUMNODES) + prev = next; + allocated += __alloc_pages_bulk(gfp, next, NULL, pages, + NULL, page_array); + } + + return allocated; +} + static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) @@ -2590,6 +2775,10 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, return alloc_pages_bulk_array_interleave(gfp, pol, nr_pages, page_array); + if (pol->mode == MPOL_PREFERRED_INTERLEAVE) + return alloc_pages_bulk_array_pil(gfp, pol, nr_pages, + page_array); + if (pol->mode == MPOL_PREFERRED_MANY) return alloc_pages_bulk_array_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); @@ -2662,6 +2851,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) switch (a->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: + case MPOL_PREFERRED_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: return !!nodes_equal(a->nodes, b->nodes); @@ -2798,6 +2988,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long switch (pol->mode) { case MPOL_INTERLEAVE: + case MPOL_PREFERRED_INTERLEAVE: pgoff = vma->vm_pgoff; pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; polnid = offset_il_node(pol, pgoff); @@ -3185,6 +3376,7 @@ static const char * const policy_modes[] = [MPOL_PREFERRED] = "prefer", [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", + [MPOL_PREFERRED_INTERLEAVE] = "preferred interleave", [MPOL_LOCAL] = "local", [MPOL_PREFERRED_MANY] = "prefer (many)", }; @@ -3355,6 +3547,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: + case MPOL_PREFERRED_INTERLEAVE: nodes = pol->nodes; break; default:

[RFC,v2,3/4] mm/mempolicy: implement a preferred-interleave

Commit Message

Patch