[RFC,1/4] mm: Add speculative numa fault support

Message ID	0ec3e9ce4b564bee3883b6141b1f9f2498188002.1639306956.git.baolin.wang@linux.alibaba.com (mailing list archive)
State	New
Headers	show Return-Path: <owner-linux-mm@kvack.org> From: Baolin Wang <baolin.wang@linux.alibaba.com> To: akpm@linux-foundation.org, ying.huang@intel.com, dave.hansen@linux.intel.com Cc: ziy@nvidia.com, shy828301@gmail.com, baolin.wang@linux.alibaba.com, zhongjiang-ali@linux.alibaba.com, xlpang@linux.alibaba.com, linux-mm@kvack.org, linux-kernel@vger.kernel.org Subject: [RFC PATCH 1/4] mm: Add speculative numa fault support Date: Sun, 12 Dec 2021 19:31:57 +0800 Message-Id: <0ec3e9ce4b564bee3883b6141b1f9f2498188002.1639306956.git.baolin.wang@linux.alibaba.com> In-Reply-To: <cover.1639306956.git.baolin.wang@linux.alibaba.com> References: <cover.1639306956.git.baolin.wang@linux.alibaba.com> In-Reply-To: <cover.1639306956.git.baolin.wang@linux.alibaba.com> References: <cover.1639306956.git.baolin.wang@linux.alibaba.com> Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	Add speculative numa fault support \| expand [RFC,0/4] Add speculative numa fault support [RFC,1/4] mm: Add speculative numa fault support [RFC,2/4] mm: Add a debug interface to control the range of speculative numa fault [RFC,3/4] mm: Add speculative numa fault stats [RFC,4/4] mm: Update the speculative pages' accessing time

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 449b6eafc695..8d8381e9aec9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -474,6 +474,9 @@ struct vm_area_struct { #endif #ifdef CONFIG_NUMA struct mempolicy *vm_policy; /* NUMA policy for the VMA */ +#endif +#ifdef CONFIG_NUMA_BALANCING + atomic_long_t numafault_ahead_info; #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; } __randomize_layout; diff --git a/mm/memory.c b/mm/memory.c index 2291417783bc..2c9ed63e4e23 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -74,6 +74,8 @@ #include <linux/ptrace.h> #include <linux/vmalloc.h> #include <linux/sched/sysctl.h> +#include <linux/pagewalk.h> +#include <linux/page_idle.h> #include <trace/events/kmem.h> @@ -4315,16 +4317,156 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, return mpol_misplaced(page, vma, addr); } +static bool try_next_numa_page(struct vm_fault *vmf, unsigned int win_pages, + unsigned long *fault_addr) +{ + unsigned long next_fault_addr = *fault_addr + PAGE_SIZE; + unsigned long numa_fault_end = vmf->address + (win_pages + 1) * PAGE_SIZE; + + if (next_fault_addr > numa_fault_end) + return false; + + *fault_addr = next_fault_addr; + vmf->pte = pte_offset_map(vmf->pmd, next_fault_addr); + vmf->orig_pte = *vmf->pte; + if (pte_protnone(vmf->orig_pte)) + return true; + + return false; +} + +#define NUMA_FAULT_AHEAD_DEFUALT 2 +#define NUMA_FAULT_EXPAND_STEP 1 +#define NUMA_FAULT_REDUCE_STEP 2 +#define GET_NUMA_FAULT_INFO(vma) \ + (atomic_long_read(&(vma)->numafault_ahead_info)) +#define NUMA_FAULT_WINDOW_START(v) ((v) & PAGE_MASK) +#define NUMA_FAULT_WINDOW_SIZE_MASK ((1UL << PAGE_SHIFT) - 1) +#define NUMA_FAULT_WINDOW_SIZE(v) ((v) & NUMA_FAULT_WINDOW_SIZE_MASK) +#define NUMA_FAULT_INFO(addr, win) \ + (((addr) & PAGE_MASK) | \ + ((win) & NUMA_FAULT_WINDOW_SIZE_MASK)) + +static inline unsigned int numa_fault_max_pages(struct vm_area_struct *vma, + unsigned long fault_address) +{ + unsigned long pmd_end_addr = (fault_address & PMD_MASK) + PMD_SIZE; + unsigned long max_fault_addr = min_t(unsigned long, pmd_end_addr, + vma->vm_end); + + return (max_fault_addr - fault_address - 1) >> PAGE_SHIFT; +} + +static unsigned int adjust_numa_fault_window(struct vm_area_struct *vma, + unsigned long fault_address) +{ + unsigned long numafault_ahead = GET_NUMA_FAULT_INFO(vma); + unsigned long prev_start = NUMA_FAULT_WINDOW_START(numafault_ahead); + unsigned int prev_pages = NUMA_FAULT_WINDOW_SIZE(numafault_ahead); + unsigned long win_start; + unsigned int win_pages, max_fault_pages; + + win_start = fault_address + PAGE_SIZE; + + /* + * Start accessing the VMA, then just open a small window to try. + */ + if (!numafault_ahead) { + win_pages = NUMA_FAULT_AHEAD_DEFUALT; + goto out; + } + + /* + * If last numa fault window was close, we should check if current fault + * address is continue with previous fault addess before opening the + * new numa fault window. + */ + if (!prev_pages) { + if (fault_address == prev_start || + fault_address == prev_start + PAGE_SIZE) + win_pages = NUMA_FAULT_AHEAD_DEFUALT; + else + win_pages = 0; + + goto out; + } + + /* + * TODO: need check the fault addess is occured before the last numa + * fault window. + */ + if (fault_address >= prev_start) { + unsigned long prev_end = prev_start + prev_pages * PAGE_SIZE; + + /* + * Continue with the previous numa fault window, then assume + * it is a sequential accessing, which need expand the numa fault + * window. + */ + if (fault_address == prev_end || + fault_address == prev_end + PAGE_SIZE) { + win_pages = prev_pages + NUMA_FAULT_EXPAND_STEP; + goto validate_out; + } else if (fault_address < prev_end) { + /* + * If current fault address is in the range of last numa + * fault window, which means the pages in last numa fault + * window were not all migrated successfully, so just + * keep current size of last numa fault window to try + * again, since last numa fault window speculation may + * be on the correct way. + */ + win_pages = prev_pages; + goto validate_out; + } + } + + /* + * Until now assume random accessing, reduce the numa fault window + * by step. + */ + if (prev_pages <= NUMA_FAULT_REDUCE_STEP) { + win_pages = 0; + goto out; + } else { + win_pages = prev_pages - NUMA_FAULT_REDUCE_STEP; + } + +validate_out: + /* + * Make sure the size of ahead numa fault address is less than the + * size of current VMA or PMD. + */ + max_fault_pages = numa_fault_max_pages(vma, fault_address); + if (win_pages > max_fault_pages) + win_pages = max_fault_pages; + +out: + atomic_long_set(&vma->numafault_ahead_info, + NUMA_FAULT_INFO(win_start, win_pages)); + return win_pages; +} + static vm_fault_t do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; - int page_nid = NUMA_NO_NODE; + int page_nid; int last_cpupid; int target_nid; pte_t pte, old_pte; - bool was_writable = pte_savedwrite(vmf->orig_pte); - int flags = 0; + bool was_writable; + int flags; + unsigned long fault_address = vmf->address; + unsigned int win_pages; + + /* Try to speculate the numa fault window for current VMA. */ + win_pages = adjust_numa_fault_window(vma, fault_address); + +try_next: + was_writable = pte_savedwrite(vmf->orig_pte); + flags = 0; + page_nid = NUMA_NO_NODE; /* * The "pte" at this point cannot be used safely without @@ -4342,7 +4484,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) old_pte = ptep_get(vmf->pte); pte = pte_modify(old_pte, vma->vm_page_prot); - page = vm_normal_page(vma, vmf->address, pte); + page = vm_normal_page(vma, fault_address, pte); if (!page) goto out_map; @@ -4378,7 +4520,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) last_cpupid = (-1 & LAST_CPUPID_MASK); else last_cpupid = page_cpupid_last(page); - target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, + target_nid = numa_migrate_prep(page, vma, fault_address, page_nid, &flags); if (target_nid == NUMA_NO_NODE) { put_page(page); @@ -4392,7 +4534,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) flags |= TNF_MIGRATED; } else { flags |= TNF_MIGRATE_FAIL; - vmf->pte = pte_offset_map(vmf->pmd, vmf->address); + vmf->pte = pte_offset_map(vmf->pmd, fault_address); spin_lock(vmf->ptl); if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -4404,19 +4546,24 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) out: if (page_nid != NUMA_NO_NODE) task_numa_fault(last_cpupid, page_nid, 1, flags); + + if ((flags & TNF_MIGRATED) && (win_pages > 0) && + try_next_numa_page(vmf, win_pages, &fault_address)) + goto try_next; + return 0; out_map: /* * Make it present again, depending on how arch implements * non-accessible ptes, some can allow access by kernel mode. */ - old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); + old_pte = ptep_modify_prot_start(vma, fault_address, vmf->pte); pte = pte_modify(old_pte, vma->vm_page_prot); pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte); - ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); - update_mmu_cache(vma, vmf->address, vmf->pte); + ptep_modify_prot_commit(vma, fault_address, vmf->pte, old_pte, pte); + update_mmu_cache(vma, fault_address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; }

[RFC,1/4] mm: Add speculative numa fault support

Commit Message

Patch