@@ -76,6 +76,8 @@
#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */
+#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+
/* compatibility flags */
#define MAP_FILE 0
@@ -103,6 +103,8 @@
#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */
+#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+
/* compatibility flags */
#define MAP_FILE 0
@@ -70,6 +70,8 @@
#define MADV_WIPEONFORK 71 /* Zero memory on fork, child only */
#define MADV_KEEPONFORK 72 /* Undo MADV_WIPEONFORK */
+#define MADV_COLLAPSE 73 /* Synchronous hugepage collapse */
+
#define MADV_HWPOISON 100 /* poison a page for testing */
#define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */
@@ -111,6 +111,8 @@
#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */
+#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+
/* compatibility flags */
#define MAP_FILE 0
@@ -236,6 +236,9 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags,
int advice);
+int madvise_collapse(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end);
void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
unsigned long end, long adjust_next);
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma);
@@ -392,6 +395,15 @@ static inline int hugepage_madvise(struct vm_area_struct *vma,
BUG();
return 0;
}
+
+static inline int madvise_collapse(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ BUG();
+ return 0;
+}
+
static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
@@ -77,6 +77,8 @@
#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */
+#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
+
/* compatibility flags */
#define MAP_FILE 0
@@ -847,6 +847,23 @@ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
}
+static struct page *alloc_hpage(struct collapse_control *cc, gfp_t gfp,
+ int node)
+{
+ VM_BUG_ON_PAGE(cc->hpage, cc->hpage);
+
+ cc->hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
+ if (unlikely(!cc->hpage)) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ cc->hpage = ERR_PTR(-ENOMEM);
+ return NULL;
+ }
+
+ prep_transhuge_page(cc->hpage);
+ count_vm_event(THP_COLLAPSE_ALLOC);
+ return cc->hpage;
+}
+
#ifdef CONFIG_NUMA
static int khugepaged_find_target_node(struct collapse_control *cc)
{
@@ -893,18 +910,7 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
static struct page *khugepaged_alloc_page(struct collapse_control *cc,
gfp_t gfp, int node)
{
- VM_BUG_ON_PAGE(cc->hpage, cc->hpage);
-
- cc->hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
- if (unlikely(!cc->hpage)) {
- count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
- cc->hpage = ERR_PTR(-ENOMEM);
- return NULL;
- }
-
- prep_transhuge_page(cc->hpage);
- count_vm_event(THP_COLLAPSE_ALLOC);
- return cc->hpage;
+ return alloc_hpage(cc, gfp, node);
}
#else
static int khugepaged_find_target_node(struct collapse_control *cc)
@@ -2471,3 +2477,131 @@ void khugepaged_min_free_kbytes_update(void)
set_recommended_min_free_kbytes();
mutex_unlock(&khugepaged_mutex);
}
+
+static inline gfp_t alloc_hugepage_madvise_gfpmask(void)
+{
+ return GFP_TRANSHUGE;
+}
+
+static void madvise_collapse_cleanup_page(struct page **hpage)
+{
+ if (!IS_ERR(*hpage) && *hpage)
+ put_page(*hpage);
+ *hpage = NULL;
+}
+
+static int madvise_collapse_errno(enum scan_result r)
+{
+ switch (r) {
+ case SCAN_PMD_NULL:
+ case SCAN_ADDRESS_RANGE:
+ case SCAN_VMA_NULL:
+ case SCAN_PTE_NON_PRESENT:
+ case SCAN_PAGE_NULL:
+ /*
+ * Addresses in the specified range are not currently mapped,
+ * or are outside the AS of the process.
+ */
+ return -ENOMEM;
+ case SCAN_ALLOC_HUGE_PAGE_FAIL:
+ case SCAN_CGROUP_CHARGE_FAIL:
+ /* A kernel resource was temporarily unavailable. */
+ return -EAGAIN;
+ default:
+ return -EINVAL;
+ }
+}
+
+int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ struct collapse_control cc = {
+ .last_target_node = NUMA_NO_NODE,
+ .hpage = NULL,
+ .gfp = &alloc_hugepage_madvise_gfpmask,
+ .alloc_hpage = &alloc_hpage,
+ };
+ struct mm_struct *mm = vma->vm_mm;
+ struct collapse_result cr;
+ unsigned long hstart, hend, addr;
+ int thps = 0, nr_hpages = 0;
+
+ BUG_ON(vma->vm_start > start);
+ BUG_ON(vma->vm_end < end);
+
+ *prev = vma;
+
+ if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file)
+ return -EINVAL;
+
+ hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = end & HPAGE_PMD_MASK;
+ nr_hpages = (hend - hstart) >> HPAGE_PMD_SHIFT;
+
+ if (hstart >= hend || !transparent_hugepage_active(vma))
+ return -EINVAL;
+
+ mmgrab(mm);
+ lru_add_drain();
+
+ for (addr = hstart; ; ) {
+ mmap_assert_locked(mm);
+ cond_resched();
+ memset(&cr, 0, sizeof(cr));
+
+ if (unlikely(khugepaged_test_exit(mm))) {
+ cr.result = SCAN_ANY_PROCESS;
+ break;
+ }
+
+ memset(cc.node_load, 0, sizeof(cc.node_load));
+ khugepaged_scan_pmd(mm, vma, addr, &cc, &cr);
+ if (cr.dropped_mmap_lock)
+ *prev = NULL; /* tell madvise we dropped mmap_lock */
+
+ switch (cr.result) {
+ /* Whitelisted set of results where continuing OK */
+ case SCAN_SUCCEED:
+ case SCAN_PMD_MAPPED:
+ ++thps;
+ case SCAN_PMD_NULL:
+ case SCAN_PTE_NON_PRESENT:
+ case SCAN_PTE_UFFD_WP:
+ case SCAN_PAGE_RO:
+ case SCAN_LACK_REFERENCED_PAGE:
+ case SCAN_PAGE_NULL:
+ case SCAN_PAGE_COUNT:
+ case SCAN_PAGE_LOCK:
+ case SCAN_PAGE_COMPOUND:
+ break;
+ case SCAN_PAGE_LRU:
+ lru_add_drain_all();
+ goto retry;
+ default:
+ /* Other error, exit */
+ goto break_loop;
+ }
+ addr += HPAGE_PMD_SIZE;
+ if (addr >= hend)
+ break;
+retry:
+ if (cr.dropped_mmap_lock) {
+ mmap_read_lock(mm);
+ cr.result = hugepage_vma_revalidate(mm, addr, &vma);
+ if (cr.result)
+ goto out;
+ }
+ madvise_collapse_cleanup_page(&cc.hpage);
+ }
+
+break_loop:
+ /* madvise_walk_vmas() expects us to hold mmap_lock on return */
+ if (cr.dropped_mmap_lock)
+ mmap_read_lock(mm);
+out:
+ mmap_assert_locked(mm);
+ madvise_collapse_cleanup_page(&cc.hpage);
+ mmdrop(mm);
+
+ return thps == nr_hpages ? 0 : madvise_collapse_errno(cr.result);
+}
@@ -59,6 +59,7 @@ static int madvise_need_mmap_write(int behavior)
case MADV_FREE:
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
+ case MADV_COLLAPSE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
@@ -1054,6 +1055,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
if (error)
goto out;
break;
+ case MADV_COLLAPSE:
+ return madvise_collapse(vma, prev, start, end);
}
anon_name = anon_vma_name(vma);
@@ -1147,6 +1150,7 @@ madvise_behavior_valid(int behavior)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
+ case MADV_COLLAPSE:
#endif
case MADV_DONTDUMP:
case MADV_DODUMP:
@@ -1336,6 +1340,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
* MADV_NOHUGEPAGE - mark the given range as not worth being backed by
* transparent huge pages so the existing pages will not be
* coalesced into THP and new pages will not be allocated as THP.
+ * MADV_COLLAPSE - synchronously coalesce pages into new THP.
* MADV_DONTDUMP - the application wants to prevent pages in the given range
* from being included in its core dump.
* MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
This idea was introduced by David Rientjes[1]. Introduce a new madvise mode, MADV_COLLAPSE, that allows users to request a synchronous collapse of memory at their own expense. The benefits of this approach are: * CPU is charged to the process that wants to spend the cycles for the THP * Avoid unpredictable timing of khugepaged collapse An immediate user of this new functionality are malloc() implementations that manage memory in hugepage-sized chunks, but sometimes subrelease memory back to the system in native-sized chunks via MADV_DONTNEED; zapping the pmd. Later, when the memory is hot, the implementation could madvise(MADV_COLLAPSE) to re-back the memory by THPs to regain hugepage coverage and dTLB performance. TCMalloc is such an implementation that could benefit from this[2]. Only privately-mapped anon memory is supported for now, but it is expected that file and shmem support will be added later to support the use-case of backing executable text by THPs. Current support provided by CONFIG_READ_ONLY_THP_FOR_FS may take a long time on a large system which might impair services from serving at their full rated load after (re)starting. Tricks like mremap(2)'ing text onto anonymous memory to immediately realize iTLB performance prevents page sharing and demand paging, both of which increase steady state memory footprint. With MADV_COLLAPSE, we get the best of both worlds: Peak upfront performance and lower RAM footprints. This call respects THP eligibility as determined by the system-wide sysfs settings and the VMA flags for the memory range being collapsed. [1] https://lore.kernel.org/linux-mm/d098c392-273a-36a4-1a29-59731cdf5d3d@google.com/ [2] https://github.com/google/tcmalloc/tree/master/tcmalloc Suggested-by: David Rientjes <rientjes@google.com> Signed-off-by: Zach O'Keefe <zokeefe@google.com> Reported-by: kernel test robot <lkp@intel.com> --- arch/alpha/include/uapi/asm/mman.h | 2 + arch/mips/include/uapi/asm/mman.h | 2 + arch/parisc/include/uapi/asm/mman.h | 2 + arch/xtensa/include/uapi/asm/mman.h | 2 + include/linux/huge_mm.h | 12 ++ include/uapi/asm-generic/mman-common.h | 2 + mm/khugepaged.c | 158 +++++++++++++++++++++++-- mm/madvise.c | 5 + 8 files changed, 173 insertions(+), 12 deletions(-)