diff mbox series

[RFC,13/14] mm/madvise: add __madvise_collapse_*_batch() actions.

Message ID 20220308213417.1407042-14-zokeefe@google.com (mailing list archive)
State New
Headers show
Series mm: userspace hugepage collapse | expand

Commit Message

Zach O'Keefe March 8, 2022, 9:34 p.m. UTC
Add implementations for the following batch actions:

scan_pmd:
	Iterate over batch and scan the pmd for eligibility. Note
	that this function is called with mmap_lock in read, and
	does not drop it before returning.

	If a batch entry fails, ->continue_collapse field of its
	madvise_collapse_data is set to 'false' so that later _batch
	actions know to ignore it.

	Return the number of THPs already the batch, which is needed
	by _madvise_collapse() to determine overall "success" criteria
	(all pmds either collapsed successfully, or already THP-backed).

prealloc_hpages:
	Iterate over batch and allocate / charge hugepages. Before
	allocating a new page, check on local free hugepage list.
	Similarly, if, after allocating a hugepage, charging the memcg
	fails, save the hugepage on a local free list for future use.

swapin_pmd:
	Iterate over batch and attempt to swap-in pages that are
	currently swapped out.  Called with mmap_lock in read, and
	returns with it held; however, it might drop and require the
	lock internally.

	Specifically, __collapse_huge_page_swapin() might drop +
	require the mmap_lock.  When it does so, it only revalidates the
	vma/address for a single pmd.  Since we need to revalidate the
	vma for the entire region covered in the batch, we need to be
	notified when the lock is dropped so that we can perform the
	required revalidation. As such, add an argument to
	__collapse_huge_page_swapin() to notify caller when mmap_lock is
	dropped.

collapse_pmd:
	Iterate over the batch and perform the actual collapse for each
	pmd.  Note that this is done while holding the mmap_lock in write for
	the entire batch action.

Signed-off-by: Zach O'Keefe <zokeefe@google.com>
---
 mm/khugepaged.c | 153 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 145 insertions(+), 8 deletions(-)
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ea53c706602e..e8156f15a3da 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2572,8 +2572,23 @@  __madvise_collapse_scan_pmd_batch(struct mm_struct *mm,
 				  int batch_size,
 				  struct collapse_control *cc)
 {
-	/* Implemented in later patch */
-	return 0;
+	unsigned long addr, i;
+	int thps = 0;
+
+	mmap_assert_locked(mm);
+
+	for (addr = batch_start, i = 0; i < batch_size;
+	     addr += HPAGE_PMD_SIZE, ++i) {
+		struct madvise_collapse_data *data = batch_data + i;
+
+		scan_pmd(mm, vma, addr, cc, &data->scan_result);
+		data->continue_collapse =
+				data->scan_result.result == SCAN_SUCCEED;
+		if (data->scan_result.result == SCAN_PAGE_COMPOUND)
+			++thps;
+	}
+	mmap_assert_locked(mm);
+	return thps;
 }
 
 /*
@@ -2590,8 +2605,39 @@  __madvise_collapse_prealloc_hpages_batch(struct mm_struct *mm,
 					 int batch_size,
 					 struct collapse_control *cc)
 {
-	/* Implemented in later patch */
-	return 0;
+	int nr_hpages = 0;
+	int i;
+
+	for (i = 0; i < batch_size; ++i) {
+		struct madvise_collapse_data *data = batch_data + i;
+
+		if (!data->continue_collapse)
+			continue;
+
+		if (!list_empty(&cc->free_hpages[node])) {
+			data->hpage  = list_first_entry(&cc->free_hpages[node],
+							struct page, lru);
+			list_del(&data->hpage->lru);
+		} else {
+			data->hpage = __alloc_pages_node(node, gfp,
+							 HPAGE_PMD_ORDER);
+			if (unlikely(!data->hpage))
+				break;
+
+			prep_transhuge_page(data->hpage);
+
+			if (unlikely(mem_cgroup_charge(page_folio(data->hpage),
+						       mm, gfp))) {
+				/* No use reusing page, so give it back */
+				put_page(data->hpage);
+				data->hpage = NULL;
+				data->continue_collapse = false;
+				break;
+			}
+		}
+		++nr_hpages;
+	}
+	return nr_hpages;
 }
 
 /*
@@ -2612,8 +2658,67 @@  __madvise_collapse_swapin_pmd_batch(struct mm_struct *mm,
 				    struct collapse_control *cc)
 
 {
-	/* Implemented in later patch */
-	return true;
+	unsigned long addr;
+	int i;
+	bool ret = true;
+
+	/*
+	 * This function is called with mmap_lock held, and returns with it
+	 * held. However, __collapse_huge_page_swapin() may internally drop and
+	 * reaquire the lock. When it does, it only revalidates the single pmd
+	 * provided to it. We need to know when it drops the lock so that we can
+	 * revalidate the batch of pmds we are operating on.
+	 *
+	 * Initially setting this to 'true' because the caller just locked
+	 * mmap_lock and so we need to revalidate before doing anything else.
+	 */
+	bool need_revalidate_pmd_count = true;
+
+	for (addr = batch_start, i = 0;
+	     i < batch_size;
+	     addr += HPAGE_PMD_SIZE, ++i) {
+		struct vm_area_struct *vma;
+		struct madvise_collapse_data *data = batch_data + i;
+
+		mmap_assert_locked(mm);
+
+		/*
+		 * We might have dropped the lock during previous iteration.
+		 * It's acceptable to exit this function without revalidating
+		 * the vma since the caller immediately unlocks mmap_lock
+		 * anyway.
+		 */
+		if (!data->continue_collapse)
+			continue;
+
+		if (need_revalidate_pmd_count) {
+			if (madvise_collapse_vma_revalidate_pmd_count(mm,
+								      batch_start,
+								      batch_size,
+								      &vma)) {
+				ret = false;
+				break;
+			}
+			need_revalidate_pmd_count = false;
+		}
+
+		data->pmd = mm_find_pmd(mm, addr);
+
+		if (!data->pmd ||
+		    (data->scan_result.unmapped &&
+		     !__collapse_huge_page_swapin(mm, vma, addr, data->pmd,
+						  VM_NOHUGEPAGE,
+						  data->scan_result.referenced,
+						  &need_revalidate_pmd_count))) {
+			/* Hold on to the THP until we know we don't need it. */
+			data->continue_collapse = false;
+			list_add_tail(&data->hpage->lru,
+				      &cc->free_hpages[node]);
+			data->hpage = NULL;
+		}
+	}
+	mmap_assert_locked(mm);
+	return ret;
 }
 
 /*
@@ -2630,8 +2735,40 @@  __madvise_collapse_pmd_batch(struct mm_struct *mm,
 			     int node,
 			     struct collapse_control *cc)
 {
-	/* Implemented in later patch */
-	return 0;
+	unsigned long addr;
+	struct vm_area_struct *vma;
+	int i, ret = 0;
+
+	mmap_assert_write_locked(mm);
+
+	if (madvise_collapse_vma_revalidate_pmd_count(mm, batch_start,
+						      batch_size, &vma))
+		goto out;
+
+	for (addr = batch_start, i = 0;
+	     i < batch_size;
+	     addr += HPAGE_PMD_SIZE, ++i) {
+		int result;
+		struct madvise_collapse_data *data = batch_data + i;
+
+		if (!data->continue_collapse ||
+		    (mm_find_pmd(mm, addr) != data->pmd))
+			continue;
+
+		result = __do_collapse_huge_page(mm, vma, addr, data->pmd,
+						 data->hpage,
+						 cc->enforce_pte_scan_limits,
+						 NULL);
+
+		if (result == SCAN_SUCCEED)
+			++ret;
+		else
+			list_add_tail(&data->hpage->lru,
+				      &cc->free_hpages[node]);
+		data->hpage = NULL;
+	}
+out:
+	return ret;
 }
 
 static bool continue_collapse(struct madvise_collapse_data *batch_data,