diff mbox series

[RFC,v1,2/7] hugetlb: create PTE level mapping when possible

Message ID 20230428004139.2899856-3-jiaqiyan@google.com (mailing list archive)
State New
Headers show
Series PAGE_SIZE Unmapping in Memory Failure Recovery for HugeTLB Pages | expand

Commit Message

Jiaqi Yan April 28, 2023, 12:41 a.m. UTC
In memory_failure handling, for each VMA that the HWPOISON HugeTLB
page mapped to, enable HGM if eligible, then split the P*D mapped
hugepage to smaller PTEs. try_to_unmap still unmaps the entire hugetlb
page, one PTE by one PTE, at levels smaller than original P*D.
For example, if a hugepage was original mapped at PUD size, it will
be split into PMDs and PTEs, and all of these PMDs and PTEs will
be unmapped. The next commit will only unmap the raw HWPOISON PTE.

For VMA that is not HGM eligible, or failed to enable HGM, or
failed to split hugepage mapping, the hugepage is still mapped by
its original P*D then unmapped at this P*D.

Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
---
 include/linux/hugetlb.h |  5 +++
 mm/hugetlb.c            | 27 ++++++++++++++++
 mm/memory-failure.c     | 68 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 100 insertions(+)
diff mbox series

Patch

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d44bf6a794e5..03074b23c396 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1266,6 +1266,7 @@  int hugetlb_alloc_largest_pte(struct hugetlb_pte *hpte, struct mm_struct *mm,
 			      unsigned long end);
 int hugetlb_collapse(struct mm_struct *mm, unsigned long start,
 		     unsigned long end);
+int hugetlb_enable_hgm_vma(struct vm_area_struct *vma);
 int hugetlb_split_to_shift(struct mm_struct *mm, struct vm_area_struct *vma,
 			   struct hugetlb_pte *hpte, unsigned long addr,
 			   unsigned int desired_shift);
@@ -1295,6 +1296,10 @@  int hugetlb_collapse(struct mm_struct *mm, unsigned long start,
 {
 	return -EINVAL;
 }
+int hugetlb_enable_hgm_vma(struct vm_area_struct *vma)
+{
+	return -EINVAL;
+}
 int hugetlb_split_to_shift(struct mm_struct *mm, struct vm_area_struct *vma,
 			   const struct hugetlb_pte *hpte, unsigned long addr,
 			   unsigned int desired_shift)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d3f3f1c2d293..1419176b7e51 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -8203,6 +8203,33 @@  int hugetlb_collapse(struct mm_struct *mm, unsigned long start,
 	return ret;
 }
 
+int hugetlb_enable_hgm_vma(struct vm_area_struct *vma)
+{
+	if (hugetlb_hgm_enabled(vma))
+		return 0;
+
+	if (!is_vm_hugetlb_page(vma)) {
+		pr_warn("VMA=[%#lx, %#lx) is not HugeTLB\n",
+			vma->vm_start, vma->vm_end);
+		return -EINVAL;
+	}
+
+	if (!hugetlb_hgm_eligible(vma)) {
+		pr_warn("VMA=[%#lx, %#lx) is not HGM eligible\n",
+			vma->vm_start, vma->vm_end);
+		return -EINVAL;
+	}
+
+	hugetlb_unshare_all_pmds(vma);
+
+	/*
+	 * TODO: add the ability to tell if HGM is enabled by kernel
+	 * (for HWPOISON unmapping) or by userspace (via MADV_SPLIT).
+	 */
+	vm_flags_set(vma, VM_HUGETLB_HGM);
+	return 0;
+}
+
 /*
  * Find the optimal HugeTLB PTE shift that @desired_addr could be mapped at.
  */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 0b37cbc6e8ae..eb5579b6787e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1479,6 +1479,73 @@  static int get_hwpoison_page(struct page *p, unsigned long flags)
 	return ret;
 }
 
+#ifdef CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING
+/*
+ * For each HGM-eligible VMA that the poisoned page mapped to, create new
+ * HGM mapping for hugepage @folio and make sure @poisoned_page is mapped
+ * by a PAGESIZE level PTE. Caller (hwpoison_user_mappings) must ensure
+ * 1. folio's address space (mapping) is locked in write mode.
+ * 2. folio is locked.
+ */
+static void try_to_split_huge_mapping(struct folio *folio,
+				      struct page *poisoned_page)
+{
+	struct address_space *mapping = folio_mapping(folio);
+	pgoff_t pgoff_start;
+	pgoff_t pgoff_end;
+	struct vm_area_struct *vma;
+	unsigned long poisoned_addr;
+	unsigned long head_addr;
+	struct hugetlb_pte hpte;
+
+	if (WARN_ON(!mapping))
+		return;
+
+	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+	pgoff_start = folio_pgoff(folio);
+	pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
+
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff_start, pgoff_end) {
+		/* Enable HGM on HGM-eligible VMAs. */
+		if (!hugetlb_hgm_eligible(vma))
+			continue;
+
+		i_mmap_assert_locked(vma->vm_file->f_mapping);
+		if (hugetlb_enable_hgm_vma(vma)) {
+			pr_err("Failed to enable HGM on eligible VMA=[%#lx, %#lx)\n",
+				vma->vm_start, vma->vm_end);
+			continue;
+		}
+
+		poisoned_addr = vma_address(poisoned_page, vma);
+		head_addr = vma_address(folio_page(folio, 0), vma);
+		/*
+		 * Get the hugetlb_pte of the PUD-mapped hugepage first,
+		 * then split the PUD entry into PMD + PTE entries.
+		 *
+		 * Both getting original huge PTE and splitting requires write
+		 * lock on vma->vm_file->f_mapping, which caller
+		 * (e.g. hwpoison_user_mappings) should already acquired.
+		 */
+		if (hugetlb_full_walk(&hpte, vma, head_addr))
+			continue;
+
+		if (hugetlb_split_to_shift(vma->vm_mm, vma, &hpte,
+					   poisoned_addr, PAGE_SHIFT)) {
+			pr_err("Failed to split huge mapping: pfn=%#lx, vaddr=%#lx in VMA=[%#lx, %#lx)\n",
+				page_to_pfn(poisoned_page), poisoned_addr,
+				vma->vm_start, vma->vm_end);
+		}
+	}
+}
+#else
+static void try_to_split_huge_mapping(struct folio *folio,
+				      struct page *poisoned_page)
+{
+}
+#endif /* CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING */
+
 /*
  * Do all that is necessary to remove user space mappings. Unmap
  * the pages and send SIGBUS to the processes if the data was dirty.
@@ -1555,6 +1622,7 @@  static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 		 */
 		mapping = hugetlb_page_mapping_lock_write(hpage);
 		if (mapping) {
+			try_to_split_huge_mapping(folio, p);
 			try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
 			i_mmap_unlock_write(mapping);
 		} else