diff mbox series

[v2,5/9] mm: pte_refcount infrastructure

Message ID 20210819031858.98043-6-zhengqi.arch@bytedance.com (mailing list archive)
State New
Headers show
Series Free user PTE page table pages | expand

Commit Message

Qi Zheng Aug. 19, 2021, 3:18 a.m. UTC
In this patch, we add two fields to the struct page of PTE page
table, the pte_refcount is used to track how many users of PTE
page table, the pmd is used to record the pmd entry which is
populated by the PTE. And we introduce some APIs that will be
used in subsequent patches:

The following are some APIs related to ->pte_refcount:

	pte_get{,_many}()
	pte_put{,_many,_vmf}()
	pte_try_get()

And we add pte_alloc_get{,_map,_map_lock}(), the semantics of
these function is to hold a pte_refcount count when it returns 0,
and its caller is responsible for decrease the pte_refcount by
calling pte_put(), like the following pattern:

	--> pte_alloc_get()

	    do something about pte

	    pte_put()

This patch contains no functional changes, just creates some
infrastructure, and we will use these in the next patch.

Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
 include/linux/mm_types.h |   8 +-
 include/linux/pgtable.h  |   3 +-
 include/linux/pte_ref.h  | 208 ++++++++++++++++++++++++++++++++++++++++++++++-
 mm/Kconfig               |   4 +
 mm/memory.c              |   8 +-
 mm/pte_ref.c             |  59 ++++++++++++--
 mm/sparse-vmemmap.c      |   2 +-
 7 files changed, 279 insertions(+), 13 deletions(-)
diff mbox series

Patch

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index b9aa9aa99924..85c5cc4b95b3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -161,11 +161,17 @@  struct page {
 		};
 		struct {	/* Page table pages */
 			unsigned long _pt_pad_1;	/* compound_head */
-			pgtable_t pmd_huge_pte; /* protected by page->ptl */
+			union {
+				pgtable_t pmd_huge_pte; /* protected by page->ptl */
+				pmd_t *pmd;             /* PTE page only */
+			};
 			unsigned long _pt_pad_2;	/* mapping */
 			union {
 				struct mm_struct *pt_mm; /* x86 pgds only */
 				atomic_t pt_frag_refcount; /* powerpc */
+#ifdef CONFIG_FREE_USER_PTE
+				atomic_t pte_refcount;  /* PTE page only */
+#endif
 			};
 #if USE_SPLIT_PTE_PTLOCKS
 #if ALLOC_SPLIT_PTLOCKS
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index e24d2c992b11..c799475bc5c9 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -331,7 +331,6 @@  static inline pte_t ptep_get_lockless(pte_t *ptep)
 }
 #endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
 static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
 					    unsigned long address,
@@ -342,6 +341,8 @@  static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
 	return pmd;
 }
 #endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
 static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
 					    unsigned long address,
diff --git a/include/linux/pte_ref.h b/include/linux/pte_ref.h
index 60b752dd7846..47aaeac7507e 100644
--- a/include/linux/pte_ref.h
+++ b/include/linux/pte_ref.h
@@ -15,6 +15,8 @@ 
 
 void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd);
+void pmd_install_get(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
+int __pte_alloc_get(struct mm_struct *mm, pmd_t *pmd);
 
 #define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd))
 
@@ -25,5 +27,209 @@  int __pte_alloc(struct mm_struct *mm, pmd_t *pmd);
 	(pte_alloc(mm, pmd) ?				\
 		 NULL : pte_offset_map_lock(mm, pmd, address, ptlp))
 
-#endif
+#ifdef CONFIG_FREE_USER_PTE
+
+void free_pte_table(struct mm_struct *mm, pmd_t *pmdp, unsigned long addr);
+
+static inline void pte_ref_init(pgtable_t pte, pmd_t *pmd, int count)
+{
+	pte->pmd = pmd;
+	atomic_set(&pte->pte_refcount, count);
+}
+
+static inline pmd_t *pte_to_pmd(pte_t *pte)
+{
+	return virt_to_page(pte)->pmd;
+}
+
+static inline void pte_migrate_pmd(pmd_t old_pmd, pmd_t *new_pmd)
+{
+	pmd_pgtable(old_pmd)->pmd = new_pmd;
+}
+
+static inline void pte_get_many(pmd_t *pmdp, unsigned int nr)
+{
+	pgtable_t pte = pmd_pgtable(*pmdp);
+
+	VM_BUG_ON(!PageTable(pte));
+	atomic_add(nr, &pte->pte_refcount);
+}
+
+/*
+ * pte_get - Increment ->pte_refcount for the PTE page table.
+ * @pmdp: a pointer to the pmd entry corresponding to the PTE page table.
+ *
+ * Similar to the mechanism of page refcount, the user of PTE page table
+ * should hold a refcount to it before accessing.
+ */
+static inline void pte_get(pmd_t *pmdp)
+{
+	pte_get_many(pmdp, 1);
+}
+
+/*
+ * pte_get_unless_zero - Increment ->pte_refcount for the PTE page table
+ *			 unless it is zero.
+ * @pmdp: a pointer to the pmd entry corresponding to the PTE page table.
+ */
+static inline bool pte_get_unless_zero(pmd_t *pmdp)
+{
+	pgtable_t pte = pmd_pgtable(*pmdp);
 
+	VM_BUG_ON(!PageTable(pte));
+	return atomic_inc_not_zero(&pte->pte_refcount);
+}
+
+/*
+ * pte_try_get - Try to increment ->pte_refcount for the PTE page table.
+ * @mm: the mm_struct of the target address space.
+ * @pmdp: a pointer to the pmd entry corresponding to the PTE page table.
+ *
+ * Return true if the increment succeeded. Otherwise return false.
+ *
+ * Before Operating the PTE page table, we need to hold a ->pte_refcount
+ * to protect against the concurrent release of the PTE page table.
+ */
+static inline bool pte_try_get(struct mm_struct *mm, pmd_t *pmdp)
+{
+	bool retval = true;
+	spinlock_t *ptl;
+
+	ptl = pmd_lock(mm, pmdp);
+	if (!pmd_present(*pmdp) || !pte_get_unless_zero(pmdp))
+		retval = false;
+	spin_unlock(ptl);
+
+	return retval;
+}
+
+static inline void pte_put_many(struct mm_struct *mm, pmd_t *pmdp,
+				unsigned long addr, unsigned int nr)
+{
+	pgtable_t pte = pmd_pgtable(*pmdp);
+
+	VM_BUG_ON(!PageTable(pte));
+	if (atomic_sub_and_test(nr, &pte->pte_refcount))
+		free_pte_table(mm, pmdp, addr & PMD_MASK);
+}
+
+/*
+ * pte_put - Decrement ->pte_refcount for the PTE page table.
+ * @mm: the mm_struct of the target address space.
+ * @pmdp: a pointer to the pmd entry corresponding to the PTE page table.
+ * @addr: the start address of the tlb range to be flushed.
+ *
+ * The PTE page table page will be freed when the last refcount is dropped.
+ */
+static inline void pte_put(struct mm_struct *mm, pmd_t *pmdp, unsigned long addr)
+{
+	pte_put_many(mm, pmdp, addr, 1);
+}
+
+/*
+ * pte_put_vmf - Decrement ->pte_refcount for the PTE page table.
+ * @vmf: fault information
+ *
+ * While we access ->pte_refcount of a PTE page table, any of the following
+ * ensures the pmd entry corresponding to the PTE page table stability:
+ * 	- mmap_lock
+ * 	- anon_lock
+ * 	- i_mmap_lock
+ * 	- parallel threads are excluded by other means which
+ * 	  can make ->pmd stable(e.g. gup case)
+ * But the mmap_lock maybe unlocked in advance in some cases in
+ * handle_pte_fault(), so we should ensure the pte_put() is performed
+ * in the critical section of the mmap_lock.
+ */
+static inline void pte_put_vmf(struct vm_fault *vmf)
+{
+	if (!(vmf->flags & FAULT_FLAG_PTE_GET))
+		return;
+	vmf->flags &= ~FAULT_FLAG_PTE_GET;
+
+	pte_put(vmf->vma->vm_mm, vmf->pmd, vmf->address);
+}
+
+/*
+ * pte_alloc_get - allocate a PTE page table if the pmd entry is none, and
+ *		   hold a ->pte_refcount when returning
+ * @mm: the mm_struct of the target address space.
+ * @pmdp: a pointer to the pmd entry corresponding to the PTE page table.
+ */
+static inline int pte_alloc_get(struct mm_struct *mm, pmd_t *pmdp)
+{
+	spinlock_t *ptl;
+
+	ptl = pmd_lock(mm, pmdp);
+	if (pmd_none(*pmdp) || !pte_get_unless_zero(pmdp)) {
+		spin_unlock(ptl);
+		return __pte_alloc_get(mm, pmdp);
+	}
+	spin_unlock(ptl);
+	return 0;
+}
+
+#define pte_alloc_get_map(mm, pmd, address)			\
+	(pte_alloc_get(mm, pmd) ? NULL : pte_offset_map(pmd, address))
+
+#define pte_alloc_get_map_lock(mm, pmd, address, ptlp)		\
+	(pte_alloc_get(mm, pmd) ?				\
+		 NULL : pte_offset_map_lock(mm, pmd, address, ptlp))
+#else
+static inline void pte_ref_init(pgtable_t pte, pmd_t *pmd, int count)
+{
+}
+
+static inline pmd_t *pte_to_pmd(pte_t *pte)
+{
+	return NULL;
+}
+
+static inline void pte_migrate_pmd(pmd_t old_pmd, pmd_t *new_pmd)
+{
+}
+
+static inline void pte_get_many(pmd_t *pmdp, unsigned int nr)
+{
+}
+
+static inline void pte_get(pmd_t *pmdp)
+{
+}
+
+static inline bool pte_get_unless_zero(pmd_t *pmdp)
+{
+	return true;
+}
+
+static inline bool pte_try_get(struct mm_struct *mm, pmd_t *pmdp)
+{
+	return true;
+}
+
+static inline void pte_put_many(struct mm_struct *mm, pmd_t *pmdp,
+				unsigned long addr, unsigned int value)
+{
+}
+
+static inline void pte_put(struct mm_struct *mm, pmd_t *pmdp, unsigned long addr)
+{
+}
+
+static inline void pte_put_vmf(struct vm_fault *vmf)
+{
+}
+
+static inline int pte_alloc_get(struct mm_struct *mm, pmd_t *pmdp)
+{
+	return pte_alloc(mm, pmdp);
+}
+
+#define pte_alloc_get_map(mm, pmd, address)			\
+	pte_alloc_map(mm, pmd, address)
+
+#define pte_alloc_get_map_lock(mm, pmd, address, ptlp)		\
+	pte_alloc_map_lock(mm, pmd, address, ptlp)
+#endif /* CONFIG_FREE_USER_PTE */
+
+#endif
diff --git a/mm/Kconfig b/mm/Kconfig
index 50ca602edeb6..fb16ecebcb80 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -894,6 +894,10 @@  config IO_MAPPING
 config SECRETMEM
 	def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
 
+config FREE_USER_PTE
+	def_bool y
+	depends on X86_64 && !TRANSPARENT_HUGEPAGE
+
 source "mm/damon/Kconfig"
 
 endmenu
diff --git a/mm/memory.c b/mm/memory.c
index 265b841cc7f9..d1efb868e682 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -441,7 +441,7 @@  int __pte_alloc_kernel(pmd_t *pmd)
 
 	spin_lock(&init_mm.page_table_lock);
 	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
-		smp_wmb(); /* See comment in pmd_install() */
+		smp_wmb(); /* See comment in pmd_install_get() */
 		pmd_populate_kernel(&init_mm, pmd, new);
 		new = NULL;
 	}
@@ -4780,7 +4780,7 @@  int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 	if (pgd_present(*pgd))		/* Another has populated it */
 		p4d_free(mm, new);
 	else {
-		smp_wmb(); /* See comment in pmd_install() */
+		smp_wmb(); /* See comment in pmd_install_get() */
 		pgd_populate(mm, pgd, new);
 	}
 	spin_unlock(&mm->page_table_lock);
@@ -4802,7 +4802,7 @@  int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
 	spin_lock(&mm->page_table_lock);
 	if (!p4d_present(*p4d)) {
 		mm_inc_nr_puds(mm);
-		smp_wmb(); /* See comment in pmd_install() */
+		smp_wmb(); /* See comment in pmd_install_get() */
 		p4d_populate(mm, p4d, new);
 	} else	/* Another has populated it */
 		pud_free(mm, new);
@@ -4826,7 +4826,7 @@  int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 	ptl = pud_lock(mm, pud);
 	if (!pud_present(*pud)) {
 		mm_inc_nr_pmds(mm);
-		smp_wmb(); /* See comment in pmd_install() */
+		smp_wmb(); /* See comment in pmd_install_get() */
 		pud_populate(mm, pud, new);
 	} else	/* Another has populated it */
 		pmd_free(mm, new);
diff --git a/mm/pte_ref.c b/mm/pte_ref.c
index 07a73b5521cc..630704ae26db 100644
--- a/mm/pte_ref.c
+++ b/mm/pte_ref.c
@@ -8,12 +8,47 @@ 
  */
 
 #include <linux/pte_ref.h>
+#include <linux/hugetlb.h>
+#include <asm/tlbflush.h>
 
-void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
+#ifdef CONFIG_DEBUG_VM
+static void pte_free_debug(pmd_t pmd)
+{
+	pte_t *ptep = (pte_t *)pmd_page_vaddr(pmd);
+	int i = 0;
+
+	for (i = 0; i < PTRS_PER_PTE; i++)
+		BUG_ON(!pte_none(*ptep++));
+}
+#else
+static inline void pte_free_debug(pmd_t pmd)
+{
+}
+#endif
+
+void free_pte_table(struct mm_struct *mm, pmd_t *pmdp, unsigned long addr)
+{
+	struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
+	spinlock_t *ptl;
+	pmd_t pmd;
+
+	ptl = pmd_lock(mm, pmdp);
+	pmd = pmdp_huge_get_and_clear(mm, addr, pmdp);
+	spin_unlock(ptl);
+
+	pte_free_debug(pmd);
+	flush_tlb_range(&vma, addr, addr + PMD_SIZE);
+	mm_dec_nr_ptes(mm);
+	pte_free(mm, pmd_pgtable(pmd));
+}
+
+void pmd_install_get(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
 {
-	spinlock_t *ptl = pmd_lock(mm, pmd);
+	spinlock_t *ptl;
 
-	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
+retry:
+	ptl = pmd_lock(mm, pmd);
+	if (likely(pmd_none(*pmd))) {
 		mm_inc_nr_ptes(mm);
 		/*
 		 * Ensure all pte setup (eg. pte page lock and page clearing) are
@@ -30,19 +65,33 @@  void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
 		 */
 		smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
 		pmd_populate(mm, pmd, *pte);
+		pte_ref_init(*pte, pmd, 1);
 		*pte = NULL;
+	} else if (!pte_get_unless_zero(pmd)) {
+		spin_unlock(ptl);
+		goto retry;
 	}
 	spin_unlock(ptl);
 }
 
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
+void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
+{
+	pmd_install_get(mm, pmd, pte);
+}
+
+int __pte_alloc_get(struct mm_struct *mm, pmd_t *pmd)
 {
 	pgtable_t new = pte_alloc_one(mm);
 	if (!new)
 		return -ENOMEM;
 
-	pmd_install(mm, pmd, &new);
+	pmd_install_get(mm, pmd, &new);
 	if (new)
 		pte_free(mm, new);
 	return 0;
 }
+
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
+{
+	return __pte_alloc_get(mm, pmd);
+}
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index db6df27c852a..818c82ee3c6e 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -76,7 +76,7 @@  static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
 		set_pte_at(&init_mm, addr, pte, entry);
 	}
 
-	/* Make pte visible before pmd. See comment in pmd_install(). */
+	/* Make pte visible before pmd. See comment in pmd_install_get(). */
 	smp_wmb();
 	pmd_populate_kernel(&init_mm, pmd, pgtable);