diff mbox series

[RFC,v2,03/30] mm: thp: use single linked list for THP page table page deposit.

Message ID 20200928175428.4110504-4-zi.yan@sent.com (mailing list archive)
State New, archived
Headers show
Series 1GB PUD THP support on x86_64 | expand

Commit Message

Zi Yan Sept. 28, 2020, 5:54 p.m. UTC
From: Zi Yan <ziy@nvidia.com>

The old design uses the double linked list page->lru to chain all
deposited page table pages when creating a THP and page->pmd_huge_pte
to point to the first page of the list. As the second pointer in
page->lru overlaps with page->pmd_huge_pte, the design prevents
multi-level page table page deposit, which is useful for PUD and higher
level THPs.

The new design uses single linked list, where deposit_head points to
a single linked list of deposited pages and deposit_node can be used to
deposit the page itself to another list. For example, this allows us to
have one PUD page points to a list of PMD pages, each of which points
a list of PTE pages to support PUD level THP.

Signed-off-by: Zi Yan <ziy@nvidia.com>
---
 include/linux/mm.h       |  9 +++++----
 include/linux/mm_types.h |  8 +++++---
 kernel/fork.c            |  4 ++--
 mm/pgtable-generic.c     | 15 +++++----------
 4 files changed, 17 insertions(+), 19 deletions(-)

Comments

Matthew Wilcox Sept. 28, 2020, 7:34 p.m. UTC | #1
On Mon, Sep 28, 2020 at 01:54:01PM -0400, Zi Yan wrote:
>  		struct {	/* Page table pages */
> -			unsigned long _pt_pad_1;	/* compound_head */
> -			pgtable_t pmd_huge_pte; /* protected by page->ptl */
> +			struct llist_head deposit_head; /* pgtable deposit list head */
> +			struct llist_node deposit_node; /* pgtable deposit list node */

If you're going to use two pointers anyway, you might as well use a
list_head.  But I don't think you need to; you could either use a union
of these or you could use the page_address() of the page to store as
much information as you like!
Zi Yan Sept. 28, 2020, 8:34 p.m. UTC | #2
On 28 Sep 2020, at 15:34, Matthew Wilcox wrote:

> On Mon, Sep 28, 2020 at 01:54:01PM -0400, Zi Yan wrote:
>>  		struct {	/* Page table pages */
>> -			unsigned long _pt_pad_1;	/* compound_head */
>> -			pgtable_t pmd_huge_pte; /* protected by page->ptl */
>> +			struct llist_head deposit_head; /* pgtable deposit list head */
>> +			struct llist_node deposit_node; /* pgtable deposit list node */
>
> If you're going to use two pointers anyway, you might as well use a
> list_head.  But I don't think you need to; you could either use a union
> of these or you could use the page_address() of the page to store as
> much information as you like!

This is intended for depositing pgtable pages hierarchically. PUD THP
pgtable page deposit uses it. For a PUD THP, we need to deposit 1 PMD
pgtable page and 512 PTE pgtable pages, totally 513 pages.

One way is to deposit all of them on a list, but when we split the PUD
THP, we need to pull them all out and use one for PMD pgtable page
and deposit the rest 512 PTE pgtable pages to PMD page’s pmd_huge_pte.
But this mixes PMD pgtable pages and PTE pgtable pages in one list,
which can be error prone and also requires extra pgtable page deposit
operations during page split.

This approach, at the high level, makes a pgtable page’s deposit_head
point to a list of lower level pgtable pages, which are linked using
deposit_node. For example, we link all 512 PTE pgtable pages using
deposit_node and use PMD pgtable page’s deposit_head to point to the
PTE page list. In addition, when we deposit the PMD pgtable page,
we just point a struct_llist_head to the PMD pgtable page’s deposit_node.
When it comes to PUD THP split, we can simply withdraw and use the PMD
pgtable page without additional operations, since PTE pgtable pages
have already been deposited at the beginning.

Let me know if it makes sense to you. I will add the paragraphs above
to the commit message. Swapping patch 4 and 5 might also make the change
easier to understand since patch 5 use this patch.


—
Best Regards,
Yan Zi
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 17e712207d74..01b62da34794 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -10,6 +10,7 @@ 
 #include <linux/gfp.h>
 #include <linux/bug.h>
 #include <linux/list.h>
+#include <linux/llist.h>
 #include <linux/mmzone.h>
 #include <linux/rbtree.h>
 #include <linux/atomic.h>
@@ -2249,7 +2250,7 @@  static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
 static inline bool pmd_ptlock_init(struct page *page)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	page->pmd_huge_pte = NULL;
+	init_llist_head(&page->deposit_head);
 #endif
 	return ptlock_init(page);
 }
@@ -2257,12 +2258,12 @@  static inline bool pmd_ptlock_init(struct page *page)
 static inline void pmd_ptlock_free(struct page *page)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	VM_BUG_ON_PAGE(page->pmd_huge_pte, page);
+	VM_BUG_ON_PAGE(!llist_empty(&page->deposit_head), page);
 #endif
 	ptlock_free(page);
 }
 
-#define pmd_huge_pte(mm, pmd) (pmd_to_page(pmd)->pmd_huge_pte)
+#define huge_pmd_deposit_head(mm, pmd) (pmd_to_page(pmd)->deposit_head)
 
 #else
 
@@ -2274,7 +2275,7 @@  static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
 static inline bool pmd_ptlock_init(struct page *page) { return true; }
 static inline void pmd_ptlock_free(struct page *page) {}
 
-#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)
+#define huge_pmd_deposit_head(mm, pmd) ((mm)->deposit_head_pmd)
 
 #endif
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 496c3ff97cce..be842926577a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -6,6 +6,7 @@ 
 
 #include <linux/auxvec.h>
 #include <linux/list.h>
+#include <linux/llist.h>
 #include <linux/spinlock.h>
 #include <linux/rbtree.h>
 #include <linux/rwsem.h>
@@ -143,8 +144,8 @@  struct page {
 			struct list_head deferred_list;
 		};
 		struct {	/* Page table pages */
-			unsigned long _pt_pad_1;	/* compound_head */
-			pgtable_t pmd_huge_pte; /* protected by page->ptl */
+			struct llist_head deposit_head; /* pgtable deposit list head */
+			struct llist_node deposit_node; /* pgtable deposit list node */
 			unsigned long _pt_pad_2;	/* mapping */
 			union {
 				struct mm_struct *pt_mm; /* x86 pgds only */
@@ -511,7 +512,8 @@  struct mm_struct {
 		struct mmu_notifier_subscriptions *notifier_subscriptions;
 #endif
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
-		pgtable_t pmd_huge_pte; /* protected by page_table_lock */
+		/* pgtable deposit list head, protected by page_table_lock */
+		struct llist_head deposit_head_pmd;
 #endif
 #ifdef CONFIG_NUMA_BALANCING
 		/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 138cd6ca50da..9c8e880538de 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -661,7 +661,7 @@  static void check_mm(struct mm_struct *mm)
 				mm_pgtables_bytes(mm));
 
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
-	VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
+	VM_BUG_ON_MM(!llist_empty(&mm->deposit_head_pmd), mm);
 #endif
 }
 
@@ -1022,7 +1022,7 @@  static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mmu_notifier_subscriptions_init(mm);
 	init_tlb_flush_pending(mm);
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
-	mm->pmd_huge_pte = NULL;
+	init_llist_head(&mm->deposit_head_pmd);
 #endif
 	mm_init_uprobes_state(mm);
 
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 9578db83e312..dbb0154165f1 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -164,11 +164,7 @@  void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 	assert_spin_locked(pmd_lockptr(mm, pmdp));
 
 	/* FIFO */
-	if (!pmd_huge_pte(mm, pmdp))
-		INIT_LIST_HEAD(&pgtable->lru);
-	else
-		list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru);
-	pmd_huge_pte(mm, pmdp) = pgtable;
+	llist_add(&pgtable->deposit_node, &huge_pmd_deposit_head(mm, pmdp));
 }
 #endif
 
@@ -180,12 +176,11 @@  pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 
 	assert_spin_locked(pmd_lockptr(mm, pmdp));
 
+	/* only withdraw from a non empty list */
+	VM_BUG_ON(llist_empty(&huge_pmd_deposit_head(mm, pmdp)));
 	/* FIFO */
-	pgtable = pmd_huge_pte(mm, pmdp);
-	pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru,
-							  struct page, lru);
-	if (pmd_huge_pte(mm, pmdp))
-		list_del(&pgtable->lru);
+	pgtable = llist_entry(llist_del_first(&huge_pmd_deposit_head(mm, pmdp)),
+			struct page, deposit_node);
 	return pgtable;
 }
 #endif