diff mbox series

[uprobe,thp,1/4] mm, thp: allow preallocate pgtable for split_huge_pmd_address()

Message ID 20190529212049.2413886-2-songliubraving@fb.com (mailing list archive)
State New, archived
Headers show
Series THP aware uprobe | expand

Commit Message

Song Liu May 29, 2019, 9:20 p.m. UTC
Currently, __split_huge_pmd_locked() uses page fault to handle file backed
THP. This is required because splitting pmd requires allocating a new
pgtable.

This patch allows the caller of __split_huge_pmd_locked() and
split_huge_pmd_address() to preallocate the pgtable, so that refault is
not required.

This is useful when the caller of split_huge_pmd_address() would like to
use small pages before refault.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 include/linux/huge_mm.h |  5 +++--
 mm/huge_memory.c        | 33 +++++++++++++++++++++++----------
 mm/rmap.c               |  2 +-
 3 files changed, 27 insertions(+), 13 deletions(-)

Comments

Kirill A. Shutemov May 30, 2019, 11:10 a.m. UTC | #1
On Wed, May 29, 2019 at 02:20:46PM -0700, Song Liu wrote:
> @@ -2133,10 +2133,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
>  	VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
>  				&& !pmd_devmap(*pmd));
> +	/* only file backed vma need preallocate pgtable*/
> +	VM_BUG_ON(vma_is_anonymous(vma) && prealloc_pgtable);
>  
>  	count_vm_event(THP_SPLIT_PMD);
>  
> -	if (!vma_is_anonymous(vma)) {
> +	if (prealloc_pgtable) {
> +		pgtable_trans_huge_deposit(mm, pmd, prealloc_pgtable);
> +		mm_inc_nr_pmds(mm);
> +	} else if (!vma_is_anonymous(vma)) {
>  		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
>  		/*
>  		 * We are going to unmap this huge page. So

Nope. This going to leak a page table for architectures where
arch_needs_pgtable_deposit() is true.
Kirill A. Shutemov May 30, 2019, 11:14 a.m. UTC | #2
On Thu, May 30, 2019 at 02:10:15PM +0300, Kirill A. Shutemov wrote:
> On Wed, May 29, 2019 at 02:20:46PM -0700, Song Liu wrote:
> > @@ -2133,10 +2133,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
> >  	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
> >  	VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
> >  				&& !pmd_devmap(*pmd));
> > +	/* only file backed vma need preallocate pgtable*/
> > +	VM_BUG_ON(vma_is_anonymous(vma) && prealloc_pgtable);
> >  
> >  	count_vm_event(THP_SPLIT_PMD);
> >  
> > -	if (!vma_is_anonymous(vma)) {
> > +	if (prealloc_pgtable) {
> > +		pgtable_trans_huge_deposit(mm, pmd, prealloc_pgtable);
> > +		mm_inc_nr_pmds(mm);
> > +	} else if (!vma_is_anonymous(vma)) {
> >  		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
> >  		/*
> >  		 * We are going to unmap this huge page. So
> 
> Nope. This going to leak a page table for architectures where
> arch_needs_pgtable_deposit() is true.

And I don't there's correct handling of dirty bit.

And what about DAX? Will it blow up? I think so.
Song Liu May 30, 2019, 5:23 p.m. UTC | #3
> On May 30, 2019, at 4:14 AM, Kirill A. Shutemov <kirill@shutemov.name> wrote:
> 
> On Thu, May 30, 2019 at 02:10:15PM +0300, Kirill A. Shutemov wrote:
>> On Wed, May 29, 2019 at 02:20:46PM -0700, Song Liu wrote:
>>> @@ -2133,10 +2133,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>> 	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
>>> 	VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
>>> 				&& !pmd_devmap(*pmd));
>>> +	/* only file backed vma need preallocate pgtable*/
>>> +	VM_BUG_ON(vma_is_anonymous(vma) && prealloc_pgtable);
>>> 
>>> 	count_vm_event(THP_SPLIT_PMD);
>>> 
>>> -	if (!vma_is_anonymous(vma)) {
>>> +	if (prealloc_pgtable) {
>>> +		pgtable_trans_huge_deposit(mm, pmd, prealloc_pgtable);
>>> +		mm_inc_nr_pmds(mm);
>>> +	} else if (!vma_is_anonymous(vma)) {
>>> 		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
>>> 		/*
>>> 		 * We are going to unmap this huge page. So
>> 
>> Nope. This going to leak a page table for architectures where
>> arch_needs_pgtable_deposit() is true.
> 
> And I don't there's correct handling of dirty bit.
> 
> And what about DAX? Will it blow up? I think so.
> 

Let me look into these cases. Thanks for the feedback!

Song

> -- 
> Kirill A. Shutemov
diff mbox series

Patch

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7cd5c150c21d..2d8a40fd06e4 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -161,7 +161,7 @@  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 
 
 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
-		bool freeze, struct page *page);
+		bool freeze, struct page *page, pgtable_t prealloc_pgtable);
 
 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
 		unsigned long address);
@@ -299,7 +299,8 @@  static inline void deferred_split_huge_page(struct page *page) {}
 static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long address, bool freeze, struct page *page) {}
 static inline void split_huge_pmd_address(struct vm_area_struct *vma,
-		unsigned long address, bool freeze, struct page *page) {}
+		unsigned long address, bool freeze, struct page *page,
+		pgtable_t prealloc_pgtable) {}
 
 #define split_huge_pud(__vma, __pmd, __address)	\
 	do { } while (0)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9f8bce9a6b32..dcb0e30213af 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2118,7 +2118,7 @@  static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 }
 
 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
-		unsigned long haddr, bool freeze)
+		unsigned long haddr, bool freeze, pgtable_t prealloc_pgtable)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct page *page;
@@ -2133,10 +2133,15 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
 	VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
 				&& !pmd_devmap(*pmd));
+	/* only file backed vma need preallocate pgtable*/
+	VM_BUG_ON(vma_is_anonymous(vma) && prealloc_pgtable);
 
 	count_vm_event(THP_SPLIT_PMD);
 
-	if (!vma_is_anonymous(vma)) {
+	if (prealloc_pgtable) {
+		pgtable_trans_huge_deposit(mm, pmd, prealloc_pgtable);
+		mm_inc_nr_pmds(mm);
+	} else if (!vma_is_anonymous(vma)) {
 		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
 		/*
 		 * We are going to unmap this huge page. So
@@ -2277,8 +2282,9 @@  static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 	}
 }
 
-void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-		unsigned long address, bool freeze, struct page *page)
+static void ____split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+		unsigned long address, bool freeze, struct page *page,
+		pgtable_t prealloc_pgtable)
 {
 	spinlock_t *ptl;
 	struct mmu_notifier_range range;
@@ -2303,7 +2309,8 @@  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			clear_page_mlock(page);
 	} else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
 		goto out;
-	__split_huge_pmd_locked(vma, pmd, range.start, freeze);
+	__split_huge_pmd_locked(vma, pmd, range.start, freeze,
+				prealloc_pgtable);
 out:
 	spin_unlock(ptl);
 	/*
@@ -2322,8 +2329,14 @@  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 	mmu_notifier_invalidate_range_only_end(&range);
 }
 
+void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+		unsigned long address, bool freeze, struct page *page)
+{
+	____split_huge_pmd(vma, pmd, address, freeze, page, NULL);
+}
+
 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
-		bool freeze, struct page *page)
+		bool freeze, struct page *page, pgtable_t prealloc_pgtable)
 {
 	pgd_t *pgd;
 	p4d_t *p4d;
@@ -2344,7 +2357,7 @@  void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
 
 	pmd = pmd_offset(pud, address);
 
-	__split_huge_pmd(vma, pmd, address, freeze, page);
+	____split_huge_pmd(vma, pmd, address, freeze, page, prealloc_pgtable);
 }
 
 void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -2360,7 +2373,7 @@  void vma_adjust_trans_huge(struct vm_area_struct *vma,
 	if (start & ~HPAGE_PMD_MASK &&
 	    (start & HPAGE_PMD_MASK) >= vma->vm_start &&
 	    (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
-		split_huge_pmd_address(vma, start, false, NULL);
+		split_huge_pmd_address(vma, start, false, NULL, NULL);
 
 	/*
 	 * If the new end address isn't hpage aligned and it could
@@ -2370,7 +2383,7 @@  void vma_adjust_trans_huge(struct vm_area_struct *vma,
 	if (end & ~HPAGE_PMD_MASK &&
 	    (end & HPAGE_PMD_MASK) >= vma->vm_start &&
 	    (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
-		split_huge_pmd_address(vma, end, false, NULL);
+		split_huge_pmd_address(vma, end, false, NULL, NULL);
 
 	/*
 	 * If we're also updating the vma->vm_next->vm_start, if the new
@@ -2384,7 +2397,7 @@  void vma_adjust_trans_huge(struct vm_area_struct *vma,
 		if (nstart & ~HPAGE_PMD_MASK &&
 		    (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
 		    (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
-			split_huge_pmd_address(next, nstart, false, NULL);
+			split_huge_pmd_address(next, nstart, false, NULL, NULL);
 	}
 }
 
diff --git a/mm/rmap.c b/mm/rmap.c
index e5dfe2ae6b0d..6970d732507c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1361,7 +1361,7 @@  static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 
 	if (flags & TTU_SPLIT_HUGE_PMD) {
 		split_huge_pmd_address(vma, address,
-				flags & TTU_SPLIT_FREEZE, page);
+				flags & TTU_SPLIT_FREEZE, page, NULL);
 	}
 
 	/*