diff mbox series

[v6,16/26] huge_memory: Add vmf_insert_folio_pmd()

Message ID 02216c30a733ecc84951f9aeb1130cef7497125d.1736488799.git-series.apopple@nvidia.com
State New
Headers show
Series fs/dax: Fix ZONE_DEVICE page reference counts | expand

Commit Message

Alistair Popple Jan. 10, 2025, 6 a.m. UTC
Currently DAX folio/page reference counts are managed differently to
normal pages. To allow these to be managed the same as normal pages
introduce vmf_insert_folio_pmd. This will map the entire PMD-sized folio
and take references as it would for a normally mapped page.

This is distinct from the current mechanism, vmf_insert_pfn_pmd, which
simply inserts a special devmap PMD entry into the page table without
holding a reference to the page for the mapping.

Signed-off-by: Alistair Popple <apopple@nvidia.com>

---

Changes for v5:
 - Minor code cleanup suggested by David
---
 include/linux/huge_mm.h |  1 +-
 mm/huge_memory.c        | 54 ++++++++++++++++++++++++++++++++++--------
 2 files changed, 45 insertions(+), 10 deletions(-)

Comments

Dan Williams Jan. 14, 2025, 2:04 a.m. UTC | #1
Alistair Popple wrote:
> Currently DAX folio/page reference counts are managed differently to
> normal pages. To allow these to be managed the same as normal pages
> introduce vmf_insert_folio_pmd. This will map the entire PMD-sized folio
> and take references as it would for a normally mapped page.
> 
> This is distinct from the current mechanism, vmf_insert_pfn_pmd, which
> simply inserts a special devmap PMD entry into the page table without
> holding a reference to the page for the mapping.
> 
> Signed-off-by: Alistair Popple <apopple@nvidia.com>
> 
> ---
> 
> Changes for v5:
>  - Minor code cleanup suggested by David
> ---
>  include/linux/huge_mm.h |  1 +-
>  mm/huge_memory.c        | 54 ++++++++++++++++++++++++++++++++++--------
>  2 files changed, 45 insertions(+), 10 deletions(-)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 5bd1ff7..3633bd3 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -39,6 +39,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>  
>  vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
>  vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
> +vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, bool write);
>  vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, bool write);
>  
>  enum transparent_hugepage_flag {
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 256adc3..d1ea76e 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1381,14 +1381,12 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
>  {
>  	struct mm_struct *mm = vma->vm_mm;
>  	pmd_t entry;
> -	spinlock_t *ptl;
>  
> -	ptl = pmd_lock(mm, pmd);

Apply this comment to the previous patch too, but I think this would be
more self-documenting as:

   lockdep_assert_held(pmd_lock(mm, pmd));

...to make it clear in this diff and into the future what the locking
constraints of this function are.

After that you can add:

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
David Hildenbrand Jan. 14, 2025, 4:40 p.m. UTC | #2
> +vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, bool write)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	unsigned long addr = vmf->address & PMD_MASK;
> +	struct mm_struct *mm = vma->vm_mm;
> +	spinlock_t *ptl;
> +	pgtable_t pgtable = NULL;
> +
> +	if (addr < vma->vm_start || addr >= vma->vm_end)
> +		return VM_FAULT_SIGBUS;
> +
> +	if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
> +		return VM_FAULT_SIGBUS;
> +
> +	if (arch_needs_pgtable_deposit()) {
> +		pgtable = pte_alloc_one(vma->vm_mm);
> +		if (!pgtable)
> +			return VM_FAULT_OOM;
> +	}

This is interesting and nasty at the same time (only to make ppc64 boo3s 
with has tables happy). But it seems to be the right thing to do.

> +
> +	ptl = pmd_lock(mm, vmf->pmd);
> +	if (pmd_none(*vmf->pmd)) {
> +		folio_get(folio);
> +		folio_add_file_rmap_pmd(folio, &folio->page, vma);
> +		add_mm_counter(mm, mm_counter_file(folio), HPAGE_PMD_NR);
> +	}
> +	insert_pfn_pmd(vma, addr, vmf->pmd, pfn_to_pfn_t(folio_pfn(folio)),
> +		       vma->vm_page_prot, write, pgtable);
> +	spin_unlock(ptl);
> +	if (pgtable)
> +		pte_free(mm, pgtable);

Ehm, are you unconditionally freeing the pgtable, even if consumed by 
insert_pfn_pmd() ?

Note that setting pgtable to NULL in insert_pfn_pmd() when consumed will 
not be visible here.

You'd have to pass a pointer to the ... pointer (&pgtable).

... unless I am missing something, staring at the diff.
Dan Williams Jan. 14, 2025, 5:22 p.m. UTC | #3
David Hildenbrand wrote:
> > +vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, bool write)
> > +{
> > +	struct vm_area_struct *vma = vmf->vma;
> > +	unsigned long addr = vmf->address & PMD_MASK;
> > +	struct mm_struct *mm = vma->vm_mm;
> > +	spinlock_t *ptl;
> > +	pgtable_t pgtable = NULL;
> > +
> > +	if (addr < vma->vm_start || addr >= vma->vm_end)
> > +		return VM_FAULT_SIGBUS;
> > +
> > +	if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
> > +		return VM_FAULT_SIGBUS;
> > +
> > +	if (arch_needs_pgtable_deposit()) {
> > +		pgtable = pte_alloc_one(vma->vm_mm);
> > +		if (!pgtable)
> > +			return VM_FAULT_OOM;
> > +	}
> 
> This is interesting and nasty at the same time (only to make ppc64 boo3s 
> with has tables happy). But it seems to be the right thing to do.
> 
> > +
> > +	ptl = pmd_lock(mm, vmf->pmd);
> > +	if (pmd_none(*vmf->pmd)) {
> > +		folio_get(folio);
> > +		folio_add_file_rmap_pmd(folio, &folio->page, vma);
> > +		add_mm_counter(mm, mm_counter_file(folio), HPAGE_PMD_NR);
> > +	}
> > +	insert_pfn_pmd(vma, addr, vmf->pmd, pfn_to_pfn_t(folio_pfn(folio)),
> > +		       vma->vm_page_prot, write, pgtable);
> > +	spin_unlock(ptl);
> > +	if (pgtable)
> > +		pte_free(mm, pgtable);
> 
> Ehm, are you unconditionally freeing the pgtable, even if consumed by 
> insert_pfn_pmd() ?
> 
> Note that setting pgtable to NULL in insert_pfn_pmd() when consumed will 
> not be visible here.
> 
> You'd have to pass a pointer to the ... pointer (&pgtable).
> 
> ... unless I am missing something, staring at the diff.

In fact I glazed over the fact that this has been commented on before
and assumed it was fixed:

http://lore.kernel.org/66f61ce4da80_964f2294fb@dwillia2-xfh.jf.intel.com.notmuch

So, yes, insert_pfn_pmd needs to take &pgtable to report back if the
allocation got consumed.

Good catch.
Alistair Popple Jan. 15, 2025, 7:05 a.m. UTC | #4
On Tue, Jan 14, 2025 at 09:22:00AM -0800, Dan Williams wrote:
> David Hildenbrand wrote:
> > > +vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, bool write)
> > > +{
> > > +	struct vm_area_struct *vma = vmf->vma;
> > > +	unsigned long addr = vmf->address & PMD_MASK;
> > > +	struct mm_struct *mm = vma->vm_mm;
> > > +	spinlock_t *ptl;
> > > +	pgtable_t pgtable = NULL;
> > > +
> > > +	if (addr < vma->vm_start || addr >= vma->vm_end)
> > > +		return VM_FAULT_SIGBUS;
> > > +
> > > +	if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
> > > +		return VM_FAULT_SIGBUS;
> > > +
> > > +	if (arch_needs_pgtable_deposit()) {
> > > +		pgtable = pte_alloc_one(vma->vm_mm);
> > > +		if (!pgtable)
> > > +			return VM_FAULT_OOM;
> > > +	}
> > 
> > This is interesting and nasty at the same time (only to make ppc64 boo3s 
> > with has tables happy). But it seems to be the right thing to do.
> > 
> > > +
> > > +	ptl = pmd_lock(mm, vmf->pmd);
> > > +	if (pmd_none(*vmf->pmd)) {
> > > +		folio_get(folio);
> > > +		folio_add_file_rmap_pmd(folio, &folio->page, vma);
> > > +		add_mm_counter(mm, mm_counter_file(folio), HPAGE_PMD_NR);
> > > +	}
> > > +	insert_pfn_pmd(vma, addr, vmf->pmd, pfn_to_pfn_t(folio_pfn(folio)),
> > > +		       vma->vm_page_prot, write, pgtable);
> > > +	spin_unlock(ptl);
> > > +	if (pgtable)
> > > +		pte_free(mm, pgtable);
> > 
> > Ehm, are you unconditionally freeing the pgtable, even if consumed by 
> > insert_pfn_pmd() ?
> > 
> > Note that setting pgtable to NULL in insert_pfn_pmd() when consumed will 
> > not be visible here.
> > 
> > You'd have to pass a pointer to the ... pointer (&pgtable).
> > 
> > ... unless I am missing something, staring at the diff.
> 
> In fact I glazed over the fact that this has been commented on before
> and assumed it was fixed:
> 
> http://lore.kernel.org/66f61ce4da80_964f2294fb@dwillia2-xfh.jf.intel.com.notmuch
> 
> So, yes, insert_pfn_pmd needs to take &pgtable to report back if the
> allocation got consumed.
> 
> Good catch.

Yes, thanks Dave and Dan and apologies for missing that originally. Looking
at the thread I suspect I went down the rabbit hole of trying to implement
vmf_insert_folio() and when that wasn't possible forgot to come back and fix
this up. I have added a return code to insert_pfn_pmd() to indicate whether
or not the pgtable was consumed. I have also added a comment in the commit log
explaining why a vmf_insert_folio() isn't useful.

 - Alistair
diff mbox series

Patch

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 5bd1ff7..3633bd3 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -39,6 +39,7 @@  int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 
 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
 vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
+vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, bool write);
 vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, bool write);
 
 enum transparent_hugepage_flag {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 256adc3..d1ea76e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1381,14 +1381,12 @@  static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pmd_t entry;
-	spinlock_t *ptl;
 
-	ptl = pmd_lock(mm, pmd);
 	if (!pmd_none(*pmd)) {
 		if (write) {
 			if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
 				WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
-				goto out_unlock;
+				return;
 			}
 			entry = pmd_mkyoung(*pmd);
 			entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1396,7 +1394,7 @@  static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 				update_mmu_cache_pmd(vma, addr, pmd);
 		}
 
-		goto out_unlock;
+		return;
 	}
 
 	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
@@ -1417,11 +1415,6 @@  static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 
 	set_pmd_at(mm, addr, pmd, entry);
 	update_mmu_cache_pmd(vma, addr, pmd);
-
-out_unlock:
-	spin_unlock(ptl);
-	if (pgtable)
-		pte_free(mm, pgtable);
 }
 
 /**
@@ -1440,6 +1433,7 @@  vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
 	struct vm_area_struct *vma = vmf->vma;
 	pgprot_t pgprot = vma->vm_page_prot;
 	pgtable_t pgtable = NULL;
+	spinlock_t *ptl;
 
 	/*
 	 * If we had pmd_special, we could avoid all these restrictions,
@@ -1462,12 +1456,52 @@  vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
 	}
 
 	track_pfn_insert(vma, &pgprot, pfn);
-
+	ptl = pmd_lock(vma->vm_mm, vmf->pmd);
 	insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
+	spin_unlock(ptl);
+	if (pgtable)
+		pte_free(vma->vm_mm, pgtable);
+
 	return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
 
+vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, bool write)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	unsigned long addr = vmf->address & PMD_MASK;
+	struct mm_struct *mm = vma->vm_mm;
+	spinlock_t *ptl;
+	pgtable_t pgtable = NULL;
+
+	if (addr < vma->vm_start || addr >= vma->vm_end)
+		return VM_FAULT_SIGBUS;
+
+	if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
+		return VM_FAULT_SIGBUS;
+
+	if (arch_needs_pgtable_deposit()) {
+		pgtable = pte_alloc_one(vma->vm_mm);
+		if (!pgtable)
+			return VM_FAULT_OOM;
+	}
+
+	ptl = pmd_lock(mm, vmf->pmd);
+	if (pmd_none(*vmf->pmd)) {
+		folio_get(folio);
+		folio_add_file_rmap_pmd(folio, &folio->page, vma);
+		add_mm_counter(mm, mm_counter_file(folio), HPAGE_PMD_NR);
+	}
+	insert_pfn_pmd(vma, addr, vmf->pmd, pfn_to_pfn_t(folio_pfn(folio)),
+		       vma->vm_page_prot, write, pgtable);
+	spin_unlock(ptl);
+	if (pgtable)
+		pte_free(mm, pgtable);
+
+	return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd);
+
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
 {