diff mbox series

[v5,1/2] mm: make dev_pagemap_mapping_shift() externally visible

Message ID 20191212182238.46535-2-brho@google.com (mailing list archive)
State New, archived
Headers show
Series kvm: Use huge pages for DAX-backed files | expand

Commit Message

Barret Rhoden Dec. 12, 2019, 6:22 p.m. UTC
KVM has a use case for determining the size of a dax mapping.

The KVM code has easy access to the address and the mm, and
dev_pagemap_mapping_shift() needs only those parameters.  It was
deriving them from page and vma.  This commit changes those parameters
from (page, vma) to (address, mm).

Signed-off-by: Barret Rhoden <brho@google.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/mm.h  |  3 +++
 mm/memory-failure.c | 38 +++-----------------------------------
 mm/util.c           | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 40 insertions(+), 35 deletions(-)

Comments

Sean Christopherson Dec. 13, 2019, 5:47 p.m. UTC | #1
On Thu, Dec 12, 2019 at 01:22:37PM -0500, Barret Rhoden wrote:
> KVM has a use case for determining the size of a dax mapping.
> 
> The KVM code has easy access to the address and the mm, and
> dev_pagemap_mapping_shift() needs only those parameters.  It was
> deriving them from page and vma.  This commit changes those parameters
> from (page, vma) to (address, mm).
> 
> Signed-off-by: Barret Rhoden <brho@google.com>
> Reviewed-by: David Hildenbrand <david@redhat.com>
> Acked-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  include/linux/mm.h  |  3 +++
>  mm/memory-failure.c | 38 +++-----------------------------------
>  mm/util.c           | 34 ++++++++++++++++++++++++++++++++++
>  3 files changed, 40 insertions(+), 35 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index a2adf95b3f9c..bfd1882dd5c6 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1013,6 +1013,9 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
>  #define page_ref_zero_or_close_to_overflow(page) \
>  	((unsigned int) page_ref_count(page) + 127u <= 127u)
>  
> +unsigned long dev_pagemap_mapping_shift(unsigned long address,
> +					struct mm_struct *mm);
> +
>  static inline void get_page(struct page *page)
>  {
>  	page = compound_head(page);
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 3151c87dff73..bafa464c8290 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -261,40 +261,6 @@ void shake_page(struct page *p, int access)
>  }
>  EXPORT_SYMBOL_GPL(shake_page);
>  
> -static unsigned long dev_pagemap_mapping_shift(struct page *page,
> -		struct vm_area_struct *vma)
> -{
> -	unsigned long address = vma_address(page, vma);
> -	pgd_t *pgd;
> -	p4d_t *p4d;
> -	pud_t *pud;
> -	pmd_t *pmd;
> -	pte_t *pte;
> -
> -	pgd = pgd_offset(vma->vm_mm, address);
> -	if (!pgd_present(*pgd))
> -		return 0;
> -	p4d = p4d_offset(pgd, address);
> -	if (!p4d_present(*p4d))
> -		return 0;
> -	pud = pud_offset(p4d, address);
> -	if (!pud_present(*pud))
> -		return 0;
> -	if (pud_devmap(*pud))
> -		return PUD_SHIFT;
> -	pmd = pmd_offset(pud, address);
> -	if (!pmd_present(*pmd))
> -		return 0;
> -	if (pmd_devmap(*pmd))
> -		return PMD_SHIFT;
> -	pte = pte_offset_map(pmd, address);
> -	if (!pte_present(*pte))
> -		return 0;
> -	if (pte_devmap(*pte))
> -		return PAGE_SHIFT;
> -	return 0;
> -}
> -
>  /*
>   * Failure handling: if we can't find or can't kill a process there's
>   * not much we can do.	We just print a message and ignore otherwise.
> @@ -324,7 +290,9 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
>  	}
>  	tk->addr = page_address_in_vma(p, vma);
>  	if (is_zone_device_page(p))
> -		tk->size_shift = dev_pagemap_mapping_shift(p, vma);
> +		tk->size_shift =
> +			dev_pagemap_mapping_shift(vma_address(page, vma),
> +						  vma->vm_mm);
>  	else
>  		tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
>  
> diff --git a/mm/util.c b/mm/util.c
> index 3ad6db9a722e..59984e6b40ab 100644
> --- a/mm/util.c
> +++ b/mm/util.c
> @@ -901,3 +901,37 @@ int memcmp_pages(struct page *page1, struct page *page2)
>  	kunmap_atomic(addr1);
>  	return ret;
>  }
> +
> +unsigned long dev_pagemap_mapping_shift(unsigned long address,
> +					struct mm_struct *mm)
> +{
> +	pgd_t *pgd;
> +	p4d_t *p4d;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +	pte_t *pte;
> +
> +	pgd = pgd_offset(mm, address);
> +	if (!pgd_present(*pgd))
> +		return 0;
> +	p4d = p4d_offset(pgd, address);
> +	if (!p4d_present(*p4d))
> +		return 0;
> +	pud = pud_offset(p4d, address);
> +	if (!pud_present(*pud))
> +		return 0;
> +	if (pud_devmap(*pud))
> +		return PUD_SHIFT;
> +	pmd = pmd_offset(pud, address);
> +	if (!pmd_present(*pmd))
> +		return 0;
> +	if (pmd_devmap(*pmd))
> +		return PMD_SHIFT;
> +	pte = pte_offset_map(pmd, address);
> +	if (!pte_present(*pte))
> +		return 0;
> +	if (pte_devmap(*pte))
> +		return PAGE_SHIFT;
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(dev_pagemap_mapping_shift);

This is basically a rehash of lookup_address_in_pgd(), and doesn't provide
exactly what KVM needs.  E.g. KVM works with levels instead of shifts, and
it would be nice to provide the pte so that KVM can sanity check that the
pfn from this walk matches the pfn it plans on mapping.

Instead of exporting dev_pagemap_mapping_shift(), what about relacing it
with a patch to introduce lookup_address_mm() and export that?

dev_pagemap_mapping_shift() could then wrap the new helper (if you want),
and KVM could do lookup_address_mm() for querying the size of ZONE_DEVICE
pages.
Dan Williams Dec. 13, 2019, 6:13 p.m. UTC | #2
On Fri, Dec 13, 2019 at 9:47 AM Sean Christopherson
<sean.j.christopherson@intel.com> wrote:
>
> On Thu, Dec 12, 2019 at 01:22:37PM -0500, Barret Rhoden wrote:
> > KVM has a use case for determining the size of a dax mapping.
> >
> > The KVM code has easy access to the address and the mm, and
> > dev_pagemap_mapping_shift() needs only those parameters.  It was
> > deriving them from page and vma.  This commit changes those parameters
> > from (page, vma) to (address, mm).
> >
> > Signed-off-by: Barret Rhoden <brho@google.com>
> > Reviewed-by: David Hildenbrand <david@redhat.com>
> > Acked-by: Dan Williams <dan.j.williams@intel.com>
> > ---
> >  include/linux/mm.h  |  3 +++
> >  mm/memory-failure.c | 38 +++-----------------------------------
> >  mm/util.c           | 34 ++++++++++++++++++++++++++++++++++
> >  3 files changed, 40 insertions(+), 35 deletions(-)
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index a2adf95b3f9c..bfd1882dd5c6 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -1013,6 +1013,9 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
> >  #define page_ref_zero_or_close_to_overflow(page) \
> >       ((unsigned int) page_ref_count(page) + 127u <= 127u)
> >
> > +unsigned long dev_pagemap_mapping_shift(unsigned long address,
> > +                                     struct mm_struct *mm);
> > +
> >  static inline void get_page(struct page *page)
> >  {
> >       page = compound_head(page);
> > diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> > index 3151c87dff73..bafa464c8290 100644
> > --- a/mm/memory-failure.c
> > +++ b/mm/memory-failure.c
> > @@ -261,40 +261,6 @@ void shake_page(struct page *p, int access)
> >  }
> >  EXPORT_SYMBOL_GPL(shake_page);
> >
> > -static unsigned long dev_pagemap_mapping_shift(struct page *page,
> > -             struct vm_area_struct *vma)
> > -{
> > -     unsigned long address = vma_address(page, vma);
> > -     pgd_t *pgd;
> > -     p4d_t *p4d;
> > -     pud_t *pud;
> > -     pmd_t *pmd;
> > -     pte_t *pte;
> > -
> > -     pgd = pgd_offset(vma->vm_mm, address);
> > -     if (!pgd_present(*pgd))
> > -             return 0;
> > -     p4d = p4d_offset(pgd, address);
> > -     if (!p4d_present(*p4d))
> > -             return 0;
> > -     pud = pud_offset(p4d, address);
> > -     if (!pud_present(*pud))
> > -             return 0;
> > -     if (pud_devmap(*pud))
> > -             return PUD_SHIFT;
> > -     pmd = pmd_offset(pud, address);
> > -     if (!pmd_present(*pmd))
> > -             return 0;
> > -     if (pmd_devmap(*pmd))
> > -             return PMD_SHIFT;
> > -     pte = pte_offset_map(pmd, address);
> > -     if (!pte_present(*pte))
> > -             return 0;
> > -     if (pte_devmap(*pte))
> > -             return PAGE_SHIFT;
> > -     return 0;
> > -}
> > -
> >  /*
> >   * Failure handling: if we can't find or can't kill a process there's
> >   * not much we can do.       We just print a message and ignore otherwise.
> > @@ -324,7 +290,9 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
> >       }
> >       tk->addr = page_address_in_vma(p, vma);
> >       if (is_zone_device_page(p))
> > -             tk->size_shift = dev_pagemap_mapping_shift(p, vma);
> > +             tk->size_shift =
> > +                     dev_pagemap_mapping_shift(vma_address(page, vma),
> > +                                               vma->vm_mm);
> >       else
> >               tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
> >
> > diff --git a/mm/util.c b/mm/util.c
> > index 3ad6db9a722e..59984e6b40ab 100644
> > --- a/mm/util.c
> > +++ b/mm/util.c
> > @@ -901,3 +901,37 @@ int memcmp_pages(struct page *page1, struct page *page2)
> >       kunmap_atomic(addr1);
> >       return ret;
> >  }
> > +
> > +unsigned long dev_pagemap_mapping_shift(unsigned long address,
> > +                                     struct mm_struct *mm)
> > +{
> > +     pgd_t *pgd;
> > +     p4d_t *p4d;
> > +     pud_t *pud;
> > +     pmd_t *pmd;
> > +     pte_t *pte;
> > +
> > +     pgd = pgd_offset(mm, address);
> > +     if (!pgd_present(*pgd))
> > +             return 0;
> > +     p4d = p4d_offset(pgd, address);
> > +     if (!p4d_present(*p4d))
> > +             return 0;
> > +     pud = pud_offset(p4d, address);
> > +     if (!pud_present(*pud))
> > +             return 0;
> > +     if (pud_devmap(*pud))
> > +             return PUD_SHIFT;
> > +     pmd = pmd_offset(pud, address);
> > +     if (!pmd_present(*pmd))
> > +             return 0;
> > +     if (pmd_devmap(*pmd))
> > +             return PMD_SHIFT;
> > +     pte = pte_offset_map(pmd, address);
> > +     if (!pte_present(*pte))
> > +             return 0;
> > +     if (pte_devmap(*pte))
> > +             return PAGE_SHIFT;
> > +     return 0;
> > +}
> > +EXPORT_SYMBOL_GPL(dev_pagemap_mapping_shift);
>
> This is basically a rehash of lookup_address_in_pgd(), and doesn't provide
> exactly what KVM needs.  E.g. KVM works with levels instead of shifts, and
> it would be nice to provide the pte so that KVM can sanity check that the
> pfn from this walk matches the pfn it plans on mapping.
>
> Instead of exporting dev_pagemap_mapping_shift(), what about relacing it
> with a patch to introduce lookup_address_mm() and export that?
>
> dev_pagemap_mapping_shift() could then wrap the new helper (if you want),
> and KVM could do lookup_address_mm() for querying the size of ZONE_DEVICE
> pages.

All of the above sounds great to me. Should have looked that much
harder when implementing dev_pagemap_mapping_shift() originally.
Barret Rhoden Dec. 16, 2019, 5:59 p.m. UTC | #3
On 12/13/19 12:47 PM, Sean Christopherson wrote:
>> +unsigned long dev_pagemap_mapping_shift(unsigned long address,
>> +					struct mm_struct *mm)
>> +{
>> +	pgd_t *pgd;
>> +	p4d_t *p4d;
>> +	pud_t *pud;
>> +	pmd_t *pmd;
>> +	pte_t *pte;
>> +
>> +	pgd = pgd_offset(mm, address);
>> +	if (!pgd_present(*pgd))
>> +		return 0;
>> +	p4d = p4d_offset(pgd, address);
>> +	if (!p4d_present(*p4d))
>> +		return 0;
>> +	pud = pud_offset(p4d, address);
>> +	if (!pud_present(*pud))
>> +		return 0;
>> +	if (pud_devmap(*pud))
>> +		return PUD_SHIFT;
>> +	pmd = pmd_offset(pud, address);
>> +	if (!pmd_present(*pmd))
>> +		return 0;
>> +	if (pmd_devmap(*pmd))
>> +		return PMD_SHIFT;
>> +	pte = pte_offset_map(pmd, address);
>> +	if (!pte_present(*pte))
>> +		return 0;
>> +	if (pte_devmap(*pte))
>> +		return PAGE_SHIFT;
>> +	return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(dev_pagemap_mapping_shift);
> 
> This is basically a rehash of lookup_address_in_pgd(), and doesn't provide
> exactly what KVM needs.  E.g. KVM works with levels instead of shifts, and
> it would be nice to provide the pte so that KVM can sanity check that the
> pfn from this walk matches the pfn it plans on mapping.

One minor issue is that the levels for lookup_address_in_pgd() and for 
KVM differ in name, although not in value.  lookup uses PG_LEVEL_4K = 1. 
  KVM uses PT_PAGE_TABLE_LEVEL = 1.  The enums differ a little too: x86 
has a name for a 512G page, etc.  It's all in arch/x86.

Does KVM-x86 need its own names for the levels?  If not, I could convert 
the PT_PAGE_TABLE_* stuff to PG_LEVEL_* stuff.

> 
> Instead of exporting dev_pagemap_mapping_shift(), what about replacing it
> with a patch to introduce lookup_address_mm() and export that?
> 
> dev_pagemap_mapping_shift() could then wrap the new helper (if you want),

I might hold off on that for now, since that helper is used outside of 
x86, and I don't know if 'level' makes sense outside of x86.

Thanks,

Barret
Sean Christopherson Dec. 18, 2019, 12:18 a.m. UTC | #4
On Mon, Dec 16, 2019 at 12:59:53PM -0500, Barret Rhoden wrote:
> On 12/13/19 12:47 PM, Sean Christopherson wrote:
> >>+EXPORT_SYMBOL_GPL(dev_pagemap_mapping_shift);
> >
> >This is basically a rehash of lookup_address_in_pgd(), and doesn't provide
> >exactly what KVM needs.  E.g. KVM works with levels instead of shifts, and
> >it would be nice to provide the pte so that KVM can sanity check that the
> >pfn from this walk matches the pfn it plans on mapping.
> 
> One minor issue is that the levels for lookup_address_in_pgd() and for KVM
> differ in name, although not in value.  lookup uses PG_LEVEL_4K = 1.  KVM
> uses PT_PAGE_TABLE_LEVEL = 1.  The enums differ a little too: x86 has a name
> for a 512G page, etc.  It's all in arch/x86.
> 
> Does KVM-x86 need its own names for the levels?  If not, I could convert the
> PT_PAGE_TABLE_* stuff to PG_LEVEL_* stuff.

Not really.  I suspect the whole reason KVM has different enums is to
handle PSE/Mode-B paging, where PG_LEVEL_2M would be inaccurate, e.g.:

	if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
		gfn += pse36_gfn_delta(pte);

That being said, I absolute loathe PT_PAGE_TABLE_LEVEL, I can never
remember that it means 4k pages.  I would be in favor of using the kernel's
enums with some KVM-specific abstraction of PG_LEVEL_2M, e.g.

/* KVM Hugepage definitions for x86 */
enum {
	PG_LEVEL_2M_OR_4M      = PG_LEVEL_2M,
	/* set max level to the biggest one */
	KVM_MAX_HUGEPAGE_LEVEL = PG_LEVEL_1G,
};

And ideally restrict usage of the ugly PG_LEVEL_2M_OR_4M to flows that can
actually encounter 4M pages, i.e. when walking guest page tables.  On the
host side, KVM always uses PAE or 64-bit paging.

Personally, I'd pursue that in a separate patch/series, it'll touch a
massive amount of code and will probably get bikeshedded a fair amount.
Paolo Bonzini Jan. 15, 2020, 6:33 p.m. UTC | #5
On 16/12/19 18:59, Barret Rhoden wrote:
> Does KVM-x86 need its own names for the levels?  If not, I could convert
> the PT_PAGE_TABLE_* stuff to PG_LEVEL_* stuff.

Yes, please do.  For the 2M/4M case, it's only incorrect to use 2M here:

        if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
                gfn += pse36_gfn_delta(pte);

And for that you can even use "> PG_LEVEL_4K" if you think it's nicer.

Paolo
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a2adf95b3f9c..bfd1882dd5c6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1013,6 +1013,9 @@  static inline bool is_pci_p2pdma_page(const struct page *page)
 #define page_ref_zero_or_close_to_overflow(page) \
 	((unsigned int) page_ref_count(page) + 127u <= 127u)
 
+unsigned long dev_pagemap_mapping_shift(unsigned long address,
+					struct mm_struct *mm);
+
 static inline void get_page(struct page *page)
 {
 	page = compound_head(page);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3151c87dff73..bafa464c8290 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -261,40 +261,6 @@  void shake_page(struct page *p, int access)
 }
 EXPORT_SYMBOL_GPL(shake_page);
 
-static unsigned long dev_pagemap_mapping_shift(struct page *page,
-		struct vm_area_struct *vma)
-{
-	unsigned long address = vma_address(page, vma);
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-
-	pgd = pgd_offset(vma->vm_mm, address);
-	if (!pgd_present(*pgd))
-		return 0;
-	p4d = p4d_offset(pgd, address);
-	if (!p4d_present(*p4d))
-		return 0;
-	pud = pud_offset(p4d, address);
-	if (!pud_present(*pud))
-		return 0;
-	if (pud_devmap(*pud))
-		return PUD_SHIFT;
-	pmd = pmd_offset(pud, address);
-	if (!pmd_present(*pmd))
-		return 0;
-	if (pmd_devmap(*pmd))
-		return PMD_SHIFT;
-	pte = pte_offset_map(pmd, address);
-	if (!pte_present(*pte))
-		return 0;
-	if (pte_devmap(*pte))
-		return PAGE_SHIFT;
-	return 0;
-}
-
 /*
  * Failure handling: if we can't find or can't kill a process there's
  * not much we can do.	We just print a message and ignore otherwise.
@@ -324,7 +290,9 @@  static void add_to_kill(struct task_struct *tsk, struct page *p,
 	}
 	tk->addr = page_address_in_vma(p, vma);
 	if (is_zone_device_page(p))
-		tk->size_shift = dev_pagemap_mapping_shift(p, vma);
+		tk->size_shift =
+			dev_pagemap_mapping_shift(vma_address(page, vma),
+						  vma->vm_mm);
 	else
 		tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
 
diff --git a/mm/util.c b/mm/util.c
index 3ad6db9a722e..59984e6b40ab 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -901,3 +901,37 @@  int memcmp_pages(struct page *page1, struct page *page2)
 	kunmap_atomic(addr1);
 	return ret;
 }
+
+unsigned long dev_pagemap_mapping_shift(unsigned long address,
+					struct mm_struct *mm)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = pgd_offset(mm, address);
+	if (!pgd_present(*pgd))
+		return 0;
+	p4d = p4d_offset(pgd, address);
+	if (!p4d_present(*p4d))
+		return 0;
+	pud = pud_offset(p4d, address);
+	if (!pud_present(*pud))
+		return 0;
+	if (pud_devmap(*pud))
+		return PUD_SHIFT;
+	pmd = pmd_offset(pud, address);
+	if (!pmd_present(*pmd))
+		return 0;
+	if (pmd_devmap(*pmd))
+		return PMD_SHIFT;
+	pte = pte_offset_map(pmd, address);
+	if (!pte_present(*pte))
+		return 0;
+	if (pte_devmap(*pte))
+		return PAGE_SHIFT;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dev_pagemap_mapping_shift);