diff mbox series

[v4,07/14] device-dax: compound devmap support

Message ID 20210827145819.16471-8-joao.m.martins@oracle.com (mailing list archive)
State New
Headers show
Series mm, sparse-vmemmap: Introduce compound devmaps for device-dax | expand

Commit Message

Joao Martins Aug. 27, 2021, 2:58 p.m. UTC
Use the newly added compound devmap facility which maps the assigned dax
ranges as compound pages at a page size of @align. Currently, this means,
that region/namespace bootstrap would take considerably less, given that
you would initialize considerably less pages.

On setups with 128G NVDIMMs the initialization with DRAM stored struct
pages improves from ~268-358 ms to ~78-100 ms with 2M pages, and to less
than a 1msec with 1G pages.

dax devices are created with a fixed @align (huge page size) which is
enforced through as well at mmap() of the device. Faults, consequently
happen too at the specified @align specified at the creation, and those
don't change through out dax device lifetime. MCEs poisons a whole dax
huge page, as well as splits occurring at the configured page size.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
 drivers/dax/device.c | 56 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 13 deletions(-)

Comments

Dan Williams Nov. 5, 2021, 12:38 a.m. UTC | #1
On Fri, Aug 27, 2021 at 7:59 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>
> Use the newly added compound devmap facility which maps the assigned dax
> ranges as compound pages at a page size of @align. Currently, this means,
> that region/namespace bootstrap would take considerably less, given that
> you would initialize considerably less pages.
>
> On setups with 128G NVDIMMs the initialization with DRAM stored struct
> pages improves from ~268-358 ms to ~78-100 ms with 2M pages, and to less
> than a 1msec with 1G pages.
>
> dax devices are created with a fixed @align (huge page size) which is
> enforced through as well at mmap() of the device. Faults, consequently
> happen too at the specified @align specified at the creation, and those
> don't change through out dax device lifetime.

s/through out/throughout/

> MCEs poisons a whole dax huge page, as well as splits occurring at the configured page size.

A clarification here, MCEs trigger memory_failure() to *unmap* a whole
dax huge page, the poison stays limited to a single cacheline.

Otherwise the patch looks good to me.

>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> ---
>  drivers/dax/device.c | 56 ++++++++++++++++++++++++++++++++++----------
>  1 file changed, 43 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/dax/device.c b/drivers/dax/device.c
> index 6e348b5f9d45..5d23128f9a60 100644
> --- a/drivers/dax/device.c
> +++ b/drivers/dax/device.c
> @@ -192,6 +192,42 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
>  }
>  #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
>
> +static void set_page_mapping(struct vm_fault *vmf, pfn_t pfn,
> +                            unsigned long fault_size,
> +                            struct address_space *f_mapping)
> +{
> +       unsigned long i;
> +       pgoff_t pgoff;
> +
> +       pgoff = linear_page_index(vmf->vma, ALIGN(vmf->address, fault_size));
> +
> +       for (i = 0; i < fault_size / PAGE_SIZE; i++) {
> +               struct page *page;
> +
> +               page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
> +               if (page->mapping)
> +                       continue;
> +               page->mapping = f_mapping;
> +               page->index = pgoff + i;
> +       }
> +}
> +
> +static void set_compound_mapping(struct vm_fault *vmf, pfn_t pfn,
> +                                unsigned long fault_size,
> +                                struct address_space *f_mapping)
> +{
> +       struct page *head;
> +
> +       head = pfn_to_page(pfn_t_to_pfn(pfn));
> +       head = compound_head(head);
> +       if (head->mapping)
> +               return;
> +
> +       head->mapping = f_mapping;
> +       head->index = linear_page_index(vmf->vma,
> +                       ALIGN(vmf->address, fault_size));
> +}
> +
>  static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf,
>                 enum page_entry_size pe_size)
>  {
> @@ -225,8 +261,7 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf,
>         }
>
>         if (rc == VM_FAULT_NOPAGE) {
> -               unsigned long i;
> -               pgoff_t pgoff;
> +               struct dev_pagemap *pgmap = dev_dax->pgmap;
>
>                 /*
>                  * In the device-dax case the only possibility for a
> @@ -234,17 +269,10 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf,
>                  * mapped. No need to consider the zero page, or racing
>                  * conflicting mappings.
>                  */
> -               pgoff = linear_page_index(vmf->vma,
> -                               ALIGN(vmf->address, fault_size));
> -               for (i = 0; i < fault_size / PAGE_SIZE; i++) {
> -                       struct page *page;
> -
> -                       page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
> -                       if (page->mapping)
> -                               continue;
> -                       page->mapping = filp->f_mapping;
> -                       page->index = pgoff + i;
> -               }
> +               if (pgmap_geometry(pgmap) > 1)
> +                       set_compound_mapping(vmf, pfn, fault_size, filp->f_mapping);
> +               else
> +                       set_page_mapping(vmf, pfn, fault_size, filp->f_mapping);
>         }
>         dax_read_unlock(id);
>
> @@ -426,6 +454,8 @@ int dev_dax_probe(struct dev_dax *dev_dax)
>         }
>
>         pgmap->type = MEMORY_DEVICE_GENERIC;
> +       if (dev_dax->align > PAGE_SIZE)
> +               pgmap->geometry = dev_dax->align >> PAGE_SHIFT;
>         dev_dax->pgmap = pgmap;
>
>         addr = devm_memremap_pages(dev, pgmap);
> --
> 2.17.1
>
Joao Martins Nov. 5, 2021, 2:10 p.m. UTC | #2
On 11/5/21 00:38, Dan Williams wrote:
> On Fri, Aug 27, 2021 at 7:59 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>>
>> Use the newly added compound devmap facility which maps the assigned dax
>> ranges as compound pages at a page size of @align. Currently, this means,
>> that region/namespace bootstrap would take considerably less, given that
>> you would initialize considerably less pages.
>>
>> On setups with 128G NVDIMMs the initialization with DRAM stored struct
>> pages improves from ~268-358 ms to ~78-100 ms with 2M pages, and to less
>> than a 1msec with 1G pages.
>>
>> dax devices are created with a fixed @align (huge page size) which is
>> enforced through as well at mmap() of the device. Faults, consequently
>> happen too at the specified @align specified at the creation, and those
>> don't change through out dax device lifetime.
> 
> s/through out/throughout/
> 
>> MCEs poisons a whole dax huge page, as well as splits occurring at the configured page size.
> 
> A clarification here, MCEs trigger memory_failure() to *unmap* a whole
> dax huge page, the poison stays limited to a single cacheline.
> 
Ah, yes. I'll fix it for v5.

> Otherwise the patch looks good to me.
> 
Thanks!

Btw, does 'looks good' == Reviewed-by (with the commit message clarification above) or is
it that 'should be good with the ammend above and you get the tag in the next round' ?

Asking as IIRC you mentioned this too some other time(s) (in the simpler sparse-vmemmap
patches) hence just clarifying to understand your expected 'process' better.

Also, I will be splitting this series as mentioned in the other discussion ...

https://lore.kernel.org/linux-mm/20211019160136.GH3686969@ziepe.ca/

... So this patch and the previous one should be the last two patches of the series
and the rest (gup, sparse-vmemmmap) will go in parallel.
Dan Williams Nov. 5, 2021, 4:41 p.m. UTC | #3
On Fri, Nov 5, 2021 at 7:10 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>
> On 11/5/21 00:38, Dan Williams wrote:
> > On Fri, Aug 27, 2021 at 7:59 AM Joao Martins <joao.m.martins@oracle.com> wrote:
> >>
> >> Use the newly added compound devmap facility which maps the assigned dax
> >> ranges as compound pages at a page size of @align. Currently, this means,
> >> that region/namespace bootstrap would take considerably less, given that
> >> you would initialize considerably less pages.
> >>
> >> On setups with 128G NVDIMMs the initialization with DRAM stored struct
> >> pages improves from ~268-358 ms to ~78-100 ms with 2M pages, and to less
> >> than a 1msec with 1G pages.
> >>
> >> dax devices are created with a fixed @align (huge page size) which is
> >> enforced through as well at mmap() of the device. Faults, consequently
> >> happen too at the specified @align specified at the creation, and those
> >> don't change through out dax device lifetime.
> >
> > s/through out/throughout/
> >
> >> MCEs poisons a whole dax huge page, as well as splits occurring at the configured page size.
> >
> > A clarification here, MCEs trigger memory_failure() to *unmap* a whole
> > dax huge page, the poison stays limited to a single cacheline.
> >
> Ah, yes. I'll fix it for v5.
>
> > Otherwise the patch looks good to me.
> >
> Thanks!
>
> Btw, does 'looks good' == Reviewed-by (with the commit message clarification above) or is
> it that 'should be good with the ammend above and you get the tag in the next round' ?

Reviewed-by: Dan Williams <dan.j.williams@intel.com>

> Asking as IIRC you mentioned this too some other time(s) (in the simpler sparse-vmemmap
> patches) hence just clarifying to understand your expected 'process' better.
>
> Also, I will be splitting this series as mentioned in the other discussion ...
>
> https://lore.kernel.org/linux-mm/20211019160136.GH3686969@ziepe.ca/
>
> ... So this patch and the previous one should be the last two patches of the series
> and the rest (gup, sparse-vmemmmap) will go in parallel.

Sounds good.
diff mbox series

Patch

diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 6e348b5f9d45..5d23128f9a60 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -192,6 +192,42 @@  static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
 }
 #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 
+static void set_page_mapping(struct vm_fault *vmf, pfn_t pfn,
+			     unsigned long fault_size,
+			     struct address_space *f_mapping)
+{
+	unsigned long i;
+	pgoff_t pgoff;
+
+	pgoff = linear_page_index(vmf->vma, ALIGN(vmf->address, fault_size));
+
+	for (i = 0; i < fault_size / PAGE_SIZE; i++) {
+		struct page *page;
+
+		page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
+		if (page->mapping)
+			continue;
+		page->mapping = f_mapping;
+		page->index = pgoff + i;
+	}
+}
+
+static void set_compound_mapping(struct vm_fault *vmf, pfn_t pfn,
+				 unsigned long fault_size,
+				 struct address_space *f_mapping)
+{
+	struct page *head;
+
+	head = pfn_to_page(pfn_t_to_pfn(pfn));
+	head = compound_head(head);
+	if (head->mapping)
+		return;
+
+	head->mapping = f_mapping;
+	head->index = linear_page_index(vmf->vma,
+			ALIGN(vmf->address, fault_size));
+}
+
 static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf,
 		enum page_entry_size pe_size)
 {
@@ -225,8 +261,7 @@  static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf,
 	}
 
 	if (rc == VM_FAULT_NOPAGE) {
-		unsigned long i;
-		pgoff_t pgoff;
+		struct dev_pagemap *pgmap = dev_dax->pgmap;
 
 		/*
 		 * In the device-dax case the only possibility for a
@@ -234,17 +269,10 @@  static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf,
 		 * mapped. No need to consider the zero page, or racing
 		 * conflicting mappings.
 		 */
-		pgoff = linear_page_index(vmf->vma,
-				ALIGN(vmf->address, fault_size));
-		for (i = 0; i < fault_size / PAGE_SIZE; i++) {
-			struct page *page;
-
-			page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
-			if (page->mapping)
-				continue;
-			page->mapping = filp->f_mapping;
-			page->index = pgoff + i;
-		}
+		if (pgmap_geometry(pgmap) > 1)
+			set_compound_mapping(vmf, pfn, fault_size, filp->f_mapping);
+		else
+			set_page_mapping(vmf, pfn, fault_size, filp->f_mapping);
 	}
 	dax_read_unlock(id);
 
@@ -426,6 +454,8 @@  int dev_dax_probe(struct dev_dax *dev_dax)
 	}
 
 	pgmap->type = MEMORY_DEVICE_GENERIC;
+	if (dev_dax->align > PAGE_SIZE)
+		pgmap->geometry = dev_dax->align >> PAGE_SHIFT;
 	dev_dax->pgmap = pgmap;
 
 	addr = devm_memremap_pages(dev, pgmap);