diff mbox series

[v3,12/14] device-dax: compound pagemap support

Message ID 20210714193542.21857-13-joao.m.martins@oracle.com (mailing list archive)
State New, archived
Headers show
Series mm, sparse-vmemmap: Introduce compound pagemaps | expand

Commit Message

Joao Martins July 14, 2021, 7:35 p.m. UTC
Use the newly added compound pagemap facility which maps the assigned dax
ranges as compound pages at a page size of @align. Currently, this means,
that region/namespace bootstrap would take considerably less, given that
you would initialize considerably less pages.

On setups with 128G NVDIMMs the initialization with DRAM stored struct
pages improves from ~268-358 ms to ~78-100 ms with 2M pages, and to less
than a 1msec with 1G pages.

dax devices are created with a fixed @align (huge page size) which is
enforced through as well at mmap() of the device. Faults, consequently
happen too at the specified @align specified at the creation, and those
don't change through out dax device lifetime. MCEs poisons a whole dax
huge page, as well as splits occurring at the configured page size.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
 drivers/dax/device.c | 56 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 13 deletions(-)

Comments

Dan Williams July 14, 2021, 11:36 p.m. UTC | #1
On Wed, Jul 14, 2021 at 12:36 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>
> Use the newly added compound pagemap facility which maps the assigned dax
> ranges as compound pages at a page size of @align. Currently, this means,
> that region/namespace bootstrap would take considerably less, given that
> you would initialize considerably less pages.
>
> On setups with 128G NVDIMMs the initialization with DRAM stored struct
> pages improves from ~268-358 ms to ~78-100 ms with 2M pages, and to less
> than a 1msec with 1G pages.
>
> dax devices are created with a fixed @align (huge page size) which is
> enforced through as well at mmap() of the device. Faults, consequently
> happen too at the specified @align specified at the creation, and those
> don't change through out dax device lifetime. MCEs poisons a whole dax
> huge page, as well as splits occurring at the configured page size.
>

Hi Joao,

With this patch I'm hitting the following with the 'device-dax' test [1].

kernel BUG at include/linux/mm.h:748!
invalid opcode: 0000 [#1] SMP NOPTI
CPU: 29 PID: 1509 Comm: device-dax Tainted: G        W  OE     5.14.0-rc1+ #720
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
RIP: 0010:memunmap_pages+0x2f5/0x390
Code: 00 00 00 31 d2 48 8d 70 01 48 29 fe 48 c1 ef 0c 48 c1 ee 0c e8
1c 30 fa ff e9 c5 fe ff ff 48 c7 c6 00 4a 58 87 e8 eb d1 f6 ff <0f> 0b
48 8b 7b 30 31 f6 e8 7e aa 2b 00 e9 2d fd ff ff 48 8d 7b 48
RSP: 0018:ffff9d33c240bbf0 EFLAGS: 00010246
RAX: 000000000000003e RBX: ffff8a44446eb700 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffff8a46b3b58af0 RDI: ffff8a46b3b58af0
RBP: 0000000000000000 R08: 0000000000000001 R09: 00000000ffffbfff
R10: ffff8a46b32a0000 R11: ffff8a46b32a0000 R12: 0000000000104201
R13: ffff8a44446eb700 R14: 0000000000000004 R15: ffff8a44474954d8
FS:  00007fd048a81fc0(0000) GS:ffff8a46b3b40000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000561ee7399000 CR3: 0000000206c70004 CR4: 0000000000170ee0
Call Trace:
 devres_release_all+0xb8/0x100
 __device_release_driver+0x190/0x240
 device_release_driver+0x26/0x40
 bus_remove_device+0xef/0x160
 device_del+0x18c/0x3e0
 unregister_dev_dax+0x62/0x90
 devres_release_all+0xb8/0x100
 __device_release_driver+0x190/0x240
 device_driver_detach+0x3e/0xa0
 unbind_store+0x113/0x130

[1]: https://github.com/pmem/ndctl/blob/master/test/device-dax.c
Joao Martins July 15, 2021, noon UTC | #2
On 7/15/21 12:36 AM, Dan Williams wrote:
> On Wed, Jul 14, 2021 at 12:36 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>>
>> Use the newly added compound pagemap facility which maps the assigned dax
>> ranges as compound pages at a page size of @align. Currently, this means,
>> that region/namespace bootstrap would take considerably less, given that
>> you would initialize considerably less pages.
>>
>> On setups with 128G NVDIMMs the initialization with DRAM stored struct
>> pages improves from ~268-358 ms to ~78-100 ms with 2M pages, and to less
>> than a 1msec with 1G pages.
>>
>> dax devices are created with a fixed @align (huge page size) which is
>> enforced through as well at mmap() of the device. Faults, consequently
>> happen too at the specified @align specified at the creation, and those
>> don't change through out dax device lifetime. MCEs poisons a whole dax
>> huge page, as well as splits occurring at the configured page size.
>>
> 
> Hi Joao,
> 
> With this patch I'm hitting the following with the 'device-dax' test [1].
> 
Ugh, I can reproduce it too -- apologies for the oversight.

This patch is not the culprit, the flaw is early in the series, specifically the fourth patch.

It needs this chunk below change on the fourth patch due to the existing elevated page ref
count at zone device memmap init. put_page() called here in memunmap_pages():

for (i = 0; i < pgmap->nr_ranges; i++)
	for_each_device_pfn(pfn, pgmap, i)
		put_page(pfn_to_page(pfn));

... on a zone_device compound memmap would otherwise always decrease head page refcount by
@geometry pfn amount (leading to the aforementioned splat you reported).

diff --git a/mm/memremap.c b/mm/memremap.c
index b0e7b8cf3047..79a883af788e 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -102,15 +102,15 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
        return (range->start + range_len(range)) >> PAGE_SHIFT;
 }

-static unsigned long pfn_next(unsigned long pfn)
+static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn)
 {
        if (pfn % 1024 == 0)
                cond_resched();
-       return pfn + 1;
+       return pfn + pgmap_pfn_geometry(pgmap);
 }

 #define for_each_device_pfn(pfn, map, i) \
-       for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn))
+       for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(map, pfn))

 static void dev_pagemap_kill(struct dev_pagemap *pgmap)
 {

It could also get this hunk below, but it is sort of redundant provided we won't touch
tail page refcount through out the devmap pages lifetime. This setting of tail pages
refcount to zero was in pre-v5.14 series, but it got removed under the assumption it comes
from the page allocator (where tail pages are already zeroed in refcount).

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 96975edac0a8..469a7aa5cf38 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6623,6 +6623,7 @@ static void __ref memmap_init_compound(struct page *page, unsigned
long pfn,
                __init_zone_device_page(page + i, pfn + i, zone_idx,
                                        nid, pgmap);
                prep_compound_tail(page, i);
+               set_page_count(page + i, 0);

                /*
                 * The first and second tail pages need to
Dan Williams July 27, 2021, 11:51 p.m. UTC | #3
On Thu, Jul 15, 2021 at 5:01 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>
> On 7/15/21 12:36 AM, Dan Williams wrote:
> > On Wed, Jul 14, 2021 at 12:36 PM Joao Martins <joao.m.martins@oracle.com> wrote:
> >>
> >> Use the newly added compound pagemap facility which maps the assigned dax
> >> ranges as compound pages at a page size of @align. Currently, this means,
> >> that region/namespace bootstrap would take considerably less, given that
> >> you would initialize considerably less pages.
> >>
> >> On setups with 128G NVDIMMs the initialization with DRAM stored struct
> >> pages improves from ~268-358 ms to ~78-100 ms with 2M pages, and to less
> >> than a 1msec with 1G pages.
> >>
> >> dax devices are created with a fixed @align (huge page size) which is
> >> enforced through as well at mmap() of the device. Faults, consequently
> >> happen too at the specified @align specified at the creation, and those
> >> don't change through out dax device lifetime. MCEs poisons a whole dax
> >> huge page, as well as splits occurring at the configured page size.
> >>
> >
> > Hi Joao,
> >
> > With this patch I'm hitting the following with the 'device-dax' test [1].
> >
> Ugh, I can reproduce it too -- apologies for the oversight.

No worries.

>
> This patch is not the culprit, the flaw is early in the series, specifically the fourth patch.
>
> It needs this chunk below change on the fourth patch due to the existing elevated page ref
> count at zone device memmap init. put_page() called here in memunmap_pages():
>
> for (i = 0; i < pgmap->nr_ranges; i++)
>         for_each_device_pfn(pfn, pgmap, i)
>                 put_page(pfn_to_page(pfn));
>
> ... on a zone_device compound memmap would otherwise always decrease head page refcount by
> @geometry pfn amount (leading to the aforementioned splat you reported).
>
> diff --git a/mm/memremap.c b/mm/memremap.c
> index b0e7b8cf3047..79a883af788e 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -102,15 +102,15 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
>         return (range->start + range_len(range)) >> PAGE_SHIFT;
>  }
>
> -static unsigned long pfn_next(unsigned long pfn)
> +static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn)
>  {
>         if (pfn % 1024 == 0)
>                 cond_resched();
> -       return pfn + 1;
> +       return pfn + pgmap_pfn_geometry(pgmap);

The cond_resched() would need to be fixed up too to something like:

if (pfn % (1024 << pgmap_geometry_order(pgmap)))
    cond_resched();

...because the goal is to take a break every 1024 iterations, not
every 1024 pfns.

>  }
>
>  #define for_each_device_pfn(pfn, map, i) \
> -       for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn))
> +       for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(map, pfn))
>
>  static void dev_pagemap_kill(struct dev_pagemap *pgmap)
>  {
>
> It could also get this hunk below, but it is sort of redundant provided we won't touch
> tail page refcount through out the devmap pages lifetime. This setting of tail pages
> refcount to zero was in pre-v5.14 series, but it got removed under the assumption it comes
> from the page allocator (where tail pages are already zeroed in refcount).

Wait, devmap pages never see the page allocator?

>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 96975edac0a8..469a7aa5cf38 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -6623,6 +6623,7 @@ static void __ref memmap_init_compound(struct page *page, unsigned
> long pfn,
>                 __init_zone_device_page(page + i, pfn + i, zone_idx,
>                                         nid, pgmap);
>                 prep_compound_tail(page, i);
> +               set_page_count(page + i, 0);

Looks good to me and perhaps a for elevated tail page refcount at
teardown as a sanity check that the tail pages was never pinned
directly?

>
>                 /*
>                  * The first and second tail pages need to
Joao Martins July 28, 2021, 9:36 a.m. UTC | #4
On 7/28/21 12:51 AM, Dan Williams wrote:
> On Thu, Jul 15, 2021 at 5:01 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>> On 7/15/21 12:36 AM, Dan Williams wrote:
>>> On Wed, Jul 14, 2021 at 12:36 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>> This patch is not the culprit, the flaw is early in the series, specifically the fourth patch.
>>
>> It needs this chunk below change on the fourth patch due to the existing elevated page ref
>> count at zone device memmap init. put_page() called here in memunmap_pages():
>>
>> for (i = 0; i < pgmap->nr_ranges; i++)
>>         for_each_device_pfn(pfn, pgmap, i)
>>                 put_page(pfn_to_page(pfn));
>>
>> ... on a zone_device compound memmap would otherwise always decrease head page refcount by
>> @geometry pfn amount (leading to the aforementioned splat you reported).
>>
>> diff --git a/mm/memremap.c b/mm/memremap.c
>> index b0e7b8cf3047..79a883af788e 100644
>> --- a/mm/memremap.c
>> +++ b/mm/memremap.c
>> @@ -102,15 +102,15 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
>>         return (range->start + range_len(range)) >> PAGE_SHIFT;
>>  }
>>
>> -static unsigned long pfn_next(unsigned long pfn)
>> +static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn)
>>  {
>>         if (pfn % 1024 == 0)
>>                 cond_resched();
>> -       return pfn + 1;
>> +       return pfn + pgmap_pfn_geometry(pgmap);
> 
> The cond_resched() would need to be fixed up too to something like:
> 
> if (pfn % (1024 << pgmap_geometry_order(pgmap)))
>     cond_resched();
> 
> ...because the goal is to take a break every 1024 iterations, not
> every 1024 pfns.
> 

Ah, good point.

>>  }
>>
>>  #define for_each_device_pfn(pfn, map, i) \
>> -       for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn))
>> +       for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(map, pfn))
>>
>>  static void dev_pagemap_kill(struct dev_pagemap *pgmap)
>>  {
>>
>> It could also get this hunk below, but it is sort of redundant provided we won't touch
>> tail page refcount through out the devmap pages lifetime. This setting of tail pages
>> refcount to zero was in pre-v5.14 series, but it got removed under the assumption it comes
>> from the page allocator (where tail pages are already zeroed in refcount).
> 
> Wait, devmap pages never see the page allocator?
> 
"where tail pages are already zeroed in refcount" this actually meant 'freshly allocated
pages' and I was referring to commit 7118fc2906e2 ("hugetlb: address ref count racing in
prep_compound_gigantic_page") that removed set_page_count() because the setting of page
ref count to zero was redundant.

Albeit devmap pages don't come from page allocator, you know separate zone and these pages
aren't part of the regular page pools (e.g. accessible via alloc_pages()), as you are
aware. Unless of course, we reassign them via dax_kmem, but then the way we map the struct
pages would be regular without any devmap stuff.

>>
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 96975edac0a8..469a7aa5cf38 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -6623,6 +6623,7 @@ static void __ref memmap_init_compound(struct page *page, unsigned
>> long pfn,
>>                 __init_zone_device_page(page + i, pfn + i, zone_idx,
>>                                         nid, pgmap);
>>                 prep_compound_tail(page, i);
>> +               set_page_count(page + i, 0);
> 
> Looks good to me and perhaps a for elevated tail page refcount at
> teardown as a sanity check that the tail pages was never pinned
> directly?
> 
Sorry didn't follow completely.

You meant to set tail page refcount back to 1 at teardown if it was kept to 0 (e.g.
memunmap_pages() after put_page()) or that the refcount is indeed kept to zero after the
put_page() in memunmap_pages() ?
Dan Williams July 28, 2021, 6:51 p.m. UTC | #5
On Wed, Jul 28, 2021 at 2:36 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>
> On 7/28/21 12:51 AM, Dan Williams wrote:
> > On Thu, Jul 15, 2021 at 5:01 AM Joao Martins <joao.m.martins@oracle.com> wrote:
> >> On 7/15/21 12:36 AM, Dan Williams wrote:
> >>> On Wed, Jul 14, 2021 at 12:36 PM Joao Martins <joao.m.martins@oracle.com> wrote:
> >> This patch is not the culprit, the flaw is early in the series, specifically the fourth patch.
> >>
> >> It needs this chunk below change on the fourth patch due to the existing elevated page ref
> >> count at zone device memmap init. put_page() called here in memunmap_pages():
> >>
> >> for (i = 0; i < pgmap->nr_ranges; i++)
> >>         for_each_device_pfn(pfn, pgmap, i)
> >>                 put_page(pfn_to_page(pfn));
> >>
> >> ... on a zone_device compound memmap would otherwise always decrease head page refcount by
> >> @geometry pfn amount (leading to the aforementioned splat you reported).
> >>
> >> diff --git a/mm/memremap.c b/mm/memremap.c
> >> index b0e7b8cf3047..79a883af788e 100644
> >> --- a/mm/memremap.c
> >> +++ b/mm/memremap.c
> >> @@ -102,15 +102,15 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
> >>         return (range->start + range_len(range)) >> PAGE_SHIFT;
> >>  }
> >>
> >> -static unsigned long pfn_next(unsigned long pfn)
> >> +static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn)
> >>  {
> >>         if (pfn % 1024 == 0)
> >>                 cond_resched();
> >> -       return pfn + 1;
> >> +       return pfn + pgmap_pfn_geometry(pgmap);
> >
> > The cond_resched() would need to be fixed up too to something like:
> >
> > if (pfn % (1024 << pgmap_geometry_order(pgmap)))
> >     cond_resched();
> >
> > ...because the goal is to take a break every 1024 iterations, not
> > every 1024 pfns.
> >
>
> Ah, good point.
>
> >>  }
> >>
> >>  #define for_each_device_pfn(pfn, map, i) \
> >> -       for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn))
> >> +       for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(map, pfn))
> >>
> >>  static void dev_pagemap_kill(struct dev_pagemap *pgmap)
> >>  {
> >>
> >> It could also get this hunk below, but it is sort of redundant provided we won't touch
> >> tail page refcount through out the devmap pages lifetime. This setting of tail pages
> >> refcount to zero was in pre-v5.14 series, but it got removed under the assumption it comes
> >> from the page allocator (where tail pages are already zeroed in refcount).
> >
> > Wait, devmap pages never see the page allocator?
> >
> "where tail pages are already zeroed in refcount" this actually meant 'freshly allocated
> pages' and I was referring to commit 7118fc2906e2 ("hugetlb: address ref count racing in
> prep_compound_gigantic_page") that removed set_page_count() because the setting of page
> ref count to zero was redundant.

Ah, maybe include that reference in the changelog?

>
> Albeit devmap pages don't come from page allocator, you know separate zone and these pages
> aren't part of the regular page pools (e.g. accessible via alloc_pages()), as you are
> aware. Unless of course, we reassign them via dax_kmem, but then the way we map the struct
> pages would be regular without any devmap stuff.

Got it. I think with the back reference to that commit (7118fc2906e2)
it resolves my confusion.

>
> >>
> >> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> >> index 96975edac0a8..469a7aa5cf38 100644
> >> --- a/mm/page_alloc.c
> >> +++ b/mm/page_alloc.c
> >> @@ -6623,6 +6623,7 @@ static void __ref memmap_init_compound(struct page *page, unsigned
> >> long pfn,
> >>                 __init_zone_device_page(page + i, pfn + i, zone_idx,
> >>                                         nid, pgmap);
> >>                 prep_compound_tail(page, i);
> >> +               set_page_count(page + i, 0);
> >
> > Looks good to me and perhaps a for elevated tail page refcount at
> > teardown as a sanity check that the tail pages was never pinned
> > directly?
> >
> Sorry didn't follow completely.
>
> You meant to set tail page refcount back to 1 at teardown if it was kept to 0 (e.g.
> memunmap_pages() after put_page()) or that the refcount is indeed kept to zero after the
> put_page() in memunmap_pages() ?

The latter, i.e. would it be worth it to check that a tail page did
not get accidentally pinned instead of a head page? I'm also ok to
leave out that sanity checking for now.
Joao Martins July 28, 2021, 6:59 p.m. UTC | #6
On 7/28/21 7:51 PM, Dan Williams wrote:
> On Wed, Jul 28, 2021 at 2:36 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>>
>> On 7/28/21 12:51 AM, Dan Williams wrote:
>>> On Thu, Jul 15, 2021 at 5:01 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>>>> On 7/15/21 12:36 AM, Dan Williams wrote:
>>>>> On Wed, Jul 14, 2021 at 12:36 PM Joao Martins <joao.m.martins@oracle.com> wrote:
>>>> This patch is not the culprit, the flaw is early in the series, specifically the fourth patch.
>>>>
>>>> It needs this chunk below change on the fourth patch due to the existing elevated page ref
>>>> count at zone device memmap init. put_page() called here in memunmap_pages():
>>>>
>>>> for (i = 0; i < pgmap->nr_ranges; i++)
>>>>         for_each_device_pfn(pfn, pgmap, i)
>>>>                 put_page(pfn_to_page(pfn));
>>>>
>>>> ... on a zone_device compound memmap would otherwise always decrease head page refcount by
>>>> @geometry pfn amount (leading to the aforementioned splat you reported).
>>>>
>>>> diff --git a/mm/memremap.c b/mm/memremap.c
>>>> index b0e7b8cf3047..79a883af788e 100644
>>>> --- a/mm/memremap.c
>>>> +++ b/mm/memremap.c
>>>> @@ -102,15 +102,15 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
>>>>         return (range->start + range_len(range)) >> PAGE_SHIFT;
>>>>  }
>>>>
>>>> -static unsigned long pfn_next(unsigned long pfn)
>>>> +static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn)
>>>>  {
>>>>         if (pfn % 1024 == 0)
>>>>                 cond_resched();
>>>> -       return pfn + 1;
>>>> +       return pfn + pgmap_pfn_geometry(pgmap);
>>>
>>> The cond_resched() would need to be fixed up too to something like:
>>>
>>> if (pfn % (1024 << pgmap_geometry_order(pgmap)))
>>>     cond_resched();
>>>
>>> ...because the goal is to take a break every 1024 iterations, not
>>> every 1024 pfns.
>>>
>>
>> Ah, good point.
>>
>>>>  }
>>>>
>>>>  #define for_each_device_pfn(pfn, map, i) \
>>>> -       for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn))
>>>> +       for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(map, pfn))
>>>>
>>>>  static void dev_pagemap_kill(struct dev_pagemap *pgmap)
>>>>  {
>>>>
>>>> It could also get this hunk below, but it is sort of redundant provided we won't touch
>>>> tail page refcount through out the devmap pages lifetime. This setting of tail pages
>>>> refcount to zero was in pre-v5.14 series, but it got removed under the assumption it comes
>>>> from the page allocator (where tail pages are already zeroed in refcount).
>>>
>>> Wait, devmap pages never see the page allocator?
>>>
>> "where tail pages are already zeroed in refcount" this actually meant 'freshly allocated
>> pages' and I was referring to commit 7118fc2906e2 ("hugetlb: address ref count racing in
>> prep_compound_gigantic_page") that removed set_page_count() because the setting of page
>> ref count to zero was redundant.
> 
> Ah, maybe include that reference in the changelog?
> 
Yeap, will do.

>>
>> Albeit devmap pages don't come from page allocator, you know separate zone and these pages
>> aren't part of the regular page pools (e.g. accessible via alloc_pages()), as you are
>> aware. Unless of course, we reassign them via dax_kmem, but then the way we map the struct
>> pages would be regular without any devmap stuff.
> 
> Got it. I think with the back reference to that commit (7118fc2906e2)
> it resolves my confusion.
> 
>>
>>>>
>>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>>> index 96975edac0a8..469a7aa5cf38 100644
>>>> --- a/mm/page_alloc.c
>>>> +++ b/mm/page_alloc.c
>>>> @@ -6623,6 +6623,7 @@ static void __ref memmap_init_compound(struct page *page, unsigned
>>>> long pfn,
>>>>                 __init_zone_device_page(page + i, pfn + i, zone_idx,
>>>>                                         nid, pgmap);
>>>>                 prep_compound_tail(page, i);
>>>> +               set_page_count(page + i, 0);
>>>
>>> Looks good to me and perhaps a for elevated tail page refcount at
>>> teardown as a sanity check that the tail pages was never pinned
>>> directly?
>>>
>> Sorry didn't follow completely.
>>
>> You meant to set tail page refcount back to 1 at teardown if it was kept to 0 (e.g.
>> memunmap_pages() after put_page()) or that the refcount is indeed kept to zero after the
>> put_page() in memunmap_pages() ?
> 
> The latter, i.e. would it be worth it to check that a tail page did
> not get accidentally pinned instead of a head page? I'm also ok to
> leave out that sanity checking for now.
> 
What makes me not worry too much about the sanity checking is that this put_page is
supposed to disappear here:

https://lore.kernel.org/linux-mm/20210717192135.9030-3-alex.sierra@amd.com/

.. in fact none the hunks here:

https://lore.kernel.org/linux-mm/f7217b61-c845-eaed-501e-c9e7067a6b87@oracle.com/

None of them would matter, as there would no longer exist an elevated page refcount to
deal with.
Dan Williams July 28, 2021, 7:03 p.m. UTC | #7
On Wed, Jul 28, 2021 at 11:59 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>
>
>
> On 7/28/21 7:51 PM, Dan Williams wrote:
> > On Wed, Jul 28, 2021 at 2:36 AM Joao Martins <joao.m.martins@oracle.com> wrote:
> >>
> >> On 7/28/21 12:51 AM, Dan Williams wrote:
> >>> On Thu, Jul 15, 2021 at 5:01 AM Joao Martins <joao.m.martins@oracle.com> wrote:
> >>>> On 7/15/21 12:36 AM, Dan Williams wrote:
> >>>>> On Wed, Jul 14, 2021 at 12:36 PM Joao Martins <joao.m.martins@oracle.com> wrote:
> >>>> This patch is not the culprit, the flaw is early in the series, specifically the fourth patch.
> >>>>
> >>>> It needs this chunk below change on the fourth patch due to the existing elevated page ref
> >>>> count at zone device memmap init. put_page() called here in memunmap_pages():
> >>>>
> >>>> for (i = 0; i < pgmap->nr_ranges; i++)
> >>>>         for_each_device_pfn(pfn, pgmap, i)
> >>>>                 put_page(pfn_to_page(pfn));
> >>>>
> >>>> ... on a zone_device compound memmap would otherwise always decrease head page refcount by
> >>>> @geometry pfn amount (leading to the aforementioned splat you reported).
> >>>>
> >>>> diff --git a/mm/memremap.c b/mm/memremap.c
> >>>> index b0e7b8cf3047..79a883af788e 100644
> >>>> --- a/mm/memremap.c
> >>>> +++ b/mm/memremap.c
> >>>> @@ -102,15 +102,15 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
> >>>>         return (range->start + range_len(range)) >> PAGE_SHIFT;
> >>>>  }
> >>>>
> >>>> -static unsigned long pfn_next(unsigned long pfn)
> >>>> +static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn)
> >>>>  {
> >>>>         if (pfn % 1024 == 0)
> >>>>                 cond_resched();
> >>>> -       return pfn + 1;
> >>>> +       return pfn + pgmap_pfn_geometry(pgmap);
> >>>
> >>> The cond_resched() would need to be fixed up too to something like:
> >>>
> >>> if (pfn % (1024 << pgmap_geometry_order(pgmap)))
> >>>     cond_resched();
> >>>
> >>> ...because the goal is to take a break every 1024 iterations, not
> >>> every 1024 pfns.
> >>>
> >>
> >> Ah, good point.
> >>
> >>>>  }
> >>>>
> >>>>  #define for_each_device_pfn(pfn, map, i) \
> >>>> -       for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn))
> >>>> +       for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(map, pfn))
> >>>>
> >>>>  static void dev_pagemap_kill(struct dev_pagemap *pgmap)
> >>>>  {
> >>>>
> >>>> It could also get this hunk below, but it is sort of redundant provided we won't touch
> >>>> tail page refcount through out the devmap pages lifetime. This setting of tail pages
> >>>> refcount to zero was in pre-v5.14 series, but it got removed under the assumption it comes
> >>>> from the page allocator (where tail pages are already zeroed in refcount).
> >>>
> >>> Wait, devmap pages never see the page allocator?
> >>>
> >> "where tail pages are already zeroed in refcount" this actually meant 'freshly allocated
> >> pages' and I was referring to commit 7118fc2906e2 ("hugetlb: address ref count racing in
> >> prep_compound_gigantic_page") that removed set_page_count() because the setting of page
> >> ref count to zero was redundant.
> >
> > Ah, maybe include that reference in the changelog?
> >
> Yeap, will do.
>
> >>
> >> Albeit devmap pages don't come from page allocator, you know separate zone and these pages
> >> aren't part of the regular page pools (e.g. accessible via alloc_pages()), as you are
> >> aware. Unless of course, we reassign them via dax_kmem, but then the way we map the struct
> >> pages would be regular without any devmap stuff.
> >
> > Got it. I think with the back reference to that commit (7118fc2906e2)
> > it resolves my confusion.
> >
> >>
> >>>>
> >>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> >>>> index 96975edac0a8..469a7aa5cf38 100644
> >>>> --- a/mm/page_alloc.c
> >>>> +++ b/mm/page_alloc.c
> >>>> @@ -6623,6 +6623,7 @@ static void __ref memmap_init_compound(struct page *page, unsigned
> >>>> long pfn,
> >>>>                 __init_zone_device_page(page + i, pfn + i, zone_idx,
> >>>>                                         nid, pgmap);
> >>>>                 prep_compound_tail(page, i);
> >>>> +               set_page_count(page + i, 0);
> >>>
> >>> Looks good to me and perhaps a for elevated tail page refcount at
> >>> teardown as a sanity check that the tail pages was never pinned
> >>> directly?
> >>>
> >> Sorry didn't follow completely.
> >>
> >> You meant to set tail page refcount back to 1 at teardown if it was kept to 0 (e.g.
> >> memunmap_pages() after put_page()) or that the refcount is indeed kept to zero after the
> >> put_page() in memunmap_pages() ?
> >
> > The latter, i.e. would it be worth it to check that a tail page did
> > not get accidentally pinned instead of a head page? I'm also ok to
> > leave out that sanity checking for now.
> >
> What makes me not worry too much about the sanity checking is that this put_page is
> supposed to disappear here:
>
> https://lore.kernel.org/linux-mm/20210717192135.9030-3-alex.sierra@amd.com/
>
> .. in fact none the hunks here:
>
> https://lore.kernel.org/linux-mm/f7217b61-c845-eaed-501e-c9e7067a6b87@oracle.com/
>
> None of them would matter, as there would no longer exist an elevated page refcount to
> deal with.

Ah good point. It's past time to take care of that... if only that
patch kit had been Cc'd to the DAX maintainer...
diff mbox series

Patch

diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 6e348b5f9d45..149627c922cc 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -192,6 +192,42 @@  static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
 }
 #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 
+static void set_page_mapping(struct vm_fault *vmf, pfn_t pfn,
+			     unsigned long fault_size,
+			     struct address_space *f_mapping)
+{
+	unsigned long i;
+	pgoff_t pgoff;
+
+	pgoff = linear_page_index(vmf->vma, ALIGN(vmf->address, fault_size));
+
+	for (i = 0; i < fault_size / PAGE_SIZE; i++) {
+		struct page *page;
+
+		page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
+		if (page->mapping)
+			continue;
+		page->mapping = f_mapping;
+		page->index = pgoff + i;
+	}
+}
+
+static void set_compound_mapping(struct vm_fault *vmf, pfn_t pfn,
+				 unsigned long fault_size,
+				 struct address_space *f_mapping)
+{
+	struct page *head;
+
+	head = pfn_to_page(pfn_t_to_pfn(pfn));
+	head = compound_head(head);
+	if (head->mapping)
+		return;
+
+	head->mapping = f_mapping;
+	head->index = linear_page_index(vmf->vma,
+			ALIGN(vmf->address, fault_size));
+}
+
 static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf,
 		enum page_entry_size pe_size)
 {
@@ -225,8 +261,7 @@  static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf,
 	}
 
 	if (rc == VM_FAULT_NOPAGE) {
-		unsigned long i;
-		pgoff_t pgoff;
+		struct dev_pagemap *pgmap = dev_dax->pgmap;
 
 		/*
 		 * In the device-dax case the only possibility for a
@@ -234,17 +269,10 @@  static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf,
 		 * mapped. No need to consider the zero page, or racing
 		 * conflicting mappings.
 		 */
-		pgoff = linear_page_index(vmf->vma,
-				ALIGN(vmf->address, fault_size));
-		for (i = 0; i < fault_size / PAGE_SIZE; i++) {
-			struct page *page;
-
-			page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
-			if (page->mapping)
-				continue;
-			page->mapping = filp->f_mapping;
-			page->index = pgoff + i;
-		}
+		if (pgmap_geometry(pgmap) > PAGE_SIZE)
+			set_compound_mapping(vmf, pfn, fault_size, filp->f_mapping);
+		else
+			set_page_mapping(vmf, pfn, fault_size, filp->f_mapping);
 	}
 	dax_read_unlock(id);
 
@@ -426,6 +454,8 @@  int dev_dax_probe(struct dev_dax *dev_dax)
 	}
 
 	pgmap->type = MEMORY_DEVICE_GENERIC;
+	if (dev_dax->align > PAGE_SIZE)
+		pgmap->geometry = dev_dax->align;
 	dev_dax->pgmap = pgmap;
 
 	addr = devm_memremap_pages(dev, pgmap);