diff mbox series

[2/3] mm: Introduce subsection_dev_map

Message ID 20191108000855.25209-3-t-fukasawa@vx.jp.nec.com (mailing list archive)
State New, archived
Headers show
Series make pfn walker support ZONE_DEVICE | expand

Commit Message

Toshiki Fukasawa Nov. 8, 2019, 12:08 a.m. UTC
Currently, there is no way to identify pfn on ZONE_DEVICE.
Identifying pfn on system memory can be done by using a
section-level flag. On the other hand, identifying pfn on
ZONE_DEVICE requires a subsection-level flag since ZONE_DEVICE
can be created in units of subsections.

This patch introduces a new bitmap subsection_dev_map so that
we can identify pfn on ZONE_DEVICE.

Also, subsection_dev_map is used to prove that struct pages
included in the subsection have been initialized since it is
set after memmap_init_zone_device(). We can avoid accessing
pages currently being initialized by checking subsection_dev_map.

Signed-off-by: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>
---
 include/linux/mmzone.h | 19 +++++++++++++++++++
 mm/memremap.c          |  2 ++
 mm/sparse.c            | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+)

Comments

Dan Williams Nov. 8, 2019, 7:13 p.m. UTC | #1
On Thu, Nov 7, 2019 at 4:15 PM Toshiki Fukasawa
<t-fukasawa@vx.jp.nec.com> wrote:
>
> Currently, there is no way to identify pfn on ZONE_DEVICE.
> Identifying pfn on system memory can be done by using a
> section-level flag. On the other hand, identifying pfn on
> ZONE_DEVICE requires a subsection-level flag since ZONE_DEVICE
> can be created in units of subsections.
>
> This patch introduces a new bitmap subsection_dev_map so that
> we can identify pfn on ZONE_DEVICE.
>
> Also, subsection_dev_map is used to prove that struct pages
> included in the subsection have been initialized since it is
> set after memmap_init_zone_device(). We can avoid accessing
> pages currently being initialized by checking subsection_dev_map.
>
> Signed-off-by: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>
> ---
>  include/linux/mmzone.h | 19 +++++++++++++++++++
>  mm/memremap.c          |  2 ++
>  mm/sparse.c            | 32 ++++++++++++++++++++++++++++++++
>  3 files changed, 53 insertions(+)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index bda2028..11376c4 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -1174,11 +1174,17 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
>
>  struct mem_section_usage {
>         DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
> +#ifdef CONFIG_ZONE_DEVICE
> +       DECLARE_BITMAP(subsection_dev_map, SUBSECTIONS_PER_SECTION);
> +#endif

Hi Toshiki,

There is currently an effort to remove the PageReserved() flag as some
code is using that to detect ZONE_DEVICE. In reviewing those patches
we realized that what many code paths want is to detect online memory.
So instead of a subsection_dev_map add a subsection_online_map. That
way pfn_to_online_page() can reliably avoid ZONE_DEVICE ranges. I
otherwise question the use case for pfn_walkers to return pages for
ZONE_DEVICE pages, I think the skip behavior when pfn_to_online_page()
== false is the right behavior.


>         /* See declaration of similar field in struct zone */
>         unsigned long pageblock_flags[0];
>  };
>
>  void subsection_map_init(unsigned long pfn, unsigned long nr_pages);
> +#ifdef CONFIG_ZONE_DEVICE
> +void subsections_mark_device(unsigned long start_pfn, unsigned long size);
> +#endif
>
>  struct page;
>  struct page_ext;
> @@ -1367,6 +1373,19 @@ static inline int pfn_present(unsigned long pfn)
>         return present_section(__nr_to_section(pfn_to_section_nr(pfn)));
>  }
>
> +static inline int pfn_zone_device(unsigned long pfn)
> +{
> +#ifdef CONFIG_ZONE_DEVICE
> +       if (pfn_valid(pfn)) {
> +               struct mem_section *ms = __pfn_to_section(pfn);
> +               int idx = subsection_map_index(pfn);
> +
> +               return test_bit(idx, ms->usage->subsection_dev_map);
> +       }
> +#endif
> +       return 0;
> +}
> +
>  /*
>   * These are _only_ used during initialisation, therefore they
>   * can use __initdata ...  They could have names to indicate
> diff --git a/mm/memremap.c b/mm/memremap.c
> index 03ccbdf..8a97fd4 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -303,6 +303,8 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
>         memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
>                                 PHYS_PFN(res->start),
>                                 PHYS_PFN(resource_size(res)), pgmap);
> +       subsections_mark_device(PHYS_PFN(res->start),
> +                               PHYS_PFN(resource_size(res)));
>         percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
>         return __va(res->start);
>
> diff --git a/mm/sparse.c b/mm/sparse.c
> index f6891c1..a3fc9e0a 100644
> --- a/mm/sparse.c
> +++ b/mm/sparse.c
> @@ -603,6 +603,31 @@ void __init sparse_init(void)
>         vmemmap_populate_print_last();
>  }
>
> +#ifdef CONFIG_ZONE_DEVICE
> +void subsections_mark_device(unsigned long start_pfn, unsigned long size)
> +{
> +       struct mem_section *ms;
> +       unsigned long *dev_map;
> +       unsigned long sec, start_sec, end_sec, pfns;
> +
> +       start_sec = pfn_to_section_nr(start_pfn);
> +       end_sec = pfn_to_section_nr(start_pfn + size - 1);
> +       for (sec = start_sec; sec <= end_sec;
> +            sec++, start_pfn += pfns, size -= pfns) {
> +               pfns = min(size, PAGES_PER_SECTION
> +                       - (start_pfn & ~PAGE_SECTION_MASK));
> +               if (WARN_ON(!valid_section_nr(sec)))
> +                       continue;
> +               ms = __pfn_to_section(start_pfn);
> +               if (!ms->usage)
> +                       continue;
> +
> +               dev_map = &ms->usage->subsection_dev_map[0];
> +               subsection_mask_set(dev_map, start_pfn, pfns);
> +       }
> +}
> +#endif
> +
>  #ifdef CONFIG_MEMORY_HOTPLUG
>
>  /* Mark all memory sections within the pfn range as online */
> @@ -782,7 +807,14 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
>                 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
>                 ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
>         }
> +#ifdef CONFIG_ZONE_DEVICE
> +       /* deactivation of a partial section on ZONE_DEVICE */
> +       if (ms->usage) {
> +               unsigned long *dev_map = &ms->usage->subsection_dev_map[0];
>
> +               bitmap_andnot(dev_map, dev_map, map, SUBSECTIONS_PER_SECTION);
> +       }
> +#endif
>         if (section_is_early && memmap)
>                 free_map_bootmem(memmap);
>         else
> --
> 1.8.3.1
>
David Hildenbrand Nov. 13, 2019, 6:51 p.m. UTC | #2
On 08.11.19 20:13, Dan Williams wrote:
> On Thu, Nov 7, 2019 at 4:15 PM Toshiki Fukasawa
> <t-fukasawa@vx.jp.nec.com> wrote:
>>
>> Currently, there is no way to identify pfn on ZONE_DEVICE.
>> Identifying pfn on system memory can be done by using a
>> section-level flag. On the other hand, identifying pfn on
>> ZONE_DEVICE requires a subsection-level flag since ZONE_DEVICE
>> can be created in units of subsections.
>>
>> This patch introduces a new bitmap subsection_dev_map so that
>> we can identify pfn on ZONE_DEVICE.
>>
>> Also, subsection_dev_map is used to prove that struct pages
>> included in the subsection have been initialized since it is
>> set after memmap_init_zone_device(). We can avoid accessing
>> pages currently being initialized by checking subsection_dev_map.
>>
>> Signed-off-by: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>
>> ---
>>   include/linux/mmzone.h | 19 +++++++++++++++++++
>>   mm/memremap.c          |  2 ++
>>   mm/sparse.c            | 32 ++++++++++++++++++++++++++++++++
>>   3 files changed, 53 insertions(+)
>>
>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>> index bda2028..11376c4 100644
>> --- a/include/linux/mmzone.h
>> +++ b/include/linux/mmzone.h
>> @@ -1174,11 +1174,17 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
>>
>>   struct mem_section_usage {
>>          DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
>> +#ifdef CONFIG_ZONE_DEVICE
>> +       DECLARE_BITMAP(subsection_dev_map, SUBSECTIONS_PER_SECTION);
>> +#endif
> 
> Hi Toshiki,
> 
> There is currently an effort to remove the PageReserved() flag as some
> code is using that to detect ZONE_DEVICE. In reviewing those patches
> we realized that what many code paths want is to detect online memory.
> So instead of a subsection_dev_map add a subsection_online_map. That
> way pfn_to_online_page() can reliably avoid ZONE_DEVICE ranges. I
> otherwise question the use case for pfn_walkers to return pages for
> ZONE_DEVICE pages, I think the skip behavior when pfn_to_online_page()
> == false is the right behavior.

To be more precise, I recommended an subsection_active_map, to indicate 
which memmaps were fully initialized and can safely be touched (e.g., to 
read the zone/nid). This map would also be set when the devmem memmaps 
were initialized (race between adding memory/growing the section and 
initializing the memmap).

See

https://lkml.org/lkml/2019/10/10/87

and

https://www.spinics.net/lists/linux-driver-devel/msg130012.html

I dislike a map that is specific to ZONE_DEVICE or (currently) 
!ZONE_DEVICE. I rather want an indication "this memmap is safe to 
touch". As discussed along the mentioned threads, we can combine this 
later with RCU to handle some races that are currently possible.
Dan Williams Nov. 13, 2019, 7:06 p.m. UTC | #3
On Wed, Nov 13, 2019 at 10:51 AM David Hildenbrand <david@redhat.com> wrote:
>
> On 08.11.19 20:13, Dan Williams wrote:
> > On Thu, Nov 7, 2019 at 4:15 PM Toshiki Fukasawa
> > <t-fukasawa@vx.jp.nec.com> wrote:
> >>
> >> Currently, there is no way to identify pfn on ZONE_DEVICE.
> >> Identifying pfn on system memory can be done by using a
> >> section-level flag. On the other hand, identifying pfn on
> >> ZONE_DEVICE requires a subsection-level flag since ZONE_DEVICE
> >> can be created in units of subsections.
> >>
> >> This patch introduces a new bitmap subsection_dev_map so that
> >> we can identify pfn on ZONE_DEVICE.
> >>
> >> Also, subsection_dev_map is used to prove that struct pages
> >> included in the subsection have been initialized since it is
> >> set after memmap_init_zone_device(). We can avoid accessing
> >> pages currently being initialized by checking subsection_dev_map.
> >>
> >> Signed-off-by: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>
> >> ---
> >>   include/linux/mmzone.h | 19 +++++++++++++++++++
> >>   mm/memremap.c          |  2 ++
> >>   mm/sparse.c            | 32 ++++++++++++++++++++++++++++++++
> >>   3 files changed, 53 insertions(+)
> >>
> >> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> >> index bda2028..11376c4 100644
> >> --- a/include/linux/mmzone.h
> >> +++ b/include/linux/mmzone.h
> >> @@ -1174,11 +1174,17 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
> >>
> >>   struct mem_section_usage {
> >>          DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
> >> +#ifdef CONFIG_ZONE_DEVICE
> >> +       DECLARE_BITMAP(subsection_dev_map, SUBSECTIONS_PER_SECTION);
> >> +#endif
> >
> > Hi Toshiki,
> >
> > There is currently an effort to remove the PageReserved() flag as some
> > code is using that to detect ZONE_DEVICE. In reviewing those patches
> > we realized that what many code paths want is to detect online memory.
> > So instead of a subsection_dev_map add a subsection_online_map. That
> > way pfn_to_online_page() can reliably avoid ZONE_DEVICE ranges. I
> > otherwise question the use case for pfn_walkers to return pages for
> > ZONE_DEVICE pages, I think the skip behavior when pfn_to_online_page()
> > == false is the right behavior.
>
> To be more precise, I recommended an subsection_active_map, to indicate
> which memmaps were fully initialized and can safely be touched (e.g., to
> read the zone/nid). This map would also be set when the devmem memmaps
> were initialized (race between adding memory/growing the section and
> initializing the memmap).
>
> See
>
> https://lkml.org/lkml/2019/10/10/87
>
> and
>
> https://www.spinics.net/lists/linux-driver-devel/msg130012.html

I'm still struggling to understand the motivation of distinguishing
"active" as something distinct from "online". As long as the "online"
granularity is improved from sections down to subsections then most
code paths are good to go. The others can use get_devpagemap() to
check for ZONE_DEVICE in a race free manner as they currently do.

> I dislike a map that is specific to ZONE_DEVICE or (currently)
> !ZONE_DEVICE. I rather want an indication "this memmap is safe to
> touch". As discussed along the mentioned threads, we can combine this
> later with RCU to handle some races that are currently possible.

The rcu protection is independent of the pfn_active vs pfn_online
distinction afaics.
David Hildenbrand Nov. 13, 2019, 7:53 p.m. UTC | #4
> Am 13.11.2019 um 20:06 schrieb Dan Williams <dan.j.williams@intel.com>:
> 
> On Wed, Nov 13, 2019 at 10:51 AM David Hildenbrand <david@redhat.com> wrote:
>> 
>>> On 08.11.19 20:13, Dan Williams wrote:
>>> On Thu, Nov 7, 2019 at 4:15 PM Toshiki Fukasawa
>>> <t-fukasawa@vx.jp.nec.com> wrote:
>>>> 
>>>> Currently, there is no way to identify pfn on ZONE_DEVICE.
>>>> Identifying pfn on system memory can be done by using a
>>>> section-level flag. On the other hand, identifying pfn on
>>>> ZONE_DEVICE requires a subsection-level flag since ZONE_DEVICE
>>>> can be created in units of subsections.
>>>> 
>>>> This patch introduces a new bitmap subsection_dev_map so that
>>>> we can identify pfn on ZONE_DEVICE.
>>>> 
>>>> Also, subsection_dev_map is used to prove that struct pages
>>>> included in the subsection have been initialized since it is
>>>> set after memmap_init_zone_device(). We can avoid accessing
>>>> pages currently being initialized by checking subsection_dev_map.
>>>> 
>>>> Signed-off-by: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>
>>>> ---
>>>>  include/linux/mmzone.h | 19 +++++++++++++++++++
>>>>  mm/memremap.c          |  2 ++
>>>>  mm/sparse.c            | 32 ++++++++++++++++++++++++++++++++
>>>>  3 files changed, 53 insertions(+)
>>>> 
>>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>>>> index bda2028..11376c4 100644
>>>> --- a/include/linux/mmzone.h
>>>> +++ b/include/linux/mmzone.h
>>>> @@ -1174,11 +1174,17 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
>>>> 
>>>>  struct mem_section_usage {
>>>>         DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
>>>> +#ifdef CONFIG_ZONE_DEVICE
>>>> +       DECLARE_BITMAP(subsection_dev_map, SUBSECTIONS_PER_SECTION);
>>>> +#endif
>>> 
>>> Hi Toshiki,
>>> 
>>> There is currently an effort to remove the PageReserved() flag as some
>>> code is using that to detect ZONE_DEVICE. In reviewing those patches
>>> we realized that what many code paths want is to detect online memory.
>>> So instead of a subsection_dev_map add a subsection_online_map. That
>>> way pfn_to_online_page() can reliably avoid ZONE_DEVICE ranges. I
>>> otherwise question the use case for pfn_walkers to return pages for
>>> ZONE_DEVICE pages, I think the skip behavior when pfn_to_online_page()
>>> == false is the right behavior.
>> 
>> To be more precise, I recommended an subsection_active_map, to indicate
>> which memmaps were fully initialized and can safely be touched (e.g., to
>> read the zone/nid). This map would also be set when the devmem memmaps
>> were initialized (race between adding memory/growing the section and
>> initializing the memmap).
>> 
>> See
>> 
>> https://lkml.org/lkml/2019/10/10/87
>> 
>> and
>> 
>> https://www.spinics.net/lists/linux-driver-devel/msg130012.html
> 
> I'm still struggling to understand the motivation of distinguishing
> "active" as something distinct from "online". As long as the "online"
> granularity is improved from sections down to subsections then most
> code paths are good to go. The others can use get_devpagemap() to
> check for ZONE_DEVICE in a race free manner as they currently do.

I thought we wanted to unify access if we don’t really care about the zone as in most pfn walkers - E.g., for zone shrinking. Anyhow, a subsection online map would be a good start, we can reuse that later for ZONE_DEVICE as well.

> 
>> I dislike a map that is specific to ZONE_DEVICE or (currently)
>> !ZONE_DEVICE. I rather want an indication "this memmap is safe to
>> touch". As discussed along the mentioned threads, we can combine this
>> later with RCU to handle some races that are currently possible.
> 
> The rcu protection is independent of the pfn_active vs pfn_online
> distinction afaics.

It’s one part of the bigger picture IMHO.

>
Dan Williams Nov. 13, 2019, 8:10 p.m. UTC | #5
On Wed, Nov 13, 2019 at 11:53 AM David Hildenbrand <david@redhat.com> wrote:
>
>
>
> > Am 13.11.2019 um 20:06 schrieb Dan Williams <dan.j.williams@intel.com>:
> >
> > On Wed, Nov 13, 2019 at 10:51 AM David Hildenbrand <david@redhat.com> wrote:
> >>
> >>> On 08.11.19 20:13, Dan Williams wrote:
> >>> On Thu, Nov 7, 2019 at 4:15 PM Toshiki Fukasawa
> >>> <t-fukasawa@vx.jp.nec.com> wrote:
> >>>>
> >>>> Currently, there is no way to identify pfn on ZONE_DEVICE.
> >>>> Identifying pfn on system memory can be done by using a
> >>>> section-level flag. On the other hand, identifying pfn on
> >>>> ZONE_DEVICE requires a subsection-level flag since ZONE_DEVICE
> >>>> can be created in units of subsections.
> >>>>
> >>>> This patch introduces a new bitmap subsection_dev_map so that
> >>>> we can identify pfn on ZONE_DEVICE.
> >>>>
> >>>> Also, subsection_dev_map is used to prove that struct pages
> >>>> included in the subsection have been initialized since it is
> >>>> set after memmap_init_zone_device(). We can avoid accessing
> >>>> pages currently being initialized by checking subsection_dev_map.
> >>>>
> >>>> Signed-off-by: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>
> >>>> ---
> >>>>  include/linux/mmzone.h | 19 +++++++++++++++++++
> >>>>  mm/memremap.c          |  2 ++
> >>>>  mm/sparse.c            | 32 ++++++++++++++++++++++++++++++++
> >>>>  3 files changed, 53 insertions(+)
> >>>>
> >>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> >>>> index bda2028..11376c4 100644
> >>>> --- a/include/linux/mmzone.h
> >>>> +++ b/include/linux/mmzone.h
> >>>> @@ -1174,11 +1174,17 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
> >>>>
> >>>>  struct mem_section_usage {
> >>>>         DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
> >>>> +#ifdef CONFIG_ZONE_DEVICE
> >>>> +       DECLARE_BITMAP(subsection_dev_map, SUBSECTIONS_PER_SECTION);
> >>>> +#endif
> >>>
> >>> Hi Toshiki,
> >>>
> >>> There is currently an effort to remove the PageReserved() flag as some
> >>> code is using that to detect ZONE_DEVICE. In reviewing those patches
> >>> we realized that what many code paths want is to detect online memory.
> >>> So instead of a subsection_dev_map add a subsection_online_map. That
> >>> way pfn_to_online_page() can reliably avoid ZONE_DEVICE ranges. I
> >>> otherwise question the use case for pfn_walkers to return pages for
> >>> ZONE_DEVICE pages, I think the skip behavior when pfn_to_online_page()
> >>> == false is the right behavior.
> >>
> >> To be more precise, I recommended an subsection_active_map, to indicate
> >> which memmaps were fully initialized and can safely be touched (e.g., to
> >> read the zone/nid). This map would also be set when the devmem memmaps
> >> were initialized (race between adding memory/growing the section and
> >> initializing the memmap).
> >>
> >> See
> >>
> >> https://lkml.org/lkml/2019/10/10/87
> >>
> >> and
> >>
> >> https://www.spinics.net/lists/linux-driver-devel/msg130012.html
> >
> > I'm still struggling to understand the motivation of distinguishing
> > "active" as something distinct from "online". As long as the "online"
> > granularity is improved from sections down to subsections then most
> > code paths are good to go. The others can use get_devpagemap() to
> > check for ZONE_DEVICE in a race free manner as they currently do.
>
> I thought we wanted to unify access if we don’t really care about the zone as in most pfn walkers - E.g., for zone shrinking.

Agree, when the zone does not matter, which is most cases, then
pfn_online() and pfn_valid() are sufficient.

> Anyhow, a subsection online map would be a good start, we can reuse that later for ZONE_DEVICE as well.

Cool, good to go with me sending a patch to introduce pfn_online() and
a corresponding subsection_map for the same?
David Hildenbrand Nov. 13, 2019, 8:23 p.m. UTC | #6
> Am 13.11.2019 um 21:10 schrieb Dan Williams <dan.j.williams@intel.com>:
> 
> On Wed, Nov 13, 2019 at 11:53 AM David Hildenbrand <david@redhat.com> wrote:
>> 
>> 
>> 
>>>> Am 13.11.2019 um 20:06 schrieb Dan Williams <dan.j.williams@intel.com>:
>>> 
>>> On Wed, Nov 13, 2019 at 10:51 AM David Hildenbrand <david@redhat.com> wrote:
>>>> 
>>>>> On 08.11.19 20:13, Dan Williams wrote:
>>>>> On Thu, Nov 7, 2019 at 4:15 PM Toshiki Fukasawa
>>>>> <t-fukasawa@vx.jp.nec.com> wrote:
>>>>>> 
>>>>>> Currently, there is no way to identify pfn on ZONE_DEVICE.
>>>>>> Identifying pfn on system memory can be done by using a
>>>>>> section-level flag. On the other hand, identifying pfn on
>>>>>> ZONE_DEVICE requires a subsection-level flag since ZONE_DEVICE
>>>>>> can be created in units of subsections.
>>>>>> 
>>>>>> This patch introduces a new bitmap subsection_dev_map so that
>>>>>> we can identify pfn on ZONE_DEVICE.
>>>>>> 
>>>>>> Also, subsection_dev_map is used to prove that struct pages
>>>>>> included in the subsection have been initialized since it is
>>>>>> set after memmap_init_zone_device(). We can avoid accessing
>>>>>> pages currently being initialized by checking subsection_dev_map.
>>>>>> 
>>>>>> Signed-off-by: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>
>>>>>> ---
>>>>>> include/linux/mmzone.h | 19 +++++++++++++++++++
>>>>>> mm/memremap.c          |  2 ++
>>>>>> mm/sparse.c            | 32 ++++++++++++++++++++++++++++++++
>>>>>> 3 files changed, 53 insertions(+)
>>>>>> 
>>>>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>>>>>> index bda2028..11376c4 100644
>>>>>> --- a/include/linux/mmzone.h
>>>>>> +++ b/include/linux/mmzone.h
>>>>>> @@ -1174,11 +1174,17 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
>>>>>> 
>>>>>> struct mem_section_usage {
>>>>>>        DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
>>>>>> +#ifdef CONFIG_ZONE_DEVICE
>>>>>> +       DECLARE_BITMAP(subsection_dev_map, SUBSECTIONS_PER_SECTION);
>>>>>> +#endif
>>>>> 
>>>>> Hi Toshiki,
>>>>> 
>>>>> There is currently an effort to remove the PageReserved() flag as some
>>>>> code is using that to detect ZONE_DEVICE. In reviewing those patches
>>>>> we realized that what many code paths want is to detect online memory.
>>>>> So instead of a subsection_dev_map add a subsection_online_map. That
>>>>> way pfn_to_online_page() can reliably avoid ZONE_DEVICE ranges. I
>>>>> otherwise question the use case for pfn_walkers to return pages for
>>>>> ZONE_DEVICE pages, I think the skip behavior when pfn_to_online_page()
>>>>> == false is the right behavior.
>>>> 
>>>> To be more precise, I recommended an subsection_active_map, to indicate
>>>> which memmaps were fully initialized and can safely be touched (e.g., to
>>>> read the zone/nid). This map would also be set when the devmem memmaps
>>>> were initialized (race between adding memory/growing the section and
>>>> initializing the memmap).
>>>> 
>>>> See
>>>> 
>>>> https://lkml.org/lkml/2019/10/10/87
>>>> 
>>>> and
>>>> 
>>>> https://www.spinics.net/lists/linux-driver-devel/msg130012.html
>>> 
>>> I'm still struggling to understand the motivation of distinguishing
>>> "active" as something distinct from "online". As long as the "online"
>>> granularity is improved from sections down to subsections then most
>>> code paths are good to go. The others can use get_devpagemap() to
>>> check for ZONE_DEVICE in a race free manner as they currently do.
>> 
>> I thought we wanted to unify access if we don’t really care about the zone as in most pfn walkers - E.g., for zone shrinking.
> 
> Agree, when the zone does not matter, which is most cases, then
> pfn_online() and pfn_valid() are sufficient.
> 
>> Anyhow, a subsection online map would be a good start, we can reuse that later for ZONE_DEVICE as well.
> 
> Cool, good to go with me sending a patch to introduce pfn_online() and
> a corresponding subsection_map for the same?

Yeah, let‘s see how this turns out and if we‘re on the same page. Thanks!

>
David Hildenbrand Nov. 13, 2019, 8:40 p.m. UTC | #7
> Am 13.11.2019 um 21:23 schrieb David Hildenbrand <david@redhat.com>:
> 
> 
> 
>>> Am 13.11.2019 um 21:10 schrieb Dan Williams <dan.j.williams@intel.com>:
>>> 
>>> On Wed, Nov 13, 2019 at 11:53 AM David Hildenbrand <david@redhat.com> wrote:
>>> 
>>> 
>>> 
>>>>> Am 13.11.2019 um 20:06 schrieb Dan Williams <dan.j.williams@intel.com>:
>>>> 
>>>> On Wed, Nov 13, 2019 at 10:51 AM David Hildenbrand <david@redhat.com> wrote:
>>>>> 
>>>>>> On 08.11.19 20:13, Dan Williams wrote:
>>>>>> On Thu, Nov 7, 2019 at 4:15 PM Toshiki Fukasawa
>>>>>> <t-fukasawa@vx.jp.nec.com> wrote:
>>>>>>> 
>>>>>>> Currently, there is no way to identify pfn on ZONE_DEVICE.
>>>>>>> Identifying pfn on system memory can be done by using a
>>>>>>> section-level flag. On the other hand, identifying pfn on
>>>>>>> ZONE_DEVICE requires a subsection-level flag since ZONE_DEVICE
>>>>>>> can be created in units of subsections.
>>>>>>> 
>>>>>>> This patch introduces a new bitmap subsection_dev_map so that
>>>>>>> we can identify pfn on ZONE_DEVICE.
>>>>>>> 
>>>>>>> Also, subsection_dev_map is used to prove that struct pages
>>>>>>> included in the subsection have been initialized since it is
>>>>>>> set after memmap_init_zone_device(). We can avoid accessing
>>>>>>> pages currently being initialized by checking subsection_dev_map.
>>>>>>> 
>>>>>>> Signed-off-by: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>
>>>>>>> ---
>>>>>>> include/linux/mmzone.h | 19 +++++++++++++++++++
>>>>>>> mm/memremap.c          |  2 ++
>>>>>>> mm/sparse.c            | 32 ++++++++++++++++++++++++++++++++
>>>>>>> 3 files changed, 53 insertions(+)
>>>>>>> 
>>>>>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>>>>>>> index bda2028..11376c4 100644
>>>>>>> --- a/include/linux/mmzone.h
>>>>>>> +++ b/include/linux/mmzone.h
>>>>>>> @@ -1174,11 +1174,17 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
>>>>>>> 
>>>>>>> struct mem_section_usage {
>>>>>>>       DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
>>>>>>> +#ifdef CONFIG_ZONE_DEVICE
>>>>>>> +       DECLARE_BITMAP(subsection_dev_map, SUBSECTIONS_PER_SECTION);
>>>>>>> +#endif
>>>>>> 
>>>>>> Hi Toshiki,
>>>>>> 
>>>>>> There is currently an effort to remove the PageReserved() flag as some
>>>>>> code is using that to detect ZONE_DEVICE. In reviewing those patches
>>>>>> we realized that what many code paths want is to detect online memory.
>>>>>> So instead of a subsection_dev_map add a subsection_online_map. That
>>>>>> way pfn_to_online_page() can reliably avoid ZONE_DEVICE ranges. I
>>>>>> otherwise question the use case for pfn_walkers to return pages for
>>>>>> ZONE_DEVICE pages, I think the skip behavior when pfn_to_online_page()
>>>>>> == false is the right behavior.
>>>>> 
>>>>> To be more precise, I recommended an subsection_active_map, to indicate
>>>>> which memmaps were fully initialized and can safely be touched (e.g., to
>>>>> read the zone/nid). This map would also be set when the devmem memmaps
>>>>> were initialized (race between adding memory/growing the section and
>>>>> initializing the memmap).
>>>>> 
>>>>> See
>>>>> 
>>>>> https://lkml.org/lkml/2019/10/10/87
>>>>> 
>>>>> and
>>>>> 
>>>>> https://www.spinics.net/lists/linux-driver-devel/msg130012.html
>>>> 
>>>> I'm still struggling to understand the motivation of distinguishing
>>>> "active" as something distinct from "online". As long as the "online"
>>>> granularity is improved from sections down to subsections then most
>>>> code paths are good to go. The others can use get_devpagemap() to
>>>> check for ZONE_DEVICE in a race free manner as they currently do.
>>> 
>>> I thought we wanted to unify access if we don’t really care about the zone as in most pfn walkers - E.g., for zone shrinking.
>> 
>> Agree, when the zone does not matter, which is most cases, then
>> pfn_online() and pfn_valid() are sufficient.

Oh, and just to clarify why I proposed pfn_active(): The issue right now is that a PFN that is valid but not online could be offline memory (memmap not initialized) or ZONE_DEVICE. That‘s why I wanted to have a way to detect if a memmap was initialized, independent of the zone. That‘s important for generic PFN walkers.

>> 
>>> Anyhow, a subsection online map would be a good start, we can reuse that later for ZONE_DEVICE as well.
>> 
>> Cool, good to go with me sending a patch to introduce pfn_online() and
>> a corresponding subsection_map for the same?
> 
> Yeah, let‘s see how this turns out and if we‘re on the same page. Thanks!
> 
>> 
>
Dan Williams Nov. 13, 2019, 9:11 p.m. UTC | #8
On Wed, Nov 13, 2019 at 12:40 PM David Hildenbrand <david@redhat.com> wrote:
[..]
> >>>> I'm still struggling to understand the motivation of distinguishing
> >>>> "active" as something distinct from "online". As long as the "online"
> >>>> granularity is improved from sections down to subsections then most
> >>>> code paths are good to go. The others can use get_devpagemap() to
> >>>> check for ZONE_DEVICE in a race free manner as they currently do.
> >>>
> >>> I thought we wanted to unify access if we don’t really care about the zone as in most pfn walkers - E.g., for zone shrinking.
> >>
> >> Agree, when the zone does not matter, which is most cases, then
> >> pfn_online() and pfn_valid() are sufficient.
>
> Oh, and just to clarify why I proposed pfn_active(): The issue right now is that a PFN that is valid but not online could be offline memory (memmap not initialized) or ZONE_DEVICE. That‘s why I wanted to have a way to detect if a memmap was initialized, independent of the zone. That‘s important for generic PFN walkers.

That's what I was debating with Toshiki [1], whether there is a real
example of needing to distinguish ZONE_DEVICE from offline memory in a
pfn walker. The proposed use case in this patch set of being able to
set hwpoison on ZONE_DEVICE pages does not seem like a good idea to
me. My suspicion is that this is a common theme and others are looking
to do these types page manipulations that only make sense for online
memory. If that is the case then treating ZONE_DEVICE as offline seems
the right direction.

[1]: https://lore.kernel.org/linux-mm/CAPcyv4joVDwiL21PPyJ7E_mMFR2SL3QTi09VMtfxb_W+-1p8vQ@mail.gmail.com/
David Hildenbrand Nov. 13, 2019, 9:22 p.m. UTC | #9
> Am 13.11.2019 um 22:12 schrieb Dan Williams <dan.j.williams@intel.com>:
> 
> On Wed, Nov 13, 2019 at 12:40 PM David Hildenbrand <david@redhat.com> wrote:
> [..]
>>>>>> I'm still struggling to understand the motivation of distinguishing
>>>>>> "active" as something distinct from "online". As long as the "online"
>>>>>> granularity is improved from sections down to subsections then most
>>>>>> code paths are good to go. The others can use get_devpagemap() to
>>>>>> check for ZONE_DEVICE in a race free manner as they currently do.
>>>>> 
>>>>> I thought we wanted to unify access if we don’t really care about the zone as in most pfn walkers - E.g., for zone shrinking.
>>>> 
>>>> Agree, when the zone does not matter, which is most cases, then
>>>> pfn_online() and pfn_valid() are sufficient.
>> 
>> Oh, and just to clarify why I proposed pfn_active(): The issue right now is that a PFN that is valid but not online could be offline memory (memmap not initialized) or ZONE_DEVICE. That‘s why I wanted to have a way to detect if a memmap was initialized, independent of the zone. That‘s important for generic PFN walkers.
> 
> That's what I was debating with Toshiki [1], whether there is a real
> example of needing to distinguish ZONE_DEVICE from offline memory in a
> pfn walker. The proposed use case in this patch set of being able to
> set hwpoison on ZONE_DEVICE pages does not seem like a good idea to
> me. My suspicion is that this is a common theme and others are looking
> to do these types page manipulations that only make sense for online
> memory. If that is the case then treating ZONE_DEVICE as offline seems
> the right direction.

Right. At least it would be nice to have for zone shrinking - not sure about the other walkers. We would have to special-case ZONE_DEVICE handling there.

But as I said, a subsection map for online memory is a good start, especially to fix pfn_to_online_page(). Also, I think this might be a very good thing to have for Oscars memmap-on-memory work (I have a plan in my head I can discuss with Oscar once he has time to work on that again).


> 
> [1]: https://lore.kernel.org/linux-mm/CAPcyv4joVDwiL21PPyJ7E_mMFR2SL3QTi09VMtfxb_W+-1p8vQ@mail.gmail.com/
>
Dan Williams Nov. 13, 2019, 9:26 p.m. UTC | #10
On Wed, Nov 13, 2019 at 1:22 PM David Hildenbrand <david@redhat.com> wrote:
>
>
>
> > Am 13.11.2019 um 22:12 schrieb Dan Williams <dan.j.williams@intel.com>:
> >
> > On Wed, Nov 13, 2019 at 12:40 PM David Hildenbrand <david@redhat.com> wrote:
> > [..]
> >>>>>> I'm still struggling to understand the motivation of distinguishing
> >>>>>> "active" as something distinct from "online". As long as the "online"
> >>>>>> granularity is improved from sections down to subsections then most
> >>>>>> code paths are good to go. The others can use get_devpagemap() to
> >>>>>> check for ZONE_DEVICE in a race free manner as they currently do.
> >>>>>
> >>>>> I thought we wanted to unify access if we don’t really care about the zone as in most pfn walkers - E.g., for zone shrinking.
> >>>>
> >>>> Agree, when the zone does not matter, which is most cases, then
> >>>> pfn_online() and pfn_valid() are sufficient.
> >>
> >> Oh, and just to clarify why I proposed pfn_active(): The issue right now is that a PFN that is valid but not online could be offline memory (memmap not initialized) or ZONE_DEVICE. That‘s why I wanted to have a way to detect if a memmap was initialized, independent of the zone. That‘s important for generic PFN walkers.
> >
> > That's what I was debating with Toshiki [1], whether there is a real
> > example of needing to distinguish ZONE_DEVICE from offline memory in a
> > pfn walker. The proposed use case in this patch set of being able to
> > set hwpoison on ZONE_DEVICE pages does not seem like a good idea to
> > me. My suspicion is that this is a common theme and others are looking
> > to do these types page manipulations that only make sense for online
> > memory. If that is the case then treating ZONE_DEVICE as offline seems
> > the right direction.
>
> Right. At least it would be nice to have for zone shrinking - not sure about the other walkers. We would have to special-case ZONE_DEVICE handling there.
>

I think that's ok... It's already zone aware code whereas pfn walkers
are zone unaware and should stay that way if at all possible.

> But as I said, a subsection map for online memory is a good start, especially to fix pfn_to_online_page(). Also, I think this might be a very good thing to have for Oscars memmap-on-memory work (I have a plan in my head I can discuss with Oscar once he has time to work on that again).

Ok, I'll keep an eye out.
Toshiki Fukasawa Nov. 14, 2019, 11:36 p.m. UTC | #11
On 2019/11/14 6:26, Dan Williams wrote:
> On Wed, Nov 13, 2019 at 1:22 PM David Hildenbrand <david@redhat.com> wrote:
>>
>>
>>
>>> Am 13.11.2019 um 22:12 schrieb Dan Williams <dan.j.williams@intel.com>:
>>>
>>> On Wed, Nov 13, 2019 at 12:40 PM David Hildenbrand <david@redhat.com> wrote:
>>> [..]
>>>>>>>> I'm still struggling to understand the motivation of distinguishing
>>>>>>>> "active" as something distinct from "online". As long as the "online"
>>>>>>>> granularity is improved from sections down to subsections then most
>>>>>>>> code paths are good to go. The others can use get_devpagemap() to
>>>>>>>> check for ZONE_DEVICE in a race free manner as they currently do.
>>>>>>>
>>>>>>> I thought we wanted to unify access if we don’t really care about the zone as in most pfn walkers - E.g., for zone shrinking.
>>>>>>
>>>>>> Agree, when the zone does not matter, which is most cases, then
>>>>>> pfn_online() and pfn_valid() are sufficient.
>>>>
>>>> Oh, and just to clarify why I proposed pfn_active(): The issue right now is that a PFN that is valid but not online could be offline memory (memmap not initialized) or ZONE_DEVICE. That‘s why I wanted to have a way to detect if a memmap was initialized, independent of the zone. That‘s important for generic PFN walkers.
>>>
>>> That's what I was debating with Toshiki [1], whether there is a real
>>> example of needing to distinguish ZONE_DEVICE from offline memory in a
>>> pfn walker. The proposed use case in this patch set of being able to
>>> set hwpoison on ZONE_DEVICE pages does not seem like a good idea to
>>> me. My suspicion is that this is a common theme and others are looking
>>> to do these types page manipulations that only make sense for online
>>> memory. If that is the case then treating ZONE_DEVICE as offline seems
>>> the right direction.
>>
>> Right. At least it would be nice to have for zone shrinking - not sure about the other walkers. We would have to special-case ZONE_DEVICE handling there.
>>
> 
> I think that's ok... It's already zone aware code whereas pfn walkers
> are zone unaware and should stay that way if at all possible.
> 
>> But as I said, a subsection map for online memory is a good start, especially to fix pfn_to_online_page(). Also, I think this might be a very good thing to have for Oscars memmap-on-memory work (I have a plan in my head I can discuss with Oscar once he has time to work on that again).
> 
> Ok, I'll keep an eye out.

I understand your point. Thanks!

By the way, I found another problem about ZONE_DEVICE, which
is race between memmap initialization and zone shrinking.

Iteration of create and destroy namespace causes the panic as below:

[   41.207694] kernel BUG at mm/page_alloc.c:535!
[   41.208109] invalid opcode: 0000 [#1] SMP PTI
[   41.208508] CPU: 7 PID: 2766 Comm: ndctl Not tainted 5.4.0-rc4 #6
[   41.209064] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.0-0-g63451fca13-prebuilt.qemu-project.org 04/01/2014
[   41.210175] RIP: 0010:set_pfnblock_flags_mask+0x95/0xf0
[   41.210643] Code: 04 41 83 e2 3c 48 8d 04 a8 48 c1 e0 07 48 03 04 dd e0 59 55 bb 48 8b 58 68 48 39 da 73 0e 48 c7 c6 70 ac 11 bb e8 1b b2 fd ff <0f> 0b 48 03 58 78 48 39 da 73 e9 49 01 ca b9 3f 00 00 00 4f 8d 0c
[   41.212354] RSP: 0018:ffffac0d41557c80 EFLAGS: 00010246
[   41.212821] RAX: 000000000000004a RBX: 0000000000244a00 RCX: 0000000000000000
[   41.213459] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffbb1197dc
[   41.214100] RBP: 000000000000000c R08: 0000000000000439 R09: 0000000000000059
[   41.214736] R10: 0000000000000000 R11: ffffac0d41557b08 R12: ffff8be475ea72b0
[   41.215376] R13: 000000000000fa00 R14: 0000000000250000 R15: 00000000fffc0bb5
[   41.216008] FS:  00007f30862ab600(0000) GS:ffff8be57bc40000(0000) knlGS:0000000000000000
[   41.216771] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   41.217299] CR2: 000055e824d0d508 CR3: 0000000231dac000 CR4: 00000000000006e0
[   41.217934] Call Trace:
[   41.218225]  memmap_init_zone_device+0x165/0x17c
[   41.218642]  memremap_pages+0x4c1/0x540
[   41.218989]  devm_memremap_pages+0x1d/0x60
[   41.219367]  pmem_attach_disk+0x16b/0x600 [nd_pmem]
[   41.219804]  ? devm_nsio_enable+0xb8/0xe0
[   41.220172]  nvdimm_bus_probe+0x69/0x1c0
[   41.220526]  really_probe+0x1c2/0x3e0
[   41.220856]  driver_probe_device+0xb4/0x100
[   41.221238]  device_driver_attach+0x4f/0x60
[   41.221611]  bind_store+0xc9/0x110
[   41.221919]  kernfs_fop_write+0x116/0x190
[   41.222326]  vfs_write+0xa5/0x1a0
[   41.222626]  ksys_write+0x59/0xd0
[   41.222927]  do_syscall_64+0x5b/0x180
[   41.223264]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   41.223714] RIP: 0033:0x7f30865d0ed8
[   41.224037] Code: 89 02 48 c7 c0 ff ff ff ff eb b3 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 45 78 0d 00 8b 00 85 c0 75 17 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 41 54 49 89 d4 55
[   41.225920] RSP: 002b:00007fffe5d30a78 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[   41.226608] RAX: ffffffffffffffda RBX: 000055e824d07f40 RCX: 00007f30865d0ed8
[   41.227242] RDX: 0000000000000007 RSI: 000055e824d07f40 RDI: 0000000000000004
[   41.227870] RBP: 0000000000000007 R08: 0000000000000007 R09: 0000000000000006
[   41.228753] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000004
[   41.229419] R13: 00007f30862ab528 R14: 0000000000000001 R15: 000055e824d07f40

While creating a namespace and initializing memmap, if you destroy the namespace
and shrink the zone, it will initialize the memmap outside the zone and
trigger VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page) in
set_pfnblock_flags_mask().

Thanks,
Toshiki Fukasawa
David Hildenbrand Nov. 15, 2019, 12:46 a.m. UTC | #12
> Am 15.11.2019 um 00:42 schrieb Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>:
> 
> On 2019/11/14 6:26, Dan Williams wrote:
>>> On Wed, Nov 13, 2019 at 1:22 PM David Hildenbrand <david@redhat.com> wrote:
>>> 
>>> 
>>> 
>>>> Am 13.11.2019 um 22:12 schrieb Dan Williams <dan.j.williams@intel.com>:
>>>> 
>>>> On Wed, Nov 13, 2019 at 12:40 PM David Hildenbrand <david@redhat.com> wrote:
>>>> [..]
>>>>>>>>> I'm still struggling to understand the motivation of distinguishing
>>>>>>>>> "active" as something distinct from "online". As long as the "online"
>>>>>>>>> granularity is improved from sections down to subsections then most
>>>>>>>>> code paths are good to go. The others can use get_devpagemap() to
>>>>>>>>> check for ZONE_DEVICE in a race free manner as they currently do.
>>>>>>>> 
>>>>>>>> I thought we wanted to unify access if we don’t really care about the zone as in most pfn walkers - E.g., for zone shrinking.
>>>>>>> 
>>>>>>> Agree, when the zone does not matter, which is most cases, then
>>>>>>> pfn_online() and pfn_valid() are sufficient.
>>>>> 
>>>>> Oh, and just to clarify why I proposed pfn_active(): The issue right now is that a PFN that is valid but not online could be offline memory (memmap not initialized) or ZONE_DEVICE. That‘s why I wanted to have a way to detect if a memmap was initialized, independent of the zone. That‘s important for generic PFN walkers.
>>>> 
>>>> That's what I was debating with Toshiki [1], whether there is a real
>>>> example of needing to distinguish ZONE_DEVICE from offline memory in a
>>>> pfn walker. The proposed use case in this patch set of being able to
>>>> set hwpoison on ZONE_DEVICE pages does not seem like a good idea to
>>>> me. My suspicion is that this is a common theme and others are looking
>>>> to do these types page manipulations that only make sense for online
>>>> memory. If that is the case then treating ZONE_DEVICE as offline seems
>>>> the right direction.
>>> 
>>> Right. At least it would be nice to have for zone shrinking - not sure about the other walkers. We would have to special-case ZONE_DEVICE handling there.
>>> 
>> 
>> I think that's ok... It's already zone aware code whereas pfn walkers
>> are zone unaware and should stay that way if at all possible.
>> 
>>> But as I said, a subsection map for online memory is a good start, especially to fix pfn_to_online_page(). Also, I think this might be a very good thing to have for Oscars memmap-on-memory work (I have a plan in my head I can discuss with Oscar once he has time to work on that again).
>> 
>> Ok, I'll keep an eye out.
> 
> I understand your point. Thanks!
> 
> By the way, I found another problem about ZONE_DEVICE, which
> is race between memmap initialization and zone shrinking.
> 
> Iteration of create and destroy namespace causes the panic as below:
> 
> [   41.207694] kernel BUG at mm/page_alloc.c:535!
> [   41.208109] invalid opcode: 0000 [#1] SMP PTI
> [   41.208508] CPU: 7 PID: 2766 Comm: ndctl Not tainted 5.4.0-rc4 #6
> [   41.209064] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.0-0-g63451fca13-prebuilt.qemu-project.org 04/01/2014
> [   41.210175] RIP: 0010:set_pfnblock_flags_mask+0x95/0xf0
> [   41.210643] Code: 04 41 83 e2 3c 48 8d 04 a8 48 c1 e0 07 48 03 04 dd e0 59 55 bb 48 8b 58 68 48 39 da 73 0e 48 c7 c6 70 ac 11 bb e8 1b b2 fd ff <0f> 0b 48 03 58 78 48 39 da 73 e9 49 01 ca b9 3f 00 00 00 4f 8d 0c
> [   41.212354] RSP: 0018:ffffac0d41557c80 EFLAGS: 00010246
> [   41.212821] RAX: 000000000000004a RBX: 0000000000244a00 RCX: 0000000000000000
> [   41.213459] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffbb1197dc
> [   41.214100] RBP: 000000000000000c R08: 0000000000000439 R09: 0000000000000059
> [   41.214736] R10: 0000000000000000 R11: ffffac0d41557b08 R12: ffff8be475ea72b0
> [   41.215376] R13: 000000000000fa00 R14: 0000000000250000 R15: 00000000fffc0bb5
> [   41.216008] FS:  00007f30862ab600(0000) GS:ffff8be57bc40000(0000) knlGS:0000000000000000
> [   41.216771] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   41.217299] CR2: 000055e824d0d508 CR3: 0000000231dac000 CR4: 00000000000006e0
> [   41.217934] Call Trace:
> [   41.218225]  memmap_init_zone_device+0x165/0x17c
> [   41.218642]  memremap_pages+0x4c1/0x540
> [   41.218989]  devm_memremap_pages+0x1d/0x60
> [   41.219367]  pmem_attach_disk+0x16b/0x600 [nd_pmem]
> [   41.219804]  ? devm_nsio_enable+0xb8/0xe0
> [   41.220172]  nvdimm_bus_probe+0x69/0x1c0
> [   41.220526]  really_probe+0x1c2/0x3e0
> [   41.220856]  driver_probe_device+0xb4/0x100
> [   41.221238]  device_driver_attach+0x4f/0x60
> [   41.221611]  bind_store+0xc9/0x110
> [   41.221919]  kernfs_fop_write+0x116/0x190
> [   41.222326]  vfs_write+0xa5/0x1a0
> [   41.222626]  ksys_write+0x59/0xd0
> [   41.222927]  do_syscall_64+0x5b/0x180
> [   41.223264]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [   41.223714] RIP: 0033:0x7f30865d0ed8
> [   41.224037] Code: 89 02 48 c7 c0 ff ff ff ff eb b3 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 45 78 0d 00 8b 00 85 c0 75 17 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 41 54 49 89 d4 55
> [   41.225920] RSP: 002b:00007fffe5d30a78 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
> [   41.226608] RAX: ffffffffffffffda RBX: 000055e824d07f40 RCX: 00007f30865d0ed8
> [   41.227242] RDX: 0000000000000007 RSI: 000055e824d07f40 RDI: 0000000000000004
> [   41.227870] RBP: 0000000000000007 R08: 0000000000000007 R09: 0000000000000006
> [   41.228753] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000004
> [   41.229419] R13: 00007f30862ab528 R14: 0000000000000001 R15: 000055e824d07f40
> 
> While creating a namespace and initializing memmap, if you destroy the namespace
> and shrink the zone, it will initialize the memmap outside the zone and
> trigger VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page) in
> set_pfnblock_flags_mask().

Does that happen with -next as well? There, we currently don‘t shrink the ZONE_DEVICE zone anymore.

> 
> Thanks,
> Toshiki Fukasawa
Toshiki Fukasawa Nov. 15, 2019, 2:57 a.m. UTC | #13
On 2019/11/15 9:46, David Hildenbrand wrote:
> 
> 
>> Am 15.11.2019 um 00:42 schrieb Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com>:
>>
>> On 2019/11/14 6:26, Dan Williams wrote:
>>>> On Wed, Nov 13, 2019 at 1:22 PM David Hildenbrand <david@redhat.com> wrote:
>>>>
>>>>
>>>>
>>>>> Am 13.11.2019 um 22:12 schrieb Dan Williams <dan.j.williams@intel.com>:
>>>>>
>>>>> On Wed, Nov 13, 2019 at 12:40 PM David Hildenbrand <david@redhat.com> wrote:
>>>>> [..]
>>>>>>>>>> I'm still struggling to understand the motivation of distinguishing
>>>>>>>>>> "active" as something distinct from "online". As long as the "online"
>>>>>>>>>> granularity is improved from sections down to subsections then most
>>>>>>>>>> code paths are good to go. The others can use get_devpagemap() to
>>>>>>>>>> check for ZONE_DEVICE in a race free manner as they currently do.
>>>>>>>>>
>>>>>>>>> I thought we wanted to unify access if we don’t really care about the zone as in most pfn walkers - E.g., for zone shrinking.
>>>>>>>>
>>>>>>>> Agree, when the zone does not matter, which is most cases, then
>>>>>>>> pfn_online() and pfn_valid() are sufficient.
>>>>>>
>>>>>> Oh, and just to clarify why I proposed pfn_active(): The issue right now is that a PFN that is valid but not online could be offline memory (memmap not initialized) or ZONE_DEVICE. That‘s why I wanted to have a way to detect if a memmap was initialized, independent of the zone. That‘s important for generic PFN walkers.
>>>>>
>>>>> That's what I was debating with Toshiki [1], whether there is a real
>>>>> example of needing to distinguish ZONE_DEVICE from offline memory in a
>>>>> pfn walker. The proposed use case in this patch set of being able to
>>>>> set hwpoison on ZONE_DEVICE pages does not seem like a good idea to
>>>>> me. My suspicion is that this is a common theme and others are looking
>>>>> to do these types page manipulations that only make sense for online
>>>>> memory. If that is the case then treating ZONE_DEVICE as offline seems
>>>>> the right direction.
>>>>
>>>> Right. At least it would be nice to have for zone shrinking - not sure about the other walkers. We would have to special-case ZONE_DEVICE handling there.
>>>>
>>>
>>> I think that's ok... It's already zone aware code whereas pfn walkers
>>> are zone unaware and should stay that way if at all possible.
>>>
>>>> But as I said, a subsection map for online memory is a good start, especially to fix pfn_to_online_page(). Also, I think this might be a very good thing to have for Oscars memmap-on-memory work (I have a plan in my head I can discuss with Oscar once he has time to work on that again).
>>>
>>> Ok, I'll keep an eye out.
>>
>> I understand your point. Thanks!
>>
>> By the way, I found another problem about ZONE_DEVICE, which
>> is race between memmap initialization and zone shrinking.
>>
>> Iteration of create and destroy namespace causes the panic as below:
>>
>> [   41.207694] kernel BUG at mm/page_alloc.c:535!
>> [   41.208109] invalid opcode: 0000 [#1] SMP PTI
>> [   41.208508] CPU: 7 PID: 2766 Comm: ndctl Not tainted 5.4.0-rc4 #6
>> [   41.209064] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.0-0-g63451fca13-prebuilt.qemu-project.org 04/01/2014
>> [   41.210175] RIP: 0010:set_pfnblock_flags_mask+0x95/0xf0
>> [   41.210643] Code: 04 41 83 e2 3c 48 8d 04 a8 48 c1 e0 07 48 03 04 dd e0 59 55 bb 48 8b 58 68 48 39 da 73 0e 48 c7 c6 70 ac 11 bb e8 1b b2 fd ff <0f> 0b 48 03 58 78 48 39 da 73 e9 49 01 ca b9 3f 00 00 00 4f 8d 0c
>> [   41.212354] RSP: 0018:ffffac0d41557c80 EFLAGS: 00010246
>> [   41.212821] RAX: 000000000000004a RBX: 0000000000244a00 RCX: 0000000000000000
>> [   41.213459] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffbb1197dc
>> [   41.214100] RBP: 000000000000000c R08: 0000000000000439 R09: 0000000000000059
>> [   41.214736] R10: 0000000000000000 R11: ffffac0d41557b08 R12: ffff8be475ea72b0
>> [   41.215376] R13: 000000000000fa00 R14: 0000000000250000 R15: 00000000fffc0bb5
>> [   41.216008] FS:  00007f30862ab600(0000) GS:ffff8be57bc40000(0000) knlGS:0000000000000000
>> [   41.216771] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [   41.217299] CR2: 000055e824d0d508 CR3: 0000000231dac000 CR4: 00000000000006e0
>> [   41.217934] Call Trace:
>> [   41.218225]  memmap_init_zone_device+0x165/0x17c
>> [   41.218642]  memremap_pages+0x4c1/0x540
>> [   41.218989]  devm_memremap_pages+0x1d/0x60
>> [   41.219367]  pmem_attach_disk+0x16b/0x600 [nd_pmem]
>> [   41.219804]  ? devm_nsio_enable+0xb8/0xe0
>> [   41.220172]  nvdimm_bus_probe+0x69/0x1c0
>> [   41.220526]  really_probe+0x1c2/0x3e0
>> [   41.220856]  driver_probe_device+0xb4/0x100
>> [   41.221238]  device_driver_attach+0x4f/0x60
>> [   41.221611]  bind_store+0xc9/0x110
>> [   41.221919]  kernfs_fop_write+0x116/0x190
>> [   41.222326]  vfs_write+0xa5/0x1a0
>> [   41.222626]  ksys_write+0x59/0xd0
>> [   41.222927]  do_syscall_64+0x5b/0x180
>> [   41.223264]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>> [   41.223714] RIP: 0033:0x7f30865d0ed8
>> [   41.224037] Code: 89 02 48 c7 c0 ff ff ff ff eb b3 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 45 78 0d 00 8b 00 85 c0 75 17 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 41 54 49 89 d4 55
>> [   41.225920] RSP: 002b:00007fffe5d30a78 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
>> [   41.226608] RAX: ffffffffffffffda RBX: 000055e824d07f40 RCX: 00007f30865d0ed8
>> [   41.227242] RDX: 0000000000000007 RSI: 000055e824d07f40 RDI: 0000000000000004
>> [   41.227870] RBP: 0000000000000007 R08: 0000000000000007 R09: 0000000000000006
>> [   41.228753] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000004
>> [   41.229419] R13: 00007f30862ab528 R14: 0000000000000001 R15: 000055e824d07f40
>>
>> While creating a namespace and initializing memmap, if you destroy the namespace
>> and shrink the zone, it will initialize the memmap outside the zone and
>> trigger VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page) in
>> set_pfnblock_flags_mask().
> 
> Does that happen with -next as well? There, we currently don‘t shrink the ZONE_DEVICE zone anymore.

I confirmed the patch. The panic doesn't occur with linux-next kernel.
https://lore.kernel.org/linux-mm/20191006085646.5768-6-david@redhat.com/

Thank you for your information.

Thanks,
Toshiki Fukasawa
diff mbox series

Patch

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index bda2028..11376c4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1174,11 +1174,17 @@  static inline unsigned long section_nr_to_pfn(unsigned long sec)
 
 struct mem_section_usage {
 	DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
+#ifdef CONFIG_ZONE_DEVICE
+	DECLARE_BITMAP(subsection_dev_map, SUBSECTIONS_PER_SECTION);
+#endif
 	/* See declaration of similar field in struct zone */
 	unsigned long pageblock_flags[0];
 };
 
 void subsection_map_init(unsigned long pfn, unsigned long nr_pages);
+#ifdef CONFIG_ZONE_DEVICE
+void subsections_mark_device(unsigned long start_pfn, unsigned long size);
+#endif
 
 struct page;
 struct page_ext;
@@ -1367,6 +1373,19 @@  static inline int pfn_present(unsigned long pfn)
 	return present_section(__nr_to_section(pfn_to_section_nr(pfn)));
 }
 
+static inline int pfn_zone_device(unsigned long pfn)
+{
+#ifdef CONFIG_ZONE_DEVICE
+	if (pfn_valid(pfn)) {
+		struct mem_section *ms = __pfn_to_section(pfn);
+		int idx = subsection_map_index(pfn);
+
+		return test_bit(idx, ms->usage->subsection_dev_map);
+	}
+#endif
+	return 0;
+}
+
 /*
  * These are _only_ used during initialisation, therefore they
  * can use __initdata ...  They could have names to indicate
diff --git a/mm/memremap.c b/mm/memremap.c
index 03ccbdf..8a97fd4 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -303,6 +303,8 @@  void *memremap_pages(struct dev_pagemap *pgmap, int nid)
 	memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
 				PHYS_PFN(res->start),
 				PHYS_PFN(resource_size(res)), pgmap);
+	subsections_mark_device(PHYS_PFN(res->start),
+				PHYS_PFN(resource_size(res)));
 	percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
 	return __va(res->start);
 
diff --git a/mm/sparse.c b/mm/sparse.c
index f6891c1..a3fc9e0a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -603,6 +603,31 @@  void __init sparse_init(void)
 	vmemmap_populate_print_last();
 }
 
+#ifdef CONFIG_ZONE_DEVICE
+void subsections_mark_device(unsigned long start_pfn, unsigned long size)
+{
+	struct mem_section *ms;
+	unsigned long *dev_map;
+	unsigned long sec, start_sec, end_sec, pfns;
+
+	start_sec = pfn_to_section_nr(start_pfn);
+	end_sec = pfn_to_section_nr(start_pfn + size - 1);
+	for (sec = start_sec; sec <= end_sec;
+	     sec++, start_pfn += pfns, size -= pfns) {
+		pfns = min(size, PAGES_PER_SECTION
+			- (start_pfn & ~PAGE_SECTION_MASK));
+		if (WARN_ON(!valid_section_nr(sec)))
+			continue;
+		ms = __pfn_to_section(start_pfn);
+		if (!ms->usage)
+			continue;
+
+		dev_map = &ms->usage->subsection_dev_map[0];
+		subsection_mask_set(dev_map, start_pfn, pfns);
+	}
+}
+#endif
+
 #ifdef CONFIG_MEMORY_HOTPLUG
 
 /* Mark all memory sections within the pfn range as online */
@@ -782,7 +807,14 @@  static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
 		memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
 		ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
 	}
+#ifdef CONFIG_ZONE_DEVICE
+	/* deactivation of a partial section on ZONE_DEVICE */
+	if (ms->usage) {
+		unsigned long *dev_map = &ms->usage->subsection_dev_map[0];
 
+		bitmap_andnot(dev_map, dev_map, map, SUBSECTIONS_PER_SECTION);
+	}
+#endif
 	if (section_is_early && memmap)
 		free_map_bootmem(memmap);
 	else