diff mbox series

[v1,2/2] mm: factor out next_present_section_nr()

Message ID 20200113144035.10848-3-david@redhat.com (mailing list archive)
State New, archived
Headers show
Series mm/page_alloc: memmap_init_zone() cleanups | expand

Commit Message

David Hildenbrand Jan. 13, 2020, 2:40 p.m. UTC
Let's move it to the header and use the shorter variant from
mm/page_alloc.c (the original one will also check
"__highest_present_section_nr + 1", which is not necessary). While at it,
make the section_nr in next_pfn() const.

In next_pfn(), we now return section_nr_to_pfn(-1) instead of -1 once
we exceed __highest_present_section_nr, which doesn't make a difference in
the caller as it is big enough (>= all sane end_pfn).

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 include/linux/mmzone.h | 10 ++++++++++
 mm/page_alloc.c        | 11 ++---------
 mm/sparse.c            | 10 ----------
 3 files changed, 12 insertions(+), 19 deletions(-)

Comments

Kirill A . Shutemov Jan. 13, 2020, 10:41 p.m. UTC | #1
On Mon, Jan 13, 2020 at 03:40:35PM +0100, David Hildenbrand wrote:
> Let's move it to the header and use the shorter variant from
> mm/page_alloc.c (the original one will also check
> "__highest_present_section_nr + 1", which is not necessary). While at it,
> make the section_nr in next_pfn() const.
> 
> In next_pfn(), we now return section_nr_to_pfn(-1) instead of -1 once
> we exceed __highest_present_section_nr, which doesn't make a difference in
> the caller as it is big enough (>= all sane end_pfn).
> 
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Michal Hocko <mhocko@kernel.org>
> Cc: Oscar Salvador <osalvador@suse.de>
> Cc: Kirill A. Shutemov <kirill@shutemov.name>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  include/linux/mmzone.h | 10 ++++++++++
>  mm/page_alloc.c        | 11 ++---------
>  mm/sparse.c            | 10 ----------
>  3 files changed, 12 insertions(+), 19 deletions(-)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index c2bc309d1634..462f6873905a 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -1379,6 +1379,16 @@ static inline int pfn_present(unsigned long pfn)
>  	return present_section(__nr_to_section(pfn_to_section_nr(pfn)));
>  }
>  
> +static inline unsigned long next_present_section_nr(unsigned long section_nr)
> +{
> +	while (++section_nr <= __highest_present_section_nr) {
> +		if (present_section_nr(section_nr))
> +			return section_nr;
> +	}
> +
> +	return -1;
> +}
> +
>  /*
>   * These are _only_ used during initialisation, therefore they
>   * can use __initdata ...  They could have names to indicate
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index a92791512077..26e8044e9848 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5852,18 +5852,11 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
>  /* Skip PFNs that belong to non-present sections */
>  static inline __meminit unsigned long next_pfn(unsigned long pfn)
>  {
> -	unsigned long section_nr;
> +	const unsigned long section_nr = pfn_to_section_nr(++pfn);
>  
> -	section_nr = pfn_to_section_nr(++pfn);
>  	if (present_section_nr(section_nr))
>  		return pfn;
> -
> -	while (++section_nr <= __highest_present_section_nr) {
> -		if (present_section_nr(section_nr))
> -			return section_nr_to_pfn(section_nr);
> -	}
> -
> -	return -1;
> +	return section_nr_to_pfn(next_present_section_nr(section_nr));

This changes behaviour in the corner case: if next_present_section_nr()
returns -1, we call section_nr_to_pfn() for it. It's unlikely would give
any valid pfn, but I can't say for sure for all archs. I guess the worst
case scenrio would be endless loop over the same secitons/pfns.

Have you considered the case?
David Hildenbrand Jan. 13, 2020, 10:57 p.m. UTC | #2
> Am 13.01.2020 um 23:41 schrieb Kirill A. Shutemov <kirill@shutemov.name>:
> 
> On Mon, Jan 13, 2020 at 03:40:35PM +0100, David Hildenbrand wrote:
>> Let's move it to the header and use the shorter variant from
>> mm/page_alloc.c (the original one will also check
>> "__highest_present_section_nr + 1", which is not necessary). While at it,
>> make the section_nr in next_pfn() const.
>> 
>> In next_pfn(), we now return section_nr_to_pfn(-1) instead of -1 once
>> we exceed __highest_present_section_nr, which doesn't make a difference in
>> the caller as it is big enough (>= all sane end_pfn).
>> 
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Michal Hocko <mhocko@kernel.org>
>> Cc: Oscar Salvador <osalvador@suse.de>
>> Cc: Kirill A. Shutemov <kirill@shutemov.name>
>> Signed-off-by: David Hildenbrand <david@redhat.com>
>> ---
>> include/linux/mmzone.h | 10 ++++++++++
>> mm/page_alloc.c        | 11 ++---------
>> mm/sparse.c            | 10 ----------
>> 3 files changed, 12 insertions(+), 19 deletions(-)
>> 
>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>> index c2bc309d1634..462f6873905a 100644
>> --- a/include/linux/mmzone.h
>> +++ b/include/linux/mmzone.h
>> @@ -1379,6 +1379,16 @@ static inline int pfn_present(unsigned long pfn)
>>    return present_section(__nr_to_section(pfn_to_section_nr(pfn)));
>> }
>> 
>> +static inline unsigned long next_present_section_nr(unsigned long section_nr)
>> +{
>> +    while (++section_nr <= __highest_present_section_nr) {
>> +        if (present_section_nr(section_nr))
>> +            return section_nr;
>> +    }
>> +
>> +    return -1;
>> +}
>> +
>> /*
>>  * These are _only_ used during initialisation, therefore they
>>  * can use __initdata ...  They could have names to indicate
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index a92791512077..26e8044e9848 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -5852,18 +5852,11 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
>> /* Skip PFNs that belong to non-present sections */
>> static inline __meminit unsigned long next_pfn(unsigned long pfn)
>> {
>> -    unsigned long section_nr;
>> +    const unsigned long section_nr = pfn_to_section_nr(++pfn);
>> 
>> -    section_nr = pfn_to_section_nr(++pfn);
>>    if (present_section_nr(section_nr))
>>        return pfn;
>> -
>> -    while (++section_nr <= __highest_present_section_nr) {
>> -        if (present_section_nr(section_nr))
>> -            return section_nr_to_pfn(section_nr);
>> -    }
>> -
>> -    return -1;
>> +    return section_nr_to_pfn(next_present_section_nr(section_nr));
> 
> This changes behaviour in the corner case: if next_present_section_nr()
> returns -1, we call section_nr_to_pfn() for it. It's unlikely would give
> any valid pfn, but I can't say for sure for all archs. I guess the worst
> case scenrio would be endless loop over the same secitons/pfns.
> 
> Have you considered the case?

Yes, see the patch description. We return -1 << PFN_SECTION_SHIFT, so a number close to the end of the address space (0xfff...000). (Will double check tomorrow if any 32bit arch could be problematic here)

Thanks!

> 
> -- 
> Kirill A. Shutemov
>
David Hildenbrand Jan. 13, 2020, 11:02 p.m. UTC | #3
> Am 13.01.2020 um 23:57 schrieb David Hildenbrand <dhildenb@redhat.com>:
> 
> 
> 
>>> Am 13.01.2020 um 23:41 schrieb Kirill A. Shutemov <kirill@shutemov.name>:
>>> 
>>> On Mon, Jan 13, 2020 at 03:40:35PM +0100, David Hildenbrand wrote:
>>> Let's move it to the header and use the shorter variant from
>>> mm/page_alloc.c (the original one will also check
>>> "__highest_present_section_nr + 1", which is not necessary). While at it,
>>> make the section_nr in next_pfn() const.
>>> 
>>> In next_pfn(), we now return section_nr_to_pfn(-1) instead of -1 once
>>> we exceed __highest_present_section_nr, which doesn't make a difference in
>>> the caller as it is big enough (>= all sane end_pfn).
>>> 
>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>> Cc: Michal Hocko <mhocko@kernel.org>
>>> Cc: Oscar Salvador <osalvador@suse.de>
>>> Cc: Kirill A. Shutemov <kirill@shutemov.name>
>>> Signed-off-by: David Hildenbrand <david@redhat.com>
>>> ---
>>> include/linux/mmzone.h | 10 ++++++++++
>>> mm/page_alloc.c        | 11 ++---------
>>> mm/sparse.c            | 10 ----------
>>> 3 files changed, 12 insertions(+), 19 deletions(-)
>>> 
>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>>> index c2bc309d1634..462f6873905a 100644
>>> --- a/include/linux/mmzone.h
>>> +++ b/include/linux/mmzone.h
>>> @@ -1379,6 +1379,16 @@ static inline int pfn_present(unsigned long pfn)
>>>   return present_section(__nr_to_section(pfn_to_section_nr(pfn)));
>>> }
>>> 
>>> +static inline unsigned long next_present_section_nr(unsigned long section_nr)
>>> +{
>>> +    while (++section_nr <= __highest_present_section_nr) {
>>> +        if (present_section_nr(section_nr))
>>> +            return section_nr;
>>> +    }
>>> +
>>> +    return -1;
>>> +}
>>> +
>>> /*
>>> * These are _only_ used during initialisation, therefore they
>>> * can use __initdata ...  They could have names to indicate
>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>> index a92791512077..26e8044e9848 100644
>>> --- a/mm/page_alloc.c
>>> +++ b/mm/page_alloc.c
>>> @@ -5852,18 +5852,11 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
>>> /* Skip PFNs that belong to non-present sections */
>>> static inline __meminit unsigned long next_pfn(unsigned long pfn)
>>> {
>>> -    unsigned long section_nr;
>>> +    const unsigned long section_nr = pfn_to_section_nr(++pfn);
>>> 
>>> -    section_nr = pfn_to_section_nr(++pfn);
>>>   if (present_section_nr(section_nr))
>>>       return pfn;
>>> -
>>> -    while (++section_nr <= __highest_present_section_nr) {
>>> -        if (present_section_nr(section_nr))
>>> -            return section_nr_to_pfn(section_nr);
>>> -    }
>>> -
>>> -    return -1;
>>> +    return section_nr_to_pfn(next_present_section_nr(section_nr));
>> 
>> This changes behaviour in the corner case: if next_present_section_nr()
>> returns -1, we call section_nr_to_pfn() for it. It's unlikely would give
>> any valid pfn, but I can't say for sure for all archs. I guess the worst
>> case scenrio would be endless loop over the same secitons/pfns.
>> 
>> Have you considered the case?
> 
> Yes, see the patch description. We return -1 << PFN_SECTION_SHIFT, so a number close to the end of the address space (0xfff...000). (Will double check tomorrow if any 32bit arch could be problematic here)

... but thinking again, 0xfff... is certainly an invalid PFN, so this should work just fine.

(biggest possible pfn is -1 >> PFN_SHIFT)

But it‘s late in Germany, will double check tomorrow :)
Kirill A . Shutemov Jan. 14, 2020, 10:41 a.m. UTC | #4
On Tue, Jan 14, 2020 at 12:02:00AM +0100, David Hildenbrand wrote:
> 
> 
> > Am 13.01.2020 um 23:57 schrieb David Hildenbrand <dhildenb@redhat.com>:
> > 
> > 
> > 
> >>> Am 13.01.2020 um 23:41 schrieb Kirill A. Shutemov <kirill@shutemov.name>:
> >>> 
> >>> On Mon, Jan 13, 2020 at 03:40:35PM +0100, David Hildenbrand wrote:
> >>> Let's move it to the header and use the shorter variant from
> >>> mm/page_alloc.c (the original one will also check
> >>> "__highest_present_section_nr + 1", which is not necessary). While at it,
> >>> make the section_nr in next_pfn() const.
> >>> 
> >>> In next_pfn(), we now return section_nr_to_pfn(-1) instead of -1 once
> >>> we exceed __highest_present_section_nr, which doesn't make a difference in
> >>> the caller as it is big enough (>= all sane end_pfn).
> >>> 
> >>> Cc: Andrew Morton <akpm@linux-foundation.org>
> >>> Cc: Michal Hocko <mhocko@kernel.org>
> >>> Cc: Oscar Salvador <osalvador@suse.de>
> >>> Cc: Kirill A. Shutemov <kirill@shutemov.name>
> >>> Signed-off-by: David Hildenbrand <david@redhat.com>
> >>> ---
> >>> include/linux/mmzone.h | 10 ++++++++++
> >>> mm/page_alloc.c        | 11 ++---------
> >>> mm/sparse.c            | 10 ----------
> >>> 3 files changed, 12 insertions(+), 19 deletions(-)
> >>> 
> >>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> >>> index c2bc309d1634..462f6873905a 100644
> >>> --- a/include/linux/mmzone.h
> >>> +++ b/include/linux/mmzone.h
> >>> @@ -1379,6 +1379,16 @@ static inline int pfn_present(unsigned long pfn)
> >>>   return present_section(__nr_to_section(pfn_to_section_nr(pfn)));
> >>> }
> >>> 
> >>> +static inline unsigned long next_present_section_nr(unsigned long section_nr)
> >>> +{
> >>> +    while (++section_nr <= __highest_present_section_nr) {
> >>> +        if (present_section_nr(section_nr))
> >>> +            return section_nr;
> >>> +    }
> >>> +
> >>> +    return -1;
> >>> +}
> >>> +
> >>> /*
> >>> * These are _only_ used during initialisation, therefore they
> >>> * can use __initdata ...  They could have names to indicate
> >>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> >>> index a92791512077..26e8044e9848 100644
> >>> --- a/mm/page_alloc.c
> >>> +++ b/mm/page_alloc.c
> >>> @@ -5852,18 +5852,11 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
> >>> /* Skip PFNs that belong to non-present sections */
> >>> static inline __meminit unsigned long next_pfn(unsigned long pfn)
> >>> {
> >>> -    unsigned long section_nr;
> >>> +    const unsigned long section_nr = pfn_to_section_nr(++pfn);
> >>> 
> >>> -    section_nr = pfn_to_section_nr(++pfn);
> >>>   if (present_section_nr(section_nr))
> >>>       return pfn;
> >>> -
> >>> -    while (++section_nr <= __highest_present_section_nr) {
> >>> -        if (present_section_nr(section_nr))
> >>> -            return section_nr_to_pfn(section_nr);
> >>> -    }
> >>> -
> >>> -    return -1;
> >>> +    return section_nr_to_pfn(next_present_section_nr(section_nr));
> >> 
> >> This changes behaviour in the corner case: if next_present_section_nr()
> >> returns -1, we call section_nr_to_pfn() for it. It's unlikely would give
> >> any valid pfn, but I can't say for sure for all archs. I guess the worst
> >> case scenrio would be endless loop over the same secitons/pfns.
> >> 
> >> Have you considered the case?
> > 
> > Yes, see the patch description. We return -1 << PFN_SECTION_SHIFT, so a number close to the end of the address space (0xfff...000). (Will double check tomorrow if any 32bit arch could be problematic here)
> 
> ... but thinking again, 0xfff... is certainly an invalid PFN, so this should work just fine.
> 
> (biggest possible pfn is -1 >> PFN_SHIFT)
> 
> But it‘s late in Germany, will double check tomorrow :)

If the end_pfn happens the be more than -1UL << PFN_SECTION_SHIFT we are
screwed: the pfn is invalid, next_present_section_nr() returns -1, the
next iterartion is on the same pfn and we have endless loop.

The question is whether we can prove end_pfn is always less than
-1UL << PFN_SECTION_SHIFT in any configuration of any arch.

It is not obvious for me.
David Hildenbrand Jan. 14, 2020, 10:49 a.m. UTC | #5
On 14.01.20 11:41, Kirill A. Shutemov wrote:
> On Tue, Jan 14, 2020 at 12:02:00AM +0100, David Hildenbrand wrote:
>>
>>
>>> Am 13.01.2020 um 23:57 schrieb David Hildenbrand <dhildenb@redhat.com>:
>>>
>>> 
>>>
>>>>> Am 13.01.2020 um 23:41 schrieb Kirill A. Shutemov <kirill@shutemov.name>:
>>>>>
>>>>> On Mon, Jan 13, 2020 at 03:40:35PM +0100, David Hildenbrand wrote:
>>>>> Let's move it to the header and use the shorter variant from
>>>>> mm/page_alloc.c (the original one will also check
>>>>> "__highest_present_section_nr + 1", which is not necessary). While at it,
>>>>> make the section_nr in next_pfn() const.
>>>>>
>>>>> In next_pfn(), we now return section_nr_to_pfn(-1) instead of -1 once
>>>>> we exceed __highest_present_section_nr, which doesn't make a difference in
>>>>> the caller as it is big enough (>= all sane end_pfn).
>>>>>
>>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>>>> Cc: Michal Hocko <mhocko@kernel.org>
>>>>> Cc: Oscar Salvador <osalvador@suse.de>
>>>>> Cc: Kirill A. Shutemov <kirill@shutemov.name>
>>>>> Signed-off-by: David Hildenbrand <david@redhat.com>
>>>>> ---
>>>>> include/linux/mmzone.h | 10 ++++++++++
>>>>> mm/page_alloc.c        | 11 ++---------
>>>>> mm/sparse.c            | 10 ----------
>>>>> 3 files changed, 12 insertions(+), 19 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>>>>> index c2bc309d1634..462f6873905a 100644
>>>>> --- a/include/linux/mmzone.h
>>>>> +++ b/include/linux/mmzone.h
>>>>> @@ -1379,6 +1379,16 @@ static inline int pfn_present(unsigned long pfn)
>>>>>   return present_section(__nr_to_section(pfn_to_section_nr(pfn)));
>>>>> }
>>>>>
>>>>> +static inline unsigned long next_present_section_nr(unsigned long section_nr)
>>>>> +{
>>>>> +    while (++section_nr <= __highest_present_section_nr) {
>>>>> +        if (present_section_nr(section_nr))
>>>>> +            return section_nr;
>>>>> +    }
>>>>> +
>>>>> +    return -1;
>>>>> +}
>>>>> +
>>>>> /*
>>>>> * These are _only_ used during initialisation, therefore they
>>>>> * can use __initdata ...  They could have names to indicate
>>>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>>>> index a92791512077..26e8044e9848 100644
>>>>> --- a/mm/page_alloc.c
>>>>> +++ b/mm/page_alloc.c
>>>>> @@ -5852,18 +5852,11 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
>>>>> /* Skip PFNs that belong to non-present sections */
>>>>> static inline __meminit unsigned long next_pfn(unsigned long pfn)
>>>>> {
>>>>> -    unsigned long section_nr;
>>>>> +    const unsigned long section_nr = pfn_to_section_nr(++pfn);
>>>>>
>>>>> -    section_nr = pfn_to_section_nr(++pfn);
>>>>>   if (present_section_nr(section_nr))
>>>>>       return pfn;
>>>>> -
>>>>> -    while (++section_nr <= __highest_present_section_nr) {
>>>>> -        if (present_section_nr(section_nr))
>>>>> -            return section_nr_to_pfn(section_nr);
>>>>> -    }
>>>>> -
>>>>> -    return -1;
>>>>> +    return section_nr_to_pfn(next_present_section_nr(section_nr));
>>>>
>>>> This changes behaviour in the corner case: if next_present_section_nr()
>>>> returns -1, we call section_nr_to_pfn() for it. It's unlikely would give
>>>> any valid pfn, but I can't say for sure for all archs. I guess the worst
>>>> case scenrio would be endless loop over the same secitons/pfns.
>>>>
>>>> Have you considered the case?
>>>
>>> Yes, see the patch description. We return -1 << PFN_SECTION_SHIFT, so a number close to the end of the address space (0xfff...000). (Will double check tomorrow if any 32bit arch could be problematic here)
>>
>> ... but thinking again, 0xfff... is certainly an invalid PFN, so this should work just fine.
>>
>> (biggest possible pfn is -1 >> PFN_SHIFT)
>>
>> But it‘s late in Germany, will double check tomorrow :)
> 
> If the end_pfn happens the be more than -1UL << PFN_SECTION_SHIFT we are
> screwed: the pfn is invalid, next_present_section_nr() returns -1, the
> next iterartion is on the same pfn and we have endless loop.
> 
> The question is whether we can prove end_pfn is always less than
> -1UL << PFN_SECTION_SHIFT in any configuration of any arch.
> 
> It is not obvious for me.

memmap_init_zone() is called for a physical memory region: pfn + size
(nr_pages)

The highest possible PFN you can have is "-1(unsigned long) >>
PFN_SHIFT". So even if you would want to add the very last section, the
PFN would still be smaller than -1UL << PFN_SECTION_SHIFT.
Kirill A . Shutemov Jan. 14, 2020, 3:52 p.m. UTC | #6
On Tue, Jan 14, 2020 at 11:49:19AM +0100, David Hildenbrand wrote:
> memmap_init_zone() is called for a physical memory region: pfn + size
> (nr_pages)
> 
> The highest possible PFN you can have is "-1(unsigned long) >>
> PFN_SHIFT". So even if you would want to add the very last section, the
> PFN would still be smaller than -1UL << PFN_SECTION_SHIFT.

PFN_SHIFT? I guess you mean PAGE_SHIFT.

Of course PFN can be more than -1UL >> PAGE_SHIFT. Like on 32-bit x86 with
PAE it is ((1ULL << 36) - 1) >> PAGE_SHIFT. That's the whole reason for
PAE.

The highest possible PFN must fit into phys_addr_t when shifted left by
PAGE_SHIFT and must fit into unsigned long. It's can be -1UL if
phys_addr_t is 64-bit.

Any other limitation I miss?
David Hildenbrand Jan. 14, 2020, 4:50 p.m. UTC | #7
On 14.01.20 16:52, Kirill A. Shutemov wrote:
> On Tue, Jan 14, 2020 at 11:49:19AM +0100, David Hildenbrand wrote:
>> memmap_init_zone() is called for a physical memory region: pfn + size
>> (nr_pages)
>>
>> The highest possible PFN you can have is "-1(unsigned long) >>
>> PFN_SHIFT". So even if you would want to add the very last section, the
>> PFN would still be smaller than -1UL << PFN_SECTION_SHIFT.
> 
> PFN_SHIFT? I guess you mean PAGE_SHIFT.

Yes :)

> 
> Of course PFN can be more than -1UL >> PAGE_SHIFT. Like on 32-bit x86 with
> PAE it is ((1ULL << 36) - 1) >> PAGE_SHIFT. That's the whole reason for
> PAE.

You are right about PAE, but I think you agree that is is a special case.

> 
> The highest possible PFN must fit into phys_addr_t when shifted left by
> PAGE_SHIFT and must fit into unsigned long. It's can be -1UL if
> phys_addr_t is 64-bit.
> 

Right, and for 32bit, that would mean (assuming something like 12bit
PAGE_SHIFT) if you have -1 (0xffffffff) that the biggest possible
address is 0xfffffffffff (44bit). In that case, the existing code would
already break because "end_pfn" (is actually +1, pointing after the one
to initialize), would overflow to 0 and you would have an endless loop
in memmap_init_zone().

Now, after thischange you not only get an endless loop when trying to
init the very last PFN, but when trying to init a PFN in the very last
section (section_nr= -1 - e.g., the last 128MB).

I don't think there is any sane use case where you initialize something
partially in the last section that is possible with any hardware address
extension mechanism.
David Hildenbrand Jan. 14, 2020, 4:52 p.m. UTC | #8
On 14.01.20 17:50, David Hildenbrand wrote:
> On 14.01.20 16:52, Kirill A. Shutemov wrote:
>> On Tue, Jan 14, 2020 at 11:49:19AM +0100, David Hildenbrand wrote:
>>> memmap_init_zone() is called for a physical memory region: pfn + size
>>> (nr_pages)
>>>
>>> The highest possible PFN you can have is "-1(unsigned long) >>
>>> PFN_SHIFT". So even if you would want to add the very last section, the
>>> PFN would still be smaller than -1UL << PFN_SECTION_SHIFT.
>>
>> PFN_SHIFT? I guess you mean PAGE_SHIFT.
> 
> Yes :)
> 
>>
>> Of course PFN can be more than -1UL >> PAGE_SHIFT. Like on 32-bit x86 with
>> PAE it is ((1ULL << 36) - 1) >> PAGE_SHIFT. That's the whole reason for
>> PAE.
> 
> You are right about PAE, but I think you agree that is is a special case.
> 
>>
>> The highest possible PFN must fit into phys_addr_t when shifted left by
>> PAGE_SHIFT and must fit into unsigned long. It's can be -1UL if
>> phys_addr_t is 64-bit.
>>
> 
> Right, and for 32bit, that would mean (assuming something like 12bit
> PAGE_SHIFT) if you have -1 (0xffffffff) that the biggest possible
> address is 0xfffffffffff (44bit). In that case, the existing code would
> already break because "end_pfn" (is actually +1, pointing after the one
> to initialize), would overflow to 0 and you would have an endless loop
> in memmap_init_zone().

Correction: If end_pfn overflows to 0, you would get no loop iteration
at all.
diff mbox series

Patch

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c2bc309d1634..462f6873905a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1379,6 +1379,16 @@  static inline int pfn_present(unsigned long pfn)
 	return present_section(__nr_to_section(pfn_to_section_nr(pfn)));
 }
 
+static inline unsigned long next_present_section_nr(unsigned long section_nr)
+{
+	while (++section_nr <= __highest_present_section_nr) {
+		if (present_section_nr(section_nr))
+			return section_nr;
+	}
+
+	return -1;
+}
+
 /*
  * These are _only_ used during initialisation, therefore they
  * can use __initdata ...  They could have names to indicate
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a92791512077..26e8044e9848 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5852,18 +5852,11 @@  overlap_memmap_init(unsigned long zone, unsigned long *pfn)
 /* Skip PFNs that belong to non-present sections */
 static inline __meminit unsigned long next_pfn(unsigned long pfn)
 {
-	unsigned long section_nr;
+	const unsigned long section_nr = pfn_to_section_nr(++pfn);
 
-	section_nr = pfn_to_section_nr(++pfn);
 	if (present_section_nr(section_nr))
 		return pfn;
-
-	while (++section_nr <= __highest_present_section_nr) {
-		if (present_section_nr(section_nr))
-			return section_nr_to_pfn(section_nr);
-	}
-
-	return -1;
+	return section_nr_to_pfn(next_present_section_nr(section_nr));
 }
 #else
 static inline __meminit unsigned long next_pfn(unsigned long pfn)
diff --git a/mm/sparse.c b/mm/sparse.c
index 3822ecbd8a1f..ac4a2bfae514 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -198,16 +198,6 @@  static void section_mark_present(struct mem_section *ms)
 	ms->section_mem_map |= SECTION_MARKED_PRESENT;
 }
 
-static inline unsigned long next_present_section_nr(unsigned long section_nr)
-{
-	do {
-		section_nr++;
-		if (present_section_nr(section_nr))
-			return section_nr;
-	} while ((section_nr <= __highest_present_section_nr));
-
-	return -1;
-}
 #define for_each_present_section_nr(start, section_nr)		\
 	for (section_nr = next_present_section_nr(start-1);	\
 	     ((section_nr != -1) &&				\