diff mbox series

mm: define pte_add_end for consistency

Message ID 20200630031852.45383-1-richard.weiyang@linux.alibaba.com (mailing list archive)
State New, archived
Headers show
Series mm: define pte_add_end for consistency | expand

Commit Message

Wei Yang June 30, 2020, 3:18 a.m. UTC
When walking page tables, we define several helpers to get the address of
the next boundary. But we don't have one for pte level.

Let's define it and consolidate the code in several places.

Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com>
---
 arch/x86/mm/init_64.c   | 6 ++----
 include/linux/pgtable.h | 7 +++++++
 mm/kasan/init.c         | 4 +---
 3 files changed, 10 insertions(+), 7 deletions(-)

Comments

David Hildenbrand June 30, 2020, 12:44 p.m. UTC | #1
On 30.06.20 05:18, Wei Yang wrote:
> When walking page tables, we define several helpers to get the address of
> the next boundary. But we don't have one for pte level.
> 
> Let's define it and consolidate the code in several places.
> 
> Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com>
> ---
>  arch/x86/mm/init_64.c   | 6 ++----
>  include/linux/pgtable.h | 7 +++++++
>  mm/kasan/init.c         | 4 +---
>  3 files changed, 10 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index dbae185511cd..f902fbd17f27 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -973,9 +973,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
>  
>  	pte = pte_start + pte_index(addr);
>  	for (; addr < end; addr = next, pte++) {
> -		next = (addr + PAGE_SIZE) & PAGE_MASK;
> -		if (next > end)
> -			next = end;
> +		next = pte_addr_end(addr, end);
>  
>  		if (!pte_present(*pte))
>  			continue;
> @@ -1558,7 +1556,7 @@ void register_page_bootmem_memmap(unsigned long section_nr,
>  		get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
>  
>  		if (!boot_cpu_has(X86_FEATURE_PSE)) {
> -			next = (addr + PAGE_SIZE) & PAGE_MASK;
> +			next = pte_addr_end(addr, end);
>  			pmd = pmd_offset(pud, addr);
>  			if (pmd_none(*pmd))
>  				continue;
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index 32b6c52d41b9..0de09c6c89d2 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -706,6 +706,13 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
>  })
>  #endif
>  
> +#ifndef pte_addr_end
> +#define pte_addr_end(addr, end)						\
> +({	unsigned long __boundary = ((addr) + PAGE_SIZE) & PAGE_MASK;	\
> +	(__boundary - 1 < (end) - 1) ? __boundary : (end);		\
> +})
> +#endif
> +
>  /*
>   * When walking page tables, we usually want to skip any p?d_none entries;
>   * and any p?d_bad entries - reporting the error before resetting to none.
> diff --git a/mm/kasan/init.c b/mm/kasan/init.c
> index fe6be0be1f76..89f748601f74 100644
> --- a/mm/kasan/init.c
> +++ b/mm/kasan/init.c
> @@ -349,9 +349,7 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr,
>  	unsigned long next;
>  
>  	for (; addr < end; addr = next, pte++) {
> -		next = (addr + PAGE_SIZE) & PAGE_MASK;
> -		if (next > end)
> -			next = end;
> +		next = pte_addr_end(addr, end);
>  
>  		if (!pte_present(*pte))
>  			continue;
> 

I'm not really a friend of this I have to say. We're simply iterating
over single pages, not much magic ....

What would definitely make sense is replacing (addr + PAGE_SIZE) &
PAGE_MASK; by PAGE_ALIGN() ...
Wei Yang July 1, 2020, 2:11 a.m. UTC | #2
On Tue, Jun 30, 2020 at 02:44:00PM +0200, David Hildenbrand wrote:
>On 30.06.20 05:18, Wei Yang wrote:
>> When walking page tables, we define several helpers to get the address of
>> the next boundary. But we don't have one for pte level.
>> 
>> Let's define it and consolidate the code in several places.
>> 
>> Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com>
>> ---
>>  arch/x86/mm/init_64.c   | 6 ++----
>>  include/linux/pgtable.h | 7 +++++++
>>  mm/kasan/init.c         | 4 +---
>>  3 files changed, 10 insertions(+), 7 deletions(-)
>> 
>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
>> index dbae185511cd..f902fbd17f27 100644
>> --- a/arch/x86/mm/init_64.c
>> +++ b/arch/x86/mm/init_64.c
>> @@ -973,9 +973,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
>>  
>>  	pte = pte_start + pte_index(addr);
>>  	for (; addr < end; addr = next, pte++) {
>> -		next = (addr + PAGE_SIZE) & PAGE_MASK;
>> -		if (next > end)
>> -			next = end;
>> +		next = pte_addr_end(addr, end);
>>  
>>  		if (!pte_present(*pte))
>>  			continue;
>> @@ -1558,7 +1556,7 @@ void register_page_bootmem_memmap(unsigned long section_nr,
>>  		get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
>>  
>>  		if (!boot_cpu_has(X86_FEATURE_PSE)) {
>> -			next = (addr + PAGE_SIZE) & PAGE_MASK;
>> +			next = pte_addr_end(addr, end);
>>  			pmd = pmd_offset(pud, addr);
>>  			if (pmd_none(*pmd))
>>  				continue;
>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>> index 32b6c52d41b9..0de09c6c89d2 100644
>> --- a/include/linux/pgtable.h
>> +++ b/include/linux/pgtable.h
>> @@ -706,6 +706,13 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
>>  })
>>  #endif
>>  
>> +#ifndef pte_addr_end
>> +#define pte_addr_end(addr, end)						\
>> +({	unsigned long __boundary = ((addr) + PAGE_SIZE) & PAGE_MASK;	\
>> +	(__boundary - 1 < (end) - 1) ? __boundary : (end);		\
>> +})
>> +#endif
>> +
>>  /*
>>   * When walking page tables, we usually want to skip any p?d_none entries;
>>   * and any p?d_bad entries - reporting the error before resetting to none.
>> diff --git a/mm/kasan/init.c b/mm/kasan/init.c
>> index fe6be0be1f76..89f748601f74 100644
>> --- a/mm/kasan/init.c
>> +++ b/mm/kasan/init.c
>> @@ -349,9 +349,7 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr,
>>  	unsigned long next;
>>  
>>  	for (; addr < end; addr = next, pte++) {
>> -		next = (addr + PAGE_SIZE) & PAGE_MASK;
>> -		if (next > end)
>> -			next = end;
>> +		next = pte_addr_end(addr, end);
>>  
>>  		if (!pte_present(*pte))
>>  			continue;
>> 
>
>I'm not really a friend of this I have to say. We're simply iterating
>over single pages, not much magic ....

Hmm... yes, we are iterating on Page boundary, while we many have the case
when addr or end is not PAGE_ALIGN.

>
>What would definitely make sense is replacing (addr + PAGE_SIZE) &
>PAGE_MASK; by PAGE_ALIGN() ...
>

No, PAGE_ALIGN() is expanded to be 

	(addr + PAGE_SIZE - 1) & PAGE_MASK;

If we change the code to PAGE_ALIGN(), we would end up with infinite loop.

>-- 
>Thanks,
>
>David / dhildenb
David Hildenbrand July 1, 2020, 8:29 a.m. UTC | #3
On 01.07.20 04:11, Wei Yang wrote:
> On Tue, Jun 30, 2020 at 02:44:00PM +0200, David Hildenbrand wrote:
>> On 30.06.20 05:18, Wei Yang wrote:
>>> When walking page tables, we define several helpers to get the address of
>>> the next boundary. But we don't have one for pte level.
>>>
>>> Let's define it and consolidate the code in several places.
>>>
>>> Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com>
>>> ---
>>>  arch/x86/mm/init_64.c   | 6 ++----
>>>  include/linux/pgtable.h | 7 +++++++
>>>  mm/kasan/init.c         | 4 +---
>>>  3 files changed, 10 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
>>> index dbae185511cd..f902fbd17f27 100644
>>> --- a/arch/x86/mm/init_64.c
>>> +++ b/arch/x86/mm/init_64.c
>>> @@ -973,9 +973,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
>>>  
>>>  	pte = pte_start + pte_index(addr);
>>>  	for (; addr < end; addr = next, pte++) {
>>> -		next = (addr + PAGE_SIZE) & PAGE_MASK;
>>> -		if (next > end)
>>> -			next = end;
>>> +		next = pte_addr_end(addr, end);
>>>  
>>>  		if (!pte_present(*pte))
>>>  			continue;
>>> @@ -1558,7 +1556,7 @@ void register_page_bootmem_memmap(unsigned long section_nr,
>>>  		get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
>>>  
>>>  		if (!boot_cpu_has(X86_FEATURE_PSE)) {
>>> -			next = (addr + PAGE_SIZE) & PAGE_MASK;
>>> +			next = pte_addr_end(addr, end);
>>>  			pmd = pmd_offset(pud, addr);
>>>  			if (pmd_none(*pmd))
>>>  				continue;
>>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>>> index 32b6c52d41b9..0de09c6c89d2 100644
>>> --- a/include/linux/pgtable.h
>>> +++ b/include/linux/pgtable.h
>>> @@ -706,6 +706,13 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
>>>  })
>>>  #endif
>>>  
>>> +#ifndef pte_addr_end
>>> +#define pte_addr_end(addr, end)						\
>>> +({	unsigned long __boundary = ((addr) + PAGE_SIZE) & PAGE_MASK;	\
>>> +	(__boundary - 1 < (end) - 1) ? __boundary : (end);		\
>>> +})
>>> +#endif
>>> +
>>>  /*
>>>   * When walking page tables, we usually want to skip any p?d_none entries;
>>>   * and any p?d_bad entries - reporting the error before resetting to none.
>>> diff --git a/mm/kasan/init.c b/mm/kasan/init.c
>>> index fe6be0be1f76..89f748601f74 100644
>>> --- a/mm/kasan/init.c
>>> +++ b/mm/kasan/init.c
>>> @@ -349,9 +349,7 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr,
>>>  	unsigned long next;
>>>  
>>>  	for (; addr < end; addr = next, pte++) {
>>> -		next = (addr + PAGE_SIZE) & PAGE_MASK;
>>> -		if (next > end)
>>> -			next = end;
>>> +		next = pte_addr_end(addr, end);
>>>  
>>>  		if (!pte_present(*pte))
>>>  			continue;
>>>
>>
>> I'm not really a friend of this I have to say. We're simply iterating
>> over single pages, not much magic ....
> 
> Hmm... yes, we are iterating on Page boundary, while we many have the case
> when addr or end is not PAGE_ALIGN.

I really do wonder if not having page aligned addresses actually happens
in real life. Page tables operate on page granularity, and
adding/removing unaligned parts feels wrong ... and that's also why I
dislike such a helper.

1. kasan_add_zero_shadow()/kasan_remove_zero_shadow(). If I understand
the logic (WARN_ON()) correctly, we bail out in case we would ever end
up in such a scenario, where we would want to add/remove things not
aligned to PAGE_SIZE.

2. remove_pagetable()...->remove_pte_table()

vmemmap_free() should never try to de-populate sub-pages. Even with
sub-section hot-add/remove (2MB / 512 pages), with valid struct page
sizes (56, 64, 72, 80), we always end up with full pages.

kernel_physical_mapping_remove() is only called via
arch_remove_memory(). That will never remove unaligned parts.

3. register_page_bootmem_memmap()

It operates on full pages only.


This needs in-depth analysis, but my gut feeling is that this alignment
is unnecessary.

> 
>>
>> What would definitely make sense is replacing (addr + PAGE_SIZE) &
>> PAGE_MASK; by PAGE_ALIGN() ...
>>
> 
> No, PAGE_ALIGN() is expanded to be 
> 
> 	(addr + PAGE_SIZE - 1) & PAGE_MASK;
> 
> If we change the code to PAGE_ALIGN(), we would end up with infinite loop.

Very right, it would have to be PAGE_ALIGN(addr + 1).
Wei Yang July 1, 2020, 11:54 a.m. UTC | #4
On Wed, Jul 01, 2020 at 10:29:08AM +0200, David Hildenbrand wrote:
>On 01.07.20 04:11, Wei Yang wrote:
>> On Tue, Jun 30, 2020 at 02:44:00PM +0200, David Hildenbrand wrote:
>>> On 30.06.20 05:18, Wei Yang wrote:
>>>> When walking page tables, we define several helpers to get the address of
>>>> the next boundary. But we don't have one for pte level.
>>>>
>>>> Let's define it and consolidate the code in several places.
>>>>
>>>> Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com>
>>>> ---
>>>>  arch/x86/mm/init_64.c   | 6 ++----
>>>>  include/linux/pgtable.h | 7 +++++++
>>>>  mm/kasan/init.c         | 4 +---
>>>>  3 files changed, 10 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
>>>> index dbae185511cd..f902fbd17f27 100644
>>>> --- a/arch/x86/mm/init_64.c
>>>> +++ b/arch/x86/mm/init_64.c
>>>> @@ -973,9 +973,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
>>>>  
>>>>  	pte = pte_start + pte_index(addr);
>>>>  	for (; addr < end; addr = next, pte++) {
>>>> -		next = (addr + PAGE_SIZE) & PAGE_MASK;
>>>> -		if (next > end)
>>>> -			next = end;
>>>> +		next = pte_addr_end(addr, end);
>>>>  
>>>>  		if (!pte_present(*pte))
>>>>  			continue;
>>>> @@ -1558,7 +1556,7 @@ void register_page_bootmem_memmap(unsigned long section_nr,
>>>>  		get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
>>>>  
>>>>  		if (!boot_cpu_has(X86_FEATURE_PSE)) {
>>>> -			next = (addr + PAGE_SIZE) & PAGE_MASK;
>>>> +			next = pte_addr_end(addr, end);
>>>>  			pmd = pmd_offset(pud, addr);
>>>>  			if (pmd_none(*pmd))
>>>>  				continue;
>>>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>>>> index 32b6c52d41b9..0de09c6c89d2 100644
>>>> --- a/include/linux/pgtable.h
>>>> +++ b/include/linux/pgtable.h
>>>> @@ -706,6 +706,13 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
>>>>  })
>>>>  #endif
>>>>  
>>>> +#ifndef pte_addr_end
>>>> +#define pte_addr_end(addr, end)						\
>>>> +({	unsigned long __boundary = ((addr) + PAGE_SIZE) & PAGE_MASK;	\
>>>> +	(__boundary - 1 < (end) - 1) ? __boundary : (end);		\
>>>> +})
>>>> +#endif
>>>> +
>>>>  /*
>>>>   * When walking page tables, we usually want to skip any p?d_none entries;
>>>>   * and any p?d_bad entries - reporting the error before resetting to none.
>>>> diff --git a/mm/kasan/init.c b/mm/kasan/init.c
>>>> index fe6be0be1f76..89f748601f74 100644
>>>> --- a/mm/kasan/init.c
>>>> +++ b/mm/kasan/init.c
>>>> @@ -349,9 +349,7 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr,
>>>>  	unsigned long next;
>>>>  
>>>>  	for (; addr < end; addr = next, pte++) {
>>>> -		next = (addr + PAGE_SIZE) & PAGE_MASK;
>>>> -		if (next > end)
>>>> -			next = end;
>>>> +		next = pte_addr_end(addr, end);
>>>>  
>>>>  		if (!pte_present(*pte))
>>>>  			continue;
>>>>
>>>
>>> I'm not really a friend of this I have to say. We're simply iterating
>>> over single pages, not much magic ....
>> 
>> Hmm... yes, we are iterating on Page boundary, while we many have the case
>> when addr or end is not PAGE_ALIGN.
>
>I really do wonder if not having page aligned addresses actually happens
>in real life. Page tables operate on page granularity, and
>adding/removing unaligned parts feels wrong ... and that's also why I
>dislike such a helper.
>
>1. kasan_add_zero_shadow()/kasan_remove_zero_shadow(). If I understand
>the logic (WARN_ON()) correctly, we bail out in case we would ever end
>up in such a scenario, where we would want to add/remove things not
>aligned to PAGE_SIZE.
>
>2. remove_pagetable()...->remove_pte_table()
>
>vmemmap_free() should never try to de-populate sub-pages. Even with
>sub-section hot-add/remove (2MB / 512 pages), with valid struct page
>sizes (56, 64, 72, 80), we always end up with full pages.
>
>kernel_physical_mapping_remove() is only called via
>arch_remove_memory(). That will never remove unaligned parts.
>

I don't have a very clear mind now, while when you look into
remove_pte_table(), it has two cases based on alignment of addr and next.

If we always remove a page, the second case won't happen?

>3. register_page_bootmem_memmap()
>
>It operates on full pages only.
>
>
>This needs in-depth analysis, but my gut feeling is that this alignment
>is unnecessary.
>
>> 
>>>
>>> What would definitely make sense is replacing (addr + PAGE_SIZE) &
>>> PAGE_MASK; by PAGE_ALIGN() ...
>>>
>> 
>> No, PAGE_ALIGN() is expanded to be 
>> 
>> 	(addr + PAGE_SIZE - 1) & PAGE_MASK;
>> 
>> If we change the code to PAGE_ALIGN(), we would end up with infinite loop.
>
>Very right, it would have to be PAGE_ALIGN(addr + 1).
>
>-- 
>Thanks,
>
>David / dhildenb
David Hildenbrand July 2, 2020, 4:28 p.m. UTC | #5
On 01.07.20 13:54, Wei Yang wrote:
> On Wed, Jul 01, 2020 at 10:29:08AM +0200, David Hildenbrand wrote:
>> On 01.07.20 04:11, Wei Yang wrote:
>>> On Tue, Jun 30, 2020 at 02:44:00PM +0200, David Hildenbrand wrote:
>>>> On 30.06.20 05:18, Wei Yang wrote:
>>>>> When walking page tables, we define several helpers to get the address of
>>>>> the next boundary. But we don't have one for pte level.
>>>>>
>>>>> Let's define it and consolidate the code in several places.
>>>>>
>>>>> Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com>
>>>>> ---
>>>>>  arch/x86/mm/init_64.c   | 6 ++----
>>>>>  include/linux/pgtable.h | 7 +++++++
>>>>>  mm/kasan/init.c         | 4 +---
>>>>>  3 files changed, 10 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
>>>>> index dbae185511cd..f902fbd17f27 100644
>>>>> --- a/arch/x86/mm/init_64.c
>>>>> +++ b/arch/x86/mm/init_64.c
>>>>> @@ -973,9 +973,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
>>>>>  
>>>>>  	pte = pte_start + pte_index(addr);
>>>>>  	for (; addr < end; addr = next, pte++) {
>>>>> -		next = (addr + PAGE_SIZE) & PAGE_MASK;
>>>>> -		if (next > end)
>>>>> -			next = end;
>>>>> +		next = pte_addr_end(addr, end);
>>>>>  
>>>>>  		if (!pte_present(*pte))
>>>>>  			continue;
>>>>> @@ -1558,7 +1556,7 @@ void register_page_bootmem_memmap(unsigned long section_nr,
>>>>>  		get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
>>>>>  
>>>>>  		if (!boot_cpu_has(X86_FEATURE_PSE)) {
>>>>> -			next = (addr + PAGE_SIZE) & PAGE_MASK;
>>>>> +			next = pte_addr_end(addr, end);
>>>>>  			pmd = pmd_offset(pud, addr);
>>>>>  			if (pmd_none(*pmd))
>>>>>  				continue;
>>>>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>>>>> index 32b6c52d41b9..0de09c6c89d2 100644
>>>>> --- a/include/linux/pgtable.h
>>>>> +++ b/include/linux/pgtable.h
>>>>> @@ -706,6 +706,13 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
>>>>>  })
>>>>>  #endif
>>>>>  
>>>>> +#ifndef pte_addr_end
>>>>> +#define pte_addr_end(addr, end)						\
>>>>> +({	unsigned long __boundary = ((addr) + PAGE_SIZE) & PAGE_MASK;	\
>>>>> +	(__boundary - 1 < (end) - 1) ? __boundary : (end);		\
>>>>> +})
>>>>> +#endif
>>>>> +
>>>>>  /*
>>>>>   * When walking page tables, we usually want to skip any p?d_none entries;
>>>>>   * and any p?d_bad entries - reporting the error before resetting to none.
>>>>> diff --git a/mm/kasan/init.c b/mm/kasan/init.c
>>>>> index fe6be0be1f76..89f748601f74 100644
>>>>> --- a/mm/kasan/init.c
>>>>> +++ b/mm/kasan/init.c
>>>>> @@ -349,9 +349,7 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr,
>>>>>  	unsigned long next;
>>>>>  
>>>>>  	for (; addr < end; addr = next, pte++) {
>>>>> -		next = (addr + PAGE_SIZE) & PAGE_MASK;
>>>>> -		if (next > end)
>>>>> -			next = end;
>>>>> +		next = pte_addr_end(addr, end);
>>>>>  
>>>>>  		if (!pte_present(*pte))
>>>>>  			continue;
>>>>>
>>>>
>>>> I'm not really a friend of this I have to say. We're simply iterating
>>>> over single pages, not much magic ....
>>>
>>> Hmm... yes, we are iterating on Page boundary, while we many have the case
>>> when addr or end is not PAGE_ALIGN.
>>
>> I really do wonder if not having page aligned addresses actually happens
>> in real life. Page tables operate on page granularity, and
>> adding/removing unaligned parts feels wrong ... and that's also why I
>> dislike such a helper.
>>
>> 1. kasan_add_zero_shadow()/kasan_remove_zero_shadow(). If I understand
>> the logic (WARN_ON()) correctly, we bail out in case we would ever end
>> up in such a scenario, where we would want to add/remove things not
>> aligned to PAGE_SIZE.
>>
>> 2. remove_pagetable()...->remove_pte_table()
>>
>> vmemmap_free() should never try to de-populate sub-pages. Even with
>> sub-section hot-add/remove (2MB / 512 pages), with valid struct page
>> sizes (56, 64, 72, 80), we always end up with full pages.
>>
>> kernel_physical_mapping_remove() is only called via
>> arch_remove_memory(). That will never remove unaligned parts.
>>
> 
> I don't have a very clear mind now, while when you look into
> remove_pte_table(), it has two cases based on alignment of addr and next.
> 
> If we always remove a page, the second case won't happen?

So, the code talks about that the second case can only happen for
vmemmap, never for direct mappings.

I don't see a way how this could ever happen with current page sizes,
even with sub-section hotadd (2MB). Maybe that is a legacy leftover or
was never relevant? Or I am missing something important, where we could
have sub-4k-page vmemmap data.
Wei Yang July 3, 2020, 1:34 a.m. UTC | #6
On Thu, Jul 02, 2020 at 06:28:19PM +0200, David Hildenbrand wrote:
>On 01.07.20 13:54, Wei Yang wrote:
>> On Wed, Jul 01, 2020 at 10:29:08AM +0200, David Hildenbrand wrote:
>>> On 01.07.20 04:11, Wei Yang wrote:
>>>> On Tue, Jun 30, 2020 at 02:44:00PM +0200, David Hildenbrand wrote:
>>>>> On 30.06.20 05:18, Wei Yang wrote:
>>>>>> When walking page tables, we define several helpers to get the address of
>>>>>> the next boundary. But we don't have one for pte level.
>>>>>>
>>>>>> Let's define it and consolidate the code in several places.
>>>>>>
>>>>>> Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com>
>>>>>> ---
>>>>>>  arch/x86/mm/init_64.c   | 6 ++----
>>>>>>  include/linux/pgtable.h | 7 +++++++
>>>>>>  mm/kasan/init.c         | 4 +---
>>>>>>  3 files changed, 10 insertions(+), 7 deletions(-)
>>>>>>
>>>>>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
>>>>>> index dbae185511cd..f902fbd17f27 100644
>>>>>> --- a/arch/x86/mm/init_64.c
>>>>>> +++ b/arch/x86/mm/init_64.c
>>>>>> @@ -973,9 +973,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
>>>>>>  
>>>>>>  	pte = pte_start + pte_index(addr);
>>>>>>  	for (; addr < end; addr = next, pte++) {
>>>>>> -		next = (addr + PAGE_SIZE) & PAGE_MASK;
>>>>>> -		if (next > end)
>>>>>> -			next = end;
>>>>>> +		next = pte_addr_end(addr, end);
>>>>>>  
>>>>>>  		if (!pte_present(*pte))
>>>>>>  			continue;
>>>>>> @@ -1558,7 +1556,7 @@ void register_page_bootmem_memmap(unsigned long section_nr,
>>>>>>  		get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
>>>>>>  
>>>>>>  		if (!boot_cpu_has(X86_FEATURE_PSE)) {
>>>>>> -			next = (addr + PAGE_SIZE) & PAGE_MASK;
>>>>>> +			next = pte_addr_end(addr, end);
>>>>>>  			pmd = pmd_offset(pud, addr);
>>>>>>  			if (pmd_none(*pmd))
>>>>>>  				continue;
>>>>>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>>>>>> index 32b6c52d41b9..0de09c6c89d2 100644
>>>>>> --- a/include/linux/pgtable.h
>>>>>> +++ b/include/linux/pgtable.h
>>>>>> @@ -706,6 +706,13 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
>>>>>>  })
>>>>>>  #endif
>>>>>>  
>>>>>> +#ifndef pte_addr_end
>>>>>> +#define pte_addr_end(addr, end)						\
>>>>>> +({	unsigned long __boundary = ((addr) + PAGE_SIZE) & PAGE_MASK;	\
>>>>>> +	(__boundary - 1 < (end) - 1) ? __boundary : (end);		\
>>>>>> +})
>>>>>> +#endif
>>>>>> +
>>>>>>  /*
>>>>>>   * When walking page tables, we usually want to skip any p?d_none entries;
>>>>>>   * and any p?d_bad entries - reporting the error before resetting to none.
>>>>>> diff --git a/mm/kasan/init.c b/mm/kasan/init.c
>>>>>> index fe6be0be1f76..89f748601f74 100644
>>>>>> --- a/mm/kasan/init.c
>>>>>> +++ b/mm/kasan/init.c
>>>>>> @@ -349,9 +349,7 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr,
>>>>>>  	unsigned long next;
>>>>>>  
>>>>>>  	for (; addr < end; addr = next, pte++) {
>>>>>> -		next = (addr + PAGE_SIZE) & PAGE_MASK;
>>>>>> -		if (next > end)
>>>>>> -			next = end;
>>>>>> +		next = pte_addr_end(addr, end);
>>>>>>  
>>>>>>  		if (!pte_present(*pte))
>>>>>>  			continue;
>>>>>>
>>>>>
>>>>> I'm not really a friend of this I have to say. We're simply iterating
>>>>> over single pages, not much magic ....
>>>>
>>>> Hmm... yes, we are iterating on Page boundary, while we many have the case
>>>> when addr or end is not PAGE_ALIGN.
>>>
>>> I really do wonder if not having page aligned addresses actually happens
>>> in real life. Page tables operate on page granularity, and
>>> adding/removing unaligned parts feels wrong ... and that's also why I
>>> dislike such a helper.
>>>
>>> 1. kasan_add_zero_shadow()/kasan_remove_zero_shadow(). If I understand
>>> the logic (WARN_ON()) correctly, we bail out in case we would ever end
>>> up in such a scenario, where we would want to add/remove things not
>>> aligned to PAGE_SIZE.
>>>
>>> 2. remove_pagetable()...->remove_pte_table()
>>>
>>> vmemmap_free() should never try to de-populate sub-pages. Even with
>>> sub-section hot-add/remove (2MB / 512 pages), with valid struct page
>>> sizes (56, 64, 72, 80), we always end up with full pages.
>>>
>>> kernel_physical_mapping_remove() is only called via
>>> arch_remove_memory(). That will never remove unaligned parts.
>>>
>> 
>> I don't have a very clear mind now, while when you look into
>> remove_pte_table(), it has two cases based on alignment of addr and next.
>> 
>> If we always remove a page, the second case won't happen?
>
>So, the code talks about that the second case can only happen for
>vmemmap, never for direct mappings.
>
>I don't see a way how this could ever happen with current page sizes,
>even with sub-section hotadd (2MB). Maybe that is a legacy leftover or
>was never relevant? Or I am missing something important, where we could
>have sub-4k-page vmemmap data.
>

I took a calculation on the sub-section page struct size, it is page size (4K)
aligned. This means you are right, which we won't depopulate a sub-page.

And yes, I am not sure all those variants would fit this case. So I would like
to leave as it now. How about your opinion?

>-- 
>Thanks,
>
>David / dhildenb
David Hildenbrand July 3, 2020, 7:23 a.m. UTC | #7
On 03.07.20 03:34, Wei Yang wrote:
> On Thu, Jul 02, 2020 at 06:28:19PM +0200, David Hildenbrand wrote:
>> On 01.07.20 13:54, Wei Yang wrote:
>>> On Wed, Jul 01, 2020 at 10:29:08AM +0200, David Hildenbrand wrote:
>>>> On 01.07.20 04:11, Wei Yang wrote:
>>>>> On Tue, Jun 30, 2020 at 02:44:00PM +0200, David Hildenbrand wrote:
>>>>>> On 30.06.20 05:18, Wei Yang wrote:
>>>>>>> When walking page tables, we define several helpers to get the address of
>>>>>>> the next boundary. But we don't have one for pte level.
>>>>>>>
>>>>>>> Let's define it and consolidate the code in several places.
>>>>>>>
>>>>>>> Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com>
>>>>>>> ---
>>>>>>>  arch/x86/mm/init_64.c   | 6 ++----
>>>>>>>  include/linux/pgtable.h | 7 +++++++
>>>>>>>  mm/kasan/init.c         | 4 +---
>>>>>>>  3 files changed, 10 insertions(+), 7 deletions(-)
>>>>>>>
>>>>>>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
>>>>>>> index dbae185511cd..f902fbd17f27 100644
>>>>>>> --- a/arch/x86/mm/init_64.c
>>>>>>> +++ b/arch/x86/mm/init_64.c
>>>>>>> @@ -973,9 +973,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
>>>>>>>  
>>>>>>>  	pte = pte_start + pte_index(addr);
>>>>>>>  	for (; addr < end; addr = next, pte++) {
>>>>>>> -		next = (addr + PAGE_SIZE) & PAGE_MASK;
>>>>>>> -		if (next > end)
>>>>>>> -			next = end;
>>>>>>> +		next = pte_addr_end(addr, end);
>>>>>>>  
>>>>>>>  		if (!pte_present(*pte))
>>>>>>>  			continue;
>>>>>>> @@ -1558,7 +1556,7 @@ void register_page_bootmem_memmap(unsigned long section_nr,
>>>>>>>  		get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
>>>>>>>  
>>>>>>>  		if (!boot_cpu_has(X86_FEATURE_PSE)) {
>>>>>>> -			next = (addr + PAGE_SIZE) & PAGE_MASK;
>>>>>>> +			next = pte_addr_end(addr, end);
>>>>>>>  			pmd = pmd_offset(pud, addr);
>>>>>>>  			if (pmd_none(*pmd))
>>>>>>>  				continue;
>>>>>>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>>>>>>> index 32b6c52d41b9..0de09c6c89d2 100644
>>>>>>> --- a/include/linux/pgtable.h
>>>>>>> +++ b/include/linux/pgtable.h
>>>>>>> @@ -706,6 +706,13 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
>>>>>>>  })
>>>>>>>  #endif
>>>>>>>  
>>>>>>> +#ifndef pte_addr_end
>>>>>>> +#define pte_addr_end(addr, end)						\
>>>>>>> +({	unsigned long __boundary = ((addr) + PAGE_SIZE) & PAGE_MASK;	\
>>>>>>> +	(__boundary - 1 < (end) - 1) ? __boundary : (end);		\
>>>>>>> +})
>>>>>>> +#endif
>>>>>>> +
>>>>>>>  /*
>>>>>>>   * When walking page tables, we usually want to skip any p?d_none entries;
>>>>>>>   * and any p?d_bad entries - reporting the error before resetting to none.
>>>>>>> diff --git a/mm/kasan/init.c b/mm/kasan/init.c
>>>>>>> index fe6be0be1f76..89f748601f74 100644
>>>>>>> --- a/mm/kasan/init.c
>>>>>>> +++ b/mm/kasan/init.c
>>>>>>> @@ -349,9 +349,7 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr,
>>>>>>>  	unsigned long next;
>>>>>>>  
>>>>>>>  	for (; addr < end; addr = next, pte++) {
>>>>>>> -		next = (addr + PAGE_SIZE) & PAGE_MASK;
>>>>>>> -		if (next > end)
>>>>>>> -			next = end;
>>>>>>> +		next = pte_addr_end(addr, end);
>>>>>>>  
>>>>>>>  		if (!pte_present(*pte))
>>>>>>>  			continue;
>>>>>>>
>>>>>>
>>>>>> I'm not really a friend of this I have to say. We're simply iterating
>>>>>> over single pages, not much magic ....
>>>>>
>>>>> Hmm... yes, we are iterating on Page boundary, while we many have the case
>>>>> when addr or end is not PAGE_ALIGN.
>>>>
>>>> I really do wonder if not having page aligned addresses actually happens
>>>> in real life. Page tables operate on page granularity, and
>>>> adding/removing unaligned parts feels wrong ... and that's also why I
>>>> dislike such a helper.
>>>>
>>>> 1. kasan_add_zero_shadow()/kasan_remove_zero_shadow(). If I understand
>>>> the logic (WARN_ON()) correctly, we bail out in case we would ever end
>>>> up in such a scenario, where we would want to add/remove things not
>>>> aligned to PAGE_SIZE.
>>>>
>>>> 2. remove_pagetable()...->remove_pte_table()
>>>>
>>>> vmemmap_free() should never try to de-populate sub-pages. Even with
>>>> sub-section hot-add/remove (2MB / 512 pages), with valid struct page
>>>> sizes (56, 64, 72, 80), we always end up with full pages.
>>>>
>>>> kernel_physical_mapping_remove() is only called via
>>>> arch_remove_memory(). That will never remove unaligned parts.
>>>>
>>>
>>> I don't have a very clear mind now, while when you look into
>>> remove_pte_table(), it has two cases based on alignment of addr and next.
>>>
>>> If we always remove a page, the second case won't happen?
>>
>> So, the code talks about that the second case can only happen for
>> vmemmap, never for direct mappings.
>>
>> I don't see a way how this could ever happen with current page sizes,
>> even with sub-section hotadd (2MB). Maybe that is a legacy leftover or
>> was never relevant? Or I am missing something important, where we could
>> have sub-4k-page vmemmap data.
>>
> 
> I took a calculation on the sub-section page struct size, it is page size (4K)
> aligned. This means you are right, which we won't depopulate a sub-page.
> 
> And yes, I am not sure all those variants would fit this case. So I would like
> to leave as it now. How about your opinion?

I'd say we clean this up and protect it by WARN_ON_ONCE(). Then, it
won't need another round of investigation to find out that handling
sub-pages is irrelevant.

If you don't want to tackle this, I can have a look. Just let me know.
Wei Yang July 3, 2020, 8:33 a.m. UTC | #8
On Fri, Jul 03, 2020 at 09:23:30AM +0200, David Hildenbrand wrote:
>On 03.07.20 03:34, Wei Yang wrote:
>> On Thu, Jul 02, 2020 at 06:28:19PM +0200, David Hildenbrand wrote:
>>> On 01.07.20 13:54, Wei Yang wrote:
>>>> On Wed, Jul 01, 2020 at 10:29:08AM +0200, David Hildenbrand wrote:
>>>>> On 01.07.20 04:11, Wei Yang wrote:
>>>>>> On Tue, Jun 30, 2020 at 02:44:00PM +0200, David Hildenbrand wrote:
>>>>>>> On 30.06.20 05:18, Wei Yang wrote:
>>>>>>>> When walking page tables, we define several helpers to get the address of
>>>>>>>> the next boundary. But we don't have one for pte level.
>>>>>>>>
>>>>>>>> Let's define it and consolidate the code in several places.
>>>>>>>>
>>>>>>>> Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com>
>>>>>>>> ---
>>>>>>>>  arch/x86/mm/init_64.c   | 6 ++----
>>>>>>>>  include/linux/pgtable.h | 7 +++++++
>>>>>>>>  mm/kasan/init.c         | 4 +---
>>>>>>>>  3 files changed, 10 insertions(+), 7 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
>>>>>>>> index dbae185511cd..f902fbd17f27 100644
>>>>>>>> --- a/arch/x86/mm/init_64.c
>>>>>>>> +++ b/arch/x86/mm/init_64.c
>>>>>>>> @@ -973,9 +973,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
>>>>>>>>  
>>>>>>>>  	pte = pte_start + pte_index(addr);
>>>>>>>>  	for (; addr < end; addr = next, pte++) {
>>>>>>>> -		next = (addr + PAGE_SIZE) & PAGE_MASK;
>>>>>>>> -		if (next > end)
>>>>>>>> -			next = end;
>>>>>>>> +		next = pte_addr_end(addr, end);
>>>>>>>>  
>>>>>>>>  		if (!pte_present(*pte))
>>>>>>>>  			continue;
>>>>>>>> @@ -1558,7 +1556,7 @@ void register_page_bootmem_memmap(unsigned long section_nr,
>>>>>>>>  		get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
>>>>>>>>  
>>>>>>>>  		if (!boot_cpu_has(X86_FEATURE_PSE)) {
>>>>>>>> -			next = (addr + PAGE_SIZE) & PAGE_MASK;
>>>>>>>> +			next = pte_addr_end(addr, end);
>>>>>>>>  			pmd = pmd_offset(pud, addr);
>>>>>>>>  			if (pmd_none(*pmd))
>>>>>>>>  				continue;
>>>>>>>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>>>>>>>> index 32b6c52d41b9..0de09c6c89d2 100644
>>>>>>>> --- a/include/linux/pgtable.h
>>>>>>>> +++ b/include/linux/pgtable.h
>>>>>>>> @@ -706,6 +706,13 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
>>>>>>>>  })
>>>>>>>>  #endif
>>>>>>>>  
>>>>>>>> +#ifndef pte_addr_end
>>>>>>>> +#define pte_addr_end(addr, end)						\
>>>>>>>> +({	unsigned long __boundary = ((addr) + PAGE_SIZE) & PAGE_MASK;	\
>>>>>>>> +	(__boundary - 1 < (end) - 1) ? __boundary : (end);		\
>>>>>>>> +})
>>>>>>>> +#endif
>>>>>>>> +
>>>>>>>>  /*
>>>>>>>>   * When walking page tables, we usually want to skip any p?d_none entries;
>>>>>>>>   * and any p?d_bad entries - reporting the error before resetting to none.
>>>>>>>> diff --git a/mm/kasan/init.c b/mm/kasan/init.c
>>>>>>>> index fe6be0be1f76..89f748601f74 100644
>>>>>>>> --- a/mm/kasan/init.c
>>>>>>>> +++ b/mm/kasan/init.c
>>>>>>>> @@ -349,9 +349,7 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr,
>>>>>>>>  	unsigned long next;
>>>>>>>>  
>>>>>>>>  	for (; addr < end; addr = next, pte++) {
>>>>>>>> -		next = (addr + PAGE_SIZE) & PAGE_MASK;
>>>>>>>> -		if (next > end)
>>>>>>>> -			next = end;
>>>>>>>> +		next = pte_addr_end(addr, end);
>>>>>>>>  
>>>>>>>>  		if (!pte_present(*pte))
>>>>>>>>  			continue;
>>>>>>>>
>>>>>>>
>>>>>>> I'm not really a friend of this I have to say. We're simply iterating
>>>>>>> over single pages, not much magic ....
>>>>>>
>>>>>> Hmm... yes, we are iterating on Page boundary, while we many have the case
>>>>>> when addr or end is not PAGE_ALIGN.
>>>>>
>>>>> I really do wonder if not having page aligned addresses actually happens
>>>>> in real life. Page tables operate on page granularity, and
>>>>> adding/removing unaligned parts feels wrong ... and that's also why I
>>>>> dislike such a helper.
>>>>>
>>>>> 1. kasan_add_zero_shadow()/kasan_remove_zero_shadow(). If I understand
>>>>> the logic (WARN_ON()) correctly, we bail out in case we would ever end
>>>>> up in such a scenario, where we would want to add/remove things not
>>>>> aligned to PAGE_SIZE.
>>>>>
>>>>> 2. remove_pagetable()...->remove_pte_table()
>>>>>
>>>>> vmemmap_free() should never try to de-populate sub-pages. Even with
>>>>> sub-section hot-add/remove (2MB / 512 pages), with valid struct page
>>>>> sizes (56, 64, 72, 80), we always end up with full pages.
>>>>>
>>>>> kernel_physical_mapping_remove() is only called via
>>>>> arch_remove_memory(). That will never remove unaligned parts.
>>>>>
>>>>
>>>> I don't have a very clear mind now, while when you look into
>>>> remove_pte_table(), it has two cases based on alignment of addr and next.
>>>>
>>>> If we always remove a page, the second case won't happen?
>>>
>>> So, the code talks about that the second case can only happen for
>>> vmemmap, never for direct mappings.
>>>
>>> I don't see a way how this could ever happen with current page sizes,
>>> even with sub-section hotadd (2MB). Maybe that is a legacy leftover or
>>> was never relevant? Or I am missing something important, where we could
>>> have sub-4k-page vmemmap data.
>>>
>> 
>> I took a calculation on the sub-section page struct size, it is page size (4K)
>> aligned. This means you are right, which we won't depopulate a sub-page.
>> 
>> And yes, I am not sure all those variants would fit this case. So I would like
>> to leave as it now. How about your opinion?
>
>I'd say we clean this up and protect it by WARN_ON_ONCE(). Then, it
>won't need another round of investigation to find out that handling
>sub-pages is irrelevant.
>
>If you don't want to tackle this, I can have a look. Just let me know.
>

Actually, I don't get what you are trying to do. So go ahead, maybe I can
review your change.

>-- 
>Thanks,
>
>David / dhildenb
diff mbox series

Patch

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index dbae185511cd..f902fbd17f27 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -973,9 +973,7 @@  remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
 
 	pte = pte_start + pte_index(addr);
 	for (; addr < end; addr = next, pte++) {
-		next = (addr + PAGE_SIZE) & PAGE_MASK;
-		if (next > end)
-			next = end;
+		next = pte_addr_end(addr, end);
 
 		if (!pte_present(*pte))
 			continue;
@@ -1558,7 +1556,7 @@  void register_page_bootmem_memmap(unsigned long section_nr,
 		get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
 
 		if (!boot_cpu_has(X86_FEATURE_PSE)) {
-			next = (addr + PAGE_SIZE) & PAGE_MASK;
+			next = pte_addr_end(addr, end);
 			pmd = pmd_offset(pud, addr);
 			if (pmd_none(*pmd))
 				continue;
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 32b6c52d41b9..0de09c6c89d2 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -706,6 +706,13 @@  static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
 })
 #endif
 
+#ifndef pte_addr_end
+#define pte_addr_end(addr, end)						\
+({	unsigned long __boundary = ((addr) + PAGE_SIZE) & PAGE_MASK;	\
+	(__boundary - 1 < (end) - 1) ? __boundary : (end);		\
+})
+#endif
+
 /*
  * When walking page tables, we usually want to skip any p?d_none entries;
  * and any p?d_bad entries - reporting the error before resetting to none.
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index fe6be0be1f76..89f748601f74 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -349,9 +349,7 @@  static void kasan_remove_pte_table(pte_t *pte, unsigned long addr,
 	unsigned long next;
 
 	for (; addr < end; addr = next, pte++) {
-		next = (addr + PAGE_SIZE) & PAGE_MASK;
-		if (next > end)
-			next = end;
+		next = pte_addr_end(addr, end);
 
 		if (!pte_present(*pte))
 			continue;