diff mbox series

[v2,4/5] mm: swap: entirely map large folios found in swapcache

Message ID 20240409082631.187483-5-21cnbao@gmail.com (mailing list archive)
State New
Headers show
Series large folios swap-in: handle refault cases first | expand

Commit Message

Barry Song April 9, 2024, 8:26 a.m. UTC
From: Chuanhua Han <hanchuanhua@oppo.com>

When a large folio is found in the swapcache, the current implementation
requires calling do_swap_page() nr_pages times, resulting in nr_pages
page faults. This patch opts to map the entire large folio at once to
minimize page faults. Additionally, redundant checks and early exits
for ARM64 MTE restoring are removed.

Signed-off-by: Chuanhua Han <hanchuanhua@oppo.com>
Co-developed-by: Barry Song <v-songbaohua@oppo.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 mm/memory.c | 64 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 52 insertions(+), 12 deletions(-)

Comments

Ryan Roberts April 11, 2024, 3:33 p.m. UTC | #1
On 09/04/2024 09:26, Barry Song wrote:
> From: Chuanhua Han <hanchuanhua@oppo.com>
> 
> When a large folio is found in the swapcache, the current implementation
> requires calling do_swap_page() nr_pages times, resulting in nr_pages
> page faults. This patch opts to map the entire large folio at once to
> minimize page faults. Additionally, redundant checks and early exits
> for ARM64 MTE restoring are removed.
> 
> Signed-off-by: Chuanhua Han <hanchuanhua@oppo.com>
> Co-developed-by: Barry Song <v-songbaohua@oppo.com>
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>  mm/memory.c | 64 +++++++++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 52 insertions(+), 12 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index c4a52e8d740a..9818dc1893c8 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3947,6 +3947,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  	pte_t pte;
>  	vm_fault_t ret = 0;
>  	void *shadow = NULL;
> +	int nr_pages = 1;
> +	unsigned long start_address = vmf->address;
> +	pte_t *start_pte = vmf->pte;

possible bug?: there are code paths that assign to vmf-pte below in this
function, so couldn't start_pte be stale in some cases? I'd just do the
assignment (all 4 of these variables in fact) in an else clause below, after any
messing about with them is complete.

nit: rename start_pte -> start_ptep ?

> +	bool any_swap_shared = false;

Suggest you defer initialization of this to your "We hit large folios in
swapcache" block below, and init it to:

	any_swap_shared = !pte_swp_exclusive(vmf->pte);

Then the any_shared semantic in swap_pte_batch() can be the same as for
folio_pte_batch().

>  
>  	if (!pte_unmap_same(vmf))
>  		goto out;
> @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  	 */
>  	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
>  			&vmf->ptl);

bug: vmf->pte may be NULL and you are not checking it until check_pte:. Byt you
are using it in this block. It also seems odd to do all the work in the below
block under the PTL but before checking if the pte has changed. Suggest moving
both checks here.

> +
> +	/* We hit large folios in swapcache */
> +	if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) {

What's the start_pte check protecting?

> +		int nr = folio_nr_pages(folio);
> +		int idx = folio_page_idx(folio, page);
> +		unsigned long folio_start = vmf->address - idx * PAGE_SIZE;
> +		unsigned long folio_end = folio_start + nr * PAGE_SIZE;
> +		pte_t *folio_ptep;
> +		pte_t folio_pte;
> +
> +		if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start)))
> +			goto check_pte;
> +		if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end)))
> +			goto check_pte;
> +
> +		folio_ptep = vmf->pte - idx;
> +		folio_pte = ptep_get(folio_ptep);
> +		if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) ||
> +		    swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr)
> +			goto check_pte;
> +
> +		start_address = folio_start;
> +		start_pte = folio_ptep;
> +		nr_pages = nr;
> +		entry = folio->swap;
> +		page = &folio->page;
> +	}
> +
> +check_pte:
>  	if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
>  		goto out_nomap;
>  
> @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  			 */
>  			exclusive = false;
>  		}
> +
> +		/* Reuse the whole large folio iff all entries are exclusive */
> +		if (nr_pages > 1 && any_swap_shared)
> +			exclusive = false;

If you init any_shared with the firt pte as I suggested then you could just set
exclusive = !any_shared at the top of this if block without needing this
separate fixup.
>  	}
>  
>  	/*
> @@ -4204,12 +4241,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  	 * We're already holding a reference on the page but haven't mapped it
>  	 * yet.
>  	 */
> -	swap_free(entry);
> +	swap_free_nr(entry, nr_pages);
>  	if (should_try_to_free_swap(folio, vma, vmf->flags))
>  		folio_free_swap(folio);
>  
> -	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
> -	dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
> +	folio_ref_add(folio, nr_pages - 1);
> +	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
> +	add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages);
> +
>  	pte = mk_pte(page, vma->vm_page_prot);
>  
>  	/*
> @@ -4219,33 +4258,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  	 * exclusivity.
>  	 */
>  	if (!folio_test_ksm(folio) &&
> -	    (exclusive || folio_ref_count(folio) == 1)) {
> +	    (exclusive || (folio_ref_count(folio) == nr_pages &&
> +			   folio_nr_pages(folio) == nr_pages))) {
>  		if (vmf->flags & FAULT_FLAG_WRITE) {
>  			pte = maybe_mkwrite(pte_mkdirty(pte), vma);
>  			vmf->flags &= ~FAULT_FLAG_WRITE;
>  		}
>  		rmap_flags |= RMAP_EXCLUSIVE;
>  	}
> -	flush_icache_page(vma, page);
> +	flush_icache_pages(vma, page, nr_pages);
>  	if (pte_swp_soft_dirty(vmf->orig_pte))
>  		pte = pte_mksoft_dirty(pte);
>  	if (pte_swp_uffd_wp(vmf->orig_pte))
>  		pte = pte_mkuffd_wp(pte);

I'm not sure about all this... you are smearing these SW bits from the faulting
PTE across all the ptes you are mapping. Although I guess actually that's ok
because swap_pte_batch() only returns a batch with all these bits the same?

> -	vmf->orig_pte = pte;

Instead of doing a readback below, perhaps:

	vmf->orig_pte = pte_advance_pfn(pte, nr_pages);

>  
>  	/* ksm created a completely new copy */
>  	if (unlikely(folio != swapcache && swapcache)) {
> -		folio_add_new_anon_rmap(folio, vma, vmf->address);
> +		folio_add_new_anon_rmap(folio, vma, start_address);
>  		folio_add_lru_vma(folio, vma);
>  	} else {
> -		folio_add_anon_rmap_pte(folio, page, vma, vmf->address,
> -					rmap_flags);
> +		folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, start_address,
> +					 rmap_flags);
>  	}
>  
>  	VM_BUG_ON(!folio_test_anon(folio) ||
>  			(pte_write(pte) && !PageAnonExclusive(page)));
> -	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
> -	arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
> +	set_ptes(vma->vm_mm, start_address, start_pte, pte, nr_pages);
> +	vmf->orig_pte = ptep_get(vmf->pte);
> +	arch_do_swap_page(vma->vm_mm, vma, start_address, pte, pte);
>  
>  	folio_unlock(folio);
>  	if (folio != swapcache && swapcache) {
> @@ -4269,7 +4309,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  	}
>  
>  	/* No need to invalidate - it was non-present before */
> -	update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
> +	update_mmu_cache_range(vmf, vma, start_address, start_pte, nr_pages);
>  unlock:
>  	if (vmf->pte)
>  		pte_unmap_unlock(vmf->pte, vmf->ptl);
Barry Song April 11, 2024, 11:30 p.m. UTC | #2
On Fri, Apr 12, 2024 at 3:33 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 09/04/2024 09:26, Barry Song wrote:
> > From: Chuanhua Han <hanchuanhua@oppo.com>
> >
> > When a large folio is found in the swapcache, the current implementation
> > requires calling do_swap_page() nr_pages times, resulting in nr_pages
> > page faults. This patch opts to map the entire large folio at once to
> > minimize page faults. Additionally, redundant checks and early exits
> > for ARM64 MTE restoring are removed.
> >
> > Signed-off-by: Chuanhua Han <hanchuanhua@oppo.com>
> > Co-developed-by: Barry Song <v-songbaohua@oppo.com>
> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > ---
> >  mm/memory.c | 64 +++++++++++++++++++++++++++++++++++++++++++----------
> >  1 file changed, 52 insertions(+), 12 deletions(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index c4a52e8d740a..9818dc1893c8 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -3947,6 +3947,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >       pte_t pte;
> >       vm_fault_t ret = 0;
> >       void *shadow = NULL;
> > +     int nr_pages = 1;
> > +     unsigned long start_address = vmf->address;
> > +     pte_t *start_pte = vmf->pte;
>
> possible bug?: there are code paths that assign to vmf-pte below in this
> function, so couldn't start_pte be stale in some cases? I'd just do the
> assignment (all 4 of these variables in fact) in an else clause below, after any
> messing about with them is complete.
>
> nit: rename start_pte -> start_ptep ?

Agreed.

>
> > +     bool any_swap_shared = false;
>
> Suggest you defer initialization of this to your "We hit large folios in
> swapcache" block below, and init it to:
>
>         any_swap_shared = !pte_swp_exclusive(vmf->pte);
>
> Then the any_shared semantic in swap_pte_batch() can be the same as for
> folio_pte_batch().
>
> >
> >       if (!pte_unmap_same(vmf))
> >               goto out;
> > @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >        */
> >       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
> >                       &vmf->ptl);
>
> bug: vmf->pte may be NULL and you are not checking it until check_pte:. Byt you
> are using it in this block. It also seems odd to do all the work in the below
> block under the PTL but before checking if the pte has changed. Suggest moving
> both checks here.

agreed.

>
> > +
> > +     /* We hit large folios in swapcache */
> > +     if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) {
>
> What's the start_pte check protecting?

This is exactly protecting the case vmf->pte==NULL but for some reason it was
assigned in the beginning of the function incorrectly. The intention of the code
was actually doing start_pte = vmf->pte after "vmf->pte = pte_offset_map_lock".

>
> > +             int nr = folio_nr_pages(folio);
> > +             int idx = folio_page_idx(folio, page);
> > +             unsigned long folio_start = vmf->address - idx * PAGE_SIZE;
> > +             unsigned long folio_end = folio_start + nr * PAGE_SIZE;
> > +             pte_t *folio_ptep;
> > +             pte_t folio_pte;
> > +
> > +             if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start)))
> > +                     goto check_pte;
> > +             if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end)))
> > +                     goto check_pte;
> > +
> > +             folio_ptep = vmf->pte - idx;
> > +             folio_pte = ptep_get(folio_ptep);
> > +             if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) ||
> > +                 swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr)
> > +                     goto check_pte;
> > +
> > +             start_address = folio_start;
> > +             start_pte = folio_ptep;
> > +             nr_pages = nr;
> > +             entry = folio->swap;
> > +             page = &folio->page;
> > +     }
> > +
> > +check_pte:
> >       if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
> >               goto out_nomap;
> >
> > @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >                        */
> >                       exclusive = false;
> >               }
> > +
> > +             /* Reuse the whole large folio iff all entries are exclusive */
> > +             if (nr_pages > 1 && any_swap_shared)
> > +                     exclusive = false;
>
> If you init any_shared with the firt pte as I suggested then you could just set
> exclusive = !any_shared at the top of this if block without needing this
> separate fixup.

Since your swap_pte_batch() function checks that all PTEs have the same
exclusive bits, I'll be removing any_shared first in version 3 per David's
suggestions. We could potentially develop "any_shared" as an incremental
patchset later on .

> >       }
> >
> >       /*
> > @@ -4204,12 +4241,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >        * We're already holding a reference on the page but haven't mapped it
> >        * yet.
> >        */
> > -     swap_free(entry);
> > +     swap_free_nr(entry, nr_pages);
> >       if (should_try_to_free_swap(folio, vma, vmf->flags))
> >               folio_free_swap(folio);
> >
> > -     inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
> > -     dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
> > +     folio_ref_add(folio, nr_pages - 1);
> > +     add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
> > +     add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages);
> > +
> >       pte = mk_pte(page, vma->vm_page_prot);
> >
> >       /*
> > @@ -4219,33 +4258,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >        * exclusivity.
> >        */
> >       if (!folio_test_ksm(folio) &&
> > -         (exclusive || folio_ref_count(folio) == 1)) {
> > +         (exclusive || (folio_ref_count(folio) == nr_pages &&
> > +                        folio_nr_pages(folio) == nr_pages))) {
> >               if (vmf->flags & FAULT_FLAG_WRITE) {
> >                       pte = maybe_mkwrite(pte_mkdirty(pte), vma);
> >                       vmf->flags &= ~FAULT_FLAG_WRITE;
> >               }
> >               rmap_flags |= RMAP_EXCLUSIVE;
> >       }
> > -     flush_icache_page(vma, page);
> > +     flush_icache_pages(vma, page, nr_pages);
> >       if (pte_swp_soft_dirty(vmf->orig_pte))
> >               pte = pte_mksoft_dirty(pte);
> >       if (pte_swp_uffd_wp(vmf->orig_pte))
> >               pte = pte_mkuffd_wp(pte);
>
> I'm not sure about all this... you are smearing these SW bits from the faulting
> PTE across all the ptes you are mapping. Although I guess actually that's ok
> because swap_pte_batch() only returns a batch with all these bits the same?

Initially, I didn't recognize the issue at all because the tested
architecture arm64
didn't include these bits. However, after reviewing your latest swpout series,
which verifies the consistent bits for soft_dirty and uffd_wp, I now
feel  its safety
even for platforms with these bits.

>
> > -     vmf->orig_pte = pte;
>
> Instead of doing a readback below, perhaps:
>
>         vmf->orig_pte = pte_advance_pfn(pte, nr_pages);

Nice !

>
> >
> >       /* ksm created a completely new copy */
> >       if (unlikely(folio != swapcache && swapcache)) {
> > -             folio_add_new_anon_rmap(folio, vma, vmf->address);
> > +             folio_add_new_anon_rmap(folio, vma, start_address);
> >               folio_add_lru_vma(folio, vma);
> >       } else {
> > -             folio_add_anon_rmap_pte(folio, page, vma, vmf->address,
> > -                                     rmap_flags);
> > +             folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, start_address,
> > +                                      rmap_flags);
> >       }
> >
> >       VM_BUG_ON(!folio_test_anon(folio) ||
> >                       (pte_write(pte) && !PageAnonExclusive(page)));
> > -     set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
> > -     arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
> > +     set_ptes(vma->vm_mm, start_address, start_pte, pte, nr_pages);
> > +     vmf->orig_pte = ptep_get(vmf->pte);
> > +     arch_do_swap_page(vma->vm_mm, vma, start_address, pte, pte);
> >
> >       folio_unlock(folio);
> >       if (folio != swapcache && swapcache) {
> > @@ -4269,7 +4309,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >       }
> >
> >       /* No need to invalidate - it was non-present before */
> > -     update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
> > +     update_mmu_cache_range(vmf, vma, start_address, start_pte, nr_pages);
> >  unlock:
> >       if (vmf->pte)
> >               pte_unmap_unlock(vmf->pte, vmf->ptl);
>

Thanks
Barry
Ryan Roberts April 12, 2024, 11:31 a.m. UTC | #3
On 12/04/2024 00:30, Barry Song wrote:
> On Fri, Apr 12, 2024 at 3:33 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> On 09/04/2024 09:26, Barry Song wrote:
>>> From: Chuanhua Han <hanchuanhua@oppo.com>
>>>
>>> When a large folio is found in the swapcache, the current implementation
>>> requires calling do_swap_page() nr_pages times, resulting in nr_pages
>>> page faults. This patch opts to map the entire large folio at once to
>>> minimize page faults. Additionally, redundant checks and early exits
>>> for ARM64 MTE restoring are removed.
>>>
>>> Signed-off-by: Chuanhua Han <hanchuanhua@oppo.com>
>>> Co-developed-by: Barry Song <v-songbaohua@oppo.com>
>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>>> ---
>>>  mm/memory.c | 64 +++++++++++++++++++++++++++++++++++++++++++----------
>>>  1 file changed, 52 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index c4a52e8d740a..9818dc1893c8 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -3947,6 +3947,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>       pte_t pte;
>>>       vm_fault_t ret = 0;
>>>       void *shadow = NULL;
>>> +     int nr_pages = 1;
>>> +     unsigned long start_address = vmf->address;
>>> +     pte_t *start_pte = vmf->pte;
>>
>> possible bug?: there are code paths that assign to vmf-pte below in this
>> function, so couldn't start_pte be stale in some cases? I'd just do the
>> assignment (all 4 of these variables in fact) in an else clause below, after any
>> messing about with them is complete.
>>
>> nit: rename start_pte -> start_ptep ?
> 
> Agreed.
> 
>>
>>> +     bool any_swap_shared = false;
>>
>> Suggest you defer initialization of this to your "We hit large folios in
>> swapcache" block below, and init it to:
>>
>>         any_swap_shared = !pte_swp_exclusive(vmf->pte);
>>
>> Then the any_shared semantic in swap_pte_batch() can be the same as for
>> folio_pte_batch().
>>
>>>
>>>       if (!pte_unmap_same(vmf))
>>>               goto out;
>>> @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>        */
>>>       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
>>>                       &vmf->ptl);
>>
>> bug: vmf->pte may be NULL and you are not checking it until check_pte:. Byt you
>> are using it in this block. It also seems odd to do all the work in the below
>> block under the PTL but before checking if the pte has changed. Suggest moving
>> both checks here.
> 
> agreed.
> 
>>
>>> +
>>> +     /* We hit large folios in swapcache */
>>> +     if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) {
>>
>> What's the start_pte check protecting?
> 
> This is exactly protecting the case vmf->pte==NULL but for some reason it was
> assigned in the beginning of the function incorrectly. The intention of the code
> was actually doing start_pte = vmf->pte after "vmf->pte = pte_offset_map_lock".
> 
>>
>>> +             int nr = folio_nr_pages(folio);
>>> +             int idx = folio_page_idx(folio, page);
>>> +             unsigned long folio_start = vmf->address - idx * PAGE_SIZE;
>>> +             unsigned long folio_end = folio_start + nr * PAGE_SIZE;
>>> +             pte_t *folio_ptep;
>>> +             pte_t folio_pte;
>>> +
>>> +             if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start)))
>>> +                     goto check_pte;
>>> +             if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end)))
>>> +                     goto check_pte;
>>> +
>>> +             folio_ptep = vmf->pte - idx;
>>> +             folio_pte = ptep_get(folio_ptep);
>>> +             if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) ||
>>> +                 swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr)
>>> +                     goto check_pte;
>>> +
>>> +             start_address = folio_start;
>>> +             start_pte = folio_ptep;
>>> +             nr_pages = nr;
>>> +             entry = folio->swap;
>>> +             page = &folio->page;
>>> +     }
>>> +
>>> +check_pte:
>>>       if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
>>>               goto out_nomap;
>>>
>>> @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>                        */
>>>                       exclusive = false;
>>>               }
>>> +
>>> +             /* Reuse the whole large folio iff all entries are exclusive */
>>> +             if (nr_pages > 1 && any_swap_shared)
>>> +                     exclusive = false;
>>
>> If you init any_shared with the firt pte as I suggested then you could just set
>> exclusive = !any_shared at the top of this if block without needing this
>> separate fixup.
> 
> Since your swap_pte_batch() function checks that all PTEs have the same
> exclusive bits, I'll be removing any_shared first in version 3 per David's
> suggestions. We could potentially develop "any_shared" as an incremental
> patchset later on .

Ahh yes, good point. I'll admit that your conversation about this went over my
head at the time since I hadn't yet looked at this.

> 
>>>       }
>>>
>>>       /*
>>> @@ -4204,12 +4241,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>        * We're already holding a reference on the page but haven't mapped it
>>>        * yet.
>>>        */
>>> -     swap_free(entry);
>>> +     swap_free_nr(entry, nr_pages);
>>>       if (should_try_to_free_swap(folio, vma, vmf->flags))
>>>               folio_free_swap(folio);
>>>
>>> -     inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
>>> -     dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
>>> +     folio_ref_add(folio, nr_pages - 1);
>>> +     add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
>>> +     add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages);
>>> +
>>>       pte = mk_pte(page, vma->vm_page_prot);
>>>
>>>       /*
>>> @@ -4219,33 +4258,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>        * exclusivity.
>>>        */
>>>       if (!folio_test_ksm(folio) &&
>>> -         (exclusive || folio_ref_count(folio) == 1)) {
>>> +         (exclusive || (folio_ref_count(folio) == nr_pages &&
>>> +                        folio_nr_pages(folio) == nr_pages))) {
>>>               if (vmf->flags & FAULT_FLAG_WRITE) {
>>>                       pte = maybe_mkwrite(pte_mkdirty(pte), vma);
>>>                       vmf->flags &= ~FAULT_FLAG_WRITE;
>>>               }
>>>               rmap_flags |= RMAP_EXCLUSIVE;
>>>       }
>>> -     flush_icache_page(vma, page);
>>> +     flush_icache_pages(vma, page, nr_pages);
>>>       if (pte_swp_soft_dirty(vmf->orig_pte))
>>>               pte = pte_mksoft_dirty(pte);
>>>       if (pte_swp_uffd_wp(vmf->orig_pte))
>>>               pte = pte_mkuffd_wp(pte);
>>
>> I'm not sure about all this... you are smearing these SW bits from the faulting
>> PTE across all the ptes you are mapping. Although I guess actually that's ok
>> because swap_pte_batch() only returns a batch with all these bits the same?
> 
> Initially, I didn't recognize the issue at all because the tested
> architecture arm64
> didn't include these bits. However, after reviewing your latest swpout series,
> which verifies the consistent bits for soft_dirty and uffd_wp, I now
> feel  its safety
> even for platforms with these bits.

Yep, agreed.

> 
>>
>>> -     vmf->orig_pte = pte;
>>
>> Instead of doing a readback below, perhaps:
>>
>>         vmf->orig_pte = pte_advance_pfn(pte, nr_pages);
> 
> Nice !
> 
>>
>>>
>>>       /* ksm created a completely new copy */
>>>       if (unlikely(folio != swapcache && swapcache)) {
>>> -             folio_add_new_anon_rmap(folio, vma, vmf->address);
>>> +             folio_add_new_anon_rmap(folio, vma, start_address);
>>>               folio_add_lru_vma(folio, vma);
>>>       } else {
>>> -             folio_add_anon_rmap_pte(folio, page, vma, vmf->address,
>>> -                                     rmap_flags);
>>> +             folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, start_address,
>>> +                                      rmap_flags);
>>>       }
>>>
>>>       VM_BUG_ON(!folio_test_anon(folio) ||
>>>                       (pte_write(pte) && !PageAnonExclusive(page)));
>>> -     set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
>>> -     arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
>>> +     set_ptes(vma->vm_mm, start_address, start_pte, pte, nr_pages);
>>> +     vmf->orig_pte = ptep_get(vmf->pte);
>>> +     arch_do_swap_page(vma->vm_mm, vma, start_address, pte, pte);
>>>
>>>       folio_unlock(folio);
>>>       if (folio != swapcache && swapcache) {
>>> @@ -4269,7 +4309,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>       }
>>>
>>>       /* No need to invalidate - it was non-present before */
>>> -     update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
>>> +     update_mmu_cache_range(vmf, vma, start_address, start_pte, nr_pages);
>>>  unlock:
>>>       if (vmf->pte)
>>>               pte_unmap_unlock(vmf->pte, vmf->ptl);
>>
> 
> Thanks
> Barry
Huang, Ying April 15, 2024, 8:37 a.m. UTC | #4
Barry Song <21cnbao@gmail.com> writes:

> From: Chuanhua Han <hanchuanhua@oppo.com>
>
> When a large folio is found in the swapcache, the current implementation
> requires calling do_swap_page() nr_pages times, resulting in nr_pages
> page faults. This patch opts to map the entire large folio at once to
> minimize page faults. Additionally, redundant checks and early exits
> for ARM64 MTE restoring are removed.
>
> Signed-off-by: Chuanhua Han <hanchuanhua@oppo.com>
> Co-developed-by: Barry Song <v-songbaohua@oppo.com>
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>  mm/memory.c | 64 +++++++++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 52 insertions(+), 12 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index c4a52e8d740a..9818dc1893c8 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3947,6 +3947,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  	pte_t pte;
>  	vm_fault_t ret = 0;
>  	void *shadow = NULL;
> +	int nr_pages = 1;
> +	unsigned long start_address = vmf->address;
> +	pte_t *start_pte = vmf->pte;

IMHO, it's better to rename the above 2 local variables to "address" and
"ptep".  Just my personal opinion.  Feel free to ignore the comments.

> +	bool any_swap_shared = false;
>  
>  	if (!pte_unmap_same(vmf))
>  		goto out;
> @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  	 */
>  	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
>  			&vmf->ptl);

We should move pte check here.  That is,

  	if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
  		goto out_nomap;

This will simplify the situation for large folio.

> +
> +	/* We hit large folios in swapcache */

The comments seems unnecessary because the code tells that already.

> +	if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) {
> +		int nr = folio_nr_pages(folio);
> +		int idx = folio_page_idx(folio, page);
> +		unsigned long folio_start = vmf->address - idx * PAGE_SIZE;
> +		unsigned long folio_end = folio_start + nr * PAGE_SIZE;
> +		pte_t *folio_ptep;
> +		pte_t folio_pte;
> +
> +		if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start)))
> +			goto check_pte;
> +		if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end)))
> +			goto check_pte;
> +
> +		folio_ptep = vmf->pte - idx;
> +		folio_pte = ptep_get(folio_ptep);

It's better to construct pte based on fault PTE via generalizing
pte_next_swp_offset() (may be pte_move_swp_offset()).  Then we can find
inconsistent PTEs quicker.

> +		if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) ||
> +		    swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr)
> +			goto check_pte;
> +
> +		start_address = folio_start;
> +		start_pte = folio_ptep;
> +		nr_pages = nr;
> +		entry = folio->swap;
> +		page = &folio->page;
> +	}
> +
> +check_pte:
>  	if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
>  		goto out_nomap;
>  
> @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  			 */
>  			exclusive = false;
>  		}
> +
> +		/* Reuse the whole large folio iff all entries are exclusive */
> +		if (nr_pages > 1 && any_swap_shared)
> +			exclusive = false;
>  	}
>  
>  	/*
> @@ -4204,12 +4241,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  	 * We're already holding a reference on the page but haven't mapped it
>  	 * yet.
>  	 */
> -	swap_free(entry);
> +	swap_free_nr(entry, nr_pages);
>  	if (should_try_to_free_swap(folio, vma, vmf->flags))
>  		folio_free_swap(folio);
>  
> -	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
> -	dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
> +	folio_ref_add(folio, nr_pages - 1);
> +	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
> +	add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages);
> +
>  	pte = mk_pte(page, vma->vm_page_prot);
>  
>  	/*
> @@ -4219,33 +4258,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  	 * exclusivity.
>  	 */
>  	if (!folio_test_ksm(folio) &&
> -	    (exclusive || folio_ref_count(folio) == 1)) {
> +	    (exclusive || (folio_ref_count(folio) == nr_pages &&
> +			   folio_nr_pages(folio) == nr_pages))) {
>  		if (vmf->flags & FAULT_FLAG_WRITE) {
>  			pte = maybe_mkwrite(pte_mkdirty(pte), vma);
>  			vmf->flags &= ~FAULT_FLAG_WRITE;
>  		}
>  		rmap_flags |= RMAP_EXCLUSIVE;
>  	}
> -	flush_icache_page(vma, page);
> +	flush_icache_pages(vma, page, nr_pages);
>  	if (pte_swp_soft_dirty(vmf->orig_pte))
>  		pte = pte_mksoft_dirty(pte);
>  	if (pte_swp_uffd_wp(vmf->orig_pte))
>  		pte = pte_mkuffd_wp(pte);
> -	vmf->orig_pte = pte;
>  
>  	/* ksm created a completely new copy */
>  	if (unlikely(folio != swapcache && swapcache)) {
> -		folio_add_new_anon_rmap(folio, vma, vmf->address);
> +		folio_add_new_anon_rmap(folio, vma, start_address);
>  		folio_add_lru_vma(folio, vma);
>  	} else {
> -		folio_add_anon_rmap_pte(folio, page, vma, vmf->address,
> -					rmap_flags);
> +		folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, start_address,
> +					 rmap_flags);
>  	}
>  
>  	VM_BUG_ON(!folio_test_anon(folio) ||
>  			(pte_write(pte) && !PageAnonExclusive(page)));
> -	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
> -	arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
> +	set_ptes(vma->vm_mm, start_address, start_pte, pte, nr_pages);
> +	vmf->orig_pte = ptep_get(vmf->pte);
> +	arch_do_swap_page(vma->vm_mm, vma, start_address, pte, pte);

Do we need to call arch_do_swap_page() for each subpage?  IIUC, the
corresponding arch_unmap_one() will be called for each subpage.

>  	folio_unlock(folio);
>  	if (folio != swapcache && swapcache) {
> @@ -4269,7 +4309,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  	}
>  
>  	/* No need to invalidate - it was non-present before */
> -	update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
> +	update_mmu_cache_range(vmf, vma, start_address, start_pte, nr_pages);
>  unlock:
>  	if (vmf->pte)
>  		pte_unmap_unlock(vmf->pte, vmf->ptl);

--
Best Regards,
Huang, Ying
Barry Song April 15, 2024, 8:53 a.m. UTC | #5
On Mon, Apr 15, 2024 at 8:39 PM Huang, Ying <ying.huang@intel.com> wrote:
>
> Barry Song <21cnbao@gmail.com> writes:
>
> > From: Chuanhua Han <hanchuanhua@oppo.com>
> >
> > When a large folio is found in the swapcache, the current implementation
> > requires calling do_swap_page() nr_pages times, resulting in nr_pages
> > page faults. This patch opts to map the entire large folio at once to
> > minimize page faults. Additionally, redundant checks and early exits
> > for ARM64 MTE restoring are removed.
> >
> > Signed-off-by: Chuanhua Han <hanchuanhua@oppo.com>
> > Co-developed-by: Barry Song <v-songbaohua@oppo.com>
> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > ---
> >  mm/memory.c | 64 +++++++++++++++++++++++++++++++++++++++++++----------
> >  1 file changed, 52 insertions(+), 12 deletions(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index c4a52e8d740a..9818dc1893c8 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -3947,6 +3947,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >       pte_t pte;
> >       vm_fault_t ret = 0;
> >       void *shadow = NULL;
> > +     int nr_pages = 1;
> > +     unsigned long start_address = vmf->address;
> > +     pte_t *start_pte = vmf->pte;
>
> IMHO, it's better to rename the above 2 local variables to "address" and
> "ptep".  Just my personal opinion.  Feel free to ignore the comments.

fine.

>
> > +     bool any_swap_shared = false;
> >
> >       if (!pte_unmap_same(vmf))
> >               goto out;
> > @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >        */
> >       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
> >                       &vmf->ptl);
>
> We should move pte check here.  That is,
>
>         if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
>                 goto out_nomap;
>
> This will simplify the situation for large folio.

the plan is moving the whole code block

if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio))

after
        if (unlikely(!folio_test_uptodate(folio))) {
                ret = VM_FAULT_SIGBUS;
                goto out_nomap;
        }

though we couldn't be !folio_test_uptodate(folio)) for hitting
swapcache but it seems
logically better for future use.

>
> > +
> > +     /* We hit large folios in swapcache */
>
> The comments seems unnecessary because the code tells that already.
>
> > +     if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) {
> > +             int nr = folio_nr_pages(folio);
> > +             int idx = folio_page_idx(folio, page);
> > +             unsigned long folio_start = vmf->address - idx * PAGE_SIZE;
> > +             unsigned long folio_end = folio_start + nr * PAGE_SIZE;
> > +             pte_t *folio_ptep;
> > +             pte_t folio_pte;
> > +
> > +             if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start)))
> > +                     goto check_pte;
> > +             if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end)))
> > +                     goto check_pte;
> > +
> > +             folio_ptep = vmf->pte - idx;
> > +             folio_pte = ptep_get(folio_ptep);
>
> It's better to construct pte based on fault PTE via generalizing
> pte_next_swp_offset() (may be pte_move_swp_offset()).  Then we can find
> inconsistent PTEs quicker.

it seems your point is getting the pte of page0 by pte_next_swp_offset()
unfortunately pte_next_swp_offset can't go back. on the other hand,
we have to check the real pte value of the 0nd entry right now because
swap_pte_batch() only really reads pte from the 1st entry. it assumes
pte argument is the real value for the 0nd pte entry.

static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
{
        pte_t expected_pte = pte_next_swp_offset(pte);
        const pte_t *end_ptep = start_ptep + max_nr;
        pte_t *ptep = start_ptep + 1;

        VM_WARN_ON(max_nr < 1);
        VM_WARN_ON(!is_swap_pte(pte));
        VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));

        while (ptep < end_ptep) {
                pte = ptep_get(ptep);

                if (!pte_same(pte, expected_pte))
                        break;

                expected_pte = pte_next_swp_offset(expected_pte);
                ptep++;
        }

        return ptep - start_ptep;
}

>
> > +             if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) ||
> > +                 swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr)
> > +                     goto check_pte;
> > +
> > +             start_address = folio_start;
> > +             start_pte = folio_ptep;
> > +             nr_pages = nr;
> > +             entry = folio->swap;
> > +             page = &folio->page;
> > +     }
> > +
> > +check_pte:
> >       if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
> >               goto out_nomap;
> >
> > @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >                        */
> >                       exclusive = false;
> >               }
> > +
> > +             /* Reuse the whole large folio iff all entries are exclusive */
> > +             if (nr_pages > 1 && any_swap_shared)
> > +                     exclusive = false;
> >       }
> >
> >       /*
> > @@ -4204,12 +4241,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >        * We're already holding a reference on the page but haven't mapped it
> >        * yet.
> >        */
> > -     swap_free(entry);
> > +     swap_free_nr(entry, nr_pages);
> >       if (should_try_to_free_swap(folio, vma, vmf->flags))
> >               folio_free_swap(folio);
> >
> > -     inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
> > -     dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
> > +     folio_ref_add(folio, nr_pages - 1);
> > +     add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
> > +     add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages);
> > +
> >       pte = mk_pte(page, vma->vm_page_prot);
> >
> >       /*
> > @@ -4219,33 +4258,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >        * exclusivity.
> >        */
> >       if (!folio_test_ksm(folio) &&
> > -         (exclusive || folio_ref_count(folio) == 1)) {
> > +         (exclusive || (folio_ref_count(folio) == nr_pages &&
> > +                        folio_nr_pages(folio) == nr_pages))) {
> >               if (vmf->flags & FAULT_FLAG_WRITE) {
> >                       pte = maybe_mkwrite(pte_mkdirty(pte), vma);
> >                       vmf->flags &= ~FAULT_FLAG_WRITE;
> >               }
> >               rmap_flags |= RMAP_EXCLUSIVE;
> >       }
> > -     flush_icache_page(vma, page);
> > +     flush_icache_pages(vma, page, nr_pages);
> >       if (pte_swp_soft_dirty(vmf->orig_pte))
> >               pte = pte_mksoft_dirty(pte);
> >       if (pte_swp_uffd_wp(vmf->orig_pte))
> >               pte = pte_mkuffd_wp(pte);
> > -     vmf->orig_pte = pte;
> >
> >       /* ksm created a completely new copy */
> >       if (unlikely(folio != swapcache && swapcache)) {
> > -             folio_add_new_anon_rmap(folio, vma, vmf->address);
> > +             folio_add_new_anon_rmap(folio, vma, start_address);
> >               folio_add_lru_vma(folio, vma);
> >       } else {
> > -             folio_add_anon_rmap_pte(folio, page, vma, vmf->address,
> > -                                     rmap_flags);
> > +             folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, start_address,
> > +                                      rmap_flags);
> >       }
> >
> >       VM_BUG_ON(!folio_test_anon(folio) ||
> >                       (pte_write(pte) && !PageAnonExclusive(page)));
> > -     set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
> > -     arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
> > +     set_ptes(vma->vm_mm, start_address, start_pte, pte, nr_pages);
> > +     vmf->orig_pte = ptep_get(vmf->pte);
> > +     arch_do_swap_page(vma->vm_mm, vma, start_address, pte, pte);
>
> Do we need to call arch_do_swap_page() for each subpage?  IIUC, the
> corresponding arch_unmap_one() will be called for each subpage.

i actually thought about this very carefully, right now, the only one who
needs this is sparc and it doesn't support THP_SWAPOUT at all. and
there is no proof doing restoration one by one won't really break sparc.
so i'd like to defer this to when sparc really needs THP_SWAPOUT.
on the other hand, it seems really bad we have both

arch_swap_restore  - for this, arm64 has moved to using folio
and
arch_do_swap_page

we should somehow unify them later if sparc wants THP_SWPOUT.

>
> >       folio_unlock(folio);
> >       if (folio != swapcache && swapcache) {
> > @@ -4269,7 +4309,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >       }
> >
> >       /* No need to invalidate - it was non-present before */
> > -     update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
> > +     update_mmu_cache_range(vmf, vma, start_address, start_pte, nr_pages);
> >  unlock:
> >       if (vmf->pte)
> >               pte_unmap_unlock(vmf->pte, vmf->ptl);
>
> --
> Best Regards,
> Huang, Ying
Huang, Ying April 16, 2024, 2:25 a.m. UTC | #6
Added Khalid for arch_do_swap_page().

Barry Song <21cnbao@gmail.com> writes:

> On Mon, Apr 15, 2024 at 8:39 PM Huang, Ying <ying.huang@intel.com> wrote:
>>
>> Barry Song <21cnbao@gmail.com> writes:

[snip]

>>
>> > +     bool any_swap_shared = false;
>> >
>> >       if (!pte_unmap_same(vmf))
>> >               goto out;
>> > @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> >        */
>> >       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
>> >                       &vmf->ptl);
>>
>> We should move pte check here.  That is,
>>
>>         if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
>>                 goto out_nomap;
>>
>> This will simplify the situation for large folio.
>
> the plan is moving the whole code block
>
> if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio))
>
> after
>         if (unlikely(!folio_test_uptodate(folio))) {
>                 ret = VM_FAULT_SIGBUS;
>                 goto out_nomap;
>         }
>
> though we couldn't be !folio_test_uptodate(folio)) for hitting
> swapcache but it seems
> logically better for future use.

LGTM, Thanks!

>>
>> > +
>> > +     /* We hit large folios in swapcache */
>>
>> The comments seems unnecessary because the code tells that already.
>>
>> > +     if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) {
>> > +             int nr = folio_nr_pages(folio);
>> > +             int idx = folio_page_idx(folio, page);
>> > +             unsigned long folio_start = vmf->address - idx * PAGE_SIZE;
>> > +             unsigned long folio_end = folio_start + nr * PAGE_SIZE;
>> > +             pte_t *folio_ptep;
>> > +             pte_t folio_pte;
>> > +
>> > +             if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start)))
>> > +                     goto check_pte;
>> > +             if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end)))
>> > +                     goto check_pte;
>> > +
>> > +             folio_ptep = vmf->pte - idx;
>> > +             folio_pte = ptep_get(folio_ptep);
>>
>> It's better to construct pte based on fault PTE via generalizing
>> pte_next_swp_offset() (may be pte_move_swp_offset()).  Then we can find
>> inconsistent PTEs quicker.
>
> it seems your point is getting the pte of page0 by pte_next_swp_offset()
> unfortunately pte_next_swp_offset can't go back. on the other hand,
> we have to check the real pte value of the 0nd entry right now because
> swap_pte_batch() only really reads pte from the 1st entry. it assumes
> pte argument is the real value for the 0nd pte entry.
>
> static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
> {
>         pte_t expected_pte = pte_next_swp_offset(pte);
>         const pte_t *end_ptep = start_ptep + max_nr;
>         pte_t *ptep = start_ptep + 1;
>
>         VM_WARN_ON(max_nr < 1);
>         VM_WARN_ON(!is_swap_pte(pte));
>         VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));
>
>         while (ptep < end_ptep) {
>                 pte = ptep_get(ptep);
>
>                 if (!pte_same(pte, expected_pte))
>                         break;
>
>                 expected_pte = pte_next_swp_offset(expected_pte);
>                 ptep++;
>         }
>
>         return ptep - start_ptep;
> }

Yes.  You are right.

But we may check whether the pte of page0 is same as "vmf->orig_pte -
folio_page_idx()" (fake code).

You need to check the pte of page 0 anyway.

>>
>> > +             if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) ||
>> > +                 swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr)
>> > +                     goto check_pte;
>> > +
>> > +             start_address = folio_start;
>> > +             start_pte = folio_ptep;
>> > +             nr_pages = nr;
>> > +             entry = folio->swap;
>> > +             page = &folio->page;
>> > +     }
>> > +
>> > +check_pte:
>> >       if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
>> >               goto out_nomap;
>> >
>> > @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> >                        */
>> >                       exclusive = false;
>> >               }
>> > +
>> > +             /* Reuse the whole large folio iff all entries are exclusive */
>> > +             if (nr_pages > 1 && any_swap_shared)
>> > +                     exclusive = false;
>> >       }
>> >
>> >       /*
>> > @@ -4204,12 +4241,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> >        * We're already holding a reference on the page but haven't mapped it
>> >        * yet.
>> >        */
>> > -     swap_free(entry);
>> > +     swap_free_nr(entry, nr_pages);
>> >       if (should_try_to_free_swap(folio, vma, vmf->flags))
>> >               folio_free_swap(folio);
>> >
>> > -     inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
>> > -     dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
>> > +     folio_ref_add(folio, nr_pages - 1);
>> > +     add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
>> > +     add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages);
>> > +
>> >       pte = mk_pte(page, vma->vm_page_prot);
>> >
>> >       /*
>> > @@ -4219,33 +4258,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> >        * exclusivity.
>> >        */
>> >       if (!folio_test_ksm(folio) &&
>> > -         (exclusive || folio_ref_count(folio) == 1)) {
>> > +         (exclusive || (folio_ref_count(folio) == nr_pages &&
>> > +                        folio_nr_pages(folio) == nr_pages))) {
>> >               if (vmf->flags & FAULT_FLAG_WRITE) {
>> >                       pte = maybe_mkwrite(pte_mkdirty(pte), vma);
>> >                       vmf->flags &= ~FAULT_FLAG_WRITE;
>> >               }
>> >               rmap_flags |= RMAP_EXCLUSIVE;
>> >       }
>> > -     flush_icache_page(vma, page);
>> > +     flush_icache_pages(vma, page, nr_pages);
>> >       if (pte_swp_soft_dirty(vmf->orig_pte))
>> >               pte = pte_mksoft_dirty(pte);
>> >       if (pte_swp_uffd_wp(vmf->orig_pte))
>> >               pte = pte_mkuffd_wp(pte);
>> > -     vmf->orig_pte = pte;
>> >
>> >       /* ksm created a completely new copy */
>> >       if (unlikely(folio != swapcache && swapcache)) {
>> > -             folio_add_new_anon_rmap(folio, vma, vmf->address);
>> > +             folio_add_new_anon_rmap(folio, vma, start_address);
>> >               folio_add_lru_vma(folio, vma);
>> >       } else {
>> > -             folio_add_anon_rmap_pte(folio, page, vma, vmf->address,
>> > -                                     rmap_flags);
>> > +             folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, start_address,
>> > +                                      rmap_flags);
>> >       }
>> >
>> >       VM_BUG_ON(!folio_test_anon(folio) ||
>> >                       (pte_write(pte) && !PageAnonExclusive(page)));
>> > -     set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
>> > -     arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
>> > +     set_ptes(vma->vm_mm, start_address, start_pte, pte, nr_pages);
>> > +     vmf->orig_pte = ptep_get(vmf->pte);
>> > +     arch_do_swap_page(vma->vm_mm, vma, start_address, pte, pte);
>>
>> Do we need to call arch_do_swap_page() for each subpage?  IIUC, the
>> corresponding arch_unmap_one() will be called for each subpage.
>
> i actually thought about this very carefully, right now, the only one who
> needs this is sparc and it doesn't support THP_SWAPOUT at all. and
> there is no proof doing restoration one by one won't really break sparc.
> so i'd like to defer this to when sparc really needs THP_SWAPOUT.

Let's ask SPARC developer (Cced) for this.

IMHO, even if we cannot get help, we need to change code with our
understanding instead of deferring it.

> on the other hand, it seems really bad we have both
> arch_swap_restore  - for this, arm64 has moved to using folio
> and
> arch_do_swap_page
>
> we should somehow unify them later if sparc wants THP_SWPOUT.
>
>>
>> >       folio_unlock(folio);
>> >       if (folio != swapcache && swapcache) {
>> > @@ -4269,7 +4309,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> >       }
>> >
>> >       /* No need to invalidate - it was non-present before */
>> > -     update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
>> > +     update_mmu_cache_range(vmf, vma, start_address, start_pte, nr_pages);
>> >  unlock:
>> >       if (vmf->pte)
>> >               pte_unmap_unlock(vmf->pte, vmf->ptl);

--
Best Regards,
Huang, Ying
Barry Song April 16, 2024, 2:36 a.m. UTC | #7
On Tue, Apr 16, 2024 at 2:27 PM Huang, Ying <ying.huang@intel.com> wrote:
>
>
> Added Khalid for arch_do_swap_page().
>
> Barry Song <21cnbao@gmail.com> writes:
>
> > On Mon, Apr 15, 2024 at 8:39 PM Huang, Ying <ying.huang@intel.com> wrote:
> >>
> >> Barry Song <21cnbao@gmail.com> writes:
>
> [snip]
>
> >>
> >> > +     bool any_swap_shared = false;
> >> >
> >> >       if (!pte_unmap_same(vmf))
> >> >               goto out;
> >> > @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >> >        */
> >> >       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
> >> >                       &vmf->ptl);
> >>
> >> We should move pte check here.  That is,
> >>
> >>         if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
> >>                 goto out_nomap;
> >>
> >> This will simplify the situation for large folio.
> >
> > the plan is moving the whole code block
> >
> > if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio))
> >
> > after
> >         if (unlikely(!folio_test_uptodate(folio))) {
> >                 ret = VM_FAULT_SIGBUS;
> >                 goto out_nomap;
> >         }
> >
> > though we couldn't be !folio_test_uptodate(folio)) for hitting
> > swapcache but it seems
> > logically better for future use.
>
> LGTM, Thanks!
>
> >>
> >> > +
> >> > +     /* We hit large folios in swapcache */
> >>
> >> The comments seems unnecessary because the code tells that already.
> >>
> >> > +     if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) {
> >> > +             int nr = folio_nr_pages(folio);
> >> > +             int idx = folio_page_idx(folio, page);
> >> > +             unsigned long folio_start = vmf->address - idx * PAGE_SIZE;
> >> > +             unsigned long folio_end = folio_start + nr * PAGE_SIZE;
> >> > +             pte_t *folio_ptep;
> >> > +             pte_t folio_pte;
> >> > +
> >> > +             if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start)))
> >> > +                     goto check_pte;
> >> > +             if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end)))
> >> > +                     goto check_pte;
> >> > +
> >> > +             folio_ptep = vmf->pte - idx;
> >> > +             folio_pte = ptep_get(folio_ptep);
> >>
> >> It's better to construct pte based on fault PTE via generalizing
> >> pte_next_swp_offset() (may be pte_move_swp_offset()).  Then we can find
> >> inconsistent PTEs quicker.
> >
> > it seems your point is getting the pte of page0 by pte_next_swp_offset()
> > unfortunately pte_next_swp_offset can't go back. on the other hand,
> > we have to check the real pte value of the 0nd entry right now because
> > swap_pte_batch() only really reads pte from the 1st entry. it assumes
> > pte argument is the real value for the 0nd pte entry.
> >
> > static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
> > {
> >         pte_t expected_pte = pte_next_swp_offset(pte);
> >         const pte_t *end_ptep = start_ptep + max_nr;
> >         pte_t *ptep = start_ptep + 1;
> >
> >         VM_WARN_ON(max_nr < 1);
> >         VM_WARN_ON(!is_swap_pte(pte));
> >         VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));
> >
> >         while (ptep < end_ptep) {
> >                 pte = ptep_get(ptep);
> >
> >                 if (!pte_same(pte, expected_pte))
> >                         break;
> >
> >                 expected_pte = pte_next_swp_offset(expected_pte);
> >                 ptep++;
> >         }
> >
> >         return ptep - start_ptep;
> > }
>
> Yes.  You are right.
>
> But we may check whether the pte of page0 is same as "vmf->orig_pte -
> folio_page_idx()" (fake code).

right, that is why we are reading and checking PTE0 before calling
swap_pte_batch()
right now.

  folio_ptep = vmf->pte - idx;
  folio_pte = ptep_get(folio_ptep);
  if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) ||
      swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr)
   goto check_pte;

So, if I understand correctly, you're proposing that we should directly check
PTE0 in swap_pte_batch(). Personally, I don't have any objections to this idea.
However, I'd also like to hear the feedback from Ryan and David :-)

>
> You need to check the pte of page 0 anyway.
>
> >>
> >> > +             if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) ||
> >> > +                 swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr)
> >> > +                     goto check_pte;
> >> > +
> >> > +             start_address = folio_start;
> >> > +             start_pte = folio_ptep;
> >> > +             nr_pages = nr;
> >> > +             entry = folio->swap;
> >> > +             page = &folio->page;
> >> > +     }
> >> > +
> >> > +check_pte:
> >> >       if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
> >> >               goto out_nomap;
> >> >
> >> > @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >> >                        */
> >> >                       exclusive = false;
> >> >               }
> >> > +
> >> > +             /* Reuse the whole large folio iff all entries are exclusive */
> >> > +             if (nr_pages > 1 && any_swap_shared)
> >> > +                     exclusive = false;
> >> >       }
> >> >
> >> >       /*
> >> > @@ -4204,12 +4241,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >> >        * We're already holding a reference on the page but haven't mapped it
> >> >        * yet.
> >> >        */
> >> > -     swap_free(entry);
> >> > +     swap_free_nr(entry, nr_pages);
> >> >       if (should_try_to_free_swap(folio, vma, vmf->flags))
> >> >               folio_free_swap(folio);
> >> >
> >> > -     inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
> >> > -     dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
> >> > +     folio_ref_add(folio, nr_pages - 1);
> >> > +     add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
> >> > +     add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages);
> >> > +
> >> >       pte = mk_pte(page, vma->vm_page_prot);
> >> >
> >> >       /*
> >> > @@ -4219,33 +4258,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >> >        * exclusivity.
> >> >        */
> >> >       if (!folio_test_ksm(folio) &&
> >> > -         (exclusive || folio_ref_count(folio) == 1)) {
> >> > +         (exclusive || (folio_ref_count(folio) == nr_pages &&
> >> > +                        folio_nr_pages(folio) == nr_pages))) {
> >> >               if (vmf->flags & FAULT_FLAG_WRITE) {
> >> >                       pte = maybe_mkwrite(pte_mkdirty(pte), vma);
> >> >                       vmf->flags &= ~FAULT_FLAG_WRITE;
> >> >               }
> >> >               rmap_flags |= RMAP_EXCLUSIVE;
> >> >       }
> >> > -     flush_icache_page(vma, page);
> >> > +     flush_icache_pages(vma, page, nr_pages);
> >> >       if (pte_swp_soft_dirty(vmf->orig_pte))
> >> >               pte = pte_mksoft_dirty(pte);
> >> >       if (pte_swp_uffd_wp(vmf->orig_pte))
> >> >               pte = pte_mkuffd_wp(pte);
> >> > -     vmf->orig_pte = pte;
> >> >
> >> >       /* ksm created a completely new copy */
> >> >       if (unlikely(folio != swapcache && swapcache)) {
> >> > -             folio_add_new_anon_rmap(folio, vma, vmf->address);
> >> > +             folio_add_new_anon_rmap(folio, vma, start_address);
> >> >               folio_add_lru_vma(folio, vma);
> >> >       } else {
> >> > -             folio_add_anon_rmap_pte(folio, page, vma, vmf->address,
> >> > -                                     rmap_flags);
> >> > +             folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, start_address,
> >> > +                                      rmap_flags);
> >> >       }
> >> >
> >> >       VM_BUG_ON(!folio_test_anon(folio) ||
> >> >                       (pte_write(pte) && !PageAnonExclusive(page)));
> >> > -     set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
> >> > -     arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
> >> > +     set_ptes(vma->vm_mm, start_address, start_pte, pte, nr_pages);
> >> > +     vmf->orig_pte = ptep_get(vmf->pte);
> >> > +     arch_do_swap_page(vma->vm_mm, vma, start_address, pte, pte);
> >>
> >> Do we need to call arch_do_swap_page() for each subpage?  IIUC, the
> >> corresponding arch_unmap_one() will be called for each subpage.
> >
> > i actually thought about this very carefully, right now, the only one who
> > needs this is sparc and it doesn't support THP_SWAPOUT at all. and
> > there is no proof doing restoration one by one won't really break sparc.
> > so i'd like to defer this to when sparc really needs THP_SWAPOUT.
>
> Let's ask SPARC developer (Cced) for this.
>
> IMHO, even if we cannot get help, we need to change code with our
> understanding instead of deferring it.

ok. Thanks for Ccing sparc developers.

>
> > on the other hand, it seems really bad we have both
> > arch_swap_restore  - for this, arm64 has moved to using folio
> > and
> > arch_do_swap_page
> >
> > we should somehow unify them later if sparc wants THP_SWPOUT.
> >
> >>
> >> >       folio_unlock(folio);
> >> >       if (folio != swapcache && swapcache) {
> >> > @@ -4269,7 +4309,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >> >       }
> >> >
> >> >       /* No need to invalidate - it was non-present before */
> >> > -     update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
> >> > +     update_mmu_cache_range(vmf, vma, start_address, start_pte, nr_pages);
> >> >  unlock:
> >> >       if (vmf->pte)
> >> >               pte_unmap_unlock(vmf->pte, vmf->ptl);
>
> --
> Best Regards,
> Huang, Ying

Thanks
Barry
Huang, Ying April 16, 2024, 2:39 a.m. UTC | #8
Barry Song <21cnbao@gmail.com> writes:

> On Tue, Apr 16, 2024 at 2:27 PM Huang, Ying <ying.huang@intel.com> wrote:
>>
>>
>> Added Khalid for arch_do_swap_page().
>>
>> Barry Song <21cnbao@gmail.com> writes:
>>
>> > On Mon, Apr 15, 2024 at 8:39 PM Huang, Ying <ying.huang@intel.com> wrote:
>> >>
>> >> Barry Song <21cnbao@gmail.com> writes:
>>
>> [snip]
>>
>> >>
>> >> > +     bool any_swap_shared = false;
>> >> >
>> >> >       if (!pte_unmap_same(vmf))
>> >> >               goto out;
>> >> > @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> >> >        */
>> >> >       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
>> >> >                       &vmf->ptl);
>> >>
>> >> We should move pte check here.  That is,
>> >>
>> >>         if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
>> >>                 goto out_nomap;
>> >>
>> >> This will simplify the situation for large folio.
>> >
>> > the plan is moving the whole code block
>> >
>> > if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio))
>> >
>> > after
>> >         if (unlikely(!folio_test_uptodate(folio))) {
>> >                 ret = VM_FAULT_SIGBUS;
>> >                 goto out_nomap;
>> >         }
>> >
>> > though we couldn't be !folio_test_uptodate(folio)) for hitting
>> > swapcache but it seems
>> > logically better for future use.
>>
>> LGTM, Thanks!
>>
>> >>
>> >> > +
>> >> > +     /* We hit large folios in swapcache */
>> >>
>> >> The comments seems unnecessary because the code tells that already.
>> >>
>> >> > +     if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) {
>> >> > +             int nr = folio_nr_pages(folio);
>> >> > +             int idx = folio_page_idx(folio, page);
>> >> > +             unsigned long folio_start = vmf->address - idx * PAGE_SIZE;
>> >> > +             unsigned long folio_end = folio_start + nr * PAGE_SIZE;
>> >> > +             pte_t *folio_ptep;
>> >> > +             pte_t folio_pte;
>> >> > +
>> >> > +             if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start)))
>> >> > +                     goto check_pte;
>> >> > +             if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end)))
>> >> > +                     goto check_pte;
>> >> > +
>> >> > +             folio_ptep = vmf->pte - idx;
>> >> > +             folio_pte = ptep_get(folio_ptep);
>> >>
>> >> It's better to construct pte based on fault PTE via generalizing
>> >> pte_next_swp_offset() (may be pte_move_swp_offset()).  Then we can find
>> >> inconsistent PTEs quicker.
>> >
>> > it seems your point is getting the pte of page0 by pte_next_swp_offset()
>> > unfortunately pte_next_swp_offset can't go back. on the other hand,
>> > we have to check the real pte value of the 0nd entry right now because
>> > swap_pte_batch() only really reads pte from the 1st entry. it assumes
>> > pte argument is the real value for the 0nd pte entry.
>> >
>> > static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
>> > {
>> >         pte_t expected_pte = pte_next_swp_offset(pte);
>> >         const pte_t *end_ptep = start_ptep + max_nr;
>> >         pte_t *ptep = start_ptep + 1;
>> >
>> >         VM_WARN_ON(max_nr < 1);
>> >         VM_WARN_ON(!is_swap_pte(pte));
>> >         VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));
>> >
>> >         while (ptep < end_ptep) {
>> >                 pte = ptep_get(ptep);
>> >
>> >                 if (!pte_same(pte, expected_pte))
>> >                         break;
>> >
>> >                 expected_pte = pte_next_swp_offset(expected_pte);
>> >                 ptep++;
>> >         }
>> >
>> >         return ptep - start_ptep;
>> > }
>>
>> Yes.  You are right.
>>
>> But we may check whether the pte of page0 is same as "vmf->orig_pte -
>> folio_page_idx()" (fake code).
>
> right, that is why we are reading and checking PTE0 before calling
> swap_pte_batch()
> right now.
>
>   folio_ptep = vmf->pte - idx;
>   folio_pte = ptep_get(folio_ptep);
>   if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) ||
>       swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr)
>    goto check_pte;
>
> So, if I understand correctly, you're proposing that we should directly check
> PTE0 in swap_pte_batch(). Personally, I don't have any objections to this idea.
> However, I'd also like to hear the feedback from Ryan and David :-)

I mean that we can replace 

        !is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte))

in above code with pte_same() with constructed expected first pte.

>>
>> You need to check the pte of page 0 anyway.
>>
>> >>
>> >> > +             if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) ||
>> >> > +                 swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr)
>> >> > +                     goto check_pte;
>> >> > +
>> >> > +             start_address = folio_start;
>> >> > +             start_pte = folio_ptep;
>> >> > +             nr_pages = nr;
>> >> > +             entry = folio->swap;
>> >> > +             page = &folio->page;
>> >> > +     }
>> >> > +
>> >> > +check_pte:
>> >> >       if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
>> >> >               goto out_nomap;
>> >> >
>> >> > @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> >> >                        */
>> >> >                       exclusive = false;
>> >> >               }
>> >> > +
>> >> > +             /* Reuse the whole large folio iff all entries are exclusive */
>> >> > +             if (nr_pages > 1 && any_swap_shared)
>> >> > +                     exclusive = false;
>> >> >       }
>> >> >

[snip]

--
Best Regards,
Huang, Ying
Barry Song April 16, 2024, 2:52 a.m. UTC | #9
On Tue, Apr 16, 2024 at 2:41 PM Huang, Ying <ying.huang@intel.com> wrote:
>
> Barry Song <21cnbao@gmail.com> writes:
>
> > On Tue, Apr 16, 2024 at 2:27 PM Huang, Ying <ying.huang@intel.com> wrote:
> >>
> >>
> >> Added Khalid for arch_do_swap_page().
> >>
> >> Barry Song <21cnbao@gmail.com> writes:
> >>
> >> > On Mon, Apr 15, 2024 at 8:39 PM Huang, Ying <ying.huang@intel.com> wrote:
> >> >>
> >> >> Barry Song <21cnbao@gmail.com> writes:
> >>
> >> [snip]
> >>
> >> >>
> >> >> > +     bool any_swap_shared = false;
> >> >> >
> >> >> >       if (!pte_unmap_same(vmf))
> >> >> >               goto out;
> >> >> > @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >> >> >        */
> >> >> >       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
> >> >> >                       &vmf->ptl);
> >> >>
> >> >> We should move pte check here.  That is,
> >> >>
> >> >>         if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
> >> >>                 goto out_nomap;
> >> >>
> >> >> This will simplify the situation for large folio.
> >> >
> >> > the plan is moving the whole code block
> >> >
> >> > if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio))
> >> >
> >> > after
> >> >         if (unlikely(!folio_test_uptodate(folio))) {
> >> >                 ret = VM_FAULT_SIGBUS;
> >> >                 goto out_nomap;
> >> >         }
> >> >
> >> > though we couldn't be !folio_test_uptodate(folio)) for hitting
> >> > swapcache but it seems
> >> > logically better for future use.
> >>
> >> LGTM, Thanks!
> >>
> >> >>
> >> >> > +
> >> >> > +     /* We hit large folios in swapcache */
> >> >>
> >> >> The comments seems unnecessary because the code tells that already.
> >> >>
> >> >> > +     if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) {
> >> >> > +             int nr = folio_nr_pages(folio);
> >> >> > +             int idx = folio_page_idx(folio, page);
> >> >> > +             unsigned long folio_start = vmf->address - idx * PAGE_SIZE;
> >> >> > +             unsigned long folio_end = folio_start + nr * PAGE_SIZE;
> >> >> > +             pte_t *folio_ptep;
> >> >> > +             pte_t folio_pte;
> >> >> > +
> >> >> > +             if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start)))
> >> >> > +                     goto check_pte;
> >> >> > +             if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end)))
> >> >> > +                     goto check_pte;
> >> >> > +
> >> >> > +             folio_ptep = vmf->pte - idx;
> >> >> > +             folio_pte = ptep_get(folio_ptep);
> >> >>
> >> >> It's better to construct pte based on fault PTE via generalizing
> >> >> pte_next_swp_offset() (may be pte_move_swp_offset()).  Then we can find
> >> >> inconsistent PTEs quicker.
> >> >
> >> > it seems your point is getting the pte of page0 by pte_next_swp_offset()
> >> > unfortunately pte_next_swp_offset can't go back. on the other hand,
> >> > we have to check the real pte value of the 0nd entry right now because
> >> > swap_pte_batch() only really reads pte from the 1st entry. it assumes
> >> > pte argument is the real value for the 0nd pte entry.
> >> >
> >> > static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
> >> > {
> >> >         pte_t expected_pte = pte_next_swp_offset(pte);
> >> >         const pte_t *end_ptep = start_ptep + max_nr;
> >> >         pte_t *ptep = start_ptep + 1;
> >> >
> >> >         VM_WARN_ON(max_nr < 1);
> >> >         VM_WARN_ON(!is_swap_pte(pte));
> >> >         VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));
> >> >
> >> >         while (ptep < end_ptep) {
> >> >                 pte = ptep_get(ptep);
> >> >
> >> >                 if (!pte_same(pte, expected_pte))
> >> >                         break;
> >> >
> >> >                 expected_pte = pte_next_swp_offset(expected_pte);
> >> >                 ptep++;
> >> >         }
> >> >
> >> >         return ptep - start_ptep;
> >> > }
> >>
> >> Yes.  You are right.
> >>
> >> But we may check whether the pte of page0 is same as "vmf->orig_pte -
> >> folio_page_idx()" (fake code).
> >
> > right, that is why we are reading and checking PTE0 before calling
> > swap_pte_batch()
> > right now.
> >
> >   folio_ptep = vmf->pte - idx;
> >   folio_pte = ptep_get(folio_ptep);
> >   if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) ||
> >       swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr)
> >    goto check_pte;
> >
> > So, if I understand correctly, you're proposing that we should directly check
> > PTE0 in swap_pte_batch(). Personally, I don't have any objections to this idea.
> > However, I'd also like to hear the feedback from Ryan and David :-)
>
> I mean that we can replace
>
>         !is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte))
>
> in above code with pte_same() with constructed expected first pte.

Got it. It could be quite tricky, especially with considerations like
pte_swp_soft_dirty, pte_swp_exclusive, and pte_swp_uffd_wp. We might
require a helper function similar to pte_next_swp_offset() but capable of
moving both forward and backward. For instance:

pte_move_swp_offset(pte_t pte, long delta)

pte_next_swp_offset can insteadly call it by:
pte_move_swp_offset(pte, 1);

Is it what you are proposing?

>
> >>
> >> You need to check the pte of page 0 anyway.
> >>
> >> >>
> >> >> > +             if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) ||
> >> >> > +                 swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr)
> >> >> > +                     goto check_pte;
> >> >> > +
> >> >> > +             start_address = folio_start;
> >> >> > +             start_pte = folio_ptep;
> >> >> > +             nr_pages = nr;
> >> >> > +             entry = folio->swap;
> >> >> > +             page = &folio->page;
> >> >> > +     }
> >> >> > +
> >> >> > +check_pte:
> >> >> >       if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
> >> >> >               goto out_nomap;
> >> >> >
> >> >> > @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >> >> >                        */
> >> >> >                       exclusive = false;
> >> >> >               }
> >> >> > +
> >> >> > +             /* Reuse the whole large folio iff all entries are exclusive */
> >> >> > +             if (nr_pages > 1 && any_swap_shared)
> >> >> > +                     exclusive = false;
> >> >> >       }
> >> >> >
>
> [snip]
>
> --
> Best Regards,
> Huang, Ying
Huang, Ying April 16, 2024, 3:17 a.m. UTC | #10
Barry Song <21cnbao@gmail.com> writes:

> On Tue, Apr 16, 2024 at 2:41 PM Huang, Ying <ying.huang@intel.com> wrote:
>>
>> Barry Song <21cnbao@gmail.com> writes:
>>
>> > On Tue, Apr 16, 2024 at 2:27 PM Huang, Ying <ying.huang@intel.com> wrote:
>> >>
>> >>
>> >> Added Khalid for arch_do_swap_page().
>> >>
>> >> Barry Song <21cnbao@gmail.com> writes:
>> >>
>> >> > On Mon, Apr 15, 2024 at 8:39 PM Huang, Ying <ying.huang@intel.com> wrote:
>> >> >>
>> >> >> Barry Song <21cnbao@gmail.com> writes:
>> >>
>> >> [snip]
>> >>
>> >> >>
>> >> >> > +     bool any_swap_shared = false;
>> >> >> >
>> >> >> >       if (!pte_unmap_same(vmf))
>> >> >> >               goto out;
>> >> >> > @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> >> >> >        */
>> >> >> >       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
>> >> >> >                       &vmf->ptl);
>> >> >>
>> >> >> We should move pte check here.  That is,
>> >> >>
>> >> >>         if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
>> >> >>                 goto out_nomap;
>> >> >>
>> >> >> This will simplify the situation for large folio.
>> >> >
>> >> > the plan is moving the whole code block
>> >> >
>> >> > if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio))
>> >> >
>> >> > after
>> >> >         if (unlikely(!folio_test_uptodate(folio))) {
>> >> >                 ret = VM_FAULT_SIGBUS;
>> >> >                 goto out_nomap;
>> >> >         }
>> >> >
>> >> > though we couldn't be !folio_test_uptodate(folio)) for hitting
>> >> > swapcache but it seems
>> >> > logically better for future use.
>> >>
>> >> LGTM, Thanks!
>> >>
>> >> >>
>> >> >> > +
>> >> >> > +     /* We hit large folios in swapcache */
>> >> >>
>> >> >> The comments seems unnecessary because the code tells that already.
>> >> >>
>> >> >> > +     if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) {
>> >> >> > +             int nr = folio_nr_pages(folio);
>> >> >> > +             int idx = folio_page_idx(folio, page);
>> >> >> > +             unsigned long folio_start = vmf->address - idx * PAGE_SIZE;
>> >> >> > +             unsigned long folio_end = folio_start + nr * PAGE_SIZE;
>> >> >> > +             pte_t *folio_ptep;
>> >> >> > +             pte_t folio_pte;
>> >> >> > +
>> >> >> > +             if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start)))
>> >> >> > +                     goto check_pte;
>> >> >> > +             if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end)))
>> >> >> > +                     goto check_pte;
>> >> >> > +
>> >> >> > +             folio_ptep = vmf->pte - idx;
>> >> >> > +             folio_pte = ptep_get(folio_ptep);
>> >> >>
>> >> >> It's better to construct pte based on fault PTE via generalizing
>> >> >> pte_next_swp_offset() (may be pte_move_swp_offset()).  Then we can find
>> >> >> inconsistent PTEs quicker.
>> >> >
>> >> > it seems your point is getting the pte of page0 by pte_next_swp_offset()
>> >> > unfortunately pte_next_swp_offset can't go back. on the other hand,
>> >> > we have to check the real pte value of the 0nd entry right now because
>> >> > swap_pte_batch() only really reads pte from the 1st entry. it assumes
>> >> > pte argument is the real value for the 0nd pte entry.
>> >> >
>> >> > static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
>> >> > {
>> >> >         pte_t expected_pte = pte_next_swp_offset(pte);
>> >> >         const pte_t *end_ptep = start_ptep + max_nr;
>> >> >         pte_t *ptep = start_ptep + 1;
>> >> >
>> >> >         VM_WARN_ON(max_nr < 1);
>> >> >         VM_WARN_ON(!is_swap_pte(pte));
>> >> >         VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));
>> >> >
>> >> >         while (ptep < end_ptep) {
>> >> >                 pte = ptep_get(ptep);
>> >> >
>> >> >                 if (!pte_same(pte, expected_pte))
>> >> >                         break;
>> >> >
>> >> >                 expected_pte = pte_next_swp_offset(expected_pte);
>> >> >                 ptep++;
>> >> >         }
>> >> >
>> >> >         return ptep - start_ptep;
>> >> > }
>> >>
>> >> Yes.  You are right.
>> >>
>> >> But we may check whether the pte of page0 is same as "vmf->orig_pte -
>> >> folio_page_idx()" (fake code).
>> >
>> > right, that is why we are reading and checking PTE0 before calling
>> > swap_pte_batch()
>> > right now.
>> >
>> >   folio_ptep = vmf->pte - idx;
>> >   folio_pte = ptep_get(folio_ptep);
>> >   if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) ||
>> >       swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr)
>> >    goto check_pte;
>> >
>> > So, if I understand correctly, you're proposing that we should directly check
>> > PTE0 in swap_pte_batch(). Personally, I don't have any objections to this idea.
>> > However, I'd also like to hear the feedback from Ryan and David :-)
>>
>> I mean that we can replace
>>
>>         !is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte))
>>
>> in above code with pte_same() with constructed expected first pte.
>
> Got it. It could be quite tricky, especially with considerations like
> pte_swp_soft_dirty, pte_swp_exclusive, and pte_swp_uffd_wp. We might
> require a helper function similar to pte_next_swp_offset() but capable of
> moving both forward and backward. For instance:
>
> pte_move_swp_offset(pte_t pte, long delta)
>
> pte_next_swp_offset can insteadly call it by:
> pte_move_swp_offset(pte, 1);
>
> Is it what you are proposing?

Yes.  Exactly.

--
Best Regards,
Huang, Ying

>>
>> >>
>> >> You need to check the pte of page 0 anyway.
>> >>
>> >> >>
>> >> >> > +             if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) ||
>> >> >> > +                 swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr)
>> >> >> > +                     goto check_pte;
>> >> >> > +
>> >> >> > +             start_address = folio_start;
>> >> >> > +             start_pte = folio_ptep;
>> >> >> > +             nr_pages = nr;
>> >> >> > +             entry = folio->swap;
>> >> >> > +             page = &folio->page;
>> >> >> > +     }
>> >> >> > +
>> >> >> > +check_pte:
>> >> >> >       if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
>> >> >> >               goto out_nomap;
>> >> >> >
>> >> >> > @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> >> >> >                        */
>> >> >> >                       exclusive = false;
>> >> >> >               }
>> >> >> > +
>> >> >> > +             /* Reuse the whole large folio iff all entries are exclusive */
>> >> >> > +             if (nr_pages > 1 && any_swap_shared)
>> >> >> > +                     exclusive = false;
>> >> >> >       }
>> >> >> >
>>
>> [snip]
>>
>> --
>> Best Regards,
>> Huang, Ying
Barry Song April 16, 2024, 4:40 a.m. UTC | #11
On Tue, Apr 16, 2024 at 3:19 PM Huang, Ying <ying.huang@intel.com> wrote:
>
> Barry Song <21cnbao@gmail.com> writes:
>
> > On Tue, Apr 16, 2024 at 2:41 PM Huang, Ying <ying.huang@intel.com> wrote:
> >>
> >> Barry Song <21cnbao@gmail.com> writes:
> >>
> >> > On Tue, Apr 16, 2024 at 2:27 PM Huang, Ying <ying.huang@intel.com> wrote:
> >> >>
> >> >>
> >> >> Added Khalid for arch_do_swap_page().
> >> >>
> >> >> Barry Song <21cnbao@gmail.com> writes:
> >> >>
> >> >> > On Mon, Apr 15, 2024 at 8:39 PM Huang, Ying <ying.huang@intel.com> wrote:
> >> >> >>
> >> >> >> Barry Song <21cnbao@gmail.com> writes:
> >> >>
> >> >> [snip]
> >> >>
> >> >> >>
> >> >> >> > +     bool any_swap_shared = false;
> >> >> >> >
> >> >> >> >       if (!pte_unmap_same(vmf))
> >> >> >> >               goto out;
> >> >> >> > @@ -4137,6 +4141,35 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >> >> >> >        */
> >> >> >> >       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
> >> >> >> >                       &vmf->ptl);
> >> >> >>
> >> >> >> We should move pte check here.  That is,
> >> >> >>
> >> >> >>         if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
> >> >> >>                 goto out_nomap;
> >> >> >>
> >> >> >> This will simplify the situation for large folio.
> >> >> >
> >> >> > the plan is moving the whole code block
> >> >> >
> >> >> > if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio))
> >> >> >
> >> >> > after
> >> >> >         if (unlikely(!folio_test_uptodate(folio))) {
> >> >> >                 ret = VM_FAULT_SIGBUS;
> >> >> >                 goto out_nomap;
> >> >> >         }
> >> >> >
> >> >> > though we couldn't be !folio_test_uptodate(folio)) for hitting
> >> >> > swapcache but it seems
> >> >> > logically better for future use.
> >> >>
> >> >> LGTM, Thanks!
> >> >>
> >> >> >>
> >> >> >> > +
> >> >> >> > +     /* We hit large folios in swapcache */
> >> >> >>
> >> >> >> The comments seems unnecessary because the code tells that already.
> >> >> >>
> >> >> >> > +     if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) {
> >> >> >> > +             int nr = folio_nr_pages(folio);
> >> >> >> > +             int idx = folio_page_idx(folio, page);
> >> >> >> > +             unsigned long folio_start = vmf->address - idx * PAGE_SIZE;
> >> >> >> > +             unsigned long folio_end = folio_start + nr * PAGE_SIZE;
> >> >> >> > +             pte_t *folio_ptep;
> >> >> >> > +             pte_t folio_pte;
> >> >> >> > +
> >> >> >> > +             if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start)))
> >> >> >> > +                     goto check_pte;
> >> >> >> > +             if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end)))
> >> >> >> > +                     goto check_pte;
> >> >> >> > +
> >> >> >> > +             folio_ptep = vmf->pte - idx;
> >> >> >> > +             folio_pte = ptep_get(folio_ptep);
> >> >> >>
> >> >> >> It's better to construct pte based on fault PTE via generalizing
> >> >> >> pte_next_swp_offset() (may be pte_move_swp_offset()).  Then we can find
> >> >> >> inconsistent PTEs quicker.
> >> >> >
> >> >> > it seems your point is getting the pte of page0 by pte_next_swp_offset()
> >> >> > unfortunately pte_next_swp_offset can't go back. on the other hand,
> >> >> > we have to check the real pte value of the 0nd entry right now because
> >> >> > swap_pte_batch() only really reads pte from the 1st entry. it assumes
> >> >> > pte argument is the real value for the 0nd pte entry.
> >> >> >
> >> >> > static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
> >> >> > {
> >> >> >         pte_t expected_pte = pte_next_swp_offset(pte);
> >> >> >         const pte_t *end_ptep = start_ptep + max_nr;
> >> >> >         pte_t *ptep = start_ptep + 1;
> >> >> >
> >> >> >         VM_WARN_ON(max_nr < 1);
> >> >> >         VM_WARN_ON(!is_swap_pte(pte));
> >> >> >         VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));
> >> >> >
> >> >> >         while (ptep < end_ptep) {
> >> >> >                 pte = ptep_get(ptep);
> >> >> >
> >> >> >                 if (!pte_same(pte, expected_pte))
> >> >> >                         break;
> >> >> >
> >> >> >                 expected_pte = pte_next_swp_offset(expected_pte);
> >> >> >                 ptep++;
> >> >> >         }
> >> >> >
> >> >> >         return ptep - start_ptep;
> >> >> > }
> >> >>
> >> >> Yes.  You are right.
> >> >>
> >> >> But we may check whether the pte of page0 is same as "vmf->orig_pte -
> >> >> folio_page_idx()" (fake code).
> >> >
> >> > right, that is why we are reading and checking PTE0 before calling
> >> > swap_pte_batch()
> >> > right now.
> >> >
> >> >   folio_ptep = vmf->pte - idx;
> >> >   folio_pte = ptep_get(folio_ptep);
> >> >   if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) ||
> >> >       swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr)
> >> >    goto check_pte;
> >> >
> >> > So, if I understand correctly, you're proposing that we should directly check
> >> > PTE0 in swap_pte_batch(). Personally, I don't have any objections to this idea.
> >> > However, I'd also like to hear the feedback from Ryan and David :-)
> >>
> >> I mean that we can replace
> >>
> >>         !is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte))
> >>
> >> in above code with pte_same() with constructed expected first pte.
> >
> > Got it. It could be quite tricky, especially with considerations like
> > pte_swp_soft_dirty, pte_swp_exclusive, and pte_swp_uffd_wp. We might
> > require a helper function similar to pte_next_swp_offset() but capable of
> > moving both forward and backward. For instance:
> >
> > pte_move_swp_offset(pte_t pte, long delta)
> >
> > pte_next_swp_offset can insteadly call it by:
> > pte_move_swp_offset(pte, 1);
> >
> > Is it what you are proposing?
>
> Yes.  Exactly.

Great. I agree that this appears to be much cleaner than the current code.

>
> --
> Best Regards,
> Huang, Ying
>
> >>
> >> >>
> >> >> You need to check the pte of page 0 anyway.
> >> >>
> >> >> >>
> >> >> >> > +             if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) ||
> >> >> >> > +                 swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr)
> >> >> >> > +                     goto check_pte;
> >> >> >> > +
> >> >> >> > +             start_address = folio_start;
> >> >> >> > +             start_pte = folio_ptep;
> >> >> >> > +             nr_pages = nr;
> >> >> >> > +             entry = folio->swap;
> >> >> >> > +             page = &folio->page;
> >> >> >> > +     }
> >> >> >> > +
> >> >> >> > +check_pte:
> >> >> >> >       if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
> >> >> >> >               goto out_nomap;
> >> >> >> >
> >> >> >> > @@ -4190,6 +4223,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >> >> >> >                        */
> >> >> >> >                       exclusive = false;
> >> >> >> >               }
> >> >> >> > +
> >> >> >> > +             /* Reuse the whole large folio iff all entries are exclusive */
> >> >> >> > +             if (nr_pages > 1 && any_swap_shared)
> >> >> >> > +                     exclusive = false;
> >> >> >> >       }
> >> >> >> >
> >>
> >> [snip]
> >>
> >> --
> >> Best Regards,
> >> Huang, Ying
Barry Song April 18, 2024, 9:55 a.m. UTC | #12
[snip]

> > >> >
> > >> >       VM_BUG_ON(!folio_test_anon(folio) ||
> > >> >                       (pte_write(pte) && !PageAnonExclusive(page)));
> > >> > -     set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
> > >> > -     arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
> > >> > +     set_ptes(vma->vm_mm, start_address, start_pte, pte, nr_pages);
> > >> > +     vmf->orig_pte = ptep_get(vmf->pte);
> > >> > +     arch_do_swap_page(vma->vm_mm, vma, start_address, pte, pte);
> > >>
> > >> Do we need to call arch_do_swap_page() for each subpage?  IIUC, the
> > >> corresponding arch_unmap_one() will be called for each subpage.
> > >
> > > i actually thought about this very carefully, right now, the only one who
> > > needs this is sparc and it doesn't support THP_SWAPOUT at all. and
> > > there is no proof doing restoration one by one won't really break sparc.
> > > so i'd like to defer this to when sparc really needs THP_SWAPOUT.
> >
> > Let's ask SPARC developer (Cced) for this.
> >
> > IMHO, even if we cannot get help, we need to change code with our
> > understanding instead of deferring it.
>
> ok. Thanks for Ccing sparc developers.

Hi Khalid & Ying (also Cced sparc maillist),

SPARC is the only platform which needs arch_do_swap_page(), right now,
its THP_SWAPOUT is not enabled. so we will not really hit a large folio
in swapcache. just in case you might need THP_SWAPOUT later, i am
changing the code as below,

@@ -4286,7 +4285,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
        VM_BUG_ON(!folio_test_anon(folio) ||
                        (pte_write(pte) && !PageAnonExclusive(page)));
        set_ptes(vma->vm_mm, start_address, start_ptep, pte, nr_pages);
-       arch_do_swap_page(vma->vm_mm, vma, start_address, pte, pte);
+       for (int i = 0; i < nr_pages; i++) {
+               arch_do_swap_page(vma->vm_mm, vma, start_address + i *
PAGE_SIZE,
+                                 pte, pte);
+               pte = pte_advance_pfn(pte, 1);
+       }

        folio_unlock(folio);
        if (folio != swapcache && swapcache) {

for sparc, nr_pages will always be 1(THP_SWAPOUT not enabled). for
arm64/x86/riscv,
it seems redundant to do a for loop "for (int i = 0; i < nr_pages; i++)".

so another option is adding a helper as below to avoid the idle loop
for arm64/x86/riscv etc.

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index e2f45e22a6d1..ea314a5f9b5e 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1085,6 +1085,28 @@ static inline void arch_do_swap_page(struct
mm_struct *mm,
 {

 }
+
+static inline void arch_do_swap_page_nr(struct mm_struct *mm,
+                                    struct vm_area_struct *vma,
+                                    unsigned long addr,
+                                    pte_t pte, pte_t oldpte,
+                                    int nr)
+{
+
+}
+#else
+static inline void arch_do_swap_page_nr(struct mm_struct *mm,
+                                    struct vm_area_struct *vma,
+                                    unsigned long addr,
+                                    pte_t pte, pte_t oldpte,
+                                    int nr)
+{
+       for (int i = 0; i < nr; i++) {
+               arch_do_swap_page(vma->vm_mm, vma, addr + i * PAGE_SIZE,
+                                pte_advance_pfn(pte, i),
+                                pte_advance_pfn(oldpte, i));
+       }
+}
 #endif

Please tell me your preference.

BTW, i found oldpte and pte are always same in do_swap_page(), is it
something wrong? does arch_do_swap_page() really need two same
arguments?


vmf->orig_pte = pte;
...
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);


>
> >
> > > on the other hand, it seems really bad we have both
> > > arch_swap_restore  - for this, arm64 has moved to using folio
> > > and
> > > arch_do_swap_page
> > >
> > > we should somehow unify them later if sparc wants THP_SWPOUT.

Thanks
Barry
diff mbox series

Patch

diff --git a/mm/memory.c b/mm/memory.c
index c4a52e8d740a..9818dc1893c8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3947,6 +3947,10 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 	pte_t pte;
 	vm_fault_t ret = 0;
 	void *shadow = NULL;
+	int nr_pages = 1;
+	unsigned long start_address = vmf->address;
+	pte_t *start_pte = vmf->pte;
+	bool any_swap_shared = false;
 
 	if (!pte_unmap_same(vmf))
 		goto out;
@@ -4137,6 +4141,35 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 	 */
 	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
 			&vmf->ptl);
+
+	/* We hit large folios in swapcache */
+	if (start_pte && folio_test_large(folio) && folio_test_swapcache(folio)) {
+		int nr = folio_nr_pages(folio);
+		int idx = folio_page_idx(folio, page);
+		unsigned long folio_start = vmf->address - idx * PAGE_SIZE;
+		unsigned long folio_end = folio_start + nr * PAGE_SIZE;
+		pte_t *folio_ptep;
+		pte_t folio_pte;
+
+		if (unlikely(folio_start < max(vmf->address & PMD_MASK, vma->vm_start)))
+			goto check_pte;
+		if (unlikely(folio_end > pmd_addr_end(vmf->address, vma->vm_end)))
+			goto check_pte;
+
+		folio_ptep = vmf->pte - idx;
+		folio_pte = ptep_get(folio_ptep);
+		if (!is_swap_pte(folio_pte) || non_swap_entry(pte_to_swp_entry(folio_pte)) ||
+		    swap_pte_batch(folio_ptep, nr, folio_pte, &any_swap_shared) != nr)
+			goto check_pte;
+
+		start_address = folio_start;
+		start_pte = folio_ptep;
+		nr_pages = nr;
+		entry = folio->swap;
+		page = &folio->page;
+	}
+
+check_pte:
 	if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
 		goto out_nomap;
 
@@ -4190,6 +4223,10 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 			 */
 			exclusive = false;
 		}
+
+		/* Reuse the whole large folio iff all entries are exclusive */
+		if (nr_pages > 1 && any_swap_shared)
+			exclusive = false;
 	}
 
 	/*
@@ -4204,12 +4241,14 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 	 * We're already holding a reference on the page but haven't mapped it
 	 * yet.
 	 */
-	swap_free(entry);
+	swap_free_nr(entry, nr_pages);
 	if (should_try_to_free_swap(folio, vma, vmf->flags))
 		folio_free_swap(folio);
 
-	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
-	dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
+	folio_ref_add(folio, nr_pages - 1);
+	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
+	add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages);
+
 	pte = mk_pte(page, vma->vm_page_prot);
 
 	/*
@@ -4219,33 +4258,34 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 	 * exclusivity.
 	 */
 	if (!folio_test_ksm(folio) &&
-	    (exclusive || folio_ref_count(folio) == 1)) {
+	    (exclusive || (folio_ref_count(folio) == nr_pages &&
+			   folio_nr_pages(folio) == nr_pages))) {
 		if (vmf->flags & FAULT_FLAG_WRITE) {
 			pte = maybe_mkwrite(pte_mkdirty(pte), vma);
 			vmf->flags &= ~FAULT_FLAG_WRITE;
 		}
 		rmap_flags |= RMAP_EXCLUSIVE;
 	}
-	flush_icache_page(vma, page);
+	flush_icache_pages(vma, page, nr_pages);
 	if (pte_swp_soft_dirty(vmf->orig_pte))
 		pte = pte_mksoft_dirty(pte);
 	if (pte_swp_uffd_wp(vmf->orig_pte))
 		pte = pte_mkuffd_wp(pte);
-	vmf->orig_pte = pte;
 
 	/* ksm created a completely new copy */
 	if (unlikely(folio != swapcache && swapcache)) {
-		folio_add_new_anon_rmap(folio, vma, vmf->address);
+		folio_add_new_anon_rmap(folio, vma, start_address);
 		folio_add_lru_vma(folio, vma);
 	} else {
-		folio_add_anon_rmap_pte(folio, page, vma, vmf->address,
-					rmap_flags);
+		folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, start_address,
+					 rmap_flags);
 	}
 
 	VM_BUG_ON(!folio_test_anon(folio) ||
 			(pte_write(pte) && !PageAnonExclusive(page)));
-	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
-	arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
+	set_ptes(vma->vm_mm, start_address, start_pte, pte, nr_pages);
+	vmf->orig_pte = ptep_get(vmf->pte);
+	arch_do_swap_page(vma->vm_mm, vma, start_address, pte, pte);
 
 	folio_unlock(folio);
 	if (folio != swapcache && swapcache) {
@@ -4269,7 +4309,7 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 	}
 
 	/* No need to invalidate - it was non-present before */
-	update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
+	update_mmu_cache_range(vmf, vma, start_address, start_pte, nr_pages);
 unlock:
 	if (vmf->pte)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);