diff mbox series

[v2,5/5] mm: Add ZAP_FLAG_SKIP_SWAP and zap_flags

Message ID 20210902201836.53605-1-peterx@redhat.com (mailing list archive)
State New
Headers show
Series mm: A few cleanup patches around zap, shmem and uffd | expand

Commit Message

Peter Xu Sept. 2, 2021, 8:18 p.m. UTC
Firstly, the comment in zap_pte_range() is misleading because it checks against
details rather than check_mappings, so it's against what the code did.

Meanwhile, it's confusing too on not explaining why passing in the details
pointer would mean to skip all swap entries.  New user of zap_details could
very possibly miss this fact if they don't read deep until zap_pte_range()
because there's no comment at zap_details talking about it at all, so swap
entries could be errornously skipped without being noticed.

Actually very recently we introduced unmap_mapping_page() in 22061a1ffabd, I
think that should also look into swap entries.  Add a comment there.  IOW, this
patch will be a functional change to unmap_mapping_page() but hopefully in the
right way to do it.

This partly reverts 3e8715fdc03e ("mm: drop zap_details::check_swap_entries"),
but introduce ZAP_FLAG_SKIP_SWAP flag, which means the opposite of previous
"details" parameter: the caller should explicitly set this to skip swap
entries, otherwise swap entries will always be considered (which should still
be the major case here).

Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 include/linux/mm.h | 16 ++++++++++++++++
 mm/memory.c        |  6 +++---
 2 files changed, 19 insertions(+), 3 deletions(-)

Comments

David Hildenbrand Sept. 3, 2021, 7:25 a.m. UTC | #1
On 02.09.21 22:18, Peter Xu wrote:
> Firstly, the comment in zap_pte_range() is misleading because it checks against
> details rather than check_mappings, so it's against what the code did.
> 
> Meanwhile, it's confusing too on not explaining why passing in the details

s/on//

> pointer would mean to skip all swap entries.  New user of zap_details could
> very possibly miss this fact if they don't read deep until zap_pte_range()
> because there's no comment at zap_details talking about it at all, so swap
> entries could be errornously skipped without being noticed.

s/errornously/erroneously/

> 
> Actually very recently we introduced unmap_mapping_page() in 22061a1ffabd, I
> think that should also look into swap entries.  Add a comment there.  IOW, this
> patch will be a functional change to unmap_mapping_page() but hopefully in the
> right way to do it.
> 
> This partly reverts 3e8715fdc03e ("mm: drop zap_details::check_swap_entries"),
> but introduce ZAP_FLAG_SKIP_SWAP flag, which means the opposite of previous
> "details" parameter: the caller should explicitly set this to skip swap
> entries, otherwise swap entries will always be considered (which should still
> be the major case here).
> 
> Cc: Kirill A. Shutemov <kirill@shutemov.name>
> Cc: Hugh Dickins <hughd@google.com>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>   include/linux/mm.h | 16 ++++++++++++++++
>   mm/memory.c        |  6 +++---
>   2 files changed, 19 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 81e402a5fbc9..a7bcdb2ec956 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1716,12 +1716,18 @@ static inline bool can_do_mlock(void) { return false; }
>   extern int user_shm_lock(size_t, struct ucounts *);
>   extern void user_shm_unlock(size_t, struct ucounts *);
>   
> +typedef unsigned int __bitwise zap_flags_t;
> +
> +/* Whether to skip zapping swap entries */
> +#define  ZAP_FLAG_SKIP_SWAP  ((__force zap_flags_t) BIT(0))

Interestingly, this will also skip fake some swap entries (e.g., 
migration entries but not private/exclusive entries). Maybe extend that 
documentation a bit.

... but, looking into zap_pmd_range(), we don't care about "details" 
when calling zap_huge_pmd(), which will zap pmd migration entries IIUC 
... so it's really unclear to me what the flag (and current behavior) 
really is and what should be documented. Should we maybe really only 
care about "real" swap entries?

Most probably I'm just missing something important.

> +
>   /*
>    * Parameter block passed down to zap_pte_range in exceptional cases.
>    */
>   struct zap_details {
>   	struct address_space *zap_mapping;	/* Check page->mapping if set */
>   	struct page *single_page;		/* Locked page to be unmapped */
> +	zap_flags_t zap_flags;			/* Extra flags for zapping */
>   };
>   
>   /*
> @@ -1737,6 +1743,16 @@ zap_skip_check_mapping(struct zap_details *details, struct page *page)
>   	return details->zap_mapping != page_rmapping(page);
>   }
>   
> +/* Return true if skip swap entries, false otherwise */
> +static inline bool
> +zap_skip_swap(struct zap_details *details)
> +{
> +	if (!details)
> +		return false;
> +
> +	return details->zap_flags & ZAP_FLAG_SKIP_SWAP;
> +}
> +
>   struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
>   			     pte_t pte);
>   struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
> diff --git a/mm/memory.c b/mm/memory.c
> index e5ee8399d270..4cb269ca8249 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1379,8 +1379,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
>   			continue;
>   		}
>   
> -		/* If details->check_mapping, we leave swap entries. */
> -		if (unlikely(details))
> +		if (unlikely(zap_skip_swap(details)))
>   			continue;
>   
>   		if (!non_swap_entry(entry))
> @@ -3351,6 +3350,7 @@ void unmap_mapping_page(struct page *page)
>   	first_index = page->index;
>   	last_index = page->index + thp_nr_pages(page) - 1;
>   
> +	/* Keep ZAP_FLAG_SKIP_SWAP cleared because we're truncating */
>   	details.zap_mapping = mapping;
>   	details.single_page = page;
>   
> @@ -3377,7 +3377,7 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
>   		pgoff_t nr, bool even_cows)
>   {
>   	pgoff_t	first_index = start, last_index = start + nr - 1;
> -	struct zap_details details = { };
> +	struct zap_details details = { .zap_flags = ZAP_FLAG_SKIP_SWAP };
>   
>   	details.zap_mapping = even_cows ? NULL : mapping;
>   	if (last_index < first_index)
> 

I think what would really help is to add a high-level description what 
unmap_mapping_page() vs. unmap_mapping_pages() really does, and what the 
expectations/use cases are. The names are just way too similar ...

I wonder if it would make sense to split this into two parts

a) Introduce ZAP_FLAG_SKIP_SWAP and use it accordingly for existing cases
b) Stop setting it for unmap_mapping_page()

So we'd have the change in behavior isolated. But not sure if it's worth 
the trouble, especially if we want to backport the fix ...
David Hildenbrand Sept. 3, 2021, 7:31 a.m. UTC | #2
>> @@ -3377,7 +3377,7 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
>>    		pgoff_t nr, bool even_cows)
>>    {
>>    	pgoff_t	first_index = start, last_index = start + nr - 1;
>> -	struct zap_details details = { };
>> +	struct zap_details details = { .zap_flags = ZAP_FLAG_SKIP_SWAP };
>>    
>>    	details.zap_mapping = even_cows ? NULL : mapping;
>>    	if (last_index < first_index)
>>
> 
> I think what would really help is to add a high-level description what
> unmap_mapping_page() vs. unmap_mapping_pages() really does, and what the
> expectations/use cases are. The names are just way too similar ...

aaaaand staring only at this patch I missed that we have nice 
descriptions already :)
Peter Xu Sept. 8, 2021, 12:43 a.m. UTC | #3
On Fri, Sep 03, 2021 at 09:25:51AM +0200, David Hildenbrand wrote:
> On 02.09.21 22:18, Peter Xu wrote:
> > Firstly, the comment in zap_pte_range() is misleading because it checks against
> > details rather than check_mappings, so it's against what the code did.
> > 
> > Meanwhile, it's confusing too on not explaining why passing in the details
> 
> s/on//
> 
> > pointer would mean to skip all swap entries.  New user of zap_details could
> > very possibly miss this fact if they don't read deep until zap_pte_range()
> > because there's no comment at zap_details talking about it at all, so swap
> > entries could be errornously skipped without being noticed.
> 
> s/errornously/erroneously/

Will fix, thanks.

> 
> > 
> > Actually very recently we introduced unmap_mapping_page() in 22061a1ffabd, I
> > think that should also look into swap entries.  Add a comment there.  IOW, this
> > patch will be a functional change to unmap_mapping_page() but hopefully in the
> > right way to do it.

I'll remove this paragraph, as explained elsewhere.

> > 
> > This partly reverts 3e8715fdc03e ("mm: drop zap_details::check_swap_entries"),
> > but introduce ZAP_FLAG_SKIP_SWAP flag, which means the opposite of previous
> > "details" parameter: the caller should explicitly set this to skip swap
> > entries, otherwise swap entries will always be considered (which should still
> > be the major case here).
> > 
> > Cc: Kirill A. Shutemov <kirill@shutemov.name>
> > Cc: Hugh Dickins <hughd@google.com>
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >   include/linux/mm.h | 16 ++++++++++++++++
> >   mm/memory.c        |  6 +++---
> >   2 files changed, 19 insertions(+), 3 deletions(-)
> > 
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 81e402a5fbc9..a7bcdb2ec956 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -1716,12 +1716,18 @@ static inline bool can_do_mlock(void) { return false; }
> >   extern int user_shm_lock(size_t, struct ucounts *);
> >   extern void user_shm_unlock(size_t, struct ucounts *);
> > +typedef unsigned int __bitwise zap_flags_t;
> > +
> > +/* Whether to skip zapping swap entries */
> > +#define  ZAP_FLAG_SKIP_SWAP  ((__force zap_flags_t) BIT(0))
> 
> Interestingly, this will also skip fake some swap entries (e.g., migration
> entries but not private/exclusive entries). Maybe extend that documentation
> a bit.
> 
> ... but, looking into zap_pmd_range(), we don't care about "details" when
> calling zap_huge_pmd(), which will zap pmd migration entries IIUC ... so
> it's really unclear to me what the flag (and current behavior) really is and
> what should be documented. Should we maybe really only care about "real"
> swap entries?

Indeed, I tried to look into it today and see why we wanted to skip swap
entries but I failed to figure it out easily - it goes back to the 1st git
commit of Linux.

Maybe there'll be some experienced developer who knows the history, but before
that I decided to just keep the behavior.

The final goal of mine is to make the code clean and also prepares for the
uffd-wp that will allow free-style use of "details" pointer, rather than have
an implicit hint on "skip swap entry" then it'll be enough for this patch.

> 
> Most probably I'm just missing something important.
> 
> > +
> >   /*
> >    * Parameter block passed down to zap_pte_range in exceptional cases.
> >    */
> >   struct zap_details {
> >   	struct address_space *zap_mapping;	/* Check page->mapping if set */
> >   	struct page *single_page;		/* Locked page to be unmapped */
> > +	zap_flags_t zap_flags;			/* Extra flags for zapping */
> >   };
> >   /*
> > @@ -1737,6 +1743,16 @@ zap_skip_check_mapping(struct zap_details *details, struct page *page)
> >   	return details->zap_mapping != page_rmapping(page);
> >   }
> > +/* Return true if skip swap entries, false otherwise */
> > +static inline bool
> > +zap_skip_swap(struct zap_details *details)
> > +{
> > +	if (!details)
> > +		return false;
> > +
> > +	return details->zap_flags & ZAP_FLAG_SKIP_SWAP;
> > +}
> > +
> >   struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
> >   			     pte_t pte);
> >   struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
> > diff --git a/mm/memory.c b/mm/memory.c
> > index e5ee8399d270..4cb269ca8249 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -1379,8 +1379,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> >   			continue;
> >   		}
> > -		/* If details->check_mapping, we leave swap entries. */
> > -		if (unlikely(details))
> > +		if (unlikely(zap_skip_swap(details)))
> >   			continue;
> >   		if (!non_swap_entry(entry))
> > @@ -3351,6 +3350,7 @@ void unmap_mapping_page(struct page *page)
> >   	first_index = page->index;
> >   	last_index = page->index + thp_nr_pages(page) - 1;
> > +	/* Keep ZAP_FLAG_SKIP_SWAP cleared because we're truncating */

As to keep the behavior, I shouldn't fiddle around with this, so I'll also
attach ZAP_FLAG_SKIP_SWAP to unmap_mapping_page() too in v3.

> >   	details.zap_mapping = mapping;
> >   	details.single_page = page;

Thanks,
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 81e402a5fbc9..a7bcdb2ec956 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1716,12 +1716,18 @@  static inline bool can_do_mlock(void) { return false; }
 extern int user_shm_lock(size_t, struct ucounts *);
 extern void user_shm_unlock(size_t, struct ucounts *);
 
+typedef unsigned int __bitwise zap_flags_t;
+
+/* Whether to skip zapping swap entries */
+#define  ZAP_FLAG_SKIP_SWAP  ((__force zap_flags_t) BIT(0))
+
 /*
  * Parameter block passed down to zap_pte_range in exceptional cases.
  */
 struct zap_details {
 	struct address_space *zap_mapping;	/* Check page->mapping if set */
 	struct page *single_page;		/* Locked page to be unmapped */
+	zap_flags_t zap_flags;			/* Extra flags for zapping */
 };
 
 /*
@@ -1737,6 +1743,16 @@  zap_skip_check_mapping(struct zap_details *details, struct page *page)
 	return details->zap_mapping != page_rmapping(page);
 }
 
+/* Return true if skip swap entries, false otherwise */
+static inline bool
+zap_skip_swap(struct zap_details *details)
+{
+	if (!details)
+		return false;
+
+	return details->zap_flags & ZAP_FLAG_SKIP_SWAP;
+}
+
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 			     pte_t pte);
 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
diff --git a/mm/memory.c b/mm/memory.c
index e5ee8399d270..4cb269ca8249 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1379,8 +1379,7 @@  static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			continue;
 		}
 
-		/* If details->check_mapping, we leave swap entries. */
-		if (unlikely(details))
+		if (unlikely(zap_skip_swap(details)))
 			continue;
 
 		if (!non_swap_entry(entry))
@@ -3351,6 +3350,7 @@  void unmap_mapping_page(struct page *page)
 	first_index = page->index;
 	last_index = page->index + thp_nr_pages(page) - 1;
 
+	/* Keep ZAP_FLAG_SKIP_SWAP cleared because we're truncating */
 	details.zap_mapping = mapping;
 	details.single_page = page;
 
@@ -3377,7 +3377,7 @@  void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
 		pgoff_t nr, bool even_cows)
 {
 	pgoff_t	first_index = start, last_index = start + nr - 1;
-	struct zap_details details = { };
+	struct zap_details details = { .zap_flags = ZAP_FLAG_SKIP_SWAP };
 
 	details.zap_mapping = even_cows ? NULL : mapping;
 	if (last_index < first_index)