diff mbox series

[v4,1/4] mm: Don't skip swap entry even if zap_details specified

Message ID 20220216094810.60572-2-peterx@redhat.com (mailing list archive)
State New
Headers show
Series mm: Rework zap ptes on swap entries | expand

Commit Message

Peter Xu Feb. 16, 2022, 9:48 a.m. UTC
The "details" pointer shouldn't be the token to decide whether we should skip
swap entries.  For example, when the user specified details->zap_mapping==NULL,
it means the user wants to zap all the pages (including COWed pages), then we
need to look into swap entries because there can be private COWed pages that
was swapped out.

Skipping some swap entries when details is non-NULL may lead to wrongly leaving
some of the swap entries while we should have zapped them.

A reproducer of the problem:

===8<===
        #define _GNU_SOURCE         /* See feature_test_macros(7) */
        #include <stdio.h>
        #include <assert.h>
        #include <unistd.h>
        #include <sys/mman.h>
        #include <sys/types.h>

        int page_size;
        int shmem_fd;
        char *buffer;

        void main(void)
        {
                int ret;
                char val;

                page_size = getpagesize();
                shmem_fd = memfd_create("test", 0);
                assert(shmem_fd >= 0);

                ret = ftruncate(shmem_fd, page_size * 2);
                assert(ret == 0);

                buffer = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
                                MAP_PRIVATE, shmem_fd, 0);
                assert(buffer != MAP_FAILED);

                /* Write private page, swap it out */
                buffer[page_size] = 1;
                madvise(buffer, page_size * 2, MADV_PAGEOUT);

                /* This should drop private buffer[page_size] already */
                ret = ftruncate(shmem_fd, page_size);
                assert(ret == 0);
                /* Recover the size */
                ret = ftruncate(shmem_fd, page_size * 2);
                assert(ret == 0);

                /* Re-read the data, it should be all zero */
                val = buffer[page_size];
                if (val == 0)
                        printf("Good\n");
                else
                        printf("BUG\n");
        }
===8<===

We don't need to touch up the pmd path, because pmd never had a issue with swap
entries.  For example, shmem pmd migration will always be split into pte level,
and same to swapping on anonymous.

Add another helper should_zap_cows() so that we can also check whether we
should zap private mappings when there's no page pointer specified.

This patch drops that trick, so we handle swap ptes coherently.  Meanwhile we
should do the same check upon migration entry, hwpoison entry and genuine swap
entries too.  To be explicit, we should still remember to keep the private
entries if even_cows==false, and always zap them when even_cows==true.

The issue seems to exist starting from the initial commit of git.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 mm/memory.c | 45 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 9 deletions(-)

Comments

Andrew Morton Feb. 17, 2022, 2:54 a.m. UTC | #1
On Wed, 16 Feb 2022 17:48:07 +0800 Peter Xu <peterx@redhat.com> wrote:

> The "details" pointer shouldn't be the token to decide whether we should skip
> swap entries.  For example, when the user specified details->zap_mapping==NULL,
> it means the user wants to zap all the pages (including COWed pages), then we
> need to look into swap entries because there can be private COWed pages that
> was swapped out.

I assume "user" here means "caller".

> Skipping some swap entries when details is non-NULL may lead to wrongly leaving
> some of the swap entries while we should have zapped them.
> 
> A reproducer of the problem:
> 
> ...
>
> The issue seems to exist starting from the initial commit of git.

I'll add cc:stable to this one, thanks.
John Hubbard Feb. 17, 2022, 3:15 a.m. UTC | #2
On 2/16/22 1:48 AM, Peter Xu wrote:
> The "details" pointer shouldn't be the token to decide whether we should skip
> swap entries.  For example, when the user specified details->zap_mapping==NULL,
> it means the user wants to zap all the pages (including COWed pages), then we
> need to look into swap entries because there can be private COWed pages that
> was swapped out.

Hi Peter,

The changes look good, just some minor readability suggestions below:

(btw, hch is going to ask you to reflow all of the commit descriptions
to 72 cols, so you might as well do it in advance. :)

> 
> Skipping some swap entries when details is non-NULL may lead to wrongly leaving
> some of the swap entries while we should have zapped them.
> 
> A reproducer of the problem:
> 
> ===8<===
>         #define _GNU_SOURCE         /* See feature_test_macros(7) */
>         #include <stdio.h>
>         #include <assert.h>
>         #include <unistd.h>
>         #include <sys/mman.h>
>         #include <sys/types.h>
> 
>         int page_size;
>         int shmem_fd;
>         char *buffer;
> 
>         void main(void)
>         {
>                 int ret;
>                 char val;
> 
>                 page_size = getpagesize();
>                 shmem_fd = memfd_create("test", 0);
>                 assert(shmem_fd >= 0);
> 
>                 ret = ftruncate(shmem_fd, page_size * 2);
>                 assert(ret == 0);
> 
>                 buffer = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
>                                 MAP_PRIVATE, shmem_fd, 0);
>                 assert(buffer != MAP_FAILED);
> 
>                 /* Write private page, swap it out */
>                 buffer[page_size] = 1;
>                 madvise(buffer, page_size * 2, MADV_PAGEOUT);
> 
>                 /* This should drop private buffer[page_size] already */
>                 ret = ftruncate(shmem_fd, page_size);
>                 assert(ret == 0);
>                 /* Recover the size */
>                 ret = ftruncate(shmem_fd, page_size * 2);
>                 assert(ret == 0);
> 
>                 /* Re-read the data, it should be all zero */
>                 val = buffer[page_size];
>                 if (val == 0)
>                         printf("Good\n");
>                 else
>                         printf("BUG\n");
>         }
> ===8<===
> 
> We don't need to touch up the pmd path, because pmd never had a issue with swap
> entries.  For example, shmem pmd migration will always be split into pte level,
> and same to swapping on anonymous.
> 
> Add another helper should_zap_cows() so that we can also check whether we
> should zap private mappings when there's no page pointer specified.
> 
> This patch drops that trick, so we handle swap ptes coherently.  Meanwhile we
> should do the same check upon migration entry, hwpoison entry and genuine swap
> entries too.  To be explicit, we should still remember to keep the private
> entries if even_cows==false, and always zap them when even_cows==true.
> 
> The issue seems to exist starting from the initial commit of git.
> 
> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  mm/memory.c | 45 ++++++++++++++++++++++++++++++++++++---------
>  1 file changed, 36 insertions(+), 9 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index c125c4969913..4bfeaca7cbc7 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1313,6 +1313,17 @@ struct zap_details {
>  	struct folio *single_folio;	/* Locked folio to be unmapped */
>  };
>  
> +/* Whether we should zap all COWed (private) pages too */
> +static inline bool should_zap_cows(struct zap_details *details)
> +{
> +	/* By default, zap all pages */
> +	if (!details)
> +		return true;
> +
> +	/* Or, we zap COWed pages only if the caller wants to */
> +	return !details->zap_mapping;
> +}
> +
>  /*
>   * We set details->zap_mapping when we want to unmap shared but keep private
>   * pages. Return true if skip zapping this page, false otherwise.
> @@ -1320,11 +1331,15 @@ struct zap_details {
>  static inline bool
>  zap_skip_check_mapping(struct zap_details *details, struct page *page)
>  {
> -	if (!details || !page)
> +	/* If we can make a decision without *page.. */
> +	if (should_zap_cows(details))
>  		return false;
>  
> -	return details->zap_mapping &&
> -		(details->zap_mapping != page_rmapping(page));
> +	/* E.g. zero page */

It's a bit confusing to see a comment that "this could be the zero page", if 
the value is NULL. Maybe, "the caller passes NULL for the case of a zero 
page", or something along those lines? 


> +	if (!page)
> +		return false;
> +
> +	return details->zap_mapping != page_rmapping(page);
>  }
>  
>  static unsigned long zap_pte_range(struct mmu_gather *tlb,
> @@ -1405,17 +1420,29 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
>  			continue;
>  		}
>  
> -		/* If details->check_mapping, we leave swap entries. */
> -		if (unlikely(details))
> -			continue;
> -
> -		if (!non_swap_entry(entry))
> +		if (!non_swap_entry(entry)) {
> +			/*
> +			 * If this is a genuine swap entry, then it must be an
> +			 * private anon page.  If the caller wants to skip
> +			 * COWed pages, ignore it.
> +			 */

How about this instead:

			/* Genuine swap entry, and therefore a private anon page. */

> +			if (!should_zap_cows(details))
> +				continue;
>  			rss[MM_SWAPENTS]--;
> -		else if (is_migration_entry(entry)) {

Can we put a newline here, and before each "else" block? Because now it
is getting very dense, and the visual separation really helps.

> +		} else if (is_migration_entry(entry)) {
>  			struct page *page;
>  
>  			page = pfn_swap_entry_to_page(entry);
> +			if (zap_skip_check_mapping(details, page))
> +				continue;
>  			rss[mm_counter(page)]--;

Newline here.

> +		} else if (is_hwpoison_entry(entry)) {
> +			/* If the caller wants to skip COWed pages, ignore it */

Likewise, I'd prefer we delete that comment, because it exactly matches 
what the following line of code says.

> +			if (!should_zap_cows(details))
> +				continue;

And newline here too.

> +		} else {
> +			/* We should have covered all the swap entry types */
> +			WARN_ON_ONCE(1);
>  		}
>  		if (unlikely(!free_swap_and_cache(entry)))
>  			print_bad_pte(vma, addr, ptent, NULL);

Those are all just nits, and as I mentioned, the actual changes look good
to me, so:

Reviewed-by: John Hubbard <jhubbard@nvidia.com>

thanks,
Peter Xu Feb. 17, 2022, 4:22 a.m. UTC | #3
On Wed, Feb 16, 2022 at 06:54:55PM -0800, Andrew Morton wrote:
> On Wed, 16 Feb 2022 17:48:07 +0800 Peter Xu <peterx@redhat.com> wrote:
> 
> > The "details" pointer shouldn't be the token to decide whether we should skip
> > swap entries.  For example, when the user specified details->zap_mapping==NULL,
> > it means the user wants to zap all the pages (including COWed pages), then we
> > need to look into swap entries because there can be private COWed pages that
> > was swapped out.
> 
> I assume "user" here means "caller".

Right.

> 
> > Skipping some swap entries when details is non-NULL may lead to wrongly leaving
> > some of the swap entries while we should have zapped them.
> > 
> > A reproducer of the problem:
> > 
> > ...
> >
> > The issue seems to exist starting from the initial commit of git.
> 
> I'll add cc:stable to this one, thanks.

Yes, thanks.

I didn't yet check the backports to stable yet, some of them might need some
tweaks on the patch.  I'll follow that up when it hits it.
Peter Xu Feb. 17, 2022, 4:46 a.m. UTC | #4
On Wed, Feb 16, 2022 at 07:15:30PM -0800, John Hubbard wrote:
> On 2/16/22 1:48 AM, Peter Xu wrote:
> > The "details" pointer shouldn't be the token to decide whether we should skip
> > swap entries.  For example, when the user specified details->zap_mapping==NULL,
> > it means the user wants to zap all the pages (including COWed pages), then we
> > need to look into swap entries because there can be private COWed pages that
> > was swapped out.
> 
> Hi Peter,
> 
> The changes look good, just some minor readability suggestions below:
> 
> (btw, hch is going to ask you to reflow all of the commit descriptions
> to 72 cols, so you might as well do it in advance. :)

Thanks for the heads-up. :)

I personally used 78/79 col width for a long time for different projects, but
sure I can adjust my config.  I found that the "official guide" points us to
75 instead:

https://www.kernel.org/doc/html/v4.17/process/submitting-patches.html

  The body of the explanation, line wrapped at 75 columns, which will be copied
  to the permanent changelog to describe this patch.

I'll follow that.

[...]

> > @@ -1320,11 +1331,15 @@ struct zap_details {
> >  static inline bool
> >  zap_skip_check_mapping(struct zap_details *details, struct page *page)
> >  {
> > -	if (!details || !page)
> > +	/* If we can make a decision without *page.. */
> > +	if (should_zap_cows(details))
> >  		return false;
> >  
> > -	return details->zap_mapping &&
> > -		(details->zap_mapping != page_rmapping(page));
> > +	/* E.g. zero page */
> 
> It's a bit confusing to see a comment that "this could be the zero page", if 
> the value is NULL. Maybe, "the caller passes NULL for the case of a zero 
> page", or something along those lines? 

It didn't show much difference here.. but for sure I can coordinate.

> 
> 
> > +	if (!page)
> > +		return false;
> > +
> > +	return details->zap_mapping != page_rmapping(page);
> >  }
> >  
> >  static unsigned long zap_pte_range(struct mmu_gather *tlb,
> > @@ -1405,17 +1420,29 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> >  			continue;
> >  		}
> >  
> > -		/* If details->check_mapping, we leave swap entries. */
> > -		if (unlikely(details))
> > -			continue;
> > -
> > -		if (!non_swap_entry(entry))
> > +		if (!non_swap_entry(entry)) {
> > +			/*
> > +			 * If this is a genuine swap entry, then it must be an
> > +			 * private anon page.  If the caller wants to skip
> > +			 * COWed pages, ignore it.
> > +			 */
> 
> How about this instead:
> 
> 			/* Genuine swap entry, and therefore a private anon page. */

Yes the last sentence is kind of redundant.

> 
> > +			if (!should_zap_cows(details))
> > +				continue;
> >  			rss[MM_SWAPENTS]--;
> > -		else if (is_migration_entry(entry)) {
> 
> Can we put a newline here, and before each "else" block? Because now it
> is getting very dense, and the visual separation really helps.

The thing is we don't have a rule to add empty lines here, or do we?  Changing
it could make it less like what we have had.

Personally it looks fine, because separations are done with either new lines or
indents.  Here it's done via indents, IMHO.

> 
> > +		} else if (is_migration_entry(entry)) {
> >  			struct page *page;
> >  
> >  			page = pfn_swap_entry_to_page(entry);
> > +			if (zap_skip_check_mapping(details, page))
> > +				continue;
> >  			rss[mm_counter(page)]--;
> 
> Newline here.
> 
> > +		} else if (is_hwpoison_entry(entry)) {
> > +			/* If the caller wants to skip COWed pages, ignore it */
> 
> Likewise, I'd prefer we delete that comment, because it exactly matches 
> what the following line of code says.

Will do.

> 
> > +			if (!should_zap_cows(details))
> > +				continue;
> 
> And newline here too.
> 
> > +		} else {
> > +			/* We should have covered all the swap entry types */
> > +			WARN_ON_ONCE(1);
> >  		}
> >  		if (unlikely(!free_swap_and_cache(entry)))
> >  			print_bad_pte(vma, addr, ptent, NULL);
> 
> Those are all just nits, and as I mentioned, the actual changes look good
> to me, so:
> 
> Reviewed-by: John Hubbard <jhubbard@nvidia.com>

Thanks,
John Hubbard Feb. 17, 2022, 5:45 a.m. UTC | #5
On 2/16/22 20:46, Peter Xu wrote:
...
> I personally used 78/79 col width for a long time for different projects, but
> sure I can adjust my config.  I found that the "official guide" points us to
> 75 instead:
> 
> https://www.kernel.org/doc/html/v4.17/process/submitting-patches.html
> 
>    The body of the explanation, line wrapped at 75 columns, which will be copied
>    to the permanent changelog to describe this patch.
> 
> I'll follow that.
> 

Oh, I'll change to that too, then. Thanks for spotting that.

> [...]
>>> +			if (!should_zap_cows(details))
>>> +				continue;
>>>   			rss[MM_SWAPENTS]--;
>>> -		else if (is_migration_entry(entry)) {
>>
>> Can we put a newline here, and before each "else" block? Because now it
>> is getting very dense, and the visual separation really helps.
> 
> The thing is we don't have a rule to add empty lines here, or do we?  Changing
> it could make it less like what we have had.
> 

I'm not claiming rules or standards, this is just a special case of trying
to make it a little nicer. No big deal either way, of course.


thanks,
diff mbox series

Patch

diff --git a/mm/memory.c b/mm/memory.c
index c125c4969913..4bfeaca7cbc7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1313,6 +1313,17 @@  struct zap_details {
 	struct folio *single_folio;	/* Locked folio to be unmapped */
 };
 
+/* Whether we should zap all COWed (private) pages too */
+static inline bool should_zap_cows(struct zap_details *details)
+{
+	/* By default, zap all pages */
+	if (!details)
+		return true;
+
+	/* Or, we zap COWed pages only if the caller wants to */
+	return !details->zap_mapping;
+}
+
 /*
  * We set details->zap_mapping when we want to unmap shared but keep private
  * pages. Return true if skip zapping this page, false otherwise.
@@ -1320,11 +1331,15 @@  struct zap_details {
 static inline bool
 zap_skip_check_mapping(struct zap_details *details, struct page *page)
 {
-	if (!details || !page)
+	/* If we can make a decision without *page.. */
+	if (should_zap_cows(details))
 		return false;
 
-	return details->zap_mapping &&
-		(details->zap_mapping != page_rmapping(page));
+	/* E.g. zero page */
+	if (!page)
+		return false;
+
+	return details->zap_mapping != page_rmapping(page);
 }
 
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -1405,17 +1420,29 @@  static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			continue;
 		}
 
-		/* If details->check_mapping, we leave swap entries. */
-		if (unlikely(details))
-			continue;
-
-		if (!non_swap_entry(entry))
+		if (!non_swap_entry(entry)) {
+			/*
+			 * If this is a genuine swap entry, then it must be an
+			 * private anon page.  If the caller wants to skip
+			 * COWed pages, ignore it.
+			 */
+			if (!should_zap_cows(details))
+				continue;
 			rss[MM_SWAPENTS]--;
-		else if (is_migration_entry(entry)) {
+		} else if (is_migration_entry(entry)) {
 			struct page *page;
 
 			page = pfn_swap_entry_to_page(entry);
+			if (zap_skip_check_mapping(details, page))
+				continue;
 			rss[mm_counter(page)]--;
+		} else if (is_hwpoison_entry(entry)) {
+			/* If the caller wants to skip COWed pages, ignore it */
+			if (!should_zap_cows(details))
+				continue;
+		} else {
+			/* We should have covered all the swap entry types */
+			WARN_ON_ONCE(1);
 		}
 		if (unlikely(!free_swap_and_cache(entry)))
 			print_bad_pte(vma, addr, ptent, NULL);