diff mbox series

[v7,3/8] mm/rmap: Split try_to_munlock from try_to_unmap

Message ID 20210326000805.2518-4-apopple@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Add support for SVM atomics in Nouveau | expand

Commit Message

Alistair Popple March 26, 2021, 12:08 a.m. UTC
The behaviour of try_to_unmap_one() is difficult to follow because it
performs different operations based on a fairly large set of flags used
in different combinations.

TTU_MUNLOCK is one such flag. However it is exclusively used by
try_to_munlock() which specifies no other flags. Therefore rather than
overload try_to_unmap_one() with unrelated behaviour split this out into
it's own function and remove the flag.

Signed-off-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>

---

v7:
* Added Christoph's Reviewed-by

v4:
* Removed redundant check for VM_LOCKED
---
 include/linux/rmap.h |  1 -
 mm/rmap.c            | 40 ++++++++++++++++++++++++++++++++--------
 2 files changed, 32 insertions(+), 9 deletions(-)

Comments

Jason Gunthorpe March 30, 2021, 6:49 p.m. UTC | #1
On Fri, Mar 26, 2021 at 11:08:00AM +1100, Alistair Popple wrote:

> +static bool try_to_munlock_one(struct page *page, struct vm_area_struct *vma,
> +		     unsigned long address, void *arg)
> +{

Is this function name right?

> +	struct page_vma_mapped_walk pvmw = {
> +		.page = page,
> +		.vma = vma,
> +		.address = address,
> +	};
> +
> +	/* munlock has nothing to gain from examining un-locked vmas */
> +	if (!(vma->vm_flags & VM_LOCKED))
> +		return true;
> +
> +	while (page_vma_mapped_walk(&pvmw)) {
> +		/* PTE-mapped THP are never mlocked */
> +		if (!PageTransCompound(page)) {
> +			/*
> +			 * Holding pte lock, we do *not* need
> +			 * mmap_lock here
> +			 */
> +			mlock_vma_page(page);

Because the only action this function seems to take is to call
*mlock*_vma_page()

> +		}
> +		page_vma_mapped_walk_done(&pvmw);
> +
> +		/* found a mlocked page, no point continuing munlock check */
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
>  /**
>   * try_to_munlock - try to munlock a page
>   * @page: the page to be munlocked
> @@ -1796,8 +1821,7 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
>  void try_to_munlock(struct page *page)
>  {

But this is also called try_to_munlock ??

/**
 * try_to_munlock - try to munlock a page
 * @page: the page to be munlocked
 *
 * Called from munlock code.  Checks all of the VMAs mapping the page
 * to make sure nobody else has this page mlocked. The page will be
 * returned with PG_mlocked cleared if no other vmas have it mlocked.
 */

So what clears PG_mlocked on this call path?

Something needs attention here..

Jason
Alistair Popple March 30, 2021, 10:09 p.m. UTC | #2
On Wednesday, 31 March 2021 5:49:03 AM AEDT Jason Gunthorpe wrote:
> On Fri, Mar 26, 2021 at 11:08:00AM +1100, Alistair Popple wrote:
> 
> > +static bool try_to_munlock_one(struct page *page, struct vm_area_struct 
*vma,
> > +		     unsigned long address, void *arg)
> > +{
> 
> Is this function name right?

Perhaps. This is called from try_to_munlock() hence the name, but see below 
for some commentary on that naming.

> > +	struct page_vma_mapped_walk pvmw = {
> > +		.page = page,
> > +		.vma = vma,
> > +		.address = address,
> > +	};
> > +
> > +	/* munlock has nothing to gain from examining un-locked vmas */
> > +	if (!(vma->vm_flags & VM_LOCKED))
> > +		return true;
> > +
> > +	while (page_vma_mapped_walk(&pvmw)) {
> > +		/* PTE-mapped THP are never mlocked */
> > +		if (!PageTransCompound(page)) {
> > +			/*
> > +			 * Holding pte lock, we do *not* need
> > +			 * mmap_lock here
> > +			 */
> > +			mlock_vma_page(page);
> 
> Because the only action this function seems to take is to call
> *mlock*_vma_page()
> 
> > +		}
> > +		page_vma_mapped_walk_done(&pvmw);
> > +
> > +		/* found a mlocked page, no point continuing munlock check */
> > +		return false;
> > +	}
> > +
> > +	return true;
> > +}
> > +
> >  /**
> >   * try_to_munlock - try to munlock a page
> >   * @page: the page to be munlocked
> > @@ -1796,8 +1821,7 @@ bool try_to_unmap(struct page *page, enum ttu_flags 
flags)
> >  void try_to_munlock(struct page *page)
> >  {
> 
> But this is also called try_to_munlock ??

As far as I can tell this has always been called try_to_munlock() even though 
it appears to do the opposite.

> /**
>  * try_to_munlock - try to munlock a page
>  * @page: the page to be munlocked
>  *
>  * Called from munlock code.  Checks all of the VMAs mapping the page
>  * to make sure nobody else has this page mlocked. The page will be
>  * returned with PG_mlocked cleared if no other vmas have it mlocked.
>  */

In other words it sets PG_mlocked if one or more vmas has it mlocked. So 
try_to_mlock() might be a better name, except that seems to have the potential 
for confusion as well because it's only called from the munlock code path and 
never for mlock.

> So what clears PG_mlocked on this call path?

See munlock_vma_page(). munlock works by clearing PG_mlocked, then calling 
try_to_munlock to check if any VMAs still need it locked in which case 
PG_mlocked gets set again. There are no other callers of try_to_munlock().

> Something needs attention here..

I think the code is correct, but perhaps the naming could be better. Would be 
interested hearing any thoughts on renaming try_to_munlock() to try_to_mlock() 
as the current name appears based on the context it is called from (munlock) 
rather than what it does (mlock).

 - Alistair

> Jason
>
Alistair Popple March 30, 2021, 10:16 p.m. UTC | #3
On Wednesday, 31 March 2021 9:09:30 AM AEDT Alistair Popple wrote:
> On Wednesday, 31 March 2021 5:49:03 AM AEDT Jason Gunthorpe wrote:
> > On Fri, Mar 26, 2021 at 11:08:00AM +1100, Alistair Popple wrote:

<snip>

> > So what clears PG_mlocked on this call path?
> 
> See munlock_vma_page(). munlock works by clearing PG_mlocked, then calling 
> try_to_munlock to check if any VMAs still need it locked in which case 
> PG_mlocked gets set again. There are no other callers of try_to_munlock().
> 
> > Something needs attention here..
> 
> I think the code is correct, but perhaps the naming could be better. Would be 
> interested hearing any thoughts on renaming try_to_munlock() to try_to_mlock() 
> as the current name appears based on the context it is called from (munlock) 
> rather than what it does (mlock).

Actually Documentation/vm/unevictable-lru.rst contains a better suggestion:

try_to_munlock() Reverse Map Scan
---------------------------------

.. warning::
   [!] TODO/FIXME: a better name might be page_mlocked() - analogous to the
   page_referenced() reverse map walker.

Thoughts on renaming try_to_unlock() -> page_mlocked() and try_to_munlock_one() -> page_mlock_one()?

>  - Alistair
> 
> > Jason
> > 
> 
> 
> 
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
> 
>
Jason Gunthorpe March 30, 2021, 10:24 p.m. UTC | #4
On Wed, Mar 31, 2021 at 09:09:30AM +1100, Alistair Popple wrote:

> > > @@ -1796,8 +1821,7 @@ bool try_to_unmap(struct page *page, enum ttu_flags 
> flags)
> > >  void try_to_munlock(struct page *page)
> > >  {
> > 
> > But this is also called try_to_munlock ??
> 
> As far as I can tell this has always been called try_to_munlock() even though 
> it appears to do the opposite.

Maybe we should change it then?

> > /**
> >  * try_to_munlock - try to munlock a page
> >  * @page: the page to be munlocked
> >  *
> >  * Called from munlock code.  Checks all of the VMAs mapping the page
> >  * to make sure nobody else has this page mlocked. The page will be
> >  * returned with PG_mlocked cleared if no other vmas have it mlocked.
> >  */
> 
> In other words it sets PG_mlocked if one or more vmas has it mlocked. So
> try_to_mlock() might be a better name, except that seems to have the potential 
> for confusion as well because it's only called from the munlock code path and 
> never for mlock.

That explanation makes more sense.. This function looks like it is
'set PG_mlocked of the page if any vm->flags has VM_LOCKED'

Maybe call it check_vm_locked or something then and reword the above
comment?

(and why is it OK to read vm->flags for this without any locking?)

> > Something needs attention here..
> 
> I think the code is correct, but perhaps the naming could be better. Would be 
> interested hearing any thoughts on renaming try_to_munlock() to try_to_mlock() 
> as the current name appears based on the context it is called from (munlock) 
> rather than what it does (mlock).

The point of this patch is to make it clearer, after all, so I'd
change something and maybe slightly clarify the comment.

Jason
John Hubbard March 30, 2021, 10:43 p.m. UTC | #5
On 3/30/21 3:24 PM, Jason Gunthorpe wrote:
...
>> As far as I can tell this has always been called try_to_munlock() even though
>> it appears to do the opposite.
> 
> Maybe we should change it then?
> 
>>> /**
>>>   * try_to_munlock - try to munlock a page
>>>   * @page: the page to be munlocked
>>>   *
>>>   * Called from munlock code.  Checks all of the VMAs mapping the page
>>>   * to make sure nobody else has this page mlocked. The page will be
>>>   * returned with PG_mlocked cleared if no other vmas have it mlocked.
>>>   */
>>
>> In other words it sets PG_mlocked if one or more vmas has it mlocked. So
>> try_to_mlock() might be a better name, except that seems to have the potential
>> for confusion as well because it's only called from the munlock code path and
>> never for mlock.
> 
> That explanation makes more sense.. This function looks like it is
> 'set PG_mlocked of the page if any vm->flags has VM_LOCKED'
> 
> Maybe call it check_vm_locked or something then and reword the above
> comment?
> 
> (and why is it OK to read vm->flags for this without any locking?)
> 
>>> Something needs attention here..
>>
>> I think the code is correct, but perhaps the naming could be better. Would be
>> interested hearing any thoughts on renaming try_to_munlock() to try_to_mlock()
>> as the current name appears based on the context it is called from (munlock)
>> rather than what it does (mlock).
> 
> The point of this patch is to make it clearer, after all, so I'd
> change something and maybe slightly clarify the comment.
> 

I'd add that, after looking around the calling code, this is a really unhappy
pre-existing situation. Anyone reading this has to remember at which point in the
call stack the naming transitions from "do the opposite of what the name says",
to "do what the name says".

+1 for renaming "munlock*" items to "mlock*", where applicable. good grief.

Although, it seems reasonable to tack such renaming patches onto the tail end
of this series. But whatever works.

thanks,
Alistair Popple March 30, 2021, 10:56 p.m. UTC | #6
On Wednesday, 31 March 2021 9:43:19 AM AEDT John Hubbard wrote:
> On 3/30/21 3:24 PM, Jason Gunthorpe wrote:
> ...
> >> As far as I can tell this has always been called try_to_munlock() even 
though
> >> it appears to do the opposite.
> > 
> > Maybe we should change it then?
> > 
> >>> /**
> >>>   * try_to_munlock - try to munlock a page
> >>>   * @page: the page to be munlocked
> >>>   *
> >>>   * Called from munlock code.  Checks all of the VMAs mapping the page
> >>>   * to make sure nobody else has this page mlocked. The page will be
> >>>   * returned with PG_mlocked cleared if no other vmas have it mlocked.
> >>>   */
> >>
> >> In other words it sets PG_mlocked if one or more vmas has it mlocked. So
> >> try_to_mlock() might be a better name, except that seems to have the 
potential
> >> for confusion as well because it's only called from the munlock code path 
and
> >> never for mlock.
> > 
> > That explanation makes more sense.. This function looks like it is
> > 'set PG_mlocked of the page if any vm->flags has VM_LOCKED'
> > 
> > Maybe call it check_vm_locked or something then and reword the above
> > comment?
> > 
> > (and why is it OK to read vm->flags for this without any locking?)
> > 
> >>> Something needs attention here..
> >>
> >> I think the code is correct, but perhaps the naming could be better. 
Would be
> >> interested hearing any thoughts on renaming try_to_munlock() to 
try_to_mlock()
> >> as the current name appears based on the context it is called from 
(munlock)
> >> rather than what it does (mlock).
> > 
> > The point of this patch is to make it clearer, after all, so I'd
> > change something and maybe slightly clarify the comment.
> > 

Yep, agree with that.
 
> I'd add that, after looking around the calling code, this is a really 
unhappy
> pre-existing situation. Anyone reading this has to remember at which point 
in the
> call stack the naming transitions from "do the opposite of what the name 
says",
> to "do what the name says".
>
> +1 for renaming "munlock*" items to "mlock*", where applicable. good grief.

At least the situation was weird enough to prompt further investigation :) 

Renaming to mlock* doesn't feel like the right solution to me either though. I 
am not sure if you saw me responding to myself earlier but I am thinking 
renaming try_to_munlock() -> page_mlocked() and try_to_munlock_one() -> 
page_mlock_one() might be better. Thoughts?

This is actually inspired from a suggestion in Documentation/vm/unevictable-
lru.rst which warns about this problem:

try_to_munlock() Reverse Map Scan
---------------------------------

.. warning::
   [!] TODO/FIXME: a better name might be page_mlocked() - analogous to the
   page_referenced() reverse map walker.

> Although, it seems reasonable to tack such renaming patches onto the tail 
end
> of this series. But whatever works.

Unless anyone objects strongly I will roll the rename into this patch as there 
is only one caller of try_to_munlock.

 - Alistair

> thanks,
>
John Hubbard March 31, 2021, 3:56 a.m. UTC | #7
On 3/30/21 3:56 PM, Alistair Popple wrote:
...
>> +1 for renaming "munlock*" items to "mlock*", where applicable. good grief.
> 
> At least the situation was weird enough to prompt further investigation :)
> 
> Renaming to mlock* doesn't feel like the right solution to me either though. I
> am not sure if you saw me responding to myself earlier but I am thinking
> renaming try_to_munlock() -> page_mlocked() and try_to_munlock_one() ->
> page_mlock_one() might be better. Thoughts?
> 

Quite confused by this naming idea. Because: try_to_munlock() returns
void, so a boolean-style name such as "page_mlocked()" is already not a
good fit.

Even more important, though, is that try_to_munlock() is mlock-ing the
page, right? Is there some subtle point I'm missing? It really is doing
an mlock to the best of my knowledge here. Although the kerneldoc
comment for try_to_munlock() seems questionable too:

/**
  * try_to_munlock - try to munlock a page
  * @page: the page to be munlocked
  *
  * Called from munlock code.  Checks all of the VMAs mapping the page
  * to make sure nobody else has this page mlocked. The page will be
  * returned with PG_mlocked cleared if no other vmas have it mlocked.
  */

...because I don't see where, in *this* routine, it clears PG_mlocked!

Obviously we agree that a routine should be named based on what it does,
rather than on who calls it. So I think that still leads to:

      try_to_munlock() --> try_to_mlock()
      try_to_munlock_one() --> try_to_mlock_one()

Sorry if I'm missing something really obvious.


> This is actually inspired from a suggestion in Documentation/vm/unevictable-
> lru.rst which warns about this problem:
> 
> try_to_munlock() Reverse Map Scan
> ---------------------------------
> 
> .. warning::
>     [!] TODO/FIXME: a better name might be page_mlocked() - analogous to the
>     page_referenced() reverse map walker.
> 

This is actually rather bad advice! page_referenced() returns an
int-that-is-really-a-boolean, whereas try_to_munlock(), at least as it
stands now, returns void. Usually when I'm writing a TODO item, I'm in a
hurry, and I think that's what probably happened here, too. :)


>> Although, it seems reasonable to tack such renaming patches onto the tail
> end
>> of this series. But whatever works.
> 
> Unless anyone objects strongly I will roll the rename into this patch as there
> is only one caller of try_to_munlock.
> 
>   - Alistair
> 

No objections here. :)

thanks,
John Hubbard March 31, 2021, 4:09 a.m. UTC | #8
On 3/30/21 8:56 PM, John Hubbard wrote:
> On 3/30/21 3:56 PM, Alistair Popple wrote:
> ...
>>> +1 for renaming "munlock*" items to "mlock*", where applicable. good grief.
>>
>> At least the situation was weird enough to prompt further investigation :)
>>
>> Renaming to mlock* doesn't feel like the right solution to me either though. I
>> am not sure if you saw me responding to myself earlier but I am thinking
>> renaming try_to_munlock() -> page_mlocked() and try_to_munlock_one() ->
>> page_mlock_one() might be better. Thoughts?
>>
> 
> Quite confused by this naming idea. Because: try_to_munlock() returns
> void, so a boolean-style name such as "page_mlocked()" is already not a
> good fit.
> 
> Even more important, though, is that try_to_munlock() is mlock-ing the
> page, right? Is there some subtle point I'm missing? It really is doing
> an mlock to the best of my knowledge here. Although the kerneldoc
> comment for try_to_munlock() seems questionable too:
> 
> /**
> * try_to_munlock - try to munlock a page
> * @page: the page to be munlocked
> *
> * Called from munlock code.  Checks all of the VMAs mapping the page
> * to make sure nobody else has this page mlocked. The page will be
> * returned with PG_mlocked cleared if no other vmas have it mlocked.
> */
> 
> ...because I don't see where, in *this* routine, it clears PG_mlocked!
> 
> Obviously we agree that a routine should be named based on what it does,
> rather than on who calls it. So I think that still leads to:
> 
>      try_to_munlock() --> try_to_mlock()
>      try_to_munlock_one() --> try_to_mlock_one()
> 
> Sorry if I'm missing something really obvious.

Actually, re-reading your and Jason's earlier points in the thread, I see
that I'm *not* missing anything, and we are actually in agreement about how
the code operates. OK, good!

Also, as you point out above, maybe the "try_" prefix is not really accurate
either, given how this works. So maybe we have arrived at something like:

     try_to_munlock() --> page_mlock() // or mlock_page()...
     try_to_munlock_one() --> page_mlock_one()



thanks,
Alistair Popple March 31, 2021, 4:15 a.m. UTC | #9
On Wednesday, 31 March 2021 2:56:38 PM AEDT John Hubbard wrote:
> On 3/30/21 3:56 PM, Alistair Popple wrote:
> ...
> >> +1 for renaming "munlock*" items to "mlock*", where applicable. good 
grief.
> > 
> > At least the situation was weird enough to prompt further investigation :)
> > 
> > Renaming to mlock* doesn't feel like the right solution to me either 
though. I
> > am not sure if you saw me responding to myself earlier but I am thinking
> > renaming try_to_munlock() -> page_mlocked() and try_to_munlock_one() ->
> > page_mlock_one() might be better. Thoughts?
> > 
> 
> Quite confused by this naming idea. Because: try_to_munlock() returns
> void, so a boolean-style name such as "page_mlocked()" is already not a
> good fit.
> 
> Even more important, though, is that try_to_munlock() is mlock-ing the
> page, right? Is there some subtle point I'm missing? It really is doing
> an mlock to the best of my knowledge here. Although the kerneldoc
> comment for try_to_munlock() seems questionable too:

It's mlocking the page if it turns out it still needs to be locked after 
unlocking it. But I don't think you're missing anything.

> /**
>   * try_to_munlock - try to munlock a page
>   * @page: the page to be munlocked
>   *
>   * Called from munlock code.  Checks all of the VMAs mapping the page
>   * to make sure nobody else has this page mlocked. The page will be
>   * returned with PG_mlocked cleared if no other vmas have it mlocked.
>   */
> 
> ...because I don't see where, in *this* routine, it clears PG_mlocked!
>
> Obviously we agree that a routine should be named based on what it does,
> rather than on who calls it. So I think that still leads to:
> 
>       try_to_munlock() --> try_to_mlock()
>       try_to_munlock_one() --> try_to_mlock_one()
> 
> Sorry if I'm missing something really obvious.

Nope, I confused things somewhat by blindly quoting the documentation whilst 
forgetting that try_to_munlock() returns void rather than a bool.

> > This is actually inspired from a suggestion in Documentation/vm/
unevictable-
> > lru.rst which warns about this problem:
> > 
> > try_to_munlock() Reverse Map Scan
> > ---------------------------------
> > 
> > .. warning::
> >     [!] TODO/FIXME: a better name might be page_mlocked() - analogous to 
the
> >     page_referenced() reverse map walker.
> > 
> 
> This is actually rather bad advice! page_referenced() returns an
> int-that-is-really-a-boolean, whereas try_to_munlock(), at least as it
> stands now, returns void. Usually when I'm writing a TODO item, I'm in a
> hurry, and I think that's what probably happened here, too. :)

So I think we're in agreement. The naming is bad and the advice in the 
documentation is also questionable :-)

Thanks for the thoughts, I will re-send this with naming and documentation 
updates.

> >> Although, it seems reasonable to tack such renaming patches onto the tail
> > end
> >> of this series. But whatever works.
> > 
> > Unless anyone objects strongly I will roll the rename into this patch as 
there
> > is only one caller of try_to_munlock.
> > 
> >   - Alistair
> > 
> 
> No objections here. :)
> 
> thanks,
>
Jason Gunthorpe March 31, 2021, 11:57 a.m. UTC | #10
On Wed, Mar 31, 2021 at 03:15:47PM +1100, Alistair Popple wrote:
> On Wednesday, 31 March 2021 2:56:38 PM AEDT John Hubbard wrote:
> > On 3/30/21 3:56 PM, Alistair Popple wrote:
> > ...
> > >> +1 for renaming "munlock*" items to "mlock*", where applicable. good 
> grief.
> > > 
> > > At least the situation was weird enough to prompt further investigation :)
> > > 
> > > Renaming to mlock* doesn't feel like the right solution to me either 
> though. I
> > > am not sure if you saw me responding to myself earlier but I am thinking
> > > renaming try_to_munlock() -> page_mlocked() and try_to_munlock_one() ->
> > > page_mlock_one() might be better. Thoughts?
> > > 
> > 
> > Quite confused by this naming idea. Because: try_to_munlock() returns
> > void, so a boolean-style name such as "page_mlocked()" is already not a
> > good fit.
> > 
> > Even more important, though, is that try_to_munlock() is mlock-ing the
> > page, right? Is there some subtle point I'm missing? It really is doing
> > an mlock to the best of my knowledge here. Although the kerneldoc
> > comment for try_to_munlock() seems questionable too:
> 
> It's mlocking the page if it turns out it still needs to be locked after 
> unlocking it. But I don't think you're missing anything.

It is really searching all VMA's to see if the VMA flag is set and if
any are found then it mlocks the page.

But presenting this rountine in its simplified form raises lots of
questions:

 - What locking is being used to read the VMA flag?
 - Why do we need to manipulate global struct page flags under the
   page table locks of a single VMA?
 - Why do we need to check for huge pages inside the VMA loop, not
   before going to the rmap? PageTransCompoundHead() is not sensitive to
   the PTEs. (and what happens if the huge page breaks up concurrently?)
 - Why do we clear the mlock bit then run around to try and set it?
   Feels racey.

Jason
Alistair Popple April 1, 2021, 4:36 a.m. UTC | #11
On Wednesday, 31 March 2021 10:57:46 PM AEDT Jason Gunthorpe wrote:
> On Wed, Mar 31, 2021 at 03:15:47PM +1100, Alistair Popple wrote:
> > On Wednesday, 31 March 2021 2:56:38 PM AEDT John Hubbard wrote:
> > > On 3/30/21 3:56 PM, Alistair Popple wrote:
> > > ...
> > > >> +1 for renaming "munlock*" items to "mlock*", where applicable. good 
> > grief.
> > > > 
> > > > At least the situation was weird enough to prompt further 
investigation :)
> > > > 
> > > > Renaming to mlock* doesn't feel like the right solution to me either 
> > though. I
> > > > am not sure if you saw me responding to myself earlier but I am 
thinking
> > > > renaming try_to_munlock() -> page_mlocked() and try_to_munlock_one() -
>
> > > > page_mlock_one() might be better. Thoughts?
> > > > 
> > > 
> > > Quite confused by this naming idea. Because: try_to_munlock() returns
> > > void, so a boolean-style name such as "page_mlocked()" is already not a
> > > good fit.
> > > 
> > > Even more important, though, is that try_to_munlock() is mlock-ing the
> > > page, right? Is there some subtle point I'm missing? It really is doing
> > > an mlock to the best of my knowledge here. Although the kerneldoc
> > > comment for try_to_munlock() seems questionable too:
> > 
> > It's mlocking the page if it turns out it still needs to be locked after 
> > unlocking it. But I don't think you're missing anything.
> 
> It is really searching all VMA's to see if the VMA flag is set and if
> any are found then it mlocks the page.
> 
> But presenting this rountine in its simplified form raises lots of
> questions:
> 
>  - What locking is being used to read the VMA flag?
>  - Why do we need to manipulate global struct page flags under the
>    page table locks of a single VMA?

I was wondering that and questioned it in an earlier version of this series. I 
have done some digging and the commit log for b87537d9e2fe ("mm: rmap use pte 
lock not mmap_sem to set PageMlocked") provides the original justification.

It's fairly long so I won't quote it here but the summary seems to be that 
among other things the combination of page lock and ptl makes this safe. I 
have yet to verify if everything there still holds and is sensible, but the 
last paragraph certainly is :-)

"Stopped short of separating try_to_munlock_one() from try_to_munmap_one()
on this occasion, but that's probably the sensible next step - with a
rename, given that try_to_munlock()'s business is to try to set Mlocked."

>  - Why do we need to check for huge pages inside the VMA loop, not
>    before going to the rmap? PageTransCompoundHead() is not sensitive to
>    the PTEs. (and what happens if the huge page breaks up concurrently?)
>  - Why do we clear the mlock bit then run around to try and set it?

I don't have an answer for that as I'm not (yet) across all the mlock code 
paths, but I'm hoping this patch at least won't change anything.

>    Feels racey.
>
> Jason
>
Shakeel Butt April 1, 2021, 7:21 p.m. UTC | #12
CC: Hugh Dickins

On Wed, Mar 31, 2021 at 9:37 PM Alistair Popple <apopple@nvidia.com> wrote:
>
> On Wednesday, 31 March 2021 10:57:46 PM AEDT Jason Gunthorpe wrote:
> > On Wed, Mar 31, 2021 at 03:15:47PM +1100, Alistair Popple wrote:
> > > On Wednesday, 31 March 2021 2:56:38 PM AEDT John Hubbard wrote:
> > > > On 3/30/21 3:56 PM, Alistair Popple wrote:
> > > > ...
> > > > >> +1 for renaming "munlock*" items to "mlock*", where applicable. good
> > > grief.
> > > > >
> > > > > At least the situation was weird enough to prompt further
> investigation :)
> > > > >
> > > > > Renaming to mlock* doesn't feel like the right solution to me either
> > > though. I
> > > > > am not sure if you saw me responding to myself earlier but I am
> thinking
> > > > > renaming try_to_munlock() -> page_mlocked() and try_to_munlock_one() -
> >
> > > > > page_mlock_one() might be better. Thoughts?
> > > > >
> > > >
> > > > Quite confused by this naming idea. Because: try_to_munlock() returns
> > > > void, so a boolean-style name such as "page_mlocked()" is already not a
> > > > good fit.
> > > >
> > > > Even more important, though, is that try_to_munlock() is mlock-ing the
> > > > page, right? Is there some subtle point I'm missing? It really is doing
> > > > an mlock to the best of my knowledge here. Although the kerneldoc
> > > > comment for try_to_munlock() seems questionable too:
> > >
> > > It's mlocking the page if it turns out it still needs to be locked after
> > > unlocking it. But I don't think you're missing anything.
> >
> > It is really searching all VMA's to see if the VMA flag is set and if
> > any are found then it mlocks the page.
> >
> > But presenting this rountine in its simplified form raises lots of
> > questions:
> >
> >  - What locking is being used to read the VMA flag?
> >  - Why do we need to manipulate global struct page flags under the
> >    page table locks of a single VMA?
>
> I was wondering that and questioned it in an earlier version of this series. I
> have done some digging and the commit log for b87537d9e2fe ("mm: rmap use pte
> lock not mmap_sem to set PageMlocked") provides the original justification.
>
> It's fairly long so I won't quote it here but the summary seems to be that
> among other things the combination of page lock and ptl makes this safe. I
> have yet to verify if everything there still holds and is sensible, but the
> last paragraph certainly is :-)
>
> "Stopped short of separating try_to_munlock_one() from try_to_munmap_one()
> on this occasion, but that's probably the sensible next step - with a
> rename, given that try_to_munlock()'s business is to try to set Mlocked."
>
> >  - Why do we need to check for huge pages inside the VMA loop, not
> >    before going to the rmap? PageTransCompoundHead() is not sensitive to
> >    the PTEs. (and what happens if the huge page breaks up concurrently?)
> >  - Why do we clear the mlock bit then run around to try and set it?
>
> I don't have an answer for that as I'm not (yet) across all the mlock code
> paths, but I'm hoping this patch at least won't change anything.
>

It would be good to ask the person who has the most answers?

Hugh, the thread started at
https://lore.kernel.org/dri-devel/20210326000805.2518-4-apopple@nvidia.com/
diff mbox series

Patch

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index def5c62c93b3..e26ac2d71346 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -87,7 +87,6 @@  struct anon_vma_chain {
 
 enum ttu_flags {
 	TTU_MIGRATION		= 0x1,	/* migration mode */
-	TTU_MUNLOCK		= 0x2,	/* munlock mode */
 
 	TTU_SPLIT_HUGE_PMD	= 0x4,	/* split huge PMD if any */
 	TTU_IGNORE_MLOCK	= 0x8,	/* ignore mlock */
diff --git a/mm/rmap.c b/mm/rmap.c
index 977e70803ed8..d02bade5245b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1405,10 +1405,6 @@  static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	struct mmu_notifier_range range;
 	enum ttu_flags flags = (enum ttu_flags)(long)arg;
 
-	/* munlock has nothing to gain from examining un-locked vmas */
-	if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
-		return true;
-
 	if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
 	    is_zone_device_page(page) && !is_device_private_page(page))
 		return true;
@@ -1469,8 +1465,6 @@  static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 				page_vma_mapped_walk_done(&pvmw);
 				break;
 			}
-			if (flags & TTU_MUNLOCK)
-				continue;
 		}
 
 		/* Unexpected PMD-mapped THP? */
@@ -1784,6 +1778,37 @@  bool try_to_unmap(struct page *page, enum ttu_flags flags)
 	return !page_mapcount(page) ? true : false;
 }
 
+static bool try_to_munlock_one(struct page *page, struct vm_area_struct *vma,
+		     unsigned long address, void *arg)
+{
+	struct page_vma_mapped_walk pvmw = {
+		.page = page,
+		.vma = vma,
+		.address = address,
+	};
+
+	/* munlock has nothing to gain from examining un-locked vmas */
+	if (!(vma->vm_flags & VM_LOCKED))
+		return true;
+
+	while (page_vma_mapped_walk(&pvmw)) {
+		/* PTE-mapped THP are never mlocked */
+		if (!PageTransCompound(page)) {
+			/*
+			 * Holding pte lock, we do *not* need
+			 * mmap_lock here
+			 */
+			mlock_vma_page(page);
+		}
+		page_vma_mapped_walk_done(&pvmw);
+
+		/* found a mlocked page, no point continuing munlock check */
+		return false;
+	}
+
+	return true;
+}
+
 /**
  * try_to_munlock - try to munlock a page
  * @page: the page to be munlocked
@@ -1796,8 +1821,7 @@  bool try_to_unmap(struct page *page, enum ttu_flags flags)
 void try_to_munlock(struct page *page)
 {
 	struct rmap_walk_control rwc = {
-		.rmap_one = try_to_unmap_one,
-		.arg = (void *)TTU_MUNLOCK,
+		.rmap_one = try_to_munlock_one,
 		.done = page_not_mapped,
 		.anon_lock = page_lock_anon_vma_read,