diff mbox series

[v4,3/9] mm: pagewalk: Don't split transhuge pmds when a pmd_entry is present

Message ID 20191008091508.2682-4-thomas_os@shipmail.org (mailing list archive)
State New, archived
Headers show
Series Emulated coherent graphics memory take 2 | expand

Commit Message

Thomas Hellström (Intel) Oct. 8, 2019, 9:15 a.m. UTC
From: Thomas Hellstrom <thellstrom@vmware.com>

The pagewalk code was unconditionally splitting transhuge pmds when a
pte_entry was present. However ideally we'd want to handle transhuge pmds
in the pmd_entry function and ptes in pte_entry function. So don't split
huge pmds when there is a pmd_entry function present, but let the callback
take care of it if necessary.

In order to make sure a virtual address range is handled by one and only
one callback, and since pmd entries may be unstable, we introduce a
pmd_entry return code that tells the walk code to continue processing this
pmd entry rather than to move on. Since caller-defined positive return
codes (up to 2) are used by current callers, use a high value that allows a
large range of positive caller-defined return codes for future users.

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
---
 include/linux/pagewalk.h |  8 ++++++++
 mm/pagewalk.c            | 28 +++++++++++++++++++++-------
 2 files changed, 29 insertions(+), 7 deletions(-)

Comments

Kirill A . Shutemov Oct. 9, 2019, 3:27 p.m. UTC | #1
On Tue, Oct 08, 2019 at 11:15:02AM +0200, Thomas Hellström (VMware) wrote:
> From: Thomas Hellstrom <thellstrom@vmware.com>
> 
> The pagewalk code was unconditionally splitting transhuge pmds when a
> pte_entry was present. However ideally we'd want to handle transhuge pmds
> in the pmd_entry function and ptes in pte_entry function. So don't split
> huge pmds when there is a pmd_entry function present, but let the callback
> take care of it if necessary.

Do we have any current user that expect split_huge_pmd() in this scenario.

> 
> In order to make sure a virtual address range is handled by one and only
> one callback, and since pmd entries may be unstable, we introduce a
> pmd_entry return code that tells the walk code to continue processing this
> pmd entry rather than to move on. Since caller-defined positive return
> codes (up to 2) are used by current callers, use a high value that allows a
> large range of positive caller-defined return codes for future users.
> 
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: Will Deacon <will.deacon@arm.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Rik van Riel <riel@surriel.com>
> Cc: Minchan Kim <minchan@kernel.org>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Huang Ying <ying.huang@intel.com>
> Cc: Jérôme Glisse <jglisse@redhat.com>
> Cc: Kirill A. Shutemov <kirill@shutemov.name>
> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
> Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
> ---
>  include/linux/pagewalk.h |  8 ++++++++
>  mm/pagewalk.c            | 28 +++++++++++++++++++++-------
>  2 files changed, 29 insertions(+), 7 deletions(-)
> 
> diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
> index bddd9759bab9..c4a013eb445d 100644
> --- a/include/linux/pagewalk.h
> +++ b/include/linux/pagewalk.h
> @@ -4,6 +4,11 @@
>  
>  #include <linux/mm.h>
>  
> +/* Highest positive pmd_entry caller-specific return value */
> +#define PAGE_WALK_CALLER_MAX     (INT_MAX / 2)
> +/* The handler did not handle the entry. Fall back to the next level */
> +#define PAGE_WALK_FALLBACK       (PAGE_WALK_CALLER_MAX + 1)
> +

That's hacky.

Maybe just use an error code for this? -EAGAIN?

>  struct mm_walk;
>  
>  /**
> @@ -16,6 +21,9 @@ struct mm_walk;
>   *			this handler is required to be able to handle
>   *			pmd_trans_huge() pmds.  They may simply choose to
>   *			split_huge_page() instead of handling it explicitly.
> + *                      If the handler did not handle the PMD, or split the
> + *                      PMD and wants it handled by the PTE handler, it
> + *                      should return PAGE_WALK_FALLBACK.

Indentation is broken. Use tabs.

>   * @pte_entry:		if set, called for each non-empty PTE (4th-level) entry
>   * @pte_hole:		if set, called for each hole at all levels
>   * @hugetlb_entry:	if set, called for each hugetlb entry
> diff --git a/mm/pagewalk.c b/mm/pagewalk.c
> index 83c0b78363b4..f844c2a2aa60 100644
> --- a/mm/pagewalk.c
> +++ b/mm/pagewalk.c
> @@ -50,10 +50,18 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
>  		 * This implies that each ->pmd_entry() handler
>  		 * needs to know about pmd_trans_huge() pmds
>  		 */
> -		if (ops->pmd_entry)
> +		if (ops->pmd_entry) {
>  			err = ops->pmd_entry(pmd, addr, next, walk);
> -		if (err)
> -			break;
> +			if (!err)
> +				continue;
> +			else if (err <= PAGE_WALK_CALLER_MAX)
> +				break;
> +			WARN_ON(err != PAGE_WALK_FALLBACK);
> +			err = 0;
> +			if (pmd_trans_unstable(pmd))
> +				goto again;
> +			/* Fall through */
> +		}
>  
>  		/*
>  		 * Check this here so we only break down trans_huge
> @@ -61,8 +69,8 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
>  		 */
>  		if (!ops->pte_entry)
>  			continue;
> -
> -		split_huge_pmd(walk->vma, pmd, addr);
> +		if (!ops->pmd_entry)
> +			split_huge_pmd(walk->vma, pmd, addr);
>  		if (pmd_trans_unstable(pmd))
>  			goto again;
>  		err = walk_pte_range(pmd, addr, next, walk);
> @@ -281,11 +289,17 @@ static int __walk_page_range(unsigned long start, unsigned long end,
>   *
>   *  - 0  : succeeded to handle the current entry, and if you don't reach the
>   *         end address yet, continue to walk.
> - *  - >0 : succeeded to handle the current entry, and return to the caller
> - *         with caller specific value.
> + *  - >0, and <= PAGE_WALK_CALLER_MAX : succeeded to handle the current entry,
> + *         and return to the caller with caller specific value.
>   *  - <0 : failed to handle the current entry, and return to the caller
>   *         with error code.
>   *
> + * For pmd_entry(), a value <= PAGE_WALK_CALLER_MAX indicates that the entry
> + * was handled by the callback. PAGE_WALK_FALLBACK indicates that the entry
> + * could not be handled by the callback and should be re-checked. If the
> + * callback needs the entry to be handled by the next level, it should
> + * split the entry and then return PAGE_WALK_FALLBACK.
> + *
>   * Before starting to walk page table, some callers want to check whether
>   * they really want to walk over the current vma, typically by checking
>   * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
> -- 
> 2.21.0
>
Thomas Hellström (Intel) Oct. 9, 2019, 4:20 p.m. UTC | #2
Hi, Kirill.

Thanks for reviewing.

On 10/9/19 5:27 PM, Kirill A. Shutemov wrote:
> On Tue, Oct 08, 2019 at 11:15:02AM +0200, Thomas Hellström (VMware) wrote:
>> From: Thomas Hellstrom <thellstrom@vmware.com>
>>
>> The pagewalk code was unconditionally splitting transhuge pmds when a
>> pte_entry was present. However ideally we'd want to handle transhuge pmds
>> in the pmd_entry function and ptes in pte_entry function. So don't split
>> huge pmds when there is a pmd_entry function present, but let the callback
>> take care of it if necessary.
> Do we have any current user that expect split_huge_pmd() in this scenario.

No. All current users either have pmd_entry (no splitting) or pte_entry 
(unconditional splitting)

>
>> In order to make sure a virtual address range is handled by one and only
>> one callback, and since pmd entries may be unstable, we introduce a
>> pmd_entry return code that tells the walk code to continue processing this
>> pmd entry rather than to move on. Since caller-defined positive return
>> codes (up to 2) are used by current callers, use a high value that allows a
>> large range of positive caller-defined return codes for future users.
>>
>> Cc: Matthew Wilcox <willy@infradead.org>
>> Cc: Will Deacon <will.deacon@arm.com>
>> Cc: Peter Zijlstra <peterz@infradead.org>
>> Cc: Rik van Riel <riel@surriel.com>
>> Cc: Minchan Kim <minchan@kernel.org>
>> Cc: Michal Hocko <mhocko@suse.com>
>> Cc: Huang Ying <ying.huang@intel.com>
>> Cc: Jérôme Glisse <jglisse@redhat.com>
>> Cc: Kirill A. Shutemov <kirill@shutemov.name>
>> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
>> Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
>> ---
>>   include/linux/pagewalk.h |  8 ++++++++
>>   mm/pagewalk.c            | 28 +++++++++++++++++++++-------
>>   2 files changed, 29 insertions(+), 7 deletions(-)
>>
>> diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
>> index bddd9759bab9..c4a013eb445d 100644
>> --- a/include/linux/pagewalk.h
>> +++ b/include/linux/pagewalk.h
>> @@ -4,6 +4,11 @@
>>   
>>   #include <linux/mm.h>
>>   
>> +/* Highest positive pmd_entry caller-specific return value */
>> +#define PAGE_WALK_CALLER_MAX     (INT_MAX / 2)
>> +/* The handler did not handle the entry. Fall back to the next level */
>> +#define PAGE_WALK_FALLBACK       (PAGE_WALK_CALLER_MAX + 1)
>> +
> That's hacky.
>
> Maybe just use an error code for this? -EAGAIN?

I agree this is hacky. But IMO it's a reasonably safe option. My 
thinking was that in the long run we'd move the positive return codes to 
the mm_walk private and introduce a PAGE_WALK_TERMINATE code as well.

Perhaps a completely clean and safe way would be to add an "int 
walk_control" in the struct mm_walk?

I'm pretty sure using an error code will come back and bite us at some 
point, if someone just blindly forwards error messages. But if you 
insist, I'll use -EAGAIN.

Please let me know what you think.

Thanks,

Thomas
Linus Torvalds Oct. 9, 2019, 4:21 p.m. UTC | #3
On Wed, Oct 9, 2019 at 8:27 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> Do we have any current user that expect split_huge_pmd() in this scenario.

No. There are no current users of the pmd callback and the pte
callback at all, that I could find.

But it looks like the new drm use does want a "I can't handle the
hugepage, please split it and I'll fo the ptes instead".

> That's hacky.
>
> Maybe just use an error code for this? -EAGAIN?

I actually like the PAGE_WALK_FALLBACK thing as more documentation
than "it's an error, but not one you return", although I do detest the
particular value chosen, which is just a nasty bitpattern.

Maybe it could use an error value, just one that makes no sense, and
is hidden by the PAGE_WALK_FALLBACK define, ie something like

  #define PAGE_WALK_FALLBACK (-ECHILD)

or something like that.

And I suspect the conditional would be cleaner if it was written something like

        if (!err)
                continue;
        if (err != PAGE_WALK_FALLBACK)
                break;
        err = 0;
        if (pmd_trans_unstable(pmd))
                goto again;
        .. do the split ..

and skip the WARN_ON() and the odd "non-zero but smaller than MAX test"

            Linus
Thomas Hellström (Intel) Oct. 9, 2019, 5:03 p.m. UTC | #4
On 10/9/19 6:21 PM, Linus Torvalds wrote:
> On Wed, Oct 9, 2019 at 8:27 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>> Do we have any current user that expect split_huge_pmd() in this scenario.
> No. There are no current users of the pmd callback and the pte
> callback at all, that I could find.
>
> But it looks like the new drm use does want a "I can't handle the
> hugepage, please split it and I'll fo the ptes instead".
>
Nope, it handles the hugepages by ignoring them, since they should be 
read-only, but if pmd_entry() was called with something else than a 
hugepage, then it requests the fallback, but never a split.

/Thomas
Linus Torvalds Oct. 9, 2019, 5:16 p.m. UTC | #5
On Wed, Oct 9, 2019 at 10:03 AM Thomas Hellström (VMware)
<thomas_os@shipmail.org> wrote:
>
> Nope, it handles the hugepages by ignoring them, since they should be
> read-only, but if pmd_entry() was called with something else than a
> hugepage, then it requests the fallback, but never a split.

But  PAGE_WALK_FALLBACK _is_ a split.

Oh, except you did this

-               split_huge_pmd(walk->vma, pmd, addr);
+               if (!ops->pmd_entry)
+                       split_huge_pmd(walk->vma, pmd, addr);


so it avoids the split.

No, that's unacceptable. And makes no sense anyway. If it doesn't
split the pmd, then it shouldn't walk the pte's - because they don't
exist. And if it's not a hugepmd, then the split is a no-op, so the
test makes no sense.

I hadn't noticed that part of the patch. That simply can't be right. I
don't think you've tested this, because you never actually have
hugepages, do you?

You didn't notice or realize that split_huge_pmd() just does that

                if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd)   \
                                        || pmd_devmap(*____pmd))        \

thing and doesn't do anythign at all if it's not huge.

So no. That code makes no sense at all, and I didn't realize how
senseless it was, becasue I stupidly missed that "make the split
conditional" - which is insane and wrong - and I thought that you
wanted PAGE_WALK_FALLBACK to split a pmd and fall back to per-pte
entries, which is what the name implies.

But that's not what you wanted at all.

Just get rid of PAGE_WALK_FALLBACK entirely then, and make the rule be
that a zero return value just means "split and do ptes". Which is what
you want (see above why "split" simply is wrong, and isn't an issue
for you anyway.

That won't change any existing cases, since even if they do have a
zero return value, they don't have a pte_entry() callback, so they
won't do that "split and do ptes" anyway.

             Linus
Thomas Hellstrom Oct. 9, 2019, 6:52 p.m. UTC | #6
Hi,

On 10/9/19 7:17 PM, Linus Torvalds wrote:
> On Wed, Oct 9, 2019 at 10:03 AM Thomas Hellström (VMware)
> <thomas_os@shipmail.org> wrote:
>> Nope, it handles the hugepages by ignoring them, since they should be
>> read-only, but if pmd_entry() was called with something else than a
>> hugepage, then it requests the fallback, but never a split.
> But  PAGE_WALK_FALLBACK _is_ a split.
>
> Oh, except you did this
>
> -               split_huge_pmd(walk->vma, pmd, addr);
> +               if (!ops->pmd_entry)
> +                       split_huge_pmd(walk->vma, pmd, addr);
>
>
> so it avoids the split.
>
> No, that's unacceptable. And makes no sense anyway. If it doesn't
> split the pmd, then it shouldn't walk the pte's - because they don't
> exist. And if it's not a hugepmd, then the split is a no-op, so the
> test makes no sense.
>
> I hadn't noticed that part of the patch. That simply can't be right. I
> don't think you've tested this, because you never actually have
> hugepages, do you?
>
> You didn't notice or realize that split_huge_pmd() just does that
>
>                 if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd)   \
>                                         || pmd_devmap(*____pmd))        \
>
> thing and doesn't do anythign at all if it's not huge.
>
> So no. That code makes no sense at all, and I didn't realize how
> senseless it was, becasue I stupidly missed that "make the split
> conditional" - which is insane and wrong - and I thought that you
> wanted PAGE_WALK_FALLBACK to split a pmd and fall back to per-pte
> entries, which is what the name implies.
>
> But that's not what you wanted at all.
>
> Just get rid of PAGE_WALK_FALLBACK entirely then, and make the rule be
> that a zero return value just means "split and do ptes". Which is what
> you want (see above why "split" simply is wrong, and isn't an issue
> for you anyway.
>
> That won't change any existing cases, since even if they do have a
> zero return value, they don't have a pte_entry() callback, so they
> won't do that "split and do ptes" anyway.
>
>              Linus
>
Hmm, so we have the following cases we need to handle when returning
from the pmd_entry() handler.

1) Huge pmd was handled - Returns 0 and continues.
2) A pmd is otherwise unstable, typically someone just zapped a huge
pmd. Returns PAGE_WALK_FALLBACK, gets caught in the pmd_trans_unstable()
test and retries.
3) A pte directory - Returns PAGE_WALK_FALLBACK, falls through, avoids
the split and continues to the next level. Yeah that split avoidance
test is indeed made unnecessary by the preceding pmd_trans_unstable() test.

-               split_huge_pmd(walk->vma, pmd, addr);
+               if (!ops->pmd_entry)
+                       split_huge_pmd(walk->vma, pmd, addr);

But as the commit message says, PAGE_WALK_FALLBACK is necessary to have
a virtual address range being handled once and only once. Therefore we
must distinguish between 1) and 2) since 2) must be retried until it's
handled correctly.

So we need the PAGE_WALK_FALLBACK. And if we instead were to combine 1)
and 3) in a single return value and use, for example PAGE_WALK_RETRY for
2)  the following could happen.

a) we handle the huge pmd and return 0 from pte_entry().
b) another process splits it.
c) we fall through to the pte level and handle the same address range
again...

So to summarize, yes the test in the code you cite is unnecessary. But
if we want to guarantee a virtual address range being handled once and
only once we need the PAGE_WALK_FALLBACK, (perhaps renamed). If not, we
can do without it similar to your original patch.

Thanks,

/Thomas
Linus Torvalds Oct. 9, 2019, 7:20 p.m. UTC | #7
On Wed, Oct 9, 2019 at 11:52 AM Thomas Hellstrom <thellstrom@vmware.com> wrote:
>
> Hmm, so we have the following cases we need to handle when returning
> from the pmd_entry() handler.

No, we really don't.

> 1) Huge pmd was handled - Returns 0 and continues.

No.

That case simply DOES NOT EXIST.

The only case that exists is

"pmd was seen, we return 0 and then look at wherer pte level is relevant".

Note that this has nothing to do with huge or not.

> 2) A pmd is otherwise unstable, typically someone just zapped a huge
> pmd. Returns PAGE_WALK_FALLBACK, gets caught in the pmd_trans_unstable()
> test and retries.

No. PAGE_WALK_FALLBACK doesn't exist, is completely broken in your
patch, and is immaterial.

It falls under the previous heading: a pmd was seen, returns zero, and
we go on with life.

If you don't have a pte callback - like EVERY SINGLE CURRENT USER -
that "goes on with life" is just "go to the next pmd entry".

And if you do have a pte callback - like your new case, that "go on
with life" is to look at the pte cases.

> 3) A pte directory - Returns PAGE_WALK_FALLBACK, falls through, avoids
> the split and continues to the next level. Yeah that split avoidance
> test is indeed made unnecessary by the preceding pmd_trans_unstable() test.

Again, no. This case does not exist. It's the same case as above: it
returns 0 and goes on to the pte level.


> -               split_huge_pmd(walk->vma, pmd, addr);
> +               if (!ops->pmd_entry)
> +                       split_huge_pmd(walk->vma, pmd, addr);
>
> But as the commit message says, PAGE_WALK_FALLBACK is necessary to have
> a virtual address range being handled once and only once.

No. Your logic is garbage. The above code is completely broken.

YOU CAN NOT AVOID TRHE SPLIT AND THEN GO ON AT THE PTE LEVEL.

Don't you get it? There *is* no PTE level if you didn't split.

And your "being handled once and only once" is garbage too. If you ask
for both a pmd callback and a pte callback, you get both. It's that
simple.

There are zero users that actually do it now, and you don't want to do
it either, so all your arguments are just pointless.

> So we need the PAGE_WALK_FALLBACK.

No we don't. You make no sense. Your case doesn't want it, no existing
cases want it, nobody wants it.

When you actually have a case that wants it, let's look at it then.
Right now, you introduced fundamentally buggy code because your
thinking is fuzzy and broken.

So what you should do is to just always return 0 in your pmd_entry().
Boom, done. The only reason for the pmd_entry existing at all is to
get the warning. Then, if you don't want to split it, you make that
warning just return an error (or a positive value) instead and say
"ok, that was bad, we don't handle it at all".

And in some _future_ life, if anybody wants to actually say "yeah,
let's not split it", make it have some "yeah I handled it" case.

In fact, I would suggest that positive return values be exactly that
"I did it" case, and that they just add up instead of breaking out.
Only an actual error would break out, and 0 would then (continue to)
mean "continue with next level".

But right now, no such user even exists.

               Linus
Thomas Hellström (Intel) Oct. 9, 2019, 8:06 p.m. UTC | #8
On 10/9/19 9:20 PM, Linus Torvalds wrote:
>
> No. Your logic is garbage. The above code is completely broken.
>
> YOU CAN NOT AVOID TRHE SPLIT AND THEN GO ON AT THE PTE LEVEL.
>
> Don't you get it? There *is* no PTE level if you didn't split.

Hmm, This paragraph makes me think we have very different perceptions about what I'm trying to achieve.

I wanted the pte level to *only* get called for *pre-existing* pte entries.
Surely those must be able to exist even if we don't split occasional huge pmds in the pagewalk code?

>
> So what you should do is to just always return 0 in your pmd_entry().
> Boom, done. The only reason for the pmd_entry existing at all is to
> get the warning. Then, if you don't want to split it, you make that
> warning just return an error (or a positive value) instead and say
> "ok, that was bad, we don't handle it at all".
>
> And in some _future_ life, if anybody wants to actually say "yeah,
> let's not split it", make it have some "yeah I handled it" case.

Well yes, this is exactly what I want. Because any huge pmd we encounter 
should be read-only.

/Thomas
Linus Torvalds Oct. 9, 2019, 8:20 p.m. UTC | #9
On Wed, Oct 9, 2019 at 1:06 PM Thomas Hellström (VMware)
<thomas_os@shipmail.org> wrote:
>
> On 10/9/19 9:20 PM, Linus Torvalds wrote:
> >
> > Don't you get it? There *is* no PTE level if you didn't split.
>
> Hmm, This paragraph makes me think we have very different perceptions about what I'm trying to achieve.

It's not about what you're trying to achieve.

It's about the actual code.

You cannot do that

> -               split_huge_pmd(walk->vma, pmd, addr);
> +               if (!ops->pmd_entry)
> +                       split_huge_pmd(walk->vma, pmd, addr);

it's insane.

You *have* to call split_huge_pmd() if you're doing to call the
pte_entry() function.

I don't understand why you are arguing. This is not about "feelings"
and "intentions" or about "trying to achieve".

This is about cold hard "you can't do that", and this is now the third
time I tell you _why_ you can't do that: you can't walk the last level
if you don't _have_ a last level. You have to split the pmd to do so.

End of story.

> I wanted the pte level to *only* get called for *pre-existing* pte entries.

Again, I told you what the solution was.

But the fact is, it's not what your other code even wants or does.

Seriously. You have two cases you care about in your callbacks

 - an actual hugepmd. This is an ERROR for you and you do a huge
WARN_ON() for it to let people know.

 - regular pages. This is what your other code actually handles.

So for the hugepomd case, you have two choices:

 - handle it by splitting and deal with the regular pages: "return 0;"

 - actually error out: "return -EINVAL".

There simply are no other choices from looking at the users you added.

> Surely those must be able to exist even if we don't split occasional huge pmds in the pagewalk code?

And I told you how to handle that case when you eventually hit it.

But that is immaterial. Because it's not what you actually do right
now. Right now you have a huge WARN_ON().

                Linus
Thomas Hellström (Intel) Oct. 9, 2019, 10:30 p.m. UTC | #10
On 10/9/19 10:20 PM, Linus Torvalds wrote:
> On Wed, Oct 9, 2019 at 1:06 PM Thomas Hellström (VMware)
> <thomas_os@shipmail.org> wrote:
>> On 10/9/19 9:20 PM, Linus Torvalds wrote:
>>> Don't you get it? There *is* no PTE level if you didn't split.
>> Hmm, This paragraph makes me think we have very different perceptions about what I'm trying to achieve.
> It's not about what you're trying to achieve.
>
> It's about the actual code.
>
> You cannot do that
>
>> -               split_huge_pmd(walk->vma, pmd, addr);
>> +               if (!ops->pmd_entry)
>> +                       split_huge_pmd(walk->vma, pmd, addr);
> it's insane.
>
> You *have* to call split_huge_pmd() if you're doing to call the
> pte_entry() function.
>
> I don't understand why you are arguing. This is not about "feelings"
> and "intentions" or about "trying to achieve".
>
> This is about cold hard "you can't do that", and this is now the third
> time I tell you _why_ you can't do that: you can't walk the last level
> if you don't _have_ a last level. You have to split the pmd to do so.
It's not so much arguing but rather trying to understand your concerns 
and your perception of what the final code should look like.
>
> End of story.

So is it that you want pte_entry() to be strictly called for *each* 
virtual address, even if we have a pmd_entry()?
In that case I completely follow your arguments, meaning we skip this 
patch completely?

My take on the change was that pmd_entry() returning 0 would mean we 
could actually skip the pte level completely and nothing would otherwise 
pass down to the next level for which split_huge_pmd() wasn't a NOP, 
similar to how pud_entry does things. FWIW, see

https://lore.kernel.org/lkml/20191004123732.xpr3vroee5mhg2zt@box.shutemov.name/

and we could in the long run transform the pte walk in many pmd_entry 
callbacks into pte_entry callbacks.


>
>> I wanted the pte level to *only* get called for *pre-existing* pte entries.
> Again, I told you what the solution was.
>
> But the fact is, it's not what your other code even wants or does.
>
> Seriously. You have two cases you care about in your callbacks
>
>   - an actual hugepmd. This is an ERROR for you and you do a huge
> WARN_ON() for it to let people know.
No, it's typically a NOP, since the hugepmd should be read-only. 
Write-enabled huge pages are split in fault().
>
>   - regular pages. This is what your other code actually handles.
>
> So for the hugepomd case, you have two choices:
>
>   - handle it by splitting and deal with the regular pages: "return 0;"
Well, this is not what we want to do, and the reason we have the patch 
in the first place.
>
>   - actually error out: "return -EINVAL".

/Thomas
Thomas Hellström (Intel) Oct. 9, 2019, 11:50 p.m. UTC | #11
On 10/10/19 12:30 AM, Thomas Hellström (VMware) wrote:
> On 10/9/19 10:20 PM, Linus Torvalds wrote:
>> On Wed, Oct 9, 2019 at 1:06 PM Thomas Hellström (VMware)
>> <thomas_os@shipmail.org> wrote:
>>> On 10/9/19 9:20 PM, Linus Torvalds wrote:
>>>> Don't you get it? There *is* no PTE level if you didn't split.
>>> Hmm, This paragraph makes me think we have very different 
>>> perceptions about what I'm trying to achieve.
>> It's not about what you're trying to achieve.
>>
>> It's about the actual code.
>>
>> You cannot do that
>>
>>> - split_huge_pmd(walk->vma, pmd, addr);
>>> +               if (!ops->pmd_entry)
>>> +                       split_huge_pmd(walk->vma, pmd, addr);
>> it's insane.
>>
>> You *have* to call split_huge_pmd() if you're doing to call the
>> pte_entry() function.
>>
>> I don't understand why you are arguing. This is not about "feelings"
>> and "intentions" or about "trying to achieve".
>>
>> This is about cold hard "you can't do that", and this is now the third
>> time I tell you _why_ you can't do that: you can't walk the last level
>> if you don't _have_ a last level. You have to split the pmd to do so.
> It's not so much arguing but rather trying to understand your concerns 
> and your perception of what the final code should look like.
>>
>> End of story.
>
> So is it that you want pte_entry() to be strictly called for *each* 
> virtual address, even if we have a pmd_entry()?
> In that case I completely follow your arguments, meaning we skip this 
> patch completely?

Or if you're still OK with your original patch

https://lore.kernel.org/lkml/CAHk-=wj5NiFPouYd6zUgY4K7VovOAxQT-xhDRjD6j5hifBWi_g@mail.gmail.com/

I'd happily use that instead.

Thanks,

Thomas
Linus Torvalds Oct. 9, 2019, 11:51 p.m. UTC | #12
On Wed, Oct 9, 2019 at 3:31 PM Thomas Hellström (VMware)
<thomas_os@shipmail.org> wrote:
>
> On 10/9/19 10:20 PM, Linus Torvalds wrote:
> >
> > You *have* to call split_huge_pmd() if you're doing to call the
> > pte_entry() function.
> >
> > End of story.
>
> So is it that you want pte_entry() to be strictly called for *each*
> virtual address, even if we have a pmd_entry()?

Thomas, you're not reading the emails.

You are conflating two issues:

 (a) the conditional split_huge_pmd()

 (b) the what to do about the return value of pmd_entry().

and I'm now FOR THE LAST TIME telling you that (a) is completely
wrong, buggy crap. It will not happen. I missed that part of the patch
in my original read-through, because it was so senseless.

Get it through you head. The "conditional split_huge_pmd()" thing is
wrong, wrong, wrong.

And it is COMPLETELY WRONG regardless of any "each virtual address"
thing that you keep bringing up. The "each virtual address" argument
is irrelevant, pointless, and does not matter.

So stop bringing that thing up. Really. The conditional
split_huge_pmd() is wrong.

It's wrong,  because the whole notion of "don't split pmd and then
call the pte walker" is completely idiotic and utterly senseless,
because without the splitting the pte's DO NOT EXIST.

What part of that is not getting through?

> In that case I completely follow your arguments, meaning we skip this
> patch completely?

Well, yes and no.

The part of the patch that is complete and utter garbage, and that you
really need to *understand* why it's complete and utter garbage is
this part:

                if (!ops->pte_entry)
                        continue;
-
-               split_huge_pmd(walk->vma, pmd, addr);
+               if (!ops->pmd_entry)
+                       split_huge_pmd(walk->vma, pmd, addr);
                if (pmd_trans_unstable(pmd))
                        goto again;
                err = walk_pte_range(pmd, addr, next, walk);

Look, you cannot call "walk_pte_range()" without calling
split_huge_pmd(), because walk_pte_range() cannot deal with a huge
pmd.

walk_pte_range() does that pte_offset_map() on the pmd (well, with
your other patch in place it does the locking version), and then it
walks every pte entry of it. But that cannot possibly work if you
didn't split it.

See what I've been trying tog tell you? YOU CANNOT MAKE THAT SPLITTING
BE CONDITIONAL!

What can be done - and what the line above it does - is to skip
walking entirely. We do it when we don't have a pte_entry walker at
all, obviously.

But we can't do is "oh, we had a pmd walker, so now I'll skip
splitting". That is insane, and cannot possibly make sense.

Now, that gets us back to the (b) part above - what to do about the
return value of "pmd_entry()".

What *would* make sense, for example, is saying "ok, pmd_entry()
already handled this, so we'll just continue, and not bother with the
pte_range() at all".

Note how that is VERY VERY different from "let's not split".

Yes, for that case we also don't split, but we don't split because
we're not even walking it. See the difference?

Not splitting and then walking: wrong, insane, and not going to happen.

Nor splitting because we're not going to walk it: sane, and we already
have one such case.

> My take on the change was that pmd_entry() returning 0 would mean we
> could actually skip the pte level completely and nothing would otherwise
> pass down to the next level for which split_huge_pmd() wasn't a NOP,
> similar to how pud_entry does things.

And that would have been a sensible operation, and yes, you had that

+                       if (!err)
+                               continue;

and that was it. Fine.

BUT.

That was not all that your patch did. Kirill felt that your
PAGE_WALK_FALLBACK thing was hacky, which is why I looked more at the
patch to see what it really wanted to do, and noticed that crazy
conditional splitting that didn't make sense, and has *NOTHING* to do
with your "skip the level completely".

I _agree_ that skipping the level completely makes sense.

But the "don't split and then don't skip the level" makes zero sense
what-so-ever. Ever. Under no circumstances can that be valid as per
above.

There's also the argument that right now, there are no users that
actually want to skip the level.

Even your use case doesn't really want that, because in your use-case,
the only case that would do it is the error case that isn't supposed
to happen.

And if it *is* supposed to happen, in many ways it would be more
sensible to just return a positive value saying "I took care of it, go
on to the next entry", wouldn't you agree?

Now, I actually tried to look through every single existing pmd_entry
case, because I wanted to see who is returning positive values right
now, and which of them might *want* to say "ok, I handled it" or "now
do the pte walking".

Quite a lot of them seem to really want to do that "ok, now do the pte
walking", because they currently do it inside the pmd function because
of how the original interface was badly done. So while we _currently_
do not have a "ok, I did everything for this pmd, skip it" vs a "ok,
continue with pte" case, it clearly does make sense. No question about
it.

I did find one single user of a positive return value:
queue_pages_pte_range() returns

 * 0 - pages are placed on the right node or queued successfully.
 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 *     specified.
 * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
 *        on a node that does not follow the policy.

to match queue_pages_range(), but that function has two callers:

 - the first ignores the return value entirely

 - the second would be perfectly fine with -EBUSY as a return value
instead, which would actually make more sense.

Everybody else returns 0 or a negative error, as far as I can tell.

End result:

 (a) right now nobody wants the "skip" behavior. You think you'll
eventually want it

 (b) right now, the "return positive value" is actually a horribly
ugly pointless hack, which could be made to be an error value and
cleaned up in the process

 (c) to me that really argues that we should just make the rule be

     - negative error means break out with error

     - 0 means continue down the next level

     - positive could be trivially be made to just mean "ok, I did it,
you can just continue".

and I think that would make a lot of sense.

So I do think we can get rid of the PAGE_WALK_FALLBACK hack entirely.
It becomes "return 0", which is what everybody really does right now.
If there's a pte_entry(), it will split and call that pte_entry for
each pte, the way it does now. Sensible and simple.

And then - for _future_ uses - we can add that "positive return value
means that I already handled it" and the walker would just skip to the
next entry.

But - again - in no alternate reality, past, future or present, does
it make sense to skip the splitting if you walk pte's.

               Linus
Linus Torvalds Oct. 10, 2019, 12:18 a.m. UTC | #13
On Wed, Oct 9, 2019 at 4:51 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
>  (a) right now nobody wants the "skip" behavior. You think you'll
> eventually want it
>
>  (b) right now, the "return positive value" is actually a horribly
> ugly pointless hack, which could be made to be an error value and
> cleaned up in the process
>
>  (c) to me that really argues that we should just make the rule be
>
>      - negative error means break out with error
>
>      - 0 means continue down the next level
>
>      - positive could be trivially be made to just mean "ok, I did it,
> you can just continue".
>
> and I think that would make a lot of sense.

So here's an ENTIRELY untested patch, but the return value of
pmd_entry() now makes conceptual sense to me. The whole "I hit an
error, I did nothing, I already did it myself" to me is the intuitive
meaning of {neg,0,pos} handling.

I think we probably should do this same thing for the upper levels too
to be consistent, but I think this at least makes sense, and is
simple, and avoids any hacky PAGE_WALK_CALLER_MAX magic.

I also wonder if some caller might want to get a count of "how many X
handled", and we'd just sum up all the positive return values as we're
traversing things, but that falls under "nobody seems to want it right
now", so I'm not adding extra code for something that might not be
useful.

And it is possible that I missed some other pmd_entry() callback that
returns a positive value. I did check them all, but mistakes happen
and I might have missed some case...

Again: entirely and utterly untested. It compiles. That's all I'm
going to guarantee, and even that might be a fluke.

                 Linus
Thomas Hellström (Intel) Oct. 10, 2019, 1:09 a.m. UTC | #14
On 10/10/19 1:51 AM, Linus Torvalds wrote:
> On Wed, Oct 9, 2019 at 3:31 PM Thomas Hellström (VMware)
> <thomas_os@shipmail.org> wrote:
>> On 10/9/19 10:20 PM, Linus Torvalds wrote:
>>> You *have* to call split_huge_pmd() if you're doing to call the
>>> pte_entry() function.
>>>
>>> End of story.
>> So is it that you want pte_entry() to be strictly called for *each*
>> virtual address, even if we have a pmd_entry()?
> Thomas, you're not reading the emails.
>
> You are conflating two issues:
>
>   (a) the conditional split_huge_pmd()
>
>   (b) the what to do about the return value of pmd_entry().
>
> and I'm now FOR THE LAST TIME telling you that (a) is completely
> wrong, buggy crap. It will not happen. I missed that part of the patch
> in my original read-through, because it was so senseless.
>
> Get it through you head. The "conditional split_huge_pmd()" thing is
> wrong, wrong, wrong.
>
> And it is COMPLETELY WRONG regardless of any "each virtual address"
> thing that you keep bringing up. The "each virtual address" argument
> is irrelevant, pointless, and does not matter.
>
> So stop bringing that thing up. Really. The conditional
> split_huge_pmd() is wrong.
>
> It's wrong,  because the whole notion of "don't split pmd and then
> call the pte walker" is completely idiotic and utterly senseless,
> because without the splitting the pte's DO NOT EXIST.
>
> What part of that is not getting through?
>
>> In that case I completely follow your arguments, meaning we skip this
>> patch completely?
> Well, yes and no.
>
> The part of the patch that is complete and utter garbage, and that you
> really need to *understand* why it's complete and utter garbage is
> this part:
>
>                  if (!ops->pte_entry)
>                          continue;
> -
> -               split_huge_pmd(walk->vma, pmd, addr);
> +               if (!ops->pmd_entry)
> +                       split_huge_pmd(walk->vma, pmd, addr);
>                  if (pmd_trans_unstable(pmd))
>                          goto again;
>                  err = walk_pte_range(pmd, addr, next, walk);
>
> Look, you cannot call "walk_pte_range()" without calling
> split_huge_pmd(), because walk_pte_range() cannot deal with a huge
> pmd.
>
> walk_pte_range() does that pte_offset_map() on the pmd (well, with
> your other patch in place it does the locking version), and then it
> walks every pte entry of it. But that cannot possibly work if you
> didn't split it.

Thank you for your patience!

Yes, I very well *do* understand that we need to split a huge pmd to 
walk the pte range, and I've never been against removing that 
conditional. What I've said is that it is pointless anyway, because 
we've already verified that the only path coming from the pmd_entry 
(with the patch applied) has the pmd *already split* and stable.

Your original patch does exactly the same!

So please let's move on from the splitting issue. We don't disagree 
here. The conditional is gone to never be resurrected.

>
> Now, that gets us back to the (b) part above - what to do about the
> return value of "pmd_entry()".
>
> What *would* make sense, for example, is saying "ok, pmd_entry()
> already handled this, so we'll just continue, and not bother with the
> pte_range() at all".
>
> Note how that is VERY VERY different from "let's not split".
>
> Yes, for that case we also don't split, but we don't split because
> we're not even walking it. See the difference?
>
> Not splitting and then walking: wrong, insane, and not going to happen.
>
> Nor splitting because we're not going to walk it: sane, and we already
> have one such case.
>
> But the "don't split and then don't skip the level" makes zero sense
> what-so-ever. Ever. Under no circumstances can that be valid as per
> above.

Agreed.

>
> There's also the argument that right now, there are no users that
> actually want to skip the level.
>
> Even your use case doesn't really want that, because in your use-case,
> the only case that would do it is the error case that isn't supposed
> to happen.
>
> And if it *is* supposed to happen, in many ways it would be more
> sensible to just return a positive value saying "I took care of it, go
> on to the next entry", wouldn't you agree?

Indeed.

>
> Now, I actually tried to look through every single existing pmd_entry
> case, because I wanted to see who is returning positive values right
> now, and which of them might *want* to say "ok, I handled it" or "now
> do the pte walking".
>
> Quite a lot of them seem to really want to do that "ok, now do the pte
> walking", because they currently do it inside the pmd function because
> of how the original interface was badly done. So while we _currently_
> do not have a "ok, I did everything for this pmd, skip it" vs a "ok,
> continue with pte" case, it clearly does make sense. No question about
> it.
>
> I did find one single user of a positive return value:
> queue_pages_pte_range() returns
>
>   * 0 - pages are placed on the right node or queued successfully.
>   * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
>   *     specified.
>   * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
>   *        on a node that does not follow the policy.
>
> to match queue_pages_range(), but that function has two callers:
>
>   - the first ignores the return value entirely
>
>   - the second would be perfectly fine with -EBUSY as a return value
> instead, which would actually make more sense.
>
> Everybody else returns 0 or a negative error, as far as I can tell.
>
> End result:
>
>   (a) right now nobody wants the "skip" behavior. You think you'll
> eventually want it
>
>   (b) right now, the "return positive value" is actually a horribly
> ugly pointless hack, which could be made to be an error value and
> cleaned up in the process
>
>   (c) to me that really argues that we should just make the rule be
>
>       - negative error means break out with error
>
>       - 0 means continue down the next level
>
>       - positive could be trivially be made to just mean "ok, I did it,
> you can just continue".
>
> and I think that would make a lot of sense.
>
Sure. I'm fine with this. So for next spin, I'll skip this patch, and do 
the positive value rework as part of the prerequisites for the huge page 
enablement. IIRC there is another user of positive values that should be 
trivial to fix.

Does that sound OK?

Thanks,

Thomas
Linus Torvalds Oct. 10, 2019, 2:07 a.m. UTC | #15
On Wed, Oct 9, 2019 at 6:10 PM Thomas Hellström (VMware)
<thomas_os@shipmail.org> wrote:
>
> Your original patch does exactly the same!

Oh, no. You misread my original patch.

Look again.

The logic in my original patch was very different. It said that

 - *if* we have a pmd_entry function, then we obviously call that one.

    And if - after calling the pmd_entry function - we are still a
hugepage, then we skip the pte_entry case entirely.

   And part of skipping is obviously "don't split" - but it never had
that "don't split and then call the pte walker" case.

 - and if we *don't* have a pmd_entry function, but we do have a
pte_entry function, then we always split before calling it.

Notice the difference?

So instead of looking at the return value of the pmd_entry() function,
the approach of that suggested patch was to basically say that if the
pmd_entry function wants us to go another level deeper and it was a
hugepmd, it needed to split the pmd to make that happen.

That's actually very different from what your patch did. My original
patch never tried to walk the pte level without having one - either it
*checked* that it had one, or it split.

But I see where you might have misread the patch, particularly if you
only looked at it as a patch, not as the end result of the patch.

The end result of that patch was this:

                if (ops->pmd_entry) {
                        err = ops->pmd_entry(pmd, addr, next, walk);
                        if (err)
                                break;
                        /* No pte level walking? */
                        if (!ops->pte_entry)
                                continue;
                        /* No pte level at all? */
                        if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)
|| pmd_devmap(*pmd))
                                continue;
                } else {
                        if (!ops->pte_entry)
                                continue;

                        split_huge_pmd(walk->vma, pmd, addr);
                        if (pmd_trans_unstable(pmd))
                                goto again;
                }
                err = walk_pte_range(pmd, addr, next, walk);

and look at thew two different sides of the if-statement: if they get
to "walk_pte_range()", both cases wil have verified that there
actually _is_ a pte level. They will just have done it differently. -
the "we didn't have a pmd function" will have split the pmd if it was
a hugepmd, while the "we do have a pmd_entry" case will just check
whether it's still a huge-pmd, and done a "continue" if it was and
never even tried to walk the ptes.

But I think the "change pmd_entry to have a sane return code" is a
simpler and more flexible model, and then the pmd_entry code can just
let the pte walker split the pmd if needed.

So I liked that part of your patch.

           Linus
Thomas Hellström (Intel) Oct. 10, 2019, 6:15 a.m. UTC | #16
On 10/10/19 4:07 AM, Linus Torvalds wrote:
> On Wed, Oct 9, 2019 at 6:10 PM Thomas Hellström (VMware)
> <thomas_os@shipmail.org> wrote:
>> Your original patch does exactly the same!
> Oh, no. You misread my original patch.
>
> Look again.
>
> The logic in my original patch was very different. It said that
>
>   - *if* we have a pmd_entry function, then we obviously call that one.
>
>      And if - after calling the pmd_entry function - we are still a
> hugepage, then we skip the pte_entry case entirely.
>
>     And part of skipping is obviously "don't split" - but it never had
> that "don't split and then call the pte walker" case.
>
>   - and if we *don't* have a pmd_entry function, but we do have a
> pte_entry function, then we always split before calling it.
>
> Notice the difference?

 From what I can tell, my patch is doing the same. At least that always 
was the intention. To determine whether to skip pte and skip split, your 
patch uses

                         /* No pte level at all? */
                         if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)
|| pmd_devmap(*pmd))
                                 continue;

whereas my patch does

                         if (pmd_trans_unstable(pmd))
                                 goto again;
			/* Fall through */

which is the same (pmd_trans_unstable() is the same test as you do, but 
not racy). Yes, it's missing the test for pmd_devmap() but I think 
that's an mm bug been discussed elsewhere, and we also rerun because a 
huge / none pmd at this (FALLBACK) point is probably a race and unintended.

>
> But I think the "change pmd_entry to have a sane return code" is a
> simpler and more flexible model, and then the pmd_entry code can just
> let the pte walker split the pmd if needed.

OK, let's aim for that then.

Thanks,

Thomas


>
> So I liked that part of your patch.
>
>             Linus
diff mbox series

Patch

diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index bddd9759bab9..c4a013eb445d 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -4,6 +4,11 @@ 
 
 #include <linux/mm.h>
 
+/* Highest positive pmd_entry caller-specific return value */
+#define PAGE_WALK_CALLER_MAX     (INT_MAX / 2)
+/* The handler did not handle the entry. Fall back to the next level */
+#define PAGE_WALK_FALLBACK       (PAGE_WALK_CALLER_MAX + 1)
+
 struct mm_walk;
 
 /**
@@ -16,6 +21,9 @@  struct mm_walk;
  *			this handler is required to be able to handle
  *			pmd_trans_huge() pmds.  They may simply choose to
  *			split_huge_page() instead of handling it explicitly.
+ *                      If the handler did not handle the PMD, or split the
+ *                      PMD and wants it handled by the PTE handler, it
+ *                      should return PAGE_WALK_FALLBACK.
  * @pte_entry:		if set, called for each non-empty PTE (4th-level) entry
  * @pte_hole:		if set, called for each hole at all levels
  * @hugetlb_entry:	if set, called for each hugetlb entry
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 83c0b78363b4..f844c2a2aa60 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -50,10 +50,18 @@  static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 		 * This implies that each ->pmd_entry() handler
 		 * needs to know about pmd_trans_huge() pmds
 		 */
-		if (ops->pmd_entry)
+		if (ops->pmd_entry) {
 			err = ops->pmd_entry(pmd, addr, next, walk);
-		if (err)
-			break;
+			if (!err)
+				continue;
+			else if (err <= PAGE_WALK_CALLER_MAX)
+				break;
+			WARN_ON(err != PAGE_WALK_FALLBACK);
+			err = 0;
+			if (pmd_trans_unstable(pmd))
+				goto again;
+			/* Fall through */
+		}
 
 		/*
 		 * Check this here so we only break down trans_huge
@@ -61,8 +69,8 @@  static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 		 */
 		if (!ops->pte_entry)
 			continue;
-
-		split_huge_pmd(walk->vma, pmd, addr);
+		if (!ops->pmd_entry)
+			split_huge_pmd(walk->vma, pmd, addr);
 		if (pmd_trans_unstable(pmd))
 			goto again;
 		err = walk_pte_range(pmd, addr, next, walk);
@@ -281,11 +289,17 @@  static int __walk_page_range(unsigned long start, unsigned long end,
  *
  *  - 0  : succeeded to handle the current entry, and if you don't reach the
  *         end address yet, continue to walk.
- *  - >0 : succeeded to handle the current entry, and return to the caller
- *         with caller specific value.
+ *  - >0, and <= PAGE_WALK_CALLER_MAX : succeeded to handle the current entry,
+ *         and return to the caller with caller specific value.
  *  - <0 : failed to handle the current entry, and return to the caller
  *         with error code.
  *
+ * For pmd_entry(), a value <= PAGE_WALK_CALLER_MAX indicates that the entry
+ * was handled by the callback. PAGE_WALK_FALLBACK indicates that the entry
+ * could not be handled by the callback and should be re-checked. If the
+ * callback needs the entry to be handled by the next level, it should
+ * split the entry and then return PAGE_WALK_FALLBACK.
+ *
  * Before starting to walk page table, some callers want to check whether
  * they really want to walk over the current vma, typically by checking
  * its vm_flags. walk_page_test() and @ops->test_walk() are used for this