diff mbox series

mm/hmm: Simplify hmm_vma_walk_pud slightly

Message ID 20200312102813.56699-1-steven.price@arm.com (mailing list archive)
State New, archived
Headers show
Series mm/hmm: Simplify hmm_vma_walk_pud slightly | expand

Commit Message

Steven Price March 12, 2020, 10:28 a.m. UTC
By refactoring to deal with the !pud_huge(pud) || !pud_devmap(pud)
condition early it's possible to remove the 'ret' variable and remove a
level of indentation from half the function making the code easier to
read.

No functional change.

Signed-off-by: Steven Price <steven.price@arm.com>
---
Thanks to Jason's changes there were only two code paths left using
the out_unlock label so it seemed like a good opportunity to refactor.
---
 mm/hmm.c | 69 ++++++++++++++++++++++++++------------------------------
 1 file changed, 32 insertions(+), 37 deletions(-)

Comments

Jason Gunthorpe March 12, 2020, 2:27 p.m. UTC | #1
On Thu, Mar 12, 2020 at 10:28:13AM +0000, Steven Price wrote:
> By refactoring to deal with the !pud_huge(pud) || !pud_devmap(pud)
> condition early it's possible to remove the 'ret' variable and remove a
> level of indentation from half the function making the code easier to
> read.
> 
> No functional change.
> 
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> Thanks to Jason's changes there were only two code paths left using
> the out_unlock label so it seemed like a good opportunity to
> refactor.

Yes, I made something very similar, what do you think of this:

https://github.com/jgunthorpe/linux/commit/93f0ed42ab3f9ceb27b58fb7c7c3ecaf60f16b36

From 93f0ed42ab3f9ceb27b58fb7c7c3ecaf60f16b36 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Wed, 4 Mar 2020 17:11:10 -0400
Subject: [PATCH] mm/hmm: rework hmm_vma_walk_pud()

At least since commit 3afc423632a1 ("mm: pagewalk: add p4d_entry() and
pgd_entry()") this code has developed a number of strange control flows.

The purpose of the routine is to copy the pfns of a huge devmap PUD into
the pfns output array, without splitting the PUD. Everything that is not a
huge devmap PUD should go back to the walker for splitting.

Rework the logic to show this goal and remove redundant stuff:

- If pud_trans_huge_lock returns !NULL then this is already
  'pud_trans_huge() || pud_devmap()' and 'pud_huge() || pud_devmap()'
  so some of the logic is redundant.

- Hitting pud_none() is a race, treat it as such and return back to the
  walker using ACTION_AGAIN

- !pud_present() gives 0 cpu_flags, so the extra checks are redundant

- Once the *pudp is read there is no need to continue holding the pud
  lock, so drop it. The only thing the following code cares about is the
  pfn from the devmap, and if there is racing then the notifiers will
  resolve everything. Perhaps the unlocked READ_ONCE in an ealier version
  was correct

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 mm/hmm.c | 79 +++++++++++++++++++++++---------------------------------
 1 file changed, 33 insertions(+), 46 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 8fec801a33c9e2..87a376659b5ad4 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -455,66 +455,53 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
 {
 	struct hmm_vma_walk *hmm_vma_walk = walk->private;
 	struct hmm_range *range = hmm_vma_walk->range;
-	unsigned long addr = start;
+	unsigned long i, npages, pfn;
+	unsigned int required_fault;
+	uint64_t cpu_flags;
+	uint64_t *pfns;
+	spinlock_t *ptl;
 	pud_t pud;
-	int ret = 0;
-	spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma);
 
+	/*
+	 * This only handles huge devmap pages, the default return is
+	 * ACTION_SUBTREE, so everything else is split by the walker and passed
+	 * to the other routines.
+	 */
+	ptl = pud_trans_huge_lock(pudp, walk->vma);
 	if (!ptl)
 		return 0;
+	pud = *pudp;
+	spin_unlock(ptl);
 
-	/* Normally we don't want to split the huge page */
-	walk->action = ACTION_CONTINUE;
-
-	pud = READ_ONCE(*pudp);
 	if (pud_none(pud)) {
-		spin_unlock(ptl);
-		return hmm_vma_walk_hole(start, end, -1, walk);
+		walk->action = ACTION_AGAIN;
+		return 0;
 	}
 
-	if (pud_huge(pud) && pud_devmap(pud)) {
-		unsigned long i, npages, pfn;
-		unsigned int required_flags;
-		uint64_t *pfns, cpu_flags;
-
-		if (!pud_present(pud)) {
-			spin_unlock(ptl);
-			return hmm_vma_walk_hole(start, end, -1, walk);
-		}
-
-		i = (addr - range->start) >> PAGE_SHIFT;
-		npages = (end - addr) >> PAGE_SHIFT;
-		pfns = &range->pfns[i];
+	if (!pud_devmap(pud))
+		return 0;
 
-		cpu_flags = pud_to_hmm_pfn_flags(range, pud);
+	pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT];
+	cpu_flags = pud_to_hmm_pfn_flags(range, pud);
+	required_fault =
 		hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags);
-		if (required_flags) {
-			spin_unlock(ptl);
-			return hmm_vma_walk_hole_(addr, end, required_flags,
-						  walk);
-		}
+	if (required_fault)
+		return hmm_vma_walk_hole_(start, end, required_fault, walk);
 
-		pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
-		for (i = 0; i < npages; ++i, ++pfn) {
-			hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
-					      hmm_vma_walk->pgmap);
-			if (unlikely(!hmm_vma_walk->pgmap)) {
-				ret = -EBUSY;
-				goto out_unlock;
-			}
-			pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
-				  cpu_flags;
-		}
-		hmm_vma_walk->last = end;
-		goto out_unlock;
+	pfn = pud_pfn(pud) + ((start & ~PUD_MASK) >> PAGE_SHIFT);
+	npages = (end - start) >> PAGE_SHIFT;
+	for (i = 0; i < npages; ++i, ++pfn) {
+		hmm_vma_walk->pgmap = get_dev_pagemap(pfn, hmm_vma_walk->pgmap);
+		if (unlikely(!hmm_vma_walk->pgmap))
+			return -EBUSY;
+		pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
 	}
 
-	/* Ask for the PUD to be split */
-	walk->action = ACTION_SUBTREE;
+	hmm_vma_walk->last = end;
 
-out_unlock:
-	spin_unlock(ptl);
-	return ret;
+	/* Do not split the pud */
+	walk->action = ACTION_CONTINUE;
+	return 0;
 }
 #else
 #define hmm_vma_walk_pud	NULL
Steven Price March 12, 2020, 2:40 p.m. UTC | #2
On 12/03/2020 14:27, Jason Gunthorpe wrote:
> On Thu, Mar 12, 2020 at 10:28:13AM +0000, Steven Price wrote:
>> By refactoring to deal with the !pud_huge(pud) || !pud_devmap(pud)
>> condition early it's possible to remove the 'ret' variable and remove a
>> level of indentation from half the function making the code easier to
>> read.
>>
>> No functional change.
>>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>> Thanks to Jason's changes there were only two code paths left using
>> the out_unlock label so it seemed like a good opportunity to
>> refactor.
> 
> Yes, I made something very similar, what do you think of this:
> 
> https://github.com/jgunthorpe/linux/commit/93f0ed42ab3f9ceb27b58fb7c7c3ecaf60f16b36

Even better! Sorry I didn't realise you'd already done this. I just saw 
that the function was needlessly complicated after your fix, so I 
thought I'd do a drive-by cleanup since part of the mess was my fault! :)

Thanks,

Steve
Jason Gunthorpe March 12, 2020, 3:11 p.m. UTC | #3
On Thu, Mar 12, 2020 at 02:40:08PM +0000, Steven Price wrote:
> On 12/03/2020 14:27, Jason Gunthorpe wrote:
> > On Thu, Mar 12, 2020 at 10:28:13AM +0000, Steven Price wrote:
> > > By refactoring to deal with the !pud_huge(pud) || !pud_devmap(pud)
> > > condition early it's possible to remove the 'ret' variable and remove a
> > > level of indentation from half the function making the code easier to
> > > read.
> > > 
> > > No functional change.
> > > 
> > > Signed-off-by: Steven Price <steven.price@arm.com>
> > > Thanks to Jason's changes there were only two code paths left using
> > > the out_unlock label so it seemed like a good opportunity to
> > > refactor.
> > 
> > Yes, I made something very similar, what do you think of this:
> > 
> > https://github.com/jgunthorpe/linux/commit/93f0ed42ab3f9ceb27b58fb7c7c3ecaf60f16b36
> 
> Even better! Sorry I didn't realise you'd already done this. I just saw that
> the function was needlessly complicated after your fix, so I thought I'd do
> a drive-by cleanup since part of the mess was my fault! :)

No worries, I've got a lot of patches for hmm_range_fault right now,
just trying to organize them, test them and post them. Haven't posted
that one yet.

Actually, while you are looking at this, do you think we should be
adding at least READ_ONCE in the pagewalk.c walk_* functions? The
multiple references of pmd, pud, etc without locking seems sketchy to
me.

Jason
Steven Price March 12, 2020, 4:16 p.m. UTC | #4
On 12/03/2020 15:11, Jason Gunthorpe wrote:
> On Thu, Mar 12, 2020 at 02:40:08PM +0000, Steven Price wrote:
>> On 12/03/2020 14:27, Jason Gunthorpe wrote:
>>> On Thu, Mar 12, 2020 at 10:28:13AM +0000, Steven Price wrote:
>>>> By refactoring to deal with the !pud_huge(pud) || !pud_devmap(pud)
>>>> condition early it's possible to remove the 'ret' variable and remove a
>>>> level of indentation from half the function making the code easier to
>>>> read.
>>>>
>>>> No functional change.
>>>>
>>>> Signed-off-by: Steven Price <steven.price@arm.com>
>>>> Thanks to Jason's changes there were only two code paths left using
>>>> the out_unlock label so it seemed like a good opportunity to
>>>> refactor.
>>>
>>> Yes, I made something very similar, what do you think of this:
>>>
>>> https://github.com/jgunthorpe/linux/commit/93f0ed42ab3f9ceb27b58fb7c7c3ecaf60f16b36
>>
>> Even better! Sorry I didn't realise you'd already done this. I just saw that
>> the function was needlessly complicated after your fix, so I thought I'd do
>> a drive-by cleanup since part of the mess was my fault! :)
> 
> No worries, I've got a lot of patches for hmm_range_fault right now,
> just trying to organize them, test them and post them. Haven't posted
> that one yet.
> 
> Actually, while you are looking at this, do you think we should be
> adding at least READ_ONCE in the pagewalk.c walk_* functions? The
> multiple references of pmd, pud, etc without locking seems sketchy to
> me.

I agree it seems worrying. I'm not entirely sure whether the holding of 
mmap_sem is sufficient, this isn't something that I changed so I've just 
been hoping that it's sufficient since it seems to have been working 
(whether that's by chance because the compiler didn't generate multiple 
reads I've no idea). For walking the kernel's page tables the lack of 
READ_ONCE is also not great, but at least for PTDUMP we don't care too 
much about accuracy and it should be crash proof because there's no RCU 
grace period. And again the code I was replacing didn't have any special 
protection.

I can't see any harm in updating the code to include READ_ONCE and I'm 
happy to review a patch.

Thanks,

Steve
Jason Gunthorpe March 12, 2020, 4:37 p.m. UTC | #5
On Thu, Mar 12, 2020 at 04:16:33PM +0000, Steven Price wrote:
> > Actually, while you are looking at this, do you think we should be
> > adding at least READ_ONCE in the pagewalk.c walk_* functions? The
> > multiple references of pmd, pud, etc without locking seems sketchy to
> > me.
> 
> I agree it seems worrying. I'm not entirely sure whether the holding of
> mmap_sem is sufficient,

I looked at this question, and at least for PMD, mmap_sem is not
sufficient. I didn't easilly figure it out for the other ones

I'm guessing if PMD is not safe then none of them are.

> this isn't something that I changed so I've just
> been hoping that it's sufficient since it seems to have been working
> (whether that's by chance because the compiler didn't generate multiple
> reads I've no idea). For walking the kernel's page tables the lack of
> READ_ONCE is also not great, but at least for PTDUMP we don't care too much
> about accuracy and it should be crash proof because there's no RCU grace
> period. And again the code I was replacing didn't have any special
> protection.
>
> I can't see any harm in updating the code to include READ_ONCE and I'm happy
> to review a patch.

The reason I ask is because hmm's walkers often have this pattern
where they get the pointer and then de-ref it (again) then
immediately have to recheck the 'again' conditions of the walker
itself because the re-read may have given a different value.

Having the walker deref the pointer and pass the value it into the ops
for use rather than repeatedly de-refing an unlocked value seems like
a much safer design to me.

If this also implicitly relies on a RCU grace period then it is also
missing RCU locking...

I also didn't quite understand why walk_pte_range() skipped locking
the pte in the no_vma case - I don't get why vma would be related to
locking here.

I also saw that hmm open coded the pte walk, presumably for
performance, so I was thinking of adding some kind of pte_range()
callback to avoid the expensive indirect function call per pte, but
hmm also can't have the pmd locked...

Jason
Steven Price March 12, 2020, 5:02 p.m. UTC | #6
On 12/03/2020 16:37, Jason Gunthorpe wrote:
> On Thu, Mar 12, 2020 at 04:16:33PM +0000, Steven Price wrote:
>>> Actually, while you are looking at this, do you think we should be
>>> adding at least READ_ONCE in the pagewalk.c walk_* functions? The
>>> multiple references of pmd, pud, etc without locking seems sketchy to
>>> me.
>>
>> I agree it seems worrying. I'm not entirely sure whether the holding of
>> mmap_sem is sufficient,
> 
> I looked at this question, and at least for PMD, mmap_sem is not
> sufficient. I didn't easilly figure it out for the other ones
> 
> I'm guessing if PMD is not safe then none of them are.
> 
>> this isn't something that I changed so I've just
>> been hoping that it's sufficient since it seems to have been working
>> (whether that's by chance because the compiler didn't generate multiple
>> reads I've no idea). For walking the kernel's page tables the lack of
>> READ_ONCE is also not great, but at least for PTDUMP we don't care too much
>> about accuracy and it should be crash proof because there's no RCU grace
>> period. And again the code I was replacing didn't have any special
>> protection.
>>
>> I can't see any harm in updating the code to include READ_ONCE and I'm happy
>> to review a patch.
> 
> The reason I ask is because hmm's walkers often have this pattern
> where they get the pointer and then de-ref it (again) then
> immediately have to recheck the 'again' conditions of the walker
> itself because the re-read may have given a different value.
> 
> Having the walker deref the pointer and pass the value it into the ops
> for use rather than repeatedly de-refing an unlocked value seems like
> a much safer design to me.

Yeah that sounds like a good idea.

> If this also implicitly relies on a RCU grace period then it is also
> missing RCU locking...

True - I'm not 100% sure in what situations a page table entry can be 
freed. Anshuman has added locking to deal with memory hotplug[1]. I 
believe this is sufficient.

[1] bf2b59f60ee1 ("arm64/mm: Hold memory hotplug lock while walking for 
kernel page table dump")

> I also didn't quite understand why walk_pte_range() skipped locking
> the pte in the no_vma case - I don't get why vma would be related to
> locking here.

The no_vma case is for walking the kernel's page tables and they may 
have entries that are not backed by struct page, so there isn't 
(reliably) a PTE lock to take.

> I also saw that hmm open coded the pte walk, presumably for
> performance, so I was thinking of adding some kind of pte_range()
> callback to avoid the expensive indirect function call per pte, but
> hmm also can't have the pmd locked...

Yeah the callback per PTE is a bit heavy because of the indirect 
function call. I'm not sure how to optimise it beyond open coding at the 
PMD level. One option would be to provide helper functions to make it a 
bit more generic.

Do you have an idea of what pte_range() would look like?

Steve
Jason Gunthorpe March 12, 2020, 5:17 p.m. UTC | #7
On Thu, Mar 12, 2020 at 05:02:18PM +0000, Steven Price wrote:

> > Having the walker deref the pointer and pass the value it into the ops
> > for use rather than repeatedly de-refing an unlocked value seems like
> > a much safer design to me.
> 
> Yeah that sounds like a good idea.

Ok.. let see when I get this hmm & odp stuff cleared off
 
> > I also didn't quite understand why walk_pte_range() skipped locking
> > the pte in the no_vma case - I don't get why vma would be related to
> > locking here.
> 
> The no_vma case is for walking the kernel's page tables and they may have
> entries that are not backed by struct page, so there isn't (reliably) a PTE
> lock to take.

Oh, that is an interesting bit of insight..

> > I also saw that hmm open coded the pte walk, presumably for
> > performance, so I was thinking of adding some kind of pte_range()
> > callback to avoid the expensive indirect function call per pte, but
> > hmm also can't have the pmd locked...
> 
> Yeah the callback per PTE is a bit heavy because of the indirect function
> call. I'm not sure how to optimise it beyond open coding at the PMD level.
> One option would be to provide helper functions to make it a bit more
> generic.
> 
> Do you have an idea of what pte_range() would look like?

Basically just pass in the already mapped pte array just like is
already done at the tail of the pmd

The reason to do it like this is so that the common code in the walker
can correctly prove the pmd is pointing at a pte before trying to map
it.

This is complicated, and hmm at least already got it wrong when trying
to open code at the PMD level.

Jason
Jason Gunthorpe March 13, 2020, 7:55 p.m. UTC | #8
On Thu, Mar 12, 2020 at 05:02:18PM +0000, Steven Price wrote:
> On 12/03/2020 16:37, Jason Gunthorpe wrote:
> > On Thu, Mar 12, 2020 at 04:16:33PM +0000, Steven Price wrote:
> > > > Actually, while you are looking at this, do you think we should be
> > > > adding at least READ_ONCE in the pagewalk.c walk_* functions? The
> > > > multiple references of pmd, pud, etc without locking seems sketchy to
> > > > me.
> > > 
> > > I agree it seems worrying. I'm not entirely sure whether the holding of
> > > mmap_sem is sufficient,
> > 
> > I looked at this question, and at least for PMD, mmap_sem is not
> > sufficient. I didn't easilly figure it out for the other ones
> > 
> > I'm guessing if PMD is not safe then none of them are.
> > 
> > > this isn't something that I changed so I've just
> > > been hoping that it's sufficient since it seems to have been working
> > > (whether that's by chance because the compiler didn't generate multiple
> > > reads I've no idea). For walking the kernel's page tables the lack of
> > > READ_ONCE is also not great, but at least for PTDUMP we don't care too much
> > > about accuracy and it should be crash proof because there's no RCU grace
> > > period. And again the code I was replacing didn't have any special
> > > protection.
> > > 
> > > I can't see any harm in updating the code to include READ_ONCE and I'm happy
> > > to review a patch.
> > 
> > The reason I ask is because hmm's walkers often have this pattern
> > where they get the pointer and then de-ref it (again) then
> > immediately have to recheck the 'again' conditions of the walker
> > itself because the re-read may have given a different value.
> > 
> > Having the walker deref the pointer and pass the value it into the ops
> > for use rather than repeatedly de-refing an unlocked value seems like
> > a much safer design to me.
> 
> Yeah that sounds like a good idea.

I'm looking at this now.. The PUD is also changing under the read
mmap_sem - and I was able to think up some race conditiony bugs
related to this. Have some patches now..

However, I haven't been able to understand why walk_page_range()
doesn't check pud_present() or pmd_present() before calling
pmd_offset_map() or pte_offset_map().

As far as I can see a non-present entry has a swap entry encoded in
it, and thus it seems like it is a bad idea to pass a non-present
entry to the two map functions. I think those should only be called
when the entry points to the next level in the page table  (so there
is something to map?)

I see you added !present tests for the !vma case, but why only there?

Is this a bug? Do you know how it works?

Is it something that was missed when people added non-present PUD and
PMD's?

Thanks,
Jason
Matthew Wilcox March 13, 2020, 9:04 p.m. UTC | #9
On Fri, Mar 13, 2020 at 04:55:50PM -0300, Jason Gunthorpe wrote:
> On Thu, Mar 12, 2020 at 05:02:18PM +0000, Steven Price wrote:
> > On 12/03/2020 16:37, Jason Gunthorpe wrote:
> > > On Thu, Mar 12, 2020 at 04:16:33PM +0000, Steven Price wrote:
> > > > > Actually, while you are looking at this, do you think we should be
> > > > > adding at least READ_ONCE in the pagewalk.c walk_* functions? The
> > > > > multiple references of pmd, pud, etc without locking seems sketchy to
> > > > > me.
> > > > 
> > > > I agree it seems worrying. I'm not entirely sure whether the holding of
> > > > mmap_sem is sufficient,
> > > 
> > > I looked at this question, and at least for PMD, mmap_sem is not
> > > sufficient. I didn't easilly figure it out for the other ones
> > > 
> > > I'm guessing if PMD is not safe then none of them are.
> > > 
> > > > this isn't something that I changed so I've just
> > > > been hoping that it's sufficient since it seems to have been working
> > > > (whether that's by chance because the compiler didn't generate multiple
> > > > reads I've no idea). For walking the kernel's page tables the lack of
> > > > READ_ONCE is also not great, but at least for PTDUMP we don't care too much
> > > > about accuracy and it should be crash proof because there's no RCU grace
> > > > period. And again the code I was replacing didn't have any special
> > > > protection.
> > > > 
> > > > I can't see any harm in updating the code to include READ_ONCE and I'm happy
> > > > to review a patch.
> > > 
> > > The reason I ask is because hmm's walkers often have this pattern
> > > where they get the pointer and then de-ref it (again) then
> > > immediately have to recheck the 'again' conditions of the walker
> > > itself because the re-read may have given a different value.
> > > 
> > > Having the walker deref the pointer and pass the value it into the ops
> > > for use rather than repeatedly de-refing an unlocked value seems like
> > > a much safer design to me.
> > 
> > Yeah that sounds like a good idea.
> 
> I'm looking at this now.. The PUD is also changing under the read
> mmap_sem - and I was able to think up some race conditiony bugs
> related to this. Have some patches now..
> 
> However, I haven't been able to understand why walk_page_range()
> doesn't check pud_present() or pmd_present() before calling
> pmd_offset_map() or pte_offset_map().
> 
> As far as I can see a non-present entry has a swap entry encoded in
> it, and thus it seems like it is a bad idea to pass a non-present
> entry to the two map functions. I think those should only be called
> when the entry points to the next level in the page table  (so there
> is something to map?)
> 
> I see you added !present tests for the !vma case, but why only there?
> 
> Is this a bug? Do you know how it works?
> 
> Is it something that was missed when people added non-present PUD and
> PMD's?

... I'm sorry, I did what now?  As far as I can tell, you're talking
about mm/pagewalk.c, and the only commit I have in that file is
a00cc7d9dd93d66a3fb83fc52aa57a4bec51c517 ("mm, x86: add support for
PUD-sized transparent hugepages", which I think I was pretty clear
from the commit message is basically copy-and-paste from the PMD code.
I have no clue why most of the decisions in the MM were made.
Jason Gunthorpe March 13, 2020, 10:51 p.m. UTC | #10
On Fri, Mar 13, 2020 at 02:04:46PM -0700, Matthew Wilcox wrote:
> On Fri, Mar 13, 2020 at 04:55:50PM -0300, Jason Gunthorpe wrote:
> > On Thu, Mar 12, 2020 at 05:02:18PM +0000, Steven Price wrote:
> > > On 12/03/2020 16:37, Jason Gunthorpe wrote:
> > > > On Thu, Mar 12, 2020 at 04:16:33PM +0000, Steven Price wrote:
> > > > > > Actually, while you are looking at this, do you think we should be
> > > > > > adding at least READ_ONCE in the pagewalk.c walk_* functions? The
> > > > > > multiple references of pmd, pud, etc without locking seems sketchy to
> > > > > > me.
> > > > > 
> > > > > I agree it seems worrying. I'm not entirely sure whether the holding of
> > > > > mmap_sem is sufficient,
> > > > 
> > > > I looked at this question, and at least for PMD, mmap_sem is not
> > > > sufficient. I didn't easilly figure it out for the other ones
> > > > 
> > > > I'm guessing if PMD is not safe then none of them are.
> > > > 
> > > > > this isn't something that I changed so I've just
> > > > > been hoping that it's sufficient since it seems to have been working
> > > > > (whether that's by chance because the compiler didn't generate multiple
> > > > > reads I've no idea). For walking the kernel's page tables the lack of
> > > > > READ_ONCE is also not great, but at least for PTDUMP we don't care too much
> > > > > about accuracy and it should be crash proof because there's no RCU grace
> > > > > period. And again the code I was replacing didn't have any special
> > > > > protection.
> > > > > 
> > > > > I can't see any harm in updating the code to include READ_ONCE and I'm happy
> > > > > to review a patch.
> > > > 
> > > > The reason I ask is because hmm's walkers often have this pattern
> > > > where they get the pointer and then de-ref it (again) then
> > > > immediately have to recheck the 'again' conditions of the walker
> > > > itself because the re-read may have given a different value.
> > > > 
> > > > Having the walker deref the pointer and pass the value it into the ops
> > > > for use rather than repeatedly de-refing an unlocked value seems like
> > > > a much safer design to me.
> > > 
> > > Yeah that sounds like a good idea.
> > 
> > I'm looking at this now.. The PUD is also changing under the read
> > mmap_sem - and I was able to think up some race conditiony bugs
> > related to this. Have some patches now..
> > 
> > However, I haven't been able to understand why walk_page_range()
> > doesn't check pud_present() or pmd_present() before calling
> > pmd_offset_map() or pte_offset_map().
> > 
> > As far as I can see a non-present entry has a swap entry encoded in
> > it, and thus it seems like it is a bad idea to pass a non-present
> > entry to the two map functions. I think those should only be called
> > when the entry points to the next level in the page table  (so there
> > is something to map?)
> > 
> > I see you added !present tests for the !vma case, but why only there?
> > 
> > Is this a bug? Do you know how it works?
> > 
> > Is it something that was missed when people added non-present PUD and
> > PMD's?
> 
> ... I'm sorry, I did what now? 

No, no, just widening to see if someone knows

> As far as I can tell, you're talking
> about mm/pagewalk.c, and the only commit I have in that file is
> a00cc7d9dd93d66a3fb83fc52aa57a4bec51c517 ("mm, x86: add support for
> PUD-sized transparent hugepages", which I think I was pretty clear
> from the commit message is basically copy-and-paste from the PMD
> code.

Right, which added the split_huge_pud() which seems maybe related to
pud_present, or maybe not, I don't know.

> I have no clue why most of the decisions in the MM were made.

Fun!

Jason
diff mbox series

Patch

diff --git a/mm/hmm.c b/mm/hmm.c
index ca33d086bdc1..0117c86426d1 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -466,8 +466,10 @@  static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
 	struct hmm_range *range = hmm_vma_walk->range;
 	unsigned long addr = start;
 	pud_t pud;
-	int ret = 0;
 	spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma);
+	unsigned long i, npages, pfn;
+	uint64_t *pfns, cpu_flags;
+	bool fault, write_fault;
 
 	if (!ptl)
 		return 0;
@@ -481,50 +483,43 @@  static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
 		return hmm_vma_walk_hole(start, end, -1, walk);
 	}
 
-	if (pud_huge(pud) && pud_devmap(pud)) {
-		unsigned long i, npages, pfn;
-		uint64_t *pfns, cpu_flags;
-		bool fault, write_fault;
+	if (!pud_huge(pud) || !pud_devmap(pud)) {
+		/* Ask for the PUD to be split */
+		walk->action = ACTION_SUBTREE;
+		spin_unlock(ptl);
+		return 0;
+	}
 
-		if (!pud_present(pud)) {
-			spin_unlock(ptl);
-			return hmm_vma_walk_hole(start, end, -1, walk);
-		}
+	if (!pud_present(pud)) {
+		spin_unlock(ptl);
+		return hmm_vma_walk_hole(start, end, -1, walk);
+	}
 
-		i = (addr - range->start) >> PAGE_SHIFT;
-		npages = (end - addr) >> PAGE_SHIFT;
-		pfns = &range->pfns[i];
+	i = (addr - range->start) >> PAGE_SHIFT;
+	npages = (end - addr) >> PAGE_SHIFT;
+	pfns = &range->pfns[i];
 
-		cpu_flags = pud_to_hmm_pfn_flags(range, pud);
-		hmm_range_need_fault(hmm_vma_walk, pfns, npages,
-				     cpu_flags, &fault, &write_fault);
-		if (fault || write_fault) {
-			spin_unlock(ptl);
-			return hmm_vma_walk_hole_(addr, end, fault, write_fault,
-						  walk);
-		}
+	cpu_flags = pud_to_hmm_pfn_flags(range, pud);
+	hmm_range_need_fault(hmm_vma_walk, pfns, npages,
+			     cpu_flags, &fault, &write_fault);
+	if (fault || write_fault) {
+		spin_unlock(ptl);
+		return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
+	}
 
-		pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
-		for (i = 0; i < npages; ++i, ++pfn) {
-			hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
-					      hmm_vma_walk->pgmap);
-			if (unlikely(!hmm_vma_walk->pgmap)) {
-				ret = -EBUSY;
-				goto out_unlock;
-			}
-			pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
-				  cpu_flags;
+	pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+	for (i = 0; i < npages; ++i, ++pfn) {
+		hmm_vma_walk->pgmap = get_dev_pagemap(pfn, hmm_vma_walk->pgmap);
+		if (unlikely(!hmm_vma_walk->pgmap)) {
+			spin_unlock(ptl);
+			return -EBUSY;
 		}
-		hmm_vma_walk->last = end;
-		goto out_unlock;
+		pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
 	}
+	hmm_vma_walk->last = end;
 
-	/* Ask for the PUD to be split */
-	walk->action = ACTION_SUBTREE;
-
-out_unlock:
 	spin_unlock(ptl);
-	return ret;
+	return 0;
 }
 #else
 #define hmm_vma_walk_pud	NULL