diff mbox series

[v16,11/25] mm: pagewalk: Add p4d_entry() and pgd_entry()

Message ID 20191206135316.47703-12-steven.price@arm.com
State New, archived
Headers show
Series Generic page walk and ptdump | expand

Commit Message

Steven Price Dec. 6, 2019, 1:53 p.m. UTC
pgd_entry() and pud_entry() were removed by commit 0b1fbfe50006c410
("mm/pagewalk: remove pgd_entry() and pud_entry()") because there were
no users. We're about to add users so reintroduce them, along with
p4d_entry() as we now have 5 levels of tables.

Note that commit a00cc7d9dd93d66a ("mm, x86: add support for
PUD-sized transparent hugepages") already re-added pud_entry() but with
different semantics to the other callbacks. Since there have never
been upstream users of this, revert the semantics back to match the
other callbacks. This means pud_entry() is called for all entries, not
just transparent huge pages.

Tested-by: Zong Li <zong.li@sifive.com>
Signed-off-by: Steven Price <steven.price@arm.com>
---
 include/linux/pagewalk.h | 19 +++++++++++++------
 mm/pagewalk.c            | 27 ++++++++++++++++-----------
 2 files changed, 29 insertions(+), 17 deletions(-)

Comments

Thomas Hellström (Intel) Dec. 12, 2019, 11:23 a.m. UTC | #1
On 12/6/19 2:53 PM, Steven Price wrote:
> pgd_entry() and pud_entry() were removed by commit 0b1fbfe50006c410
> ("mm/pagewalk: remove pgd_entry() and pud_entry()") because there were
> no users. We're about to add users so reintroduce them, along with
> p4d_entry() as we now have 5 levels of tables.
>
> Note that commit a00cc7d9dd93d66a ("mm, x86: add support for
> PUD-sized transparent hugepages") already re-added pud_entry() but with
> different semantics to the other callbacks. Since there have never
> been upstream users of this, revert the semantics back to match the
> other callbacks. This means pud_entry() is called for all entries, not
> just transparent huge pages.

Actually, there are two users of pud_entry(), in hmm.c and since 5.5rc1 
also mapping_dirty_helpers.c. The latter one is unproblematic and 
requires no attention but the one in hmm.c is probably largely untested, 
and seems to assume it was called outside of the spinlock.

The problem with the current patch is that the hmm pud_entry will 
traverse also pmds, so that will be done twice now.

In another thread we were discussing a means of rerunning the level (in 
case of a race), or continuing after a level, based on the return value 
after the callback. The change was fairly invasive,


> Tested-by: Zong Li <zong.li@sifive.com>
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
>   include/linux/pagewalk.h | 19 +++++++++++++------
>   mm/pagewalk.c            | 27 ++++++++++++++++-----------
>   2 files changed, 29 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
> index 6ec82e92c87f..06790f23957f 100644
> --- a/include/linux/pagewalk.h
> +++ b/include/linux/pagewalk.h
> @@ -8,15 +8,15 @@ struct mm_walk;
>   
>   /**
>    * mm_walk_ops - callbacks for walk_page_range
> - * @pud_entry:		if set, called for each non-empty PUD (2nd-level) entry
> - *			this handler should only handle pud_trans_huge() puds.
> - *			the pmd_entry or pte_entry callbacks will be used for
> - *			regular PUDs.
> - * @pmd_entry:		if set, called for each non-empty PMD (3rd-level) entry
> + * @pgd_entry:		if set, called for each non-empty PGD (top-level) entry
> + * @p4d_entry:		if set, called for each non-empty P4D entry
> + * @pud_entry:		if set, called for each non-empty PUD entry
> + * @pmd_entry:		if set, called for each non-empty PMD entry
>    *			this handler is required to be able to handle
>    *			pmd_trans_huge() pmds.  They may simply choose to
>    *			split_huge_page() instead of handling it explicitly.
> - * @pte_entry:		if set, called for each non-empty PTE (4th-level) entry
> + * @pte_entry:		if set, called for each non-empty PTE (lowest-level)
> + *			entry
>    * @pte_hole:		if set, called for each hole at all levels
>    * @hugetlb_entry:	if set, called for each hugetlb entry
>    * @test_walk:		caller specific callback function to determine whether
> @@ -27,8 +27,15 @@ struct mm_walk;
>    * @pre_vma:            if set, called before starting walk on a non-null vma.
>    * @post_vma:           if set, called after a walk on a non-null vma, provided
>    *                      that @pre_vma and the vma walk succeeded.
> + *
> + * p?d_entry callbacks are called even if those levels are folded on a
> + * particular architecture/configuration.
>    */
>   struct mm_walk_ops {
> +	int (*pgd_entry)(pgd_t *pgd, unsigned long addr,
> +			 unsigned long next, struct mm_walk *walk);
> +	int (*p4d_entry)(p4d_t *p4d, unsigned long addr,
> +			 unsigned long next, struct mm_walk *walk);
>   	int (*pud_entry)(pud_t *pud, unsigned long addr,
>   			 unsigned long next, struct mm_walk *walk);
>   	int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
> diff --git a/mm/pagewalk.c b/mm/pagewalk.c
> index ea0b9e606ad1..c089786e7a7f 100644
> --- a/mm/pagewalk.c
> +++ b/mm/pagewalk.c
> @@ -94,15 +94,9 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
>   		}
>   
>   		if (ops->pud_entry) {
> -			spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);
> -
> -			if (ptl) {
> -				err = ops->pud_entry(pud, addr, next, walk);
> -				spin_unlock(ptl);
> -				if (err)
> -					break;
> -				continue;
> -			}
> +			err = ops->pud_entry(pud, addr, next, walk);
> +			if (err)
> +				break;

Actually, there are two current users of pud_entry(), in hmm.c and since 
5.5rc1 also mapping_dirty_helpers.c. The latter one is unproblematic and 
requires no attention but the one in hmm.c is probably largely untested, 
and seems to assume it was called outside of the spinlock.

The problem with the current patch is that the hmm pud_entry will 
traverse also pmds, so that will now be done twice.

/Thomas
Thomas Hellström (Intel) Dec. 12, 2019, 11:33 a.m. UTC | #2
On 12/12/19 12:23 PM, Thomas Hellström (VMware) wrote:
> On 12/6/19 2:53 PM, Steven Price wrote:
>> pgd_entry() and pud_entry() were removed by commit 0b1fbfe50006c410
>> ("mm/pagewalk: remove pgd_entry() and pud_entry()") because there were
>> no users. We're about to add users so reintroduce them, along with
>> p4d_entry() as we now have 5 levels of tables.
>>
>> Note that commit a00cc7d9dd93d66a ("mm, x86: add support for
>> PUD-sized transparent hugepages") already re-added pud_entry() but with
>> different semantics to the other callbacks. Since there have never
>> been upstream users of this, revert the semantics back to match the
>> other callbacks. This means pud_entry() is called for all entries, not
>> just transparent huge pages.
>
> Actually, there are two users of pud_entry(), in hmm.c and since 
> 5.5rc1 also mapping_dirty_helpers.c. The latter one is unproblematic 
> and requires no attention but the one in hmm.c is probably largely 
> untested, and seems to assume it was called outside of the spinlock.
>
> The problem with the current patch is that the hmm pud_entry will 
> traverse also pmds, so that will be done twice now.
>
> In another thread we were discussing a means of rerunning the level 
> (in case of a race), or continuing after a level, based on the return 
> value after the callback. The change was fairly invasive,
>
Hmm. Forgot to remove the above text that appears twice. :(. The correct 
one is inline below.

>
>> Tested-by: Zong Li <zong.li@sifive.com>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>>   include/linux/pagewalk.h | 19 +++++++++++++------
>>   mm/pagewalk.c            | 27 ++++++++++++++++-----------
>>   2 files changed, 29 insertions(+), 17 deletions(-)
>>
>> diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
>> index 6ec82e92c87f..06790f23957f 100644
>> --- a/include/linux/pagewalk.h
>> +++ b/include/linux/pagewalk.h
>> @@ -8,15 +8,15 @@ struct mm_walk;
>>     /**
>>    * mm_walk_ops - callbacks for walk_page_range
>> - * @pud_entry:        if set, called for each non-empty PUD 
>> (2nd-level) entry
>> - *            this handler should only handle pud_trans_huge() puds.
>> - *            the pmd_entry or pte_entry callbacks will be used for
>> - *            regular PUDs.
>> - * @pmd_entry:        if set, called for each non-empty PMD 
>> (3rd-level) entry
>> + * @pgd_entry:        if set, called for each non-empty PGD 
>> (top-level) entry
>> + * @p4d_entry:        if set, called for each non-empty P4D entry
>> + * @pud_entry:        if set, called for each non-empty PUD entry
>> + * @pmd_entry:        if set, called for each non-empty PMD entry
>>    *            this handler is required to be able to handle
>>    *            pmd_trans_huge() pmds.  They may simply choose to
>>    *            split_huge_page() instead of handling it explicitly.
>> - * @pte_entry:        if set, called for each non-empty PTE 
>> (4th-level) entry
>> + * @pte_entry:        if set, called for each non-empty PTE 
>> (lowest-level)
>> + *            entry
>>    * @pte_hole:        if set, called for each hole at all levels
>>    * @hugetlb_entry:    if set, called for each hugetlb entry
>>    * @test_walk:        caller specific callback function to 
>> determine whether
>> @@ -27,8 +27,15 @@ struct mm_walk;
>>    * @pre_vma:            if set, called before starting walk on a 
>> non-null vma.
>>    * @post_vma:           if set, called after a walk on a non-null 
>> vma, provided
>>    *                      that @pre_vma and the vma walk succeeded.
>> + *
>> + * p?d_entry callbacks are called even if those levels are folded on a
>> + * particular architecture/configuration.
>>    */
>>   struct mm_walk_ops {
>> +    int (*pgd_entry)(pgd_t *pgd, unsigned long addr,
>> +             unsigned long next, struct mm_walk *walk);
>> +    int (*p4d_entry)(p4d_t *p4d, unsigned long addr,
>> +             unsigned long next, struct mm_walk *walk);
>>       int (*pud_entry)(pud_t *pud, unsigned long addr,
>>                unsigned long next, struct mm_walk *walk);
>>       int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
>> diff --git a/mm/pagewalk.c b/mm/pagewalk.c
>> index ea0b9e606ad1..c089786e7a7f 100644
>> --- a/mm/pagewalk.c
>> +++ b/mm/pagewalk.c
>> @@ -94,15 +94,9 @@ static int walk_pud_range(p4d_t *p4d, unsigned 
>> long addr, unsigned long end,
>>           }
>>             if (ops->pud_entry) {
>> -            spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);
>> -
>> -            if (ptl) {
>> -                err = ops->pud_entry(pud, addr, next, walk);
>> -                spin_unlock(ptl);
>> -                if (err)
>> -                    break;
>> -                continue;
>> -            }
>> +            err = ops->pud_entry(pud, addr, next, walk);
>> +            if (err)
>> +                break;
>
> Actually, there are two current users of pud_entry(), in hmm.c and 
> since 5.5rc1 also mapping_dirty_helpers.c. The latter one is 
> unproblematic and requires no attention but the one in hmm.c is 
> probably largely untested, and seems to assume it was called outside 
> of the spinlock.
>
> The problem with the current patch is that the hmm pud_entry will 
> traverse also pmds, so that will now be done twice.
>
> /Thomas
>
Steven Price Dec. 12, 2019, 1:15 p.m. UTC | #3
On 12/12/2019 11:33, Thomas Hellström (VMware) wrote:
> On 12/12/19 12:23 PM, Thomas Hellström (VMware) wrote:
>> On 12/6/19 2:53 PM, Steven Price wrote:
>>> pgd_entry() and pud_entry() were removed by commit 0b1fbfe50006c410
>>> ("mm/pagewalk: remove pgd_entry() and pud_entry()") because there were
>>> no users. We're about to add users so reintroduce them, along with
>>> p4d_entry() as we now have 5 levels of tables.
>>>
>>> Note that commit a00cc7d9dd93d66a ("mm, x86: add support for
>>> PUD-sized transparent hugepages") already re-added pud_entry() but with
>>> different semantics to the other callbacks. Since there have never
>>> been upstream users of this, revert the semantics back to match the
>>> other callbacks. This means pud_entry() is called for all entries, not
>>> just transparent huge pages.

When I wrote that there were no upstream users, which sadly shows how
long ago that was :(

>> Actually, there are two users of pud_entry(), in hmm.c and since 
>> 5.5rc1 also mapping_dirty_helpers.c. The latter one is unproblematic 
>> and requires no attention but the one in hmm.c is probably largely 
>> untested, and seems to assume it was called outside of the spinlock.
>>
>> The problem with the current patch is that the hmm pud_entry will 
>> traverse also pmds, so that will be done twice now.
>>
>> In another thread we were discussing a means of rerunning the level 
>> (in case of a race), or continuing after a level, based on the return 
>> value after the callback. The change was fairly invasive,
>>
> Hmm. Forgot to remove the above text that appears twice. :(. The correct 
> one is inline below.
> 
>>
>>> Tested-by: Zong Li <zong.li@sifive.com>
>>> Signed-off-by: Steven Price <steven.price@arm.com>
>>> ---
>>>   include/linux/pagewalk.h | 19 +++++++++++++------
>>>   mm/pagewalk.c            | 27 ++++++++++++++++-----------
>>>   2 files changed, 29 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
>>> index 6ec82e92c87f..06790f23957f 100644
>>> --- a/include/linux/pagewalk.h
>>> +++ b/include/linux/pagewalk.h
>>> @@ -8,15 +8,15 @@ struct mm_walk;
>>>     /**
>>>    * mm_walk_ops - callbacks for walk_page_range
>>> - * @pud_entry:        if set, called for each non-empty PUD 
>>> (2nd-level) entry
>>> - *            this handler should only handle pud_trans_huge() puds.
>>> - *            the pmd_entry or pte_entry callbacks will be used for
>>> - *            regular PUDs.
>>> - * @pmd_entry:        if set, called for each non-empty PMD 
>>> (3rd-level) entry
>>> + * @pgd_entry:        if set, called for each non-empty PGD 
>>> (top-level) entry
>>> + * @p4d_entry:        if set, called for each non-empty P4D entry
>>> + * @pud_entry:        if set, called for each non-empty PUD entry
>>> + * @pmd_entry:        if set, called for each non-empty PMD entry
>>>    *            this handler is required to be able to handle
>>>    *            pmd_trans_huge() pmds.  They may simply choose to
>>>    *            split_huge_page() instead of handling it explicitly.
>>> - * @pte_entry:        if set, called for each non-empty PTE 
>>> (4th-level) entry
>>> + * @pte_entry:        if set, called for each non-empty PTE 
>>> (lowest-level)
>>> + *            entry
>>>    * @pte_hole:        if set, called for each hole at all levels
>>>    * @hugetlb_entry:    if set, called for each hugetlb entry
>>>    * @test_walk:        caller specific callback function to 
>>> determine whether
>>> @@ -27,8 +27,15 @@ struct mm_walk;
>>>    * @pre_vma:            if set, called before starting walk on a 
>>> non-null vma.
>>>    * @post_vma:           if set, called after a walk on a non-null 
>>> vma, provided
>>>    *                      that @pre_vma and the vma walk succeeded.
>>> + *
>>> + * p?d_entry callbacks are called even if those levels are folded on a
>>> + * particular architecture/configuration.
>>>    */
>>>   struct mm_walk_ops {
>>> +    int (*pgd_entry)(pgd_t *pgd, unsigned long addr,
>>> +             unsigned long next, struct mm_walk *walk);
>>> +    int (*p4d_entry)(p4d_t *p4d, unsigned long addr,
>>> +             unsigned long next, struct mm_walk *walk);
>>>       int (*pud_entry)(pud_t *pud, unsigned long addr,
>>>                unsigned long next, struct mm_walk *walk);
>>>       int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
>>> diff --git a/mm/pagewalk.c b/mm/pagewalk.c
>>> index ea0b9e606ad1..c089786e7a7f 100644
>>> --- a/mm/pagewalk.c
>>> +++ b/mm/pagewalk.c
>>> @@ -94,15 +94,9 @@ static int walk_pud_range(p4d_t *p4d, unsigned 
>>> long addr, unsigned long end,
>>>           }
>>>             if (ops->pud_entry) {
>>> -            spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);
>>> -
>>> -            if (ptl) {
>>> -                err = ops->pud_entry(pud, addr, next, walk);
>>> -                spin_unlock(ptl);
>>> -                if (err)
>>> -                    break;
>>> -                continue;
>>> -            }
>>> +            err = ops->pud_entry(pud, addr, next, walk);
>>> +            if (err)
>>> +                break;
>>
>> Actually, there are two current users of pud_entry(), in hmm.c and 
>> since 5.5rc1 also mapping_dirty_helpers.c. The latter one is 
>> unproblematic and requires no attention but the one in hmm.c is 
>> probably largely untested, and seems to assume it was called outside 
>> of the spinlock.

Thanks for pointing that out, I guess the simplest fix would be to
squash in something like the below which should restore the old
behaviour for hmm.c without affecting others.

Steve

---8<----
diff --git a/mm/hmm.c b/mm/hmm.c
index d379cb6496ae..744b6644d0e4 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -478,19 +478,26 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
  	pmd_t *pmdp;
  	pud_t pud;
  	int ret;
+	spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma);
+	if (!ptl)
+		return 0;
  
  again:
  	pud = READ_ONCE(*pudp);
-	if (pud_none(pud))
-		return hmm_vma_walk_hole(start, end, walk);
+	if (pud_none(pud)) {
+		ret = hmm_vma_walk_hole(start, end, walk);
+		goto out_unlock;
+	}
  
  	if (pud_huge(pud) && pud_devmap(pud)) {
  		unsigned long i, npages, pfn;
  		uint64_t *pfns, cpu_flags;
  		bool fault, write_fault;
  
-		if (!pud_present(pud))
-			return hmm_vma_walk_hole(start, end, walk);
+		if (!pud_present(pud)) {
+			ret = hmm_vma_walk_hole(start, end, walk);
+			goto out_unlock;
+		}
  
  		i = (addr - range->start) >> PAGE_SHIFT;
  		npages = (end - addr) >> PAGE_SHIFT;
@@ -499,16 +506,20 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
  		cpu_flags = pud_to_hmm_pfn_flags(range, pud);
  		hmm_range_need_fault(hmm_vma_walk, pfns, npages,
  				     cpu_flags, &fault, &write_fault);
-		if (fault || write_fault)
-			return hmm_vma_walk_hole_(addr, end, fault,
-						write_fault, walk);
+		if (fault || write_fault) {
+			ret = hmm_vma_walk_hole_(addr, end, fault,
+						 write_fault, walk);
+			goto out_unlock;
+		}
  
  		pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
  		for (i = 0; i < npages; ++i, ++pfn) {
  			hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
  					      hmm_vma_walk->pgmap);
-			if (unlikely(!hmm_vma_walk->pgmap))
-				return -EBUSY;
+			if (unlikely(!hmm_vma_walk->pgmap)) {
+				ret = -EBUSY;
+				goto out_unlock;
+			}
  			pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
  				  cpu_flags;
  		}
@@ -517,7 +528,8 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
  			hmm_vma_walk->pgmap = NULL;
  		}
  		hmm_vma_walk->last = end;
-		return 0;
+		ret = 0;
+		goto out_unlock;
  	}
  
  	split_huge_pud(walk->vma, pudp, addr);
@@ -529,10 +541,14 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
  		next = pmd_addr_end(addr, end);
  		ret = hmm_vma_walk_pmd(pmdp, addr, next, walk);
  		if (ret)
-			return ret;
+			goto out_unlock;
  	} while (pmdp++, addr = next, addr != end);
  
-	return 0;
+	ret = 0;
+
+out_unlock:
+	spin_unlock(ptl);
+	return ret;
  }
  #else
  #define hmm_vma_walk_pud	NULL
Thomas Hellström (Intel) Dec. 12, 2019, 2:04 p.m. UTC | #4
On 12/12/19 2:15 PM, Steven Price wrote:
> On 12/12/2019 11:33, Thomas Hellström (VMware) wrote:
>> On 12/12/19 12:23 PM, Thomas Hellström (VMware) wrote:
>>> On 12/6/19 2:53 PM, Steven Price wrote:
>>>> pgd_entry() and pud_entry() were removed by commit 0b1fbfe50006c410
>>>> ("mm/pagewalk: remove pgd_entry() and pud_entry()") because there were
>>>> no users. We're about to add users so reintroduce them, along with
>>>> p4d_entry() as we now have 5 levels of tables.
>>>>
>>>> Note that commit a00cc7d9dd93d66a ("mm, x86: add support for
>>>> PUD-sized transparent hugepages") already re-added pud_entry() but 
>>>> with
>>>> different semantics to the other callbacks. Since there have never
>>>> been upstream users of this, revert the semantics back to match the
>>>> other callbacks. This means pud_entry() is called for all entries, not
>>>> just transparent huge pages.
>
> When I wrote that there were no upstream users, which sadly shows how
> long ago that was :(
>
>>> Actually, there are two users of pud_entry(), in hmm.c and since 
>>> 5.5rc1 also mapping_dirty_helpers.c. The latter one is unproblematic 
>>> and requires no attention but the one in hmm.c is probably largely 
>>> untested, and seems to assume it was called outside of the spinlock.
>>>
>>> The problem with the current patch is that the hmm pud_entry will 
>>> traverse also pmds, so that will be done twice now.
>>>
>>> In another thread we were discussing a means of rerunning the level 
>>> (in case of a race), or continuing after a level, based on the 
>>> return value after the callback. The change was fairly invasive,
>>>
>> Hmm. Forgot to remove the above text that appears twice. :(. The 
>> correct one is inline below.
>>
>>>
>>>> Tested-by: Zong Li <zong.li@sifive.com>
>>>> Signed-off-by: Steven Price <steven.price@arm.com>
>>>> ---
>>>>   include/linux/pagewalk.h | 19 +++++++++++++------
>>>>   mm/pagewalk.c            | 27 ++++++++++++++++-----------
>>>>   2 files changed, 29 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
>>>> index 6ec82e92c87f..06790f23957f 100644
>>>> --- a/include/linux/pagewalk.h
>>>> +++ b/include/linux/pagewalk.h
>>>> @@ -8,15 +8,15 @@ struct mm_walk;
>>>>     /**
>>>>    * mm_walk_ops - callbacks for walk_page_range
>>>> - * @pud_entry:        if set, called for each non-empty PUD 
>>>> (2nd-level) entry
>>>> - *            this handler should only handle pud_trans_huge() puds.
>>>> - *            the pmd_entry or pte_entry callbacks will be used for
>>>> - *            regular PUDs.
>>>> - * @pmd_entry:        if set, called for each non-empty PMD 
>>>> (3rd-level) entry
>>>> + * @pgd_entry:        if set, called for each non-empty PGD 
>>>> (top-level) entry
>>>> + * @p4d_entry:        if set, called for each non-empty P4D entry
>>>> + * @pud_entry:        if set, called for each non-empty PUD entry
>>>> + * @pmd_entry:        if set, called for each non-empty PMD entry
>>>>    *            this handler is required to be able to handle
>>>>    *            pmd_trans_huge() pmds.  They may simply choose to
>>>>    *            split_huge_page() instead of handling it explicitly.
>>>> - * @pte_entry:        if set, called for each non-empty PTE 
>>>> (4th-level) entry
>>>> + * @pte_entry:        if set, called for each non-empty PTE 
>>>> (lowest-level)
>>>> + *            entry
>>>>    * @pte_hole:        if set, called for each hole at all levels
>>>>    * @hugetlb_entry:    if set, called for each hugetlb entry
>>>>    * @test_walk:        caller specific callback function to 
>>>> determine whether
>>>> @@ -27,8 +27,15 @@ struct mm_walk;
>>>>    * @pre_vma:            if set, called before starting walk on a 
>>>> non-null vma.
>>>>    * @post_vma:           if set, called after a walk on a non-null 
>>>> vma, provided
>>>>    *                      that @pre_vma and the vma walk succeeded.
>>>> + *
>>>> + * p?d_entry callbacks are called even if those levels are folded 
>>>> on a
>>>> + * particular architecture/configuration.
>>>>    */
>>>>   struct mm_walk_ops {
>>>> +    int (*pgd_entry)(pgd_t *pgd, unsigned long addr,
>>>> +             unsigned long next, struct mm_walk *walk);
>>>> +    int (*p4d_entry)(p4d_t *p4d, unsigned long addr,
>>>> +             unsigned long next, struct mm_walk *walk);
>>>>       int (*pud_entry)(pud_t *pud, unsigned long addr,
>>>>                unsigned long next, struct mm_walk *walk);
>>>>       int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
>>>> diff --git a/mm/pagewalk.c b/mm/pagewalk.c
>>>> index ea0b9e606ad1..c089786e7a7f 100644
>>>> --- a/mm/pagewalk.c
>>>> +++ b/mm/pagewalk.c
>>>> @@ -94,15 +94,9 @@ static int walk_pud_range(p4d_t *p4d, unsigned 
>>>> long addr, unsigned long end,
>>>>           }
>>>>             if (ops->pud_entry) {
>>>> -            spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);
>>>> -
>>>> -            if (ptl) {
>>>> -                err = ops->pud_entry(pud, addr, next, walk);
>>>> -                spin_unlock(ptl);
>>>> -                if (err)
>>>> -                    break;
>>>> -                continue;
>>>> -            }
>>>> +            err = ops->pud_entry(pud, addr, next, walk);
>>>> +            if (err)
>>>> +                break;
>>>
>>> Actually, there are two current users of pud_entry(), in hmm.c and 
>>> since 5.5rc1 also mapping_dirty_helpers.c. The latter one is 
>>> unproblematic and requires no attention but the one in hmm.c is 
>>> probably largely untested, and seems to assume it was called outside 
>>> of the spinlock.
>
> Thanks for pointing that out, I guess the simplest fix would be to
> squash in something like the below which should restore the old
> behaviour for hmm.c without affecting others.
>
> Steve 

I'm not fully sure that the old behaviour is the correct one, but definitely hmm's pud_entry needs some fixing.
I'm more concerned with the pagewalk code. With your patch it actually splits all huge puds present in the page-table
on each page walk which is not what we want.

One idea would be to add a new member to struct_mm_walk:

enum page_walk_ret_action {
	ACTION_SUBTREE = 0,
	ACTION_CONTINUE = 1,
	ACTION_AGAIN = 2 /* Only for levels that thave p?d_unstable */
};

struct mm_walk {
	...
	enum page_walk_ret_action action; /* or perhaps as an enum */
};


if (ops->pud_entry) {
	walk->action = ACTION_SUBTREE;
	...
	...
	...
	if (walk->action == ACTION_AGAIN)  /* Callback tried to split huge entry, but failed */
		goto again;
	else if (walk->action == ACTION_CONTINUE) /* Done with this subtree. Probably huge entry handled. */
		continue;
	/* ACTION_SUBTREE falls through */
}

we discussed something similar before on linux-mm, but the idea then was to redefine
the positive return value of the callback to the action, but that meant changing those existing callbacks that relied on
a positive return value. The above would be helpful also for pmd_entry.

/Thomas
Steven Price Dec. 12, 2019, 3:18 p.m. UTC | #5
On 12/12/2019 14:04, Thomas Hellström (VMware) wrote:
> On 12/12/19 2:15 PM, Steven Price wrote:
>> On 12/12/2019 11:33, Thomas Hellström (VMware) wrote:
>>> On 12/12/19 12:23 PM, Thomas Hellström (VMware) wrote:
>>>> On 12/6/19 2:53 PM, Steven Price wrote:
>>>>> pgd_entry() and pud_entry() were removed by commit 0b1fbfe50006c410
>>>>> ("mm/pagewalk: remove pgd_entry() and pud_entry()") because there were
>>>>> no users. We're about to add users so reintroduce them, along with
>>>>> p4d_entry() as we now have 5 levels of tables.
>>>>>
>>>>> Note that commit a00cc7d9dd93d66a ("mm, x86: add support for
>>>>> PUD-sized transparent hugepages") already re-added pud_entry() but with
>>>>> different semantics to the other callbacks. Since there have never
>>>>> been upstream users of this, revert the semantics back to match the
>>>>> other callbacks. This means pud_entry() is called for all entries, not
>>>>> just transparent huge pages.
>>
>> When I wrote that there were no upstream users, which sadly shows how
>> long ago that was :(
>>
>>>> Actually, there are two users of pud_entry(), in hmm.c and since 5.5rc1 also mapping_dirty_helpers.c. The latter one is unproblematic and requires no attention but the one in hmm.c is probably largely untested, and seems to assume it was called outside of the spinlock.
>>>>
>>>> The problem with the current patch is that the hmm pud_entry will traverse also pmds, so that will be done twice now.
>>>>
>>>> In another thread we were discussing a means of rerunning the level (in case of a race), or continuing after a level, based on the return value after the callback. The change was fairly invasive,
>>>>
>>> Hmm. Forgot to remove the above text that appears twice. :(. The correct one is inline below.
>>>
>>>>
>>>>> Tested-by: Zong Li <zong.li@sifive.com>
>>>>> Signed-off-by: Steven Price <steven.price@arm.com>
>>>>> ---
>>>>>   include/linux/pagewalk.h | 19 +++++++++++++------
>>>>>   mm/pagewalk.c            | 27 ++++++++++++++++-----------
>>>>>   2 files changed, 29 insertions(+), 17 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
>>>>> index 6ec82e92c87f..06790f23957f 100644
>>>>> --- a/include/linux/pagewalk.h
>>>>> +++ b/include/linux/pagewalk.h
>>>>> @@ -8,15 +8,15 @@ struct mm_walk;
>>>>>     /**
>>>>>    * mm_walk_ops - callbacks for walk_page_range
>>>>> - * @pud_entry:        if set, called for each non-empty PUD (2nd-level) entry
>>>>> - *            this handler should only handle pud_trans_huge() puds.
>>>>> - *            the pmd_entry or pte_entry callbacks will be used for
>>>>> - *            regular PUDs.
>>>>> - * @pmd_entry:        if set, called for each non-empty PMD (3rd-level) entry
>>>>> + * @pgd_entry:        if set, called for each non-empty PGD (top-level) entry
>>>>> + * @p4d_entry:        if set, called for each non-empty P4D entry
>>>>> + * @pud_entry:        if set, called for each non-empty PUD entry
>>>>> + * @pmd_entry:        if set, called for each non-empty PMD entry
>>>>>    *            this handler is required to be able to handle
>>>>>    *            pmd_trans_huge() pmds.  They may simply choose to
>>>>>    *            split_huge_page() instead of handling it explicitly.
>>>>> - * @pte_entry:        if set, called for each non-empty PTE (4th-level) entry
>>>>> + * @pte_entry:        if set, called for each non-empty PTE (lowest-level)
>>>>> + *            entry
>>>>>    * @pte_hole:        if set, called for each hole at all levels
>>>>>    * @hugetlb_entry:    if set, called for each hugetlb entry
>>>>>    * @test_walk:        caller specific callback function to determine whether
>>>>> @@ -27,8 +27,15 @@ struct mm_walk;
>>>>>    * @pre_vma:            if set, called before starting walk on a non-null vma.
>>>>>    * @post_vma:           if set, called after a walk on a non-null vma, provided
>>>>>    *                      that @pre_vma and the vma walk succeeded.
>>>>> + *
>>>>> + * p?d_entry callbacks are called even if those levels are folded on a
>>>>> + * particular architecture/configuration.
>>>>>    */
>>>>>   struct mm_walk_ops {
>>>>> +    int (*pgd_entry)(pgd_t *pgd, unsigned long addr,
>>>>> +             unsigned long next, struct mm_walk *walk);
>>>>> +    int (*p4d_entry)(p4d_t *p4d, unsigned long addr,
>>>>> +             unsigned long next, struct mm_walk *walk);
>>>>>       int (*pud_entry)(pud_t *pud, unsigned long addr,
>>>>>                unsigned long next, struct mm_walk *walk);
>>>>>       int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
>>>>> diff --git a/mm/pagewalk.c b/mm/pagewalk.c
>>>>> index ea0b9e606ad1..c089786e7a7f 100644
>>>>> --- a/mm/pagewalk.c
>>>>> +++ b/mm/pagewalk.c
>>>>> @@ -94,15 +94,9 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
>>>>>           }
>>>>>             if (ops->pud_entry) {
>>>>> -            spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);
>>>>> -
>>>>> -            if (ptl) {
>>>>> -                err = ops->pud_entry(pud, addr, next, walk);
>>>>> -                spin_unlock(ptl);
>>>>> -                if (err)
>>>>> -                    break;
>>>>> -                continue;
>>>>> -            }
>>>>> +            err = ops->pud_entry(pud, addr, next, walk);
>>>>> +            if (err)
>>>>> +                break;
>>>>
>>>> Actually, there are two current users of pud_entry(), in hmm.c and since 5.5rc1 also mapping_dirty_helpers.c. The latter one is unproblematic and requires no attention but the one in hmm.c is probably largely untested, and seems to assume it was called outside of the spinlock.
>>
>> Thanks for pointing that out, I guess the simplest fix would be to
>> squash in something like the below which should restore the old
>> behaviour for hmm.c without affecting others.
>>
>> Steve 
> 
> I'm not fully sure that the old behaviour is the correct one, but definitely hmm's pud_entry needs some fixing.
> I'm more concerned with the pagewalk code. With your patch it actually splits all huge puds present in the page-table
> on each page walk which is not what we want.

Good catch - yes that's certainly not ideal.

> One idea would be to add a new member to struct_mm_walk:
> 
> enum page_walk_ret_action {
>      ACTION_SUBTREE = 0,
>      ACTION_CONTINUE = 1,
>      ACTION_AGAIN = 2 /* Only for levels that thave p?d_unstable */
> };
> 
> struct mm_walk {
>      ...
>      enum page_walk_ret_action action; /* or perhaps as an enum */
> };
> 
> 
> if (ops->pud_entry) {
>      walk->action = ACTION_SUBTREE;
>      ...
>      ...
>      ...
>      if (walk->action == ACTION_AGAIN)  /* Callback tried to split huge entry, but failed */
>          goto again;
>      else if (walk->action == ACTION_CONTINUE) /* Done with this subtree. Probably huge entry handled. */
>          continue;
>      /* ACTION_SUBTREE falls through */
> }

I'll have a go at implementing the above - this might also allow removing the test_p?d() callbacks as they can simply return ACTION_CONTINUE.

Steve

> we discussed something similar before on linux-mm, but the idea then was to redefine
> the positive return value of the callback to the action, but that meant changing those existing callbacks that relied on
> a positive return value. The above would be helpful also for pmd_entry.
> 
> /Thomas
> 
> 
> 
> 
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
diff mbox series

Patch

diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index 6ec82e92c87f..06790f23957f 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -8,15 +8,15 @@  struct mm_walk;
 
 /**
  * mm_walk_ops - callbacks for walk_page_range
- * @pud_entry:		if set, called for each non-empty PUD (2nd-level) entry
- *			this handler should only handle pud_trans_huge() puds.
- *			the pmd_entry or pte_entry callbacks will be used for
- *			regular PUDs.
- * @pmd_entry:		if set, called for each non-empty PMD (3rd-level) entry
+ * @pgd_entry:		if set, called for each non-empty PGD (top-level) entry
+ * @p4d_entry:		if set, called for each non-empty P4D entry
+ * @pud_entry:		if set, called for each non-empty PUD entry
+ * @pmd_entry:		if set, called for each non-empty PMD entry
  *			this handler is required to be able to handle
  *			pmd_trans_huge() pmds.  They may simply choose to
  *			split_huge_page() instead of handling it explicitly.
- * @pte_entry:		if set, called for each non-empty PTE (4th-level) entry
+ * @pte_entry:		if set, called for each non-empty PTE (lowest-level)
+ *			entry
  * @pte_hole:		if set, called for each hole at all levels
  * @hugetlb_entry:	if set, called for each hugetlb entry
  * @test_walk:		caller specific callback function to determine whether
@@ -27,8 +27,15 @@  struct mm_walk;
  * @pre_vma:            if set, called before starting walk on a non-null vma.
  * @post_vma:           if set, called after a walk on a non-null vma, provided
  *                      that @pre_vma and the vma walk succeeded.
+ *
+ * p?d_entry callbacks are called even if those levels are folded on a
+ * particular architecture/configuration.
  */
 struct mm_walk_ops {
+	int (*pgd_entry)(pgd_t *pgd, unsigned long addr,
+			 unsigned long next, struct mm_walk *walk);
+	int (*p4d_entry)(p4d_t *p4d, unsigned long addr,
+			 unsigned long next, struct mm_walk *walk);
 	int (*pud_entry)(pud_t *pud, unsigned long addr,
 			 unsigned long next, struct mm_walk *walk);
 	int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index ea0b9e606ad1..c089786e7a7f 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -94,15 +94,9 @@  static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
 		}
 
 		if (ops->pud_entry) {
-			spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);
-
-			if (ptl) {
-				err = ops->pud_entry(pud, addr, next, walk);
-				spin_unlock(ptl);
-				if (err)
-					break;
-				continue;
-			}
+			err = ops->pud_entry(pud, addr, next, walk);
+			if (err)
+				break;
 		}
 
 		split_huge_pud(walk->vma, pud, addr);
@@ -136,7 +130,12 @@  static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 				break;
 			continue;
 		}
-		if (ops->pmd_entry || ops->pte_entry)
+		if (ops->p4d_entry) {
+			err = ops->p4d_entry(p4d, addr, next, walk);
+			if (err)
+				break;
+		}
+		if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
 			err = walk_pud_range(p4d, addr, next, walk);
 		if (err)
 			break;
@@ -163,7 +162,13 @@  static int walk_pgd_range(unsigned long addr, unsigned long end,
 				break;
 			continue;
 		}
-		if (ops->pmd_entry || ops->pte_entry)
+		if (ops->pgd_entry) {
+			err = ops->pgd_entry(pgd, addr, next, walk);
+			if (err)
+				break;
+		}
+		if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
+		    ops->pte_entry)
 			err = walk_p4d_range(pgd, addr, next, walk);
 		if (err)
 			break;