diff mbox series

[v1,5/5] mm, hwpoison: enable memory error handling on 1GB hugepage

Message ID 20220602050631.771414-6-naoya.horiguchi@linux.dev (mailing list archive)
State New
Headers show
Series mm, hwpoison: enable 1GB hugepage support | expand

Commit Message

Naoya Horiguchi June 2, 2022, 5:06 a.m. UTC
From: Naoya Horiguchi <naoya.horiguchi@nec.com>

Now error handling code is prepared, so remove the blocking code and
enable memory error handling on 1GB hugepage.

Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
---
 include/linux/mm.h      |  1 -
 include/ras/ras_event.h |  1 -
 mm/memory-failure.c     | 16 ----------------
 3 files changed, 18 deletions(-)

Comments

Miaohe Lin June 7, 2022, 2:11 p.m. UTC | #1
On 2022/6/2 13:06, Naoya Horiguchi wrote:
> From: Naoya Horiguchi <naoya.horiguchi@nec.com>
> 
> Now error handling code is prepared, so remove the blocking code and
> enable memory error handling on 1GB hugepage.
> 

I'm nervous about this change. It seems there are many code paths not awared of pud swap entry.
I browsed some of them:
apply_to_pud_range called from apply_to_page_range:

apply_to_pud_range:
	next = pud_addr_end(addr, end);
	if (pud_none(*pud) && !create)
		continue;
	if (WARN_ON_ONCE(pud_leaf(*pud)))
		return -EINVAL;
	if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
		if (!create)
			continue;
		pud_clear_bad(pud);
	}
	err = apply_to_pmd_range(mm, pud, addr, next,
				 fn, data, create, mask);

For !pud_present case, it will mostly reach apply_to_pmd_range and call pmd_offset on it. And invalid
pointer will be de-referenced.

Another example might be copy_pud_range and so on. So I think it might not be prepared to enable the
1GB hugepage or all of these places should be fixed?

Thanks!

> Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
> ---
>  include/linux/mm.h      |  1 -
>  include/ras/ras_event.h |  1 -
>  mm/memory-failure.c     | 16 ----------------
>  3 files changed, 18 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 04de0c3e4f9f..58a6aa916e4f 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3238,7 +3238,6 @@ enum mf_action_page_type {
>  	MF_MSG_DIFFERENT_COMPOUND,
>  	MF_MSG_HUGE,
>  	MF_MSG_FREE_HUGE,
> -	MF_MSG_NON_PMD_HUGE,
>  	MF_MSG_UNMAP_FAILED,
>  	MF_MSG_DIRTY_SWAPCACHE,
>  	MF_MSG_CLEAN_SWAPCACHE,
> diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
> index d0337a41141c..cbd3ddd7c33d 100644
> --- a/include/ras/ras_event.h
> +++ b/include/ras/ras_event.h
> @@ -360,7 +360,6 @@ TRACE_EVENT(aer_event,
>  	EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \
>  	EM ( MF_MSG_HUGE, "huge page" )					\
>  	EM ( MF_MSG_FREE_HUGE, "free huge page" )			\
> -	EM ( MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page" )		\
>  	EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" )		\
>  	EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" )		\
>  	EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" )		\
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index babeb34f7477..ced033a99e19 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -725,7 +725,6 @@ static const char * const action_page_types[] = {
>  	[MF_MSG_DIFFERENT_COMPOUND]	= "different compound page after locking",
>  	[MF_MSG_HUGE]			= "huge page",
>  	[MF_MSG_FREE_HUGE]		= "free huge page",
> -	[MF_MSG_NON_PMD_HUGE]		= "non-pmd-sized huge page",
>  	[MF_MSG_UNMAP_FAILED]		= "unmapping failed page",
>  	[MF_MSG_DIRTY_SWAPCACHE]	= "dirty swapcache page",
>  	[MF_MSG_CLEAN_SWAPCACHE]	= "clean swapcache page",
> @@ -1614,21 +1613,6 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
>  
>  	page_flags = head->flags;
>  
> -	/*
> -	 * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
> -	 * simply disable it. In order to make it work properly, we need
> -	 * make sure that:
> -	 *  - conversion of a pud that maps an error hugetlb into hwpoison
> -	 *    entry properly works, and
> -	 *  - other mm code walking over page table is aware of pud-aligned
> -	 *    hwpoison entries.
> -	 */
> -	if (huge_page_size(page_hstate(head)) > PMD_SIZE) {
> -		action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED);
> -		res = -EBUSY;
> -		goto out;
> -	}
> -
>  	if (!hwpoison_user_mappings(p, pfn, flags, head)) {
>  		action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
>  		res = -EBUSY;
>
HORIGUCHI NAOYA(堀口 直也) June 8, 2022, 6:16 a.m. UTC | #2
On Tue, Jun 07, 2022 at 10:11:24PM +0800, Miaohe Lin wrote:
> On 2022/6/2 13:06, Naoya Horiguchi wrote:
> > From: Naoya Horiguchi <naoya.horiguchi@nec.com>
> > 
> > Now error handling code is prepared, so remove the blocking code and
> > enable memory error handling on 1GB hugepage.
> > 
> 
> I'm nervous about this change. It seems there are many code paths not awared of pud swap entry.
> I browsed some of them:
> apply_to_pud_range called from apply_to_page_range:
> 
> apply_to_pud_range:
> 	next = pud_addr_end(addr, end);
> 	if (pud_none(*pud) && !create)
> 		continue;
> 	if (WARN_ON_ONCE(pud_leaf(*pud)))
> 		return -EINVAL;
> 	if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
> 		if (!create)
> 			continue;
> 		pud_clear_bad(pud);
> 	}
> 	err = apply_to_pmd_range(mm, pud, addr, next,
> 				 fn, data, create, mask);
> 
> For !pud_present case, it will mostly reach apply_to_pmd_range and call pmd_offset on it. And invalid
> pointer will be de-referenced.

apply_to_pmd_range() has BUG_ON(pud_huge(*pud)) and apply_to_pte_range() has
BUG_ON(pmd_huge(*pmd)), so this page table walking code seems to not expect
to handle pmd/pud level mapping.

> 
> Another example might be copy_pud_range and so on. So I think it might not be prepared to enable the
> 1GB hugepage or all of these places should be fixed?

I think that most of page table walker for user address space should first
check is_vm_hugetlb_page() and call hugetlb specific walking code for vma
with VM_HUGETLB.
copy_page_range() is a good example.  It calls copy_hugetlb_page_range()
for vma with VM_HUGETLB and the function should support hwpoison entry.
But I feel that I need testing for confirmation.

And I'm not sure that all other are prepared for non-present pud-mapping,
so I'll need somehow code inspection and testing for each.

Thanks,
Naoya Horiguchi
Miaohe Lin June 8, 2022, 12:57 p.m. UTC | #3
On 2022/6/8 14:16, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Tue, Jun 07, 2022 at 10:11:24PM +0800, Miaohe Lin wrote:
>> On 2022/6/2 13:06, Naoya Horiguchi wrote:
>>> From: Naoya Horiguchi <naoya.horiguchi@nec.com>
>>>
>>> Now error handling code is prepared, so remove the blocking code and
>>> enable memory error handling on 1GB hugepage.
>>>
>>
>> I'm nervous about this change. It seems there are many code paths not awared of pud swap entry.
>> I browsed some of them:
>> apply_to_pud_range called from apply_to_page_range:
>>
>> apply_to_pud_range:
>> 	next = pud_addr_end(addr, end);
>> 	if (pud_none(*pud) && !create)
>> 		continue;
>> 	if (WARN_ON_ONCE(pud_leaf(*pud)))
>> 		return -EINVAL;
>> 	if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
>> 		if (!create)
>> 			continue;
>> 		pud_clear_bad(pud);
>> 	}
>> 	err = apply_to_pmd_range(mm, pud, addr, next,
>> 				 fn, data, create, mask);
>>
>> For !pud_present case, it will mostly reach apply_to_pmd_range and call pmd_offset on it. And invalid
>> pointer will be de-referenced.
> 
> apply_to_pmd_range() has BUG_ON(pud_huge(*pud)) and apply_to_pte_range() has
> BUG_ON(pmd_huge(*pmd)), so this page table walking code seems to not expect
> to handle pmd/pud level mapping.

Yes, you're right. These functions are not intended to handle pmd/pud level mapping.

> 
>>
>> Another example might be copy_pud_range and so on. So I think it might not be prepared to enable the
>> 1GB hugepage or all of these places should be fixed?
> 
> I think that most of page table walker for user address space should first
> check is_vm_hugetlb_page() and call hugetlb specific walking code for vma
> with VM_HUGETLB.
> copy_page_range() is a good example.  It calls copy_hugetlb_page_range()
> for vma with VM_HUGETLB and the function should support hwpoison entry.
> But I feel that I need testing for confirmation.

Sorry, I missed it should be called from hugetlb variants.

> 
> And I'm not sure that all other are prepared for non-present pud-mapping,
> so I'll need somehow code inspection and testing for each.

I browsed the code again, there still might be some problematic code paths:

1.For follow_pud_mask, !pud_present will mostly reach follow_pmd_mask(). This can be
called for hugetlb page. (Note gup_pud_range is fixed at 15494520b776 ("mm: fix gup_pud_range"))

2.Even for huge_pte_alloc, pud_offset will be called in pud_alloc. So pudp will be an invalid pointer.
And it will be de-referenced later.

I hope I'm not miss something again this time. ;)

> 
> Thanks,
> Naoya Horiguchi

Thanks!

>
HORIGUCHI NAOYA(堀口 直也) June 9, 2022, 8:45 a.m. UTC | #4
On Wed, Jun 08, 2022 at 08:57:24PM +0800, Miaohe Lin wrote:
...
> >
> > I think that most of page table walker for user address space should first
> > check is_vm_hugetlb_page() and call hugetlb specific walking code for vma
> > with VM_HUGETLB.
> > copy_page_range() is a good example.  It calls copy_hugetlb_page_range()
> > for vma with VM_HUGETLB and the function should support hwpoison entry.
> > But I feel that I need testing for confirmation.
>
> Sorry, I missed it should be called from hugetlb variants.
>
> >
> > And I'm not sure that all other are prepared for non-present pud-mapping,
> > so I'll need somehow code inspection and testing for each.
>
> I browsed the code again, there still might be some problematic code paths:
>
> 1.For follow_pud_mask, !pud_present will mostly reach follow_pmd_mask(). This can be
> called for hugetlb page. (Note gup_pud_range is fixed at 15494520b776 ("mm: fix gup_pud_range"))
>
> 2.Even for huge_pte_alloc, pud_offset will be called in pud_alloc. So pudp will be an invalid pointer.
> And it will be de-referenced later.

Yes, these paths need to support non-present pud entry, so I'll update/add
the patches.  It seems that I did the similar work for pmd few years ago
(cbef8478bee5 ("mm/hugetlb: pmd_huge() returns true for non-present hugepage").

Thanks,
Naoya Horiguchi
Miaohe Lin June 9, 2022, 11:28 a.m. UTC | #5
On 2022/6/9 16:45, HORIGUCHI NAOYA(堀口 直也) wrote:
> On Wed, Jun 08, 2022 at 08:57:24PM +0800, Miaohe Lin wrote:
> ...
>>>
>>> I think that most of page table walker for user address space should first
>>> check is_vm_hugetlb_page() and call hugetlb specific walking code for vma
>>> with VM_HUGETLB.
>>> copy_page_range() is a good example.  It calls copy_hugetlb_page_range()
>>> for vma with VM_HUGETLB and the function should support hwpoison entry.
>>> But I feel that I need testing for confirmation.
>>
>> Sorry, I missed it should be called from hugetlb variants.
>>
>>>
>>> And I'm not sure that all other are prepared for non-present pud-mapping,
>>> so I'll need somehow code inspection and testing for each.
>>
>> I browsed the code again, there still might be some problematic code paths:
>>
>> 1.For follow_pud_mask, !pud_present will mostly reach follow_pmd_mask(). This can be
>> called for hugetlb page. (Note gup_pud_range is fixed at 15494520b776 ("mm: fix gup_pud_range"))
>>
>> 2.Even for huge_pte_alloc, pud_offset will be called in pud_alloc. So pudp will be an invalid pointer.
>> And it will be de-referenced later.
> 
> Yes, these paths need to support non-present pud entry, so I'll update/add
> the patches.  It seems that I did the similar work for pmd few years ago
> (cbef8478bee5 ("mm/hugetlb: pmd_huge() returns true for non-present hugepage").

Yes, these should be similar work. Thanks for your hard work. :)

> 
> Thanks,
> Naoya Horiguchi
>
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 04de0c3e4f9f..58a6aa916e4f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3238,7 +3238,6 @@  enum mf_action_page_type {
 	MF_MSG_DIFFERENT_COMPOUND,
 	MF_MSG_HUGE,
 	MF_MSG_FREE_HUGE,
-	MF_MSG_NON_PMD_HUGE,
 	MF_MSG_UNMAP_FAILED,
 	MF_MSG_DIRTY_SWAPCACHE,
 	MF_MSG_CLEAN_SWAPCACHE,
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index d0337a41141c..cbd3ddd7c33d 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -360,7 +360,6 @@  TRACE_EVENT(aer_event,
 	EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \
 	EM ( MF_MSG_HUGE, "huge page" )					\
 	EM ( MF_MSG_FREE_HUGE, "free huge page" )			\
-	EM ( MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page" )		\
 	EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" )		\
 	EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" )		\
 	EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" )		\
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index babeb34f7477..ced033a99e19 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -725,7 +725,6 @@  static const char * const action_page_types[] = {
 	[MF_MSG_DIFFERENT_COMPOUND]	= "different compound page after locking",
 	[MF_MSG_HUGE]			= "huge page",
 	[MF_MSG_FREE_HUGE]		= "free huge page",
-	[MF_MSG_NON_PMD_HUGE]		= "non-pmd-sized huge page",
 	[MF_MSG_UNMAP_FAILED]		= "unmapping failed page",
 	[MF_MSG_DIRTY_SWAPCACHE]	= "dirty swapcache page",
 	[MF_MSG_CLEAN_SWAPCACHE]	= "clean swapcache page",
@@ -1614,21 +1613,6 @@  static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
 
 	page_flags = head->flags;
 
-	/*
-	 * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
-	 * simply disable it. In order to make it work properly, we need
-	 * make sure that:
-	 *  - conversion of a pud that maps an error hugetlb into hwpoison
-	 *    entry properly works, and
-	 *  - other mm code walking over page table is aware of pud-aligned
-	 *    hwpoison entries.
-	 */
-	if (huge_page_size(page_hstate(head)) > PMD_SIZE) {
-		action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED);
-		res = -EBUSY;
-		goto out;
-	}
-
 	if (!hwpoison_user_mappings(p, pfn, flags, head)) {
 		action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
 		res = -EBUSY;