diff mbox series

[RFC,2/2] x86/mm/cpa: drop pgprot_clear_protnone_bits()

Message ID 20220614063933.13030-3-42.hyeyoo@gmail.com (mailing list archive)
State New
Headers show
Series CPA improvements | expand

Commit Message

Hyeonggon Yoo June 14, 2022, 6:39 a.m. UTC
commit a8aed3e0752b4 ("x86/mm/pageattr: Prevent PSE and GLOABL leftovers
to confuse pmd/pte_present and pmd_huge") made CPA clear _PAGE_GLOBAL when
_PAGE_PRESENT is not set. This prevents kernel crashing when kernel reads
a page with !_PAGE_PRESENT and _PAGE_PROTNONE (_PAGE_GLOBAL). And then it
set _PAGE_GLOBAL back when setting _PAGE_PRESENT again.

After commit d1440b23c922d ("x86/mm: Factor out pageattr _PAGE_GLOBAL
setting") made kernel not set unconditionally _PAGE_GLOBAL, pages lose
global flag after _set_pages_np() and _set_pages_p() are called.

But after commit 3166851142411 ("x86: skip check for spurious faults for
non-present faults"), spurious_kernel_fault() does not confuse
pte/pmd entries with _PAGE_PROTNONE as present anymore. So simply
drop pgprot_clear_protnone_bits().

Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
---
 arch/x86/mm/pat/set_memory.c | 24 ------------------------
 1 file changed, 24 deletions(-)

Comments

Hyeonggon Yoo June 14, 2022, 6:53 a.m. UTC | #1
On Tue, Jun 14, 2022 at 03:39:33PM +0900, Hyeonggon Yoo wrote:
> commit a8aed3e0752b4 ("x86/mm/pageattr: Prevent PSE and GLOABL leftovers
> to confuse pmd/pte_present and pmd_huge") made CPA clear _PAGE_GLOBAL when
> _PAGE_PRESENT is not set. This prevents kernel crashing when kernel reads
> a page with !_PAGE_PRESENT and _PAGE_PROTNONE (_PAGE_GLOBAL). And then it
> set _PAGE_GLOBAL back when setting _PAGE_PRESENT again.
> 
> After commit d1440b23c922d ("x86/mm: Factor out pageattr _PAGE_GLOBAL
> setting") made kernel not set unconditionally _PAGE_GLOBAL, pages lose
> global flag after _set_pages_np() and _set_pages_p() are called.
> 
> But after commit 3166851142411 ("x86: skip check for spurious faults for
> non-present faults"), spurious_kernel_fault() does not confuse
> pte/pmd entries with _PAGE_PROTNONE as present anymore. So simply
> drop pgprot_clear_protnone_bits().
 
Looks like I forgot to Cc: Andrea Arcangeli <aarcange@redhat.com>

Plus I did check that kernel does not crash when reading from/writing to
non-present pages with this patch applied.

> Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
> ---
>  arch/x86/mm/pat/set_memory.c | 24 ------------------------
>  1 file changed, 24 deletions(-)
> 
> diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
> index 67cf969fed0d..8a8ce8d78694 100644
> --- a/arch/x86/mm/pat/set_memory.c
> +++ b/arch/x86/mm/pat/set_memory.c
> @@ -746,23 +746,6 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
>  #endif
>  }
>  
> -static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot)
> -{
> -	/*
> -	 * _PAGE_GLOBAL means "global page" for present PTEs.
> -	 * But, it is also used to indicate _PAGE_PROTNONE
> -	 * for non-present PTEs.
> -	 *
> -	 * This ensures that a _PAGE_GLOBAL PTE going from
> -	 * present to non-present is not confused as
> -	 * _PAGE_PROTNONE.
> -	 */
> -	if (!(pgprot_val(prot) & _PAGE_PRESENT))
> -		pgprot_val(prot) &= ~_PAGE_GLOBAL;
> -
> -	return prot;
> -}
> -
>  static int __should_split_large_page(pte_t *kpte, unsigned long address,
>  				     struct cpa_data *cpa)
>  {
> @@ -824,7 +807,6 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
>  	 * different bit positions in the two formats.
>  	 */
>  	req_prot = pgprot_4k_2_large(req_prot);
> -	req_prot = pgprot_clear_protnone_bits(req_prot);
>  	if (pgprot_val(req_prot) & _PAGE_PRESENT)
>  		pgprot_val(req_prot) |= _PAGE_PSE;
>  
> @@ -1013,8 +995,6 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
>  		return 1;
>  	}
>  
> -	ref_prot = pgprot_clear_protnone_bits(ref_prot);
> -
>  	/*
>  	 * Get the target pfn from the original entry:
>  	 */
> @@ -1246,8 +1226,6 @@ static void populate_pte(struct cpa_data *cpa,
>  
>  	pte = pte_offset_kernel(pmd, start);
>  
> -	pgprot = pgprot_clear_protnone_bits(pgprot);
> -
>  	while (num_pages-- && start < end) {
>  		set_pte(pte, pfn_pte(cpa->pfn, pgprot));
>  
> @@ -1542,8 +1520,6 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
>  		new_prot = static_protections(new_prot, address, pfn, 1, 0,
>  					      CPA_PROTECT);
>  
> -		new_prot = pgprot_clear_protnone_bits(new_prot);
> -
>  		/*
>  		 * We need to keep the pfn from the existing PTE,
>  		 * after all we're only going to change it's attributes
> -- 
> 2.32.0
>
Edgecombe, Rick P June 14, 2022, 6:23 p.m. UTC | #2
On Tue, 2022-06-14 at 15:53 +0900, Hyeonggon Yoo wrote:
> On Tue, Jun 14, 2022 at 03:39:33PM +0900, Hyeonggon Yoo wrote:
> > commit a8aed3e0752b4 ("x86/mm/pageattr: Prevent PSE and GLOABL
> > leftovers
> > to confuse pmd/pte_present and pmd_huge") made CPA clear
> > _PAGE_GLOBAL when
> > _PAGE_PRESENT is not set. This prevents kernel crashing when kernel
> > reads
> > a page with !_PAGE_PRESENT and _PAGE_PROTNONE (_PAGE_GLOBAL). And
> > then it
> > set _PAGE_GLOBAL back when setting _PAGE_PRESENT again.
> > 
> > After commit d1440b23c922d ("x86/mm: Factor out pageattr
> > _PAGE_GLOBAL
> > setting") made kernel not set unconditionally _PAGE_GLOBAL, pages
> > lose
> > global flag after _set_pages_np() and _set_pages_p() are called.
> > 
> > But after commit 3166851142411 ("x86: skip check for spurious
> > faults for
> > non-present faults"), spurious_kernel_fault() does not confuse
> > pte/pmd entries with _PAGE_PROTNONE as present anymore. So simply
> > drop pgprot_clear_protnone_bits().
> 
>  
> Looks like I forgot to Cc: Andrea Arcangeli <aarcange@redhat.com>
> 
> Plus I did check that kernel does not crash when reading from/writing
> to
> non-present pages with this patch applied.

Thanks for the history.

I think we should still fix pte_present() to not check prot_none if the
user bit is clear. The spurious fault handler infinite loop may no
longer be a problem, but pte_present() still would return true for
kernel NP pages, so be fragile. Today I see at least the oops message
and memory hotunplug (see remove_pagetable()) that would get confused.
Hyeonggon Yoo June 15, 2022, 3:47 a.m. UTC | #3
On Tue, Jun 14, 2022 at 06:23:43PM +0000, Edgecombe, Rick P wrote:
> On Tue, 2022-06-14 at 15:53 +0900, Hyeonggon Yoo wrote:
> > On Tue, Jun 14, 2022 at 03:39:33PM +0900, Hyeonggon Yoo wrote:
> > > commit a8aed3e0752b4 ("x86/mm/pageattr: Prevent PSE and GLOABL
> > > leftovers
> > > to confuse pmd/pte_present and pmd_huge") made CPA clear
> > > _PAGE_GLOBAL when
> > > _PAGE_PRESENT is not set. This prevents kernel crashing when kernel
> > > reads
> > > a page with !_PAGE_PRESENT and _PAGE_PROTNONE (_PAGE_GLOBAL). And
> > > then it
> > > set _PAGE_GLOBAL back when setting _PAGE_PRESENT again.
> > > 
> > > After commit d1440b23c922d ("x86/mm: Factor out pageattr
> > > _PAGE_GLOBAL
> > > setting") made kernel not set unconditionally _PAGE_GLOBAL, pages
> > > lose
> > > global flag after _set_pages_np() and _set_pages_p() are called.
> > > 
> > > But after commit 3166851142411 ("x86: skip check for spurious
> > > faults for
> > > non-present faults"), spurious_kernel_fault() does not confuse
> > > pte/pmd entries with _PAGE_PROTNONE as present anymore. So simply
> > > drop pgprot_clear_protnone_bits().
> > 
> >  
> > Looks like I forgot to Cc: Andrea Arcangeli <aarcange@redhat.com>
> > 
> > Plus I did check that kernel does not crash when reading from/writing
> > to
> > non-present pages with this patch applied.
> 
> Thanks for the history.
> 
> I think we should still fix pte_present() to not check prot_none if the
> user bit is clear.

I tried, but realized it wouldn't work :(

For example, when a pte entry is used as swap entry, _PAGE_PRESENT is
cleared and _PAGE_PROTNONE is set.

And other bits are used as type and offset of swap entry.
In that case, _PAGE_BIT_USER bit does not represent _PAGE_USER.
It is just one of bits that represents type of swap entry.

So checking if _PAGE_PROTNONE set only when _PAGE_USER is set
will confuse some swap entries as non-present.

> The spurious fault handler infinite loop may no
> longer be a problem, but pte_present() still would return true for
> kernel NP pages, so be fragile. Today I see at least the oops message
> and memory hotunplug (see remove_pagetable()) that would get confused.

As explained above, I don't think it's possible to make pte_present() 
accurate for both kernel and user ptes.

Maybe we can implement pte_present_kernel()/pte_present_user()
for when kernel knows it is user or kernel pte.

or pte_present_with_address(pte, address) if we don't
know it is user pte or kernel pte.
Edgecombe, Rick P June 15, 2022, 6:18 p.m. UTC | #4
On Wed, 2022-06-15 at 12:47 +0900, Hyeonggon Yoo wrote:
> On Tue, Jun 14, 2022 at 06:23:43PM +0000, Edgecombe, Rick P wrote:
> > On Tue, 2022-06-14 at 15:53 +0900, Hyeonggon Yoo wrote:
> > > On Tue, Jun 14, 2022 at 03:39:33PM +0900, Hyeonggon Yoo wrote:
> > > > commit a8aed3e0752b4 ("x86/mm/pageattr: Prevent PSE and GLOABL
> > > > leftovers
> > > > to confuse pmd/pte_present and pmd_huge") made CPA clear
> > > > _PAGE_GLOBAL when
> > > > _PAGE_PRESENT is not set. This prevents kernel crashing when
> > > > kernel
> > > > reads
> > > > a page with !_PAGE_PRESENT and _PAGE_PROTNONE (_PAGE_GLOBAL).
> > > > And
> > > > then it
> > > > set _PAGE_GLOBAL back when setting _PAGE_PRESENT again.
> > > > 
> > > > After commit d1440b23c922d ("x86/mm: Factor out pageattr
> > > > _PAGE_GLOBAL
> > > > setting") made kernel not set unconditionally _PAGE_GLOBAL,
> > > > pages
> > > > lose
> > > > global flag after _set_pages_np() and _set_pages_p() are
> > > > called.
> > > > 
> > > > But after commit 3166851142411 ("x86: skip check for spurious
> > > > faults for
> > > > non-present faults"), spurious_kernel_fault() does not confuse
> > > > pte/pmd entries with _PAGE_PROTNONE as present anymore. So
> > > > simply
> > > > drop pgprot_clear_protnone_bits().
> > > 
> > >  
> > > Looks like I forgot to Cc: Andrea Arcangeli <aarcange@redhat.com>
> > > 
> > > Plus I did check that kernel does not crash when reading
> > > from/writing
> > > to
> > > non-present pages with this patch applied.
> > 
> > Thanks for the history.
> > 
> > I think we should still fix pte_present() to not check prot_none if
> > the
> > user bit is clear.
> 
> I tried, but realized it wouldn't work :(
> 
> For example, when a pte entry is used as swap entry, _PAGE_PRESENT is
> cleared and _PAGE_PROTNONE is set.
> 
> And other bits are used as type and offset of swap entry.
> In that case, _PAGE_BIT_USER bit does not represent _PAGE_USER.
> It is just one of bits that represents type of swap entry.
> 
> So checking if _PAGE_PROTNONE set only when _PAGE_USER is set
> will confuse some swap entries as non-present.

Oooh, right. So the user bit records "when a pagetable is
writeprotected by userfaultfd WP support". I'm not sure if maybe PCD is
available to move that to and leave the user bit in place, but it
sounds like an errata sensitive area to be tweaking.

> 
> > The spurious fault handler infinite loop may no
> > longer be a problem, but pte_present() still would return true for
> > kernel NP pages, so be fragile. Today I see at least the oops
> > message
> > and memory hotunplug (see remove_pagetable()) that would get
> > confused.
> 
> As explained above, I don't think it's possible to make
> pte_present() 
> accurate for both kernel and user ptes.
> 
> Maybe we can implement pte_present_kernel()/pte_present_user()
> for when kernel knows it is user or kernel pte.

This seems like a decent option to me. It seems there are only a few
places that are isolated to arch/x86.

> 
> or pte_present_with_address(pte, address) if we don't
> know it is user pte or kernel pte.
>
Hyeonggon Yoo June 19, 2022, 12:20 p.m. UTC | #5
On Wed, Jun 15, 2022 at 06:18:15PM +0000, Edgecombe, Rick P wrote:
> On Wed, 2022-06-15 at 12:47 +0900, Hyeonggon Yoo wrote:
> > On Tue, Jun 14, 2022 at 06:23:43PM +0000, Edgecombe, Rick P wrote:
> > > On Tue, 2022-06-14 at 15:53 +0900, Hyeonggon Yoo wrote:
> > > > On Tue, Jun 14, 2022 at 03:39:33PM +0900, Hyeonggon Yoo wrote:
> > > > > commit a8aed3e0752b4 ("x86/mm/pageattr: Prevent PSE and GLOABL
> > > > > leftovers
> > > > > to confuse pmd/pte_present and pmd_huge") made CPA clear
> > > > > _PAGE_GLOBAL when
> > > > > _PAGE_PRESENT is not set. This prevents kernel crashing when
> > > > > kernel
> > > > > reads
> > > > > a page with !_PAGE_PRESENT and _PAGE_PROTNONE (_PAGE_GLOBAL).
> > > > > And
> > > > > then it
> > > > > set _PAGE_GLOBAL back when setting _PAGE_PRESENT again.
> > > > > 
> > > > > After commit d1440b23c922d ("x86/mm: Factor out pageattr
> > > > > _PAGE_GLOBAL
> > > > > setting") made kernel not set unconditionally _PAGE_GLOBAL,
> > > > > pages
> > > > > lose
> > > > > global flag after _set_pages_np() and _set_pages_p() are
> > > > > called.
> > > > > 
> > > > > But after commit 3166851142411 ("x86: skip check for spurious
> > > > > faults for
> > > > > non-present faults"), spurious_kernel_fault() does not confuse
> > > > > pte/pmd entries with _PAGE_PROTNONE as present anymore. So
> > > > > simply
> > > > > drop pgprot_clear_protnone_bits().
> > > > 
> > > >  
> > > > Looks like I forgot to Cc: Andrea Arcangeli <aarcange@redhat.com>
> > > > 
> > > > Plus I did check that kernel does not crash when reading
> > > > from/writing
> > > > to
> > > > non-present pages with this patch applied.
> > > 
> > > Thanks for the history.
> > > 
> > > I think we should still fix pte_present() to not check prot_none if
> > > the
> > > user bit is clear.
> > 
> > I tried, but realized it wouldn't work :(
> > 
> > For example, when a pte entry is used as swap entry, _PAGE_PRESENT is
> > cleared and _PAGE_PROTNONE is set.
> > 
> > And other bits are used as type and offset of swap entry.
> > In that case, _PAGE_BIT_USER bit does not represent _PAGE_USER.
> > It is just one of bits that represents type of swap entry.
> > 
> > So checking if _PAGE_PROTNONE set only when _PAGE_USER is set
> > will confuse some swap entries as non-present.
> 
> Oooh, right. So the user bit records "when a pagetable is
> writeprotected by userfaultfd WP support". I'm not sure if maybe PCD is
> available to move that to and leave the user bit in place, but it
> sounds like an errata sensitive area to be tweaking.
> 
> > 
> > > The spurious fault handler infinite loop may no
> > > longer be a problem, but pte_present() still would return true for
> > > kernel NP pages, so be fragile. Today I see at least the oops
> > > message
> > > and memory hotunplug (see remove_pagetable()) that would get
> > > confused.
> > 
> > As explained above, I don't think it's possible to make
> > pte_present() 
> > accurate for both kernel and user ptes.
> > 
> > Maybe we can implement pte_present_kernel()/pte_present_user()
> > for when kernel knows it is user or kernel pte.
> 
> This seems like a decent option to me. It seems there are only a few
> places that are isolated to arch/x86.

But there are some places where kernel does not know if it's
kernel pte or user pte.

For example show_fault_oops() can be called for both kernel and user
address. Is something like this acceptable?

static inline bool pte_present_address(pte_t pte, address)
{
	if (kernel address)
		return pte_present_kernel(pte);
	return pte_present_user(pte);
}

> > 
> > or pte_present_with_address(pte, address) if we don't
> > know it is user pte or kernel pte.
> >
diff mbox series

Patch

diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 67cf969fed0d..8a8ce8d78694 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -746,23 +746,6 @@  static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
 #endif
 }
 
-static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot)
-{
-	/*
-	 * _PAGE_GLOBAL means "global page" for present PTEs.
-	 * But, it is also used to indicate _PAGE_PROTNONE
-	 * for non-present PTEs.
-	 *
-	 * This ensures that a _PAGE_GLOBAL PTE going from
-	 * present to non-present is not confused as
-	 * _PAGE_PROTNONE.
-	 */
-	if (!(pgprot_val(prot) & _PAGE_PRESENT))
-		pgprot_val(prot) &= ~_PAGE_GLOBAL;
-
-	return prot;
-}
-
 static int __should_split_large_page(pte_t *kpte, unsigned long address,
 				     struct cpa_data *cpa)
 {
@@ -824,7 +807,6 @@  static int __should_split_large_page(pte_t *kpte, unsigned long address,
 	 * different bit positions in the two formats.
 	 */
 	req_prot = pgprot_4k_2_large(req_prot);
-	req_prot = pgprot_clear_protnone_bits(req_prot);
 	if (pgprot_val(req_prot) & _PAGE_PRESENT)
 		pgprot_val(req_prot) |= _PAGE_PSE;
 
@@ -1013,8 +995,6 @@  __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 		return 1;
 	}
 
-	ref_prot = pgprot_clear_protnone_bits(ref_prot);
-
 	/*
 	 * Get the target pfn from the original entry:
 	 */
@@ -1246,8 +1226,6 @@  static void populate_pte(struct cpa_data *cpa,
 
 	pte = pte_offset_kernel(pmd, start);
 
-	pgprot = pgprot_clear_protnone_bits(pgprot);
-
 	while (num_pages-- && start < end) {
 		set_pte(pte, pfn_pte(cpa->pfn, pgprot));
 
@@ -1542,8 +1520,6 @@  static int __change_page_attr(struct cpa_data *cpa, int primary)
 		new_prot = static_protections(new_prot, address, pfn, 1, 0,
 					      CPA_PROTECT);
 
-		new_prot = pgprot_clear_protnone_bits(new_prot);
-
 		/*
 		 * We need to keep the pfn from the existing PTE,
 		 * after all we're only going to change it's attributes