diff mbox

[PATCHv3] x86/xen: avoid m2p lookup when setting early page table entries

Message ID 1466675466-7392-1-git-send-email-david.vrabel@citrix.com (mailing list archive)
State New, archived
Headers show

Commit Message

David Vrabel June 23, 2016, 9:51 a.m. UTC
When page tables entries are set using xen_set_pte_init() during early
boot there is no page fault handler that could handle a fault when
performing an M2P lookup.

In 64 bit guests (usually dom0) early_ioremap() would fault in
xen_set_pte_init() because an M2P lookup faults because the MFN is in
MMIO space and not mapped in the M2P.  This lookup is done to see if
the PFN in in the range used for the initial page table pages, so that
the PTE may be set as read-only.

The M2P lookup can be avoided by moving the check (and clear of RW)
earlier when the PFN is still available.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
Cc: Kevin Moraga <kmoragas@riseup.net>

v3:
- fold mask_rw_pte()/mask_rw_pteval() into their callers.

v2:
- Remove __init annotation from xen_make_pte_init() since
  PV_CALLEE_SAVE_REGS_THUNK always puts the thunk in .text.

- mask_rw_pte() -> mask_rw_pteval() for x86-64.
---
 arch/x86/xen/mmu.c | 76 +++++++++++++++++++++++++-----------------------------
 1 file changed, 35 insertions(+), 41 deletions(-)

Comments

Boris Ostrovsky June 23, 2016, 11:59 a.m. UTC | #1
On 06/23/2016 05:51 AM, David Vrabel wrote:
> When page tables entries are set using xen_set_pte_init() during early
> boot there is no page fault handler that could handle a fault when
> performing an M2P lookup.
>
> In 64 bit guests (usually dom0) early_ioremap() would fault in
> xen_set_pte_init() because an M2P lookup faults because the MFN is in
> MMIO space and not mapped in the M2P.  This lookup is done to see if
> the PFN in in the range used for the initial page table pages, so that
> the PTE may be set as read-only.
>
> The M2P lookup can be avoided by moving the check (and clear of RW)
> earlier when the PFN is still available.
>
> Signed-off-by: David Vrabel <david.vrabel@citrix.com>
> ---
> Cc: Kevin Moraga <kmoragas@riseup.net>

Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Jürgen Groß June 23, 2016, 12:13 p.m. UTC | #2
On 23/06/16 11:51, David Vrabel wrote:
> When page tables entries are set using xen_set_pte_init() during early
> boot there is no page fault handler that could handle a fault when
> performing an M2P lookup.
> 
> In 64 bit guests (usually dom0) early_ioremap() would fault in
> xen_set_pte_init() because an M2P lookup faults because the MFN is in
> MMIO space and not mapped in the M2P.  This lookup is done to see if
> the PFN in in the range used for the initial page table pages, so that
> the PTE may be set as read-only.
> 
> The M2P lookup can be avoided by moving the check (and clear of RW)
> earlier when the PFN is still available.
> 
> Signed-off-by: David Vrabel <david.vrabel@citrix.com>
> ---
> Cc: Kevin Moraga <kmoragas@riseup.net>
> 
> v3:
> - fold mask_rw_pte()/mask_rw_pteval() into their callers.
> 
> v2:
> - Remove __init annotation from xen_make_pte_init() since
>   PV_CALLEE_SAVE_REGS_THUNK always puts the thunk in .text.
> 
> - mask_rw_pte() -> mask_rw_pteval() for x86-64.
> ---
>  arch/x86/xen/mmu.c | 76 +++++++++++++++++++++++++-----------------------------
>  1 file changed, 35 insertions(+), 41 deletions(-)
> 
> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
> index 478a2de..64d8f0b 100644
> --- a/arch/x86/xen/mmu.c
> +++ b/arch/x86/xen/mmu.c
> @@ -1551,41 +1551,6 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
>  #endif
>  }
>  
> -#ifdef CONFIG_X86_32
> -static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
> -{
> -	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
> -	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
> -		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
> -			       pte_val_ma(pte));
> -
> -	return pte;
> -}
> -#else /* CONFIG_X86_64 */
> -static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
> -{
> -	unsigned long pfn;
> -
> -	if (xen_feature(XENFEAT_writable_page_tables) ||
> -	    xen_feature(XENFEAT_auto_translated_physmap) ||
> -	    xen_start_info->mfn_list >= __START_KERNEL_map)
> -		return pte;
> -
> -	/*
> -	 * Pages belonging to the initial p2m list mapped outside the default
> -	 * address range must be mapped read-only. This region contains the
> -	 * page tables for mapping the p2m list, too, and page tables MUST be
> -	 * mapped read-only.
> -	 */
> -	pfn = pte_pfn(pte);
> -	if (pfn >= xen_start_info->first_p2m_pfn &&
> -	    pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
> -		pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW);
> -
> -	return pte;
> -}
> -#endif /* CONFIG_X86_64 */
> -
>  /*
>   * Init-time set_pte while constructing initial pagetables, which
>   * doesn't allow RO page table pages to be remapped RW.
> @@ -1600,13 +1565,41 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
>   * so always write the PTE directly and rely on Xen trapping and
>   * emulating any updates as necessary.
>   */
> -static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
> +__visible pte_t xen_make_pte_init(pteval_t pte)
>  {
> -	if (pte_mfn(pte) != INVALID_P2M_ENTRY)
> -		pte = mask_rw_pte(ptep, pte);
> -	else
> -		pte = __pte_ma(0);
> +#ifdef CONFIG_X86_64
> +	unsigned long pfn;
> +
> +	/*
> +	 * Pages belonging to the initial p2m list mapped outside the default
> +	 * address range must be mapped read-only. This region contains the
> +	 * page tables for mapping the p2m list, too, and page tables MUST be
> +	 * mapped read-only.
> +	 */
> +	pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT;
> +	if (xen_start_info->mfn_list < __START_KERNEL_map &&
> +	    pfn >= xen_start_info->first_p2m_pfn &&
> +	    pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
> +		pte &= ~_PAGE_RW;
> +#endif
> +	pte = pte_pfn_to_mfn(pte);
>  
> +	if ((pte & PTE_PFN_MASK) >> PAGE_SHIFT == INVALID_P2M_ENTRY)

How can this ever be true? I know this is just the open coded
variant form the original xen_set_pte_init(). Either the if isn't
needed at all or it should be corrected.

> +		pte = 0;
> +
> +	return native_make_pte(pte);
> +}
> +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init);
> +
> +static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
> +{
> +#ifdef CONFIG_X86_32
> +	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
> +	if (pte_mfn(pte) != INVALID_P2M_ENTRY
> +	    && pte_val_ma(*ptep) & _PAGE_PRESENT)

Even more obvious: do we really create ptes with INVALID_P2M_ENTRY and
_PAGE_PRESENT? I think the first part of the if can be dropped again.


Juergen

> +		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
> +			       pte_val_ma(pte));
> +#endif
>  	native_set_pte(ptep, pte);
>  }
>  
> @@ -2407,6 +2400,7 @@ static void __init xen_post_allocator_init(void)
>  	pv_mmu_ops.alloc_pud = xen_alloc_pud;
>  	pv_mmu_ops.release_pud = xen_release_pud;
>  #endif
> +	pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte);
>  
>  #ifdef CONFIG_X86_64
>  	pv_mmu_ops.write_cr3 = &xen_write_cr3;
> @@ -2455,7 +2449,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
>  	.pte_val = PV_CALLEE_SAVE(xen_pte_val),
>  	.pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
>  
> -	.make_pte = PV_CALLEE_SAVE(xen_make_pte),
> +	.make_pte = PV_CALLEE_SAVE(xen_make_pte_init),
>  	.make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
>  
>  #ifdef CONFIG_X86_PAE
>
David Vrabel June 23, 2016, 1:06 p.m. UTC | #3
On 23/06/16 13:13, Juergen Gross wrote:
> On 23/06/16 11:51, David Vrabel wrote:
>> When page tables entries are set using xen_set_pte_init() during early
>> boot there is no page fault handler that could handle a fault when
>> performing an M2P lookup.
>>
>> In 64 bit guests (usually dom0) early_ioremap() would fault in
>> xen_set_pte_init() because an M2P lookup faults because the MFN is in
>> MMIO space and not mapped in the M2P.  This lookup is done to see if
>> the PFN in in the range used for the initial page table pages, so that
>> the PTE may be set as read-only.
>>
>> The M2P lookup can be avoided by moving the check (and clear of RW)
>> earlier when the PFN is still available.
>>
>> Signed-off-by: David Vrabel <david.vrabel@citrix.com>
>> ---
>> Cc: Kevin Moraga <kmoragas@riseup.net>
>>
>> v3:
>> - fold mask_rw_pte()/mask_rw_pteval() into their callers.
>>
>> v2:
>> - Remove __init annotation from xen_make_pte_init() since
>>   PV_CALLEE_SAVE_REGS_THUNK always puts the thunk in .text.
>>
>> - mask_rw_pte() -> mask_rw_pteval() for x86-64.
>> ---
>>  arch/x86/xen/mmu.c | 76 +++++++++++++++++++++++++-----------------------------
>>  1 file changed, 35 insertions(+), 41 deletions(-)
>>
>> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
>> index 478a2de..64d8f0b 100644
>> --- a/arch/x86/xen/mmu.c
>> +++ b/arch/x86/xen/mmu.c
>> @@ -1551,41 +1551,6 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
>>  #endif
>>  }
>>  
>> -#ifdef CONFIG_X86_32
>> -static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
>> -{
>> -	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
>> -	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
>> -		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
>> -			       pte_val_ma(pte));
>> -
>> -	return pte;
>> -}
>> -#else /* CONFIG_X86_64 */
>> -static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
>> -{
>> -	unsigned long pfn;
>> -
>> -	if (xen_feature(XENFEAT_writable_page_tables) ||
>> -	    xen_feature(XENFEAT_auto_translated_physmap) ||
>> -	    xen_start_info->mfn_list >= __START_KERNEL_map)
>> -		return pte;
>> -
>> -	/*
>> -	 * Pages belonging to the initial p2m list mapped outside the default
>> -	 * address range must be mapped read-only. This region contains the
>> -	 * page tables for mapping the p2m list, too, and page tables MUST be
>> -	 * mapped read-only.
>> -	 */
>> -	pfn = pte_pfn(pte);
>> -	if (pfn >= xen_start_info->first_p2m_pfn &&
>> -	    pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
>> -		pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW);
>> -
>> -	return pte;
>> -}
>> -#endif /* CONFIG_X86_64 */
>> -
>>  /*
>>   * Init-time set_pte while constructing initial pagetables, which
>>   * doesn't allow RO page table pages to be remapped RW.
>> @@ -1600,13 +1565,41 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
>>   * so always write the PTE directly and rely on Xen trapping and
>>   * emulating any updates as necessary.
>>   */
>> -static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
>> +__visible pte_t xen_make_pte_init(pteval_t pte)
>>  {
>> -	if (pte_mfn(pte) != INVALID_P2M_ENTRY)
>> -		pte = mask_rw_pte(ptep, pte);
>> -	else
>> -		pte = __pte_ma(0);
>> +#ifdef CONFIG_X86_64
>> +	unsigned long pfn;
>> +
>> +	/*
>> +	 * Pages belonging to the initial p2m list mapped outside the default
>> +	 * address range must be mapped read-only. This region contains the
>> +	 * page tables for mapping the p2m list, too, and page tables MUST be
>> +	 * mapped read-only.
>> +	 */
>> +	pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT;
>> +	if (xen_start_info->mfn_list < __START_KERNEL_map &&
>> +	    pfn >= xen_start_info->first_p2m_pfn &&
>> +	    pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
>> +		pte &= ~_PAGE_RW;
>> +#endif
>> +	pte = pte_pfn_to_mfn(pte);
>>  
>> +	if ((pte & PTE_PFN_MASK) >> PAGE_SHIFT == INVALID_P2M_ENTRY)
> 
> How can this ever be true? I know this is just the open coded
> variant form the original xen_set_pte_init(). Either the if isn't
> needed at all or it should be corrected.

The frame might be ballooned out.

>> +		pte = 0;
>> +
>> +	return native_make_pte(pte);
>> +}
>> +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init);
>> +
>> +static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
>> +{
>> +#ifdef CONFIG_X86_32
>> +	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
>> +	if (pte_mfn(pte) != INVALID_P2M_ENTRY
>> +	    && pte_val_ma(*ptep) & _PAGE_PRESENT)
> 
> Even more obvious: do we really create ptes with INVALID_P2M_ENTRY and
> _PAGE_PRESENT? I think the first part of the if can be dropped again.

Again, the frame might be ballooned out.

David
Jürgen Groß June 23, 2016, 1:27 p.m. UTC | #4
On 23/06/16 15:06, David Vrabel wrote:
> On 23/06/16 13:13, Juergen Gross wrote:
>> On 23/06/16 11:51, David Vrabel wrote:
>>> When page tables entries are set using xen_set_pte_init() during early
>>> boot there is no page fault handler that could handle a fault when
>>> performing an M2P lookup.
>>>
>>> In 64 bit guests (usually dom0) early_ioremap() would fault in
>>> xen_set_pte_init() because an M2P lookup faults because the MFN is in
>>> MMIO space and not mapped in the M2P.  This lookup is done to see if
>>> the PFN in in the range used for the initial page table pages, so that
>>> the PTE may be set as read-only.
>>>
>>> The M2P lookup can be avoided by moving the check (and clear of RW)
>>> earlier when the PFN is still available.
>>>
>>> Signed-off-by: David Vrabel <david.vrabel@citrix.com>
>>> ---
>>> Cc: Kevin Moraga <kmoragas@riseup.net>
>>>
>>> v3:
>>> - fold mask_rw_pte()/mask_rw_pteval() into their callers.
>>>
>>> v2:
>>> - Remove __init annotation from xen_make_pte_init() since
>>>   PV_CALLEE_SAVE_REGS_THUNK always puts the thunk in .text.
>>>
>>> - mask_rw_pte() -> mask_rw_pteval() for x86-64.
>>> ---
>>>  arch/x86/xen/mmu.c | 76 +++++++++++++++++++++++++-----------------------------
>>>  1 file changed, 35 insertions(+), 41 deletions(-)
>>>
>>> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
>>> index 478a2de..64d8f0b 100644
>>> --- a/arch/x86/xen/mmu.c
>>> +++ b/arch/x86/xen/mmu.c
>>> @@ -1551,41 +1551,6 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
>>>  #endif
>>>  }
>>>  
>>> -#ifdef CONFIG_X86_32
>>> -static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
>>> -{
>>> -	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
>>> -	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
>>> -		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
>>> -			       pte_val_ma(pte));
>>> -
>>> -	return pte;
>>> -}
>>> -#else /* CONFIG_X86_64 */
>>> -static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
>>> -{
>>> -	unsigned long pfn;
>>> -
>>> -	if (xen_feature(XENFEAT_writable_page_tables) ||
>>> -	    xen_feature(XENFEAT_auto_translated_physmap) ||
>>> -	    xen_start_info->mfn_list >= __START_KERNEL_map)
>>> -		return pte;
>>> -
>>> -	/*
>>> -	 * Pages belonging to the initial p2m list mapped outside the default
>>> -	 * address range must be mapped read-only. This region contains the
>>> -	 * page tables for mapping the p2m list, too, and page tables MUST be
>>> -	 * mapped read-only.
>>> -	 */
>>> -	pfn = pte_pfn(pte);
>>> -	if (pfn >= xen_start_info->first_p2m_pfn &&
>>> -	    pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
>>> -		pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW);
>>> -
>>> -	return pte;
>>> -}
>>> -#endif /* CONFIG_X86_64 */
>>> -
>>>  /*
>>>   * Init-time set_pte while constructing initial pagetables, which
>>>   * doesn't allow RO page table pages to be remapped RW.
>>> @@ -1600,13 +1565,41 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
>>>   * so always write the PTE directly and rely on Xen trapping and
>>>   * emulating any updates as necessary.
>>>   */
>>> -static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
>>> +__visible pte_t xen_make_pte_init(pteval_t pte)
>>>  {
>>> -	if (pte_mfn(pte) != INVALID_P2M_ENTRY)
>>> -		pte = mask_rw_pte(ptep, pte);
>>> -	else
>>> -		pte = __pte_ma(0);
>>> +#ifdef CONFIG_X86_64
>>> +	unsigned long pfn;
>>> +
>>> +	/*
>>> +	 * Pages belonging to the initial p2m list mapped outside the default
>>> +	 * address range must be mapped read-only. This region contains the
>>> +	 * page tables for mapping the p2m list, too, and page tables MUST be
>>> +	 * mapped read-only.
>>> +	 */
>>> +	pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT;
>>> +	if (xen_start_info->mfn_list < __START_KERNEL_map &&
>>> +	    pfn >= xen_start_info->first_p2m_pfn &&
>>> +	    pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
>>> +		pte &= ~_PAGE_RW;
>>> +#endif
>>> +	pte = pte_pfn_to_mfn(pte);
>>>  
>>> +	if ((pte & PTE_PFN_MASK) >> PAGE_SHIFT == INVALID_P2M_ENTRY)
>>
>> How can this ever be true? I know this is just the open coded
>> variant form the original xen_set_pte_init(). Either the if isn't
>> needed at all or it should be corrected.
> 
> The frame might be ballooned out.

Okay. But how do you get the 64 bit value with all bits set by shifting
an unsigned value to the right (I admit that this is possible on a 32
bit dom0, but I doubt that was the intention of the test)?


Juergen
David Vrabel June 23, 2016, 1:37 p.m. UTC | #5
On 23/06/16 14:27, Juergen Gross wrote:
> On 23/06/16 15:06, David Vrabel wrote:
>> On 23/06/16 13:13, Juergen Gross wrote:
>>> On 23/06/16 11:51, David Vrabel wrote:
>>>> When page tables entries are set using xen_set_pte_init() during early
>>>> boot there is no page fault handler that could handle a fault when
>>>> performing an M2P lookup.
>>>>
>>>> In 64 bit guests (usually dom0) early_ioremap() would fault in
>>>> xen_set_pte_init() because an M2P lookup faults because the MFN is in
>>>> MMIO space and not mapped in the M2P.  This lookup is done to see if
>>>> the PFN in in the range used for the initial page table pages, so that
>>>> the PTE may be set as read-only.
>>>>
>>>> The M2P lookup can be avoided by moving the check (and clear of RW)
>>>> earlier when the PFN is still available.
>>>>
>>>> Signed-off-by: David Vrabel <david.vrabel@citrix.com>
>>>> ---
>>>> Cc: Kevin Moraga <kmoragas@riseup.net>
>>>>
>>>> v3:
>>>> - fold mask_rw_pte()/mask_rw_pteval() into their callers.
>>>>
>>>> v2:
>>>> - Remove __init annotation from xen_make_pte_init() since
>>>>   PV_CALLEE_SAVE_REGS_THUNK always puts the thunk in .text.
>>>>
>>>> - mask_rw_pte() -> mask_rw_pteval() for x86-64.
>>>> ---
>>>>  arch/x86/xen/mmu.c | 76 +++++++++++++++++++++++++-----------------------------
>>>>  1 file changed, 35 insertions(+), 41 deletions(-)
>>>>
>>>> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
>>>> index 478a2de..64d8f0b 100644
>>>> --- a/arch/x86/xen/mmu.c
>>>> +++ b/arch/x86/xen/mmu.c
>>>> @@ -1551,41 +1551,6 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
>>>>  #endif
>>>>  }
>>>>  
>>>> -#ifdef CONFIG_X86_32
>>>> -static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
>>>> -{
>>>> -	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
>>>> -	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
>>>> -		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
>>>> -			       pte_val_ma(pte));
>>>> -
>>>> -	return pte;
>>>> -}
>>>> -#else /* CONFIG_X86_64 */
>>>> -static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
>>>> -{
>>>> -	unsigned long pfn;
>>>> -
>>>> -	if (xen_feature(XENFEAT_writable_page_tables) ||
>>>> -	    xen_feature(XENFEAT_auto_translated_physmap) ||
>>>> -	    xen_start_info->mfn_list >= __START_KERNEL_map)
>>>> -		return pte;
>>>> -
>>>> -	/*
>>>> -	 * Pages belonging to the initial p2m list mapped outside the default
>>>> -	 * address range must be mapped read-only. This region contains the
>>>> -	 * page tables for mapping the p2m list, too, and page tables MUST be
>>>> -	 * mapped read-only.
>>>> -	 */
>>>> -	pfn = pte_pfn(pte);
>>>> -	if (pfn >= xen_start_info->first_p2m_pfn &&
>>>> -	    pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
>>>> -		pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW);
>>>> -
>>>> -	return pte;
>>>> -}
>>>> -#endif /* CONFIG_X86_64 */
>>>> -
>>>>  /*
>>>>   * Init-time set_pte while constructing initial pagetables, which
>>>>   * doesn't allow RO page table pages to be remapped RW.
>>>> @@ -1600,13 +1565,41 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
>>>>   * so always write the PTE directly and rely on Xen trapping and
>>>>   * emulating any updates as necessary.
>>>>   */
>>>> -static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
>>>> +__visible pte_t xen_make_pte_init(pteval_t pte)
>>>>  {
>>>> -	if (pte_mfn(pte) != INVALID_P2M_ENTRY)
>>>> -		pte = mask_rw_pte(ptep, pte);
>>>> -	else
>>>> -		pte = __pte_ma(0);
>>>> +#ifdef CONFIG_X86_64
>>>> +	unsigned long pfn;
>>>> +
>>>> +	/*
>>>> +	 * Pages belonging to the initial p2m list mapped outside the default
>>>> +	 * address range must be mapped read-only. This region contains the
>>>> +	 * page tables for mapping the p2m list, too, and page tables MUST be
>>>> +	 * mapped read-only.
>>>> +	 */
>>>> +	pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT;
>>>> +	if (xen_start_info->mfn_list < __START_KERNEL_map &&
>>>> +	    pfn >= xen_start_info->first_p2m_pfn &&
>>>> +	    pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
>>>> +		pte &= ~_PAGE_RW;
>>>> +#endif
>>>> +	pte = pte_pfn_to_mfn(pte);
>>>>  
>>>> +	if ((pte & PTE_PFN_MASK) >> PAGE_SHIFT == INVALID_P2M_ENTRY)
>>>
>>> How can this ever be true? I know this is just the open coded
>>> variant form the original xen_set_pte_init(). Either the if isn't
>>> needed at all or it should be corrected.
>>
>> The frame might be ballooned out.
> 
> Okay. But how do you get the 64 bit value with all bits set by shifting
> an unsigned value to the right (I admit that this is possible on a 32
> bit dom0, but I doubt that was the intention of the test)?

pte_pfn_to_mfn() already does the right thing so this test is bogus.
I've removed it.

Can I add your acked-by or reviewed-by with this change?

David
Jürgen Groß June 23, 2016, 1:41 p.m. UTC | #6
On 23/06/16 15:37, David Vrabel wrote:
> On 23/06/16 14:27, Juergen Gross wrote:
>> On 23/06/16 15:06, David Vrabel wrote:
>>> On 23/06/16 13:13, Juergen Gross wrote:
>>>> On 23/06/16 11:51, David Vrabel wrote:
>>>>> When page tables entries are set using xen_set_pte_init() during early
>>>>> boot there is no page fault handler that could handle a fault when
>>>>> performing an M2P lookup.
>>>>>
>>>>> In 64 bit guests (usually dom0) early_ioremap() would fault in
>>>>> xen_set_pte_init() because an M2P lookup faults because the MFN is in
>>>>> MMIO space and not mapped in the M2P.  This lookup is done to see if
>>>>> the PFN in in the range used for the initial page table pages, so that
>>>>> the PTE may be set as read-only.
>>>>>
>>>>> The M2P lookup can be avoided by moving the check (and clear of RW)
>>>>> earlier when the PFN is still available.
>>>>>
>>>>> Signed-off-by: David Vrabel <david.vrabel@citrix.com>
>>>>> ---
>>>>> Cc: Kevin Moraga <kmoragas@riseup.net>
>>>>>
>>>>> v3:
>>>>> - fold mask_rw_pte()/mask_rw_pteval() into their callers.
>>>>>
>>>>> v2:
>>>>> - Remove __init annotation from xen_make_pte_init() since
>>>>>   PV_CALLEE_SAVE_REGS_THUNK always puts the thunk in .text.
>>>>>
>>>>> - mask_rw_pte() -> mask_rw_pteval() for x86-64.
>>>>> ---
>>>>>  arch/x86/xen/mmu.c | 76 +++++++++++++++++++++++++-----------------------------
>>>>>  1 file changed, 35 insertions(+), 41 deletions(-)
>>>>>
>>>>> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
>>>>> index 478a2de..64d8f0b 100644
>>>>> --- a/arch/x86/xen/mmu.c
>>>>> +++ b/arch/x86/xen/mmu.c
>>>>> @@ -1551,41 +1551,6 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
>>>>>  #endif
>>>>>  }
>>>>>  
>>>>> -#ifdef CONFIG_X86_32
>>>>> -static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
>>>>> -{
>>>>> -	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
>>>>> -	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
>>>>> -		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
>>>>> -			       pte_val_ma(pte));
>>>>> -
>>>>> -	return pte;
>>>>> -}
>>>>> -#else /* CONFIG_X86_64 */
>>>>> -static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
>>>>> -{
>>>>> -	unsigned long pfn;
>>>>> -
>>>>> -	if (xen_feature(XENFEAT_writable_page_tables) ||
>>>>> -	    xen_feature(XENFEAT_auto_translated_physmap) ||
>>>>> -	    xen_start_info->mfn_list >= __START_KERNEL_map)
>>>>> -		return pte;
>>>>> -
>>>>> -	/*
>>>>> -	 * Pages belonging to the initial p2m list mapped outside the default
>>>>> -	 * address range must be mapped read-only. This region contains the
>>>>> -	 * page tables for mapping the p2m list, too, and page tables MUST be
>>>>> -	 * mapped read-only.
>>>>> -	 */
>>>>> -	pfn = pte_pfn(pte);
>>>>> -	if (pfn >= xen_start_info->first_p2m_pfn &&
>>>>> -	    pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
>>>>> -		pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW);
>>>>> -
>>>>> -	return pte;
>>>>> -}
>>>>> -#endif /* CONFIG_X86_64 */
>>>>> -
>>>>>  /*
>>>>>   * Init-time set_pte while constructing initial pagetables, which
>>>>>   * doesn't allow RO page table pages to be remapped RW.
>>>>> @@ -1600,13 +1565,41 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
>>>>>   * so always write the PTE directly and rely on Xen trapping and
>>>>>   * emulating any updates as necessary.
>>>>>   */
>>>>> -static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
>>>>> +__visible pte_t xen_make_pte_init(pteval_t pte)
>>>>>  {
>>>>> -	if (pte_mfn(pte) != INVALID_P2M_ENTRY)
>>>>> -		pte = mask_rw_pte(ptep, pte);
>>>>> -	else
>>>>> -		pte = __pte_ma(0);
>>>>> +#ifdef CONFIG_X86_64
>>>>> +	unsigned long pfn;
>>>>> +
>>>>> +	/*
>>>>> +	 * Pages belonging to the initial p2m list mapped outside the default
>>>>> +	 * address range must be mapped read-only. This region contains the
>>>>> +	 * page tables for mapping the p2m list, too, and page tables MUST be
>>>>> +	 * mapped read-only.
>>>>> +	 */
>>>>> +	pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT;
>>>>> +	if (xen_start_info->mfn_list < __START_KERNEL_map &&
>>>>> +	    pfn >= xen_start_info->first_p2m_pfn &&
>>>>> +	    pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
>>>>> +		pte &= ~_PAGE_RW;
>>>>> +#endif
>>>>> +	pte = pte_pfn_to_mfn(pte);
>>>>>  
>>>>> +	if ((pte & PTE_PFN_MASK) >> PAGE_SHIFT == INVALID_P2M_ENTRY)
>>>>
>>>> How can this ever be true? I know this is just the open coded
>>>> variant form the original xen_set_pte_init(). Either the if isn't
>>>> needed at all or it should be corrected.
>>>
>>> The frame might be ballooned out.
>>
>> Okay. But how do you get the 64 bit value with all bits set by shifting
>> an unsigned value to the right (I admit that this is possible on a 32
>> bit dom0, but I doubt that was the intention of the test)?
> 
> pte_pfn_to_mfn() already does the right thing so this test is bogus.
> I've removed it.
> 
> Can I add your acked-by or reviewed-by with this change?

Yes, with the test removed: Reviewed-by: Juergen Gross <jgross@suse.com>


Juergen
diff mbox

Patch

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 478a2de..64d8f0b 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1551,41 +1551,6 @@  static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
 #endif
 }
 
-#ifdef CONFIG_X86_32
-static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
-{
-	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
-	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
-		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
-			       pte_val_ma(pte));
-
-	return pte;
-}
-#else /* CONFIG_X86_64 */
-static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
-{
-	unsigned long pfn;
-
-	if (xen_feature(XENFEAT_writable_page_tables) ||
-	    xen_feature(XENFEAT_auto_translated_physmap) ||
-	    xen_start_info->mfn_list >= __START_KERNEL_map)
-		return pte;
-
-	/*
-	 * Pages belonging to the initial p2m list mapped outside the default
-	 * address range must be mapped read-only. This region contains the
-	 * page tables for mapping the p2m list, too, and page tables MUST be
-	 * mapped read-only.
-	 */
-	pfn = pte_pfn(pte);
-	if (pfn >= xen_start_info->first_p2m_pfn &&
-	    pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
-		pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW);
-
-	return pte;
-}
-#endif /* CONFIG_X86_64 */
-
 /*
  * Init-time set_pte while constructing initial pagetables, which
  * doesn't allow RO page table pages to be remapped RW.
@@ -1600,13 +1565,41 @@  static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
  * so always write the PTE directly and rely on Xen trapping and
  * emulating any updates as necessary.
  */
-static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
+__visible pte_t xen_make_pte_init(pteval_t pte)
 {
-	if (pte_mfn(pte) != INVALID_P2M_ENTRY)
-		pte = mask_rw_pte(ptep, pte);
-	else
-		pte = __pte_ma(0);
+#ifdef CONFIG_X86_64
+	unsigned long pfn;
+
+	/*
+	 * Pages belonging to the initial p2m list mapped outside the default
+	 * address range must be mapped read-only. This region contains the
+	 * page tables for mapping the p2m list, too, and page tables MUST be
+	 * mapped read-only.
+	 */
+	pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT;
+	if (xen_start_info->mfn_list < __START_KERNEL_map &&
+	    pfn >= xen_start_info->first_p2m_pfn &&
+	    pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
+		pte &= ~_PAGE_RW;
+#endif
+	pte = pte_pfn_to_mfn(pte);
 
+	if ((pte & PTE_PFN_MASK) >> PAGE_SHIFT == INVALID_P2M_ENTRY)
+		pte = 0;
+
+	return native_make_pte(pte);
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init);
+
+static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
+{
+#ifdef CONFIG_X86_32
+	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
+	if (pte_mfn(pte) != INVALID_P2M_ENTRY
+	    && pte_val_ma(*ptep) & _PAGE_PRESENT)
+		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
+			       pte_val_ma(pte));
+#endif
 	native_set_pte(ptep, pte);
 }
 
@@ -2407,6 +2400,7 @@  static void __init xen_post_allocator_init(void)
 	pv_mmu_ops.alloc_pud = xen_alloc_pud;
 	pv_mmu_ops.release_pud = xen_release_pud;
 #endif
+	pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte);
 
 #ifdef CONFIG_X86_64
 	pv_mmu_ops.write_cr3 = &xen_write_cr3;
@@ -2455,7 +2449,7 @@  static const struct pv_mmu_ops xen_mmu_ops __initconst = {
 	.pte_val = PV_CALLEE_SAVE(xen_pte_val),
 	.pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
 
-	.make_pte = PV_CALLEE_SAVE(xen_make_pte),
+	.make_pte = PV_CALLEE_SAVE(xen_make_pte_init),
 	.make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
 
 #ifdef CONFIG_X86_PAE