diff mbox series

[Part2,RFC,v4,10/40] x86/fault: Add support to handle the RMP fault for user address

Message ID 20210707183616.5620-11-brijesh.singh@amd.com (mailing list archive)
State New, archived
Headers show
Series Add AMD Secure Nested Paging (SEV-SNP) Hypervisor Support | expand

Commit Message

Brijesh Singh July 7, 2021, 6:35 p.m. UTC
When SEV-SNP is enabled globally, a write from the host goes through the
RMP check. When the host writes to pages, hardware checks the following
conditions at the end of page walk:

1. Assigned bit in the RMP table is zero (i.e page is shared).
2. If the page table entry that gives the sPA indicates that the target
   page size is a large page, then all RMP entries for the 4KB
   constituting pages of the target must have the assigned bit 0.
3. Immutable bit in the RMP table is not zero.

The hardware will raise page fault if one of the above conditions is not
met. Try resolving the fault instead of taking fault again and again. If
the host attempts to write to the guest private memory then send the
SIGBUG signal to kill the process. If the page level between the host and
RMP entry does not match, then split the address to keep the RMP and host
page levels in sync.

Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
---
 arch/x86/mm/fault.c | 69 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h  |  6 +++-
 mm/memory.c         | 13 +++++++++
 3 files changed, 87 insertions(+), 1 deletion(-)

Comments

Dave Hansen July 8, 2021, 4:16 p.m. UTC | #1
Oh, here's the THP code.  The subject just changed.

On 7/7/21 11:35 AM, Brijesh Singh wrote:
> When SEV-SNP is enabled globally, a write from the host goes through the
> RMP check. When the host writes to pages, hardware checks the following
> conditions at the end of page walk:
> 
> 1. Assigned bit in the RMP table is zero (i.e page is shared).
> 2. If the page table entry that gives the sPA indicates that the target
>    page size is a large page, then all RMP entries for the 4KB
>    constituting pages of the target must have the assigned bit 0.
> 3. Immutable bit in the RMP table is not zero.
> 
> The hardware will raise page fault if one of the above conditions is not
> met. Try resolving the fault instead of taking fault again and again. If
> the host attempts to write to the guest private memory then send the
> SIGBUG signal to kill the process. If the page level between the host and

"SIGBUG"?

> RMP entry does not match, then split the address to keep the RMP and host
> page levels in sync.


> ---
>  arch/x86/mm/fault.c | 69 +++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/mm.h  |  6 +++-
>  mm/memory.c         | 13 +++++++++
>  3 files changed, 87 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 195149eae9b6..cdf48019c1a7 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -1281,6 +1281,58 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
>  }
>  NOKPROBE_SYMBOL(do_kern_addr_fault);
>  
> +#define RMP_FAULT_RETRY		0
> +#define RMP_FAULT_KILL		1
> +#define RMP_FAULT_PAGE_SPLIT	2
> +
> +static inline size_t pages_per_hpage(int level)
> +{
> +	return page_level_size(level) / PAGE_SIZE;
> +}
> +
> +static int handle_user_rmp_page_fault(unsigned long hw_error_code, unsigned long address)
> +{
> +	unsigned long pfn, mask;
> +	int rmp_level, level;
> +	struct rmpentry *e;
> +	pte_t *pte;
> +
> +	if (unlikely(!cpu_feature_enabled(X86_FEATURE_SEV_SNP)))
> +		return RMP_FAULT_KILL;

Shouldn't this be a WARN_ON_ONCE()?  How can we get RMP faults without
SEV-SNP?

> +	/* Get the native page level */
> +	pte = lookup_address_in_mm(current->mm, address, &level);
> +	if (unlikely(!pte))
> +		return RMP_FAULT_KILL;

What would this mean?  There was an RMP fault on a non-present page?
How could that happen?  What if there was a race between an unmapping
event and the RMP fault delivery?

> +	pfn = pte_pfn(*pte);
> +	if (level > PG_LEVEL_4K) {
> +		mask = pages_per_hpage(level) - pages_per_hpage(level - 1);
> +		pfn |= (address >> PAGE_SHIFT) & mask;
> +	}

This looks inherently racy.  What happens if there are two parallel RMP
faults on the same 2M page.  One of them splits the page tables, the
other gets a fault for an already-split page table.

Is that handled here somehow?

> +	/* Get the page level from the RMP entry. */
> +	e = snp_lookup_page_in_rmptable(pfn_to_page(pfn), &rmp_level);
> +	if (!e)
> +		return RMP_FAULT_KILL;

The snp_lookup_page_in_rmptable() failure cases looks WARN-worthly.
Either you're doing a lookup for something not *IN* the RMP table, or
you don't support SEV-SNP, in which case you shouldn't be in this code
in the first place.

> +	/*
> +	 * Check if the RMP violation is due to the guest private page access.
> +	 * We can not resolve this RMP fault, ask to kill the guest.
> +	 */
> +	if (rmpentry_assigned(e))
> +		return RMP_FAULT_KILL;

No "We's", please.  Speak in imperative voice.

> +	/*
> +	 * The backing page level is higher than the RMP page level, request
> +	 * to split the page.
> +	 */
> +	if (level > rmp_level)
> +		return RMP_FAULT_PAGE_SPLIT;

This can theoretically trigger on a hugetlbfs page.  Right?

I thought I asked about this before... more below...

> +	return RMP_FAULT_RETRY;
> +}
> +
>  /*
>   * Handle faults in the user portion of the address space.  Nothing in here
>   * should check X86_PF_USER without a specific justification: for almost
> @@ -1298,6 +1350,7 @@ void do_user_addr_fault(struct pt_regs *regs,
>  	struct task_struct *tsk;
>  	struct mm_struct *mm;
>  	vm_fault_t fault;
> +	int ret;
>  	unsigned int flags = FAULT_FLAG_DEFAULT;
>  
>  	tsk = current;
> @@ -1378,6 +1431,22 @@ void 
(struct pt_regs *regs,
>  	if (error_code & X86_PF_INSTR)
>  		flags |= FAULT_FLAG_INSTRUCTION;
>  
> +	/*
> +	 * If its an RMP violation, try resolving it.
> +	 */
> +	if (error_code & X86_PF_RMP) {
> +		ret = handle_user_rmp_page_fault(error_code, address);
> +		if (ret == RMP_FAULT_PAGE_SPLIT) {
> +			flags |= FAULT_FLAG_PAGE_SPLIT;
> +		} else if (ret == RMP_FAULT_KILL) {
> +			fault |= VM_FAULT_SIGBUS;
> +			do_sigbus(regs, error_code, address, fault);
> +			return;
> +		} else {
> +			return;
> +		}
> +	}

Why not just have handle_user_rmp_page_fault() return a VM_FAULT_* code
directly?

I also suspect you can just set VM_FAULT_SIGBUS and let the do_sigbus()
call later on in the function do its work.

>  	 * Faults in the vsyscall page might need emulation.  The
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 322ec61d0da7..211dfe5d3b1d 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -450,6 +450,8 @@ extern pgprot_t protection_map[16];
>   * @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
>   * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
>   * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
> + * @FAULT_FLAG_PAGE_SPLIT: The fault was due page size mismatch, split the
> + *  region to smaller page size and retry.
>   *
>   * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
>   * whether we would allow page faults to retry by specifying these two
> @@ -481,6 +483,7 @@ enum fault_flag {
>  	FAULT_FLAG_REMOTE =		1 << 7,
>  	FAULT_FLAG_INSTRUCTION =	1 << 8,
>  	FAULT_FLAG_INTERRUPTIBLE =	1 << 9,
> +	FAULT_FLAG_PAGE_SPLIT =		1 << 10,
>  };
>  
>  /*
> @@ -520,7 +523,8 @@ static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
>  	{ FAULT_FLAG_USER,		"USER" }, \
>  	{ FAULT_FLAG_REMOTE,		"REMOTE" }, \
>  	{ FAULT_FLAG_INSTRUCTION,	"INSTRUCTION" }, \
> -	{ FAULT_FLAG_INTERRUPTIBLE,	"INTERRUPTIBLE" }
> +	{ FAULT_FLAG_INTERRUPTIBLE,	"INTERRUPTIBLE" }, \
> +	{ FAULT_FLAG_PAGE_SPLIT,	"PAGESPLIT" }
>  
>  /*
>   * vm_fault is filled by the pagefault handler and passed to the vma's
> diff --git a/mm/memory.c b/mm/memory.c
> index 730daa00952b..aef261d94e33 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4407,6 +4407,15 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
>  	return 0;
>  }
>  
> +static int handle_split_page_fault(struct vm_fault *vmf)
> +{
> +	if (!IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
> +		return VM_FAULT_SIGBUS;
> +
> +	__split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
> +	return 0;
> +}

What will this do when you hand it a hugetlbfs page?
Brijesh Singh July 12, 2021, 3:43 p.m. UTC | #2
Hi Dave,


On 7/8/21 11:16 AM, Dave Hansen wrote:
> 
> "SIGBUG"?

Its typo, it should be SIGBUS

>> +
>> +	if (unlikely(!cpu_feature_enabled(X86_FEATURE_SEV_SNP)))
>> +		return RMP_FAULT_KILL;
> 
> Shouldn't this be a WARN_ON_ONCE()?  How can we get RMP faults without
> SEV-SNP?

Yes, we should *not* get RMP fault if SEV-SNP is not enabled. I can use 
the WARN_ON_ONCE().


> 
>> +	/* Get the native page level */
>> +	pte = lookup_address_in_mm(current->mm, address, &level);
>> +	if (unlikely(!pte))
>> +		return RMP_FAULT_KILL;
> 
> What would this mean?  There was an RMP fault on a non-present page?
> How could that happen?  What if there was a race between an unmapping
> event and the RMP fault delivery?

We should not have RMP fault for non-present pages. But you have a good 
point that there maybe a race between the unmap event and RMP fault. 
Instead of terminating the process we should simply retry.


> 
>> +	pfn = pte_pfn(*pte);
>> +	if (level > PG_LEVEL_4K) {
>> +		mask = pages_per_hpage(level) - pages_per_hpage(level - 1);
>> +		pfn |= (address >> PAGE_SHIFT) & mask;
>> +	}
> 
> This looks inherently racy.  What happens if there are two parallel RMP
> faults on the same 2M page.  One of them splits the page tables, the
> other gets a fault for an already-split page table.
>  > Is that handled here somehow?

Yes, in this particular case we simply retry and hardware should 
re-evaluate the page level and take the corrective action.


> 
>> +	/* Get the page level from the RMP entry. */
>> +	e = snp_lookup_page_in_rmptable(pfn_to_page(pfn), &rmp_level);
>> +	if (!e)
>> +		return RMP_FAULT_KILL;
> 
> The snp_lookup_page_in_rmptable() failure cases looks WARN-worthly.
> Either you're doing a lookup for something not *IN* the RMP table, or
> you don't support SEV-SNP, in which case you shouldn't be in this code
> in the first place.

Noted.

> 
>> +	/*
>> +	 * Check if the RMP violation is due to the guest private page access.
>> +	 * We can not resolve this RMP fault, ask to kill the guest.
>> +	 */
>> +	if (rmpentry_assigned(e))
>> +		return RMP_FAULT_KILL;
> 
> No "We's", please.  Speak in imperative voice.

Noted.

> 
>> +	/*
>> +	 * The backing page level is higher than the RMP page level, request
>> +	 * to split the page.
>> +	 */
>> +	if (level > rmp_level)
>> +		return RMP_FAULT_PAGE_SPLIT;
> 
> This can theoretically trigger on a hugetlbfs page.  Right?
> 

Yes, theoretically.

In the current implementation, the VMM is enlightened to not use the 
hugetlbfs for backing page when creating the SEV-SNP guests.


> I thought I asked about this before... more below...
> 
>> +	return RMP_FAULT_RETRY;
>> +}
>> +
>>   /*
>>    * Handle faults in the user portion of the address space.  Nothing in here
>>    * should check X86_PF_USER without a specific justification: for almost
>> @@ -1298,6 +1350,7 @@ void do_user_addr_fault(struct pt_regs *regs,
>>   	struct task_struct *tsk;
>>   	struct mm_struct *mm;
>>   	vm_fault_t fault;
>> +	int ret;
>>   	unsigned int flags = FAULT_FLAG_DEFAULT;
>>   
>>   	tsk = current;
>> @@ -1378,6 +1431,22 @@ void
> (struct pt_regs *regs,
>>   	if (error_code & X86_PF_INSTR)
>>   		flags |= FAULT_FLAG_INSTRUCTION;
>>   
>> +	/*
>> +	 * If its an RMP violation, try resolving it.
>> +	 */
>> +	if (error_code & X86_PF_RMP) {
>> +		ret = handle_user_rmp_page_fault(error_code, address);
>> +		if (ret == RMP_FAULT_PAGE_SPLIT) {
>> +			flags |= FAULT_FLAG_PAGE_SPLIT;
>> +		} else if (ret == RMP_FAULT_KILL) {
>> +			fault |= VM_FAULT_SIGBUS;
>> +			do_sigbus(regs, error_code, address, fault);
>> +			return;
>> +		} else {
>> +			return;
>> +		}
>> +	}
> 
> Why not just have handle_user_rmp_page_fault() return a VM_FAULT_* code
> directly?
> 

I don't have any strong reason against it. In next rev, I can update to 
use the VM_FAULT_* code and call the do_sigbus() etc.

> I also suspect you can just set VM_FAULT_SIGBUS and let the do_sigbus()
> call later on in the function do its work.
>>   
>> +static int handle_split_page_fault(struct vm_fault *vmf)
>> +{
>> +	if (!IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
>> +		return VM_FAULT_SIGBUS;
>> +
>> +	__split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
>> +	return 0;
>> +}
> 
> What will this do when you hand it a hugetlbfs page?
> 

VMM is updated to not use the hugetlbfs when creating SEV-SNP guests. 
So, we should not run into it.

-Brijesh
Dave Hansen July 12, 2021, 4 p.m. UTC | #3
On 7/12/21 8:43 AM, Brijesh Singh wrote:
>>> +    /*
>>> +     * The backing page level is higher than the RMP page level,
>>> request
>>> +     * to split the page.
>>> +     */
>>> +    if (level > rmp_level)
>>> +        return RMP_FAULT_PAGE_SPLIT;
>>
>> This can theoretically trigger on a hugetlbfs page.  Right?
> 
> Yes, theoretically.
> 
> In the current implementation, the VMM is enlightened to not use the
> hugetlbfs for backing page when creating the SEV-SNP guests.

"The VMM"?

We try to write kernel code so that it "works" and doesn't do unexpected
things with whatever userspace might throw at it.  This seems to be
written with an assumption that no VMM will ever use hugetlbfs with SEV-SNP.

That worries me.  Not only because someone is sure to try it, but it's
the kind of assumption that an attacker or a fuzzer might try.

Could you please test this kernel code in practice with hugetblfs?

>> I also suspect you can just set VM_FAULT_SIGBUS and let the do_sigbus()
>> call later on in the function do its work.
>>>   +static int handle_split_page_fault(struct vm_fault *vmf)
>>> +{
>>> +    if (!IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
>>> +        return VM_FAULT_SIGBUS;
>>> +
>>> +    __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
>>> +    return 0;
>>> +}
>>
>> What will this do when you hand it a hugetlbfs page?
> 
> VMM is updated to not use the hugetlbfs when creating SEV-SNP guests.
> So, we should not run into it.

Please fix this code to handle hugetlbfs along with any other non-THP
source of level>0 mappings.  DAX comes to mind.  "Handle" can mean
rejecting these.  You don't have to find some way to split them and make
the VM work, just fail safely, ideally as early as possible.

To me, this is a fundamental requirement before this code can be accepted.

How many more parts of this series are predicated on the behavior of the
VMM like this?
Brijesh Singh July 12, 2021, 4:11 p.m. UTC | #4
On 7/12/21 11:00 AM, Dave Hansen wrote:
> On 7/12/21 8:43 AM, Brijesh Singh wrote:
>>>> +    /*
>>>> +     * The backing page level is higher than the RMP page level,
>>>> request
>>>> +     * to split the page.
>>>> +     */
>>>> +    if (level > rmp_level)
>>>> +        return RMP_FAULT_PAGE_SPLIT;
>>>
>>> This can theoretically trigger on a hugetlbfs page.  Right?
>>
>> Yes, theoretically.
>>
>> In the current implementation, the VMM is enlightened to not use the
>> hugetlbfs for backing page when creating the SEV-SNP guests.
> 
> "The VMM"?

I was meaning a userspace qemu.

> 
> We try to write kernel code so that it "works" and doesn't do unexpected
> things with whatever userspace might throw at it.  This seems to be
> written with an assumption that no VMM will ever use hugetlbfs with SEV-SNP.
> 
> That worries me.  Not only because someone is sure to try it, but it's
> the kind of assumption that an attacker or a fuzzer might try.
> 
> Could you please test this kernel code in practice with hugetblfs?

Yes, I will make sure that hugetlbfs path is tested in non-RFC version.


> 
>>> I also suspect you can just set VM_FAULT_SIGBUS and let the do_sigbus()
>>> call later on in the function do its work.
>>>>    +static int handle_split_page_fault(struct vm_fault *vmf)
>>>> +{
>>>> +    if (!IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
>>>> +        return VM_FAULT_SIGBUS;
>>>> +
>>>> +    __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
>>>> +    return 0;
>>>> +}
>>>
>>> What will this do when you hand it a hugetlbfs page?
>>
>> VMM is updated to not use the hugetlbfs when creating SEV-SNP guests.
>> So, we should not run into it.
> 
> Please fix this code to handle hugetlbfs along with any other non-THP
> source of level>0 mappings.  DAX comes to mind.  "Handle" can mean
> rejecting these.  You don't have to find some way to split them and make
> the VM work, just fail safely, ideally as early as possible.
> 
> To me, this is a fundamental requirement before this code can be accepted.

Understood, if userspace decided to use the hugetlbfs backing pages then 
I believe earliest we can detect is when we go about adding the pages in 
the RMP table. I'll add a check, and fail the page state change.

-Brijesh
Dave Hansen July 12, 2021, 4:15 p.m. UTC | #5
On 7/12/21 9:11 AM, Brijesh Singh wrote:
>> Please fix this code to handle hugetlbfs along with any other non-THP
>> source of level>0 mappings.  DAX comes to mind.  "Handle" can mean
>> rejecting these.  You don't have to find some way to split them and make
>> the VM work, just fail safely, ideally as early as possible.
>>
>> To me, this is a fundamental requirement before this code can be
>> accepted.
> 
> Understood, if userspace decided to use the hugetlbfs backing pages then
> I believe earliest we can detect is when we go about adding the pages in
> the RMP table. I'll add a check, and fail the page state change.

Really?  You had to feed the RMP entries from *some* mapping in the
first place.  Is there a reason the originating mapping can't be checked
at that point instead of waiting for the fault?
Brijesh Singh July 12, 2021, 4:24 p.m. UTC | #6
On 7/12/21 11:15 AM, Dave Hansen wrote:
> On 7/12/21 9:11 AM, Brijesh Singh wrote:
>>> Please fix this code to handle hugetlbfs along with any other non-THP
>>> source of level>0 mappings.  DAX comes to mind.  "Handle" can mean
>>> rejecting these.  You don't have to find some way to split them and make
>>> the VM work, just fail safely, ideally as early as possible.
>>>
>>> To me, this is a fundamental requirement before this code can be
>>> accepted.
>>
>> Understood, if userspace decided to use the hugetlbfs backing pages then
>> I believe earliest we can detect is when we go about adding the pages in
>> the RMP table. I'll add a check, and fail the page state change.
> 
> Really?  You had to feed the RMP entries from *some* mapping in the
> first place.  Is there a reason the originating mapping can't be checked
> at that point instead of waiting for the fault?
> 

Apologies if I was not clear in the messaging, that's exactly what I 
mean that we don't feed RMP entries during the page state change.

The sequence of the operation is:

1. Guest issues a VMGEXIT (page state change) to add a page in the RMP
2. Hyperivosr adds the page in the RMP table.

The check will be inside the hypervisor (#2), to query the backing page 
type, if the backing page is from the hugetlbfs, then don't add the page 
in the RMP, and fail the page state change VMGEXIT.

-Brijesh
Dave Hansen July 12, 2021, 4:29 p.m. UTC | #7
On 7/12/21 9:24 AM, Brijesh Singh wrote:
> Apologies if I was not clear in the messaging, that's exactly what I
> mean that we don't feed RMP entries during the page state change.
> 
> The sequence of the operation is:
> 
> 1. Guest issues a VMGEXIT (page state change) to add a page in the RMP
> 2. Hyperivosr adds the page in the RMP table.
> 
> The check will be inside the hypervisor (#2), to query the backing page
> type, if the backing page is from the hugetlbfs, then don't add the page
> in the RMP, and fail the page state change VMGEXIT.

Right, but *LOOOOOONG* before that, something walked the page tables and
stuffed the PFN into the NPT (that's the AMD equivalent of EPT, right?).
 You could also avoid this whole mess by refusing to allow hugetblfs to
be mapped into the guest in the first place.
Brijesh Singh July 12, 2021, 4:49 p.m. UTC | #8
On 7/12/21 11:29 AM, Dave Hansen wrote:
> On 7/12/21 9:24 AM, Brijesh Singh wrote:
>> Apologies if I was not clear in the messaging, that's exactly what I
>> mean that we don't feed RMP entries during the page state change.
>>
>> The sequence of the operation is:
>>
>> 1. Guest issues a VMGEXIT (page state change) to add a page in the RMP
>> 2. Hyperivosr adds the page in the RMP table.
>>
>> The check will be inside the hypervisor (#2), to query the backing page
>> type, if the backing page is from the hugetlbfs, then don't add the page
>> in the RMP, and fail the page state change VMGEXIT.
> 
> Right, but *LOOOOOONG* before that, something walked the page tables and
> stuffed the PFN into the NPT (that's the AMD equivalent of EPT, right?).
>   You could also avoid this whole mess by refusing to allow hugetblfs to
> be mapped into the guest in the first place.
> 

Ah, that should be doable. For SEV stuff, we require the VMM to register 
the memory region to the hypervisor during the VM creation time. I can 
check the hugetlbfs while registering the memory region and fail much 
earlier.

thanks
Sean Christopherson July 15, 2021, 9:53 p.m. UTC | #9
On Mon, Jul 12, 2021, Brijesh Singh wrote:
> 
> 
> On 7/12/21 11:29 AM, Dave Hansen wrote:
> > On 7/12/21 9:24 AM, Brijesh Singh wrote:
> > > Apologies if I was not clear in the messaging, that's exactly what I
> > > mean that we don't feed RMP entries during the page state change.
> > > 
> > > The sequence of the operation is:
> > > 
> > > 1. Guest issues a VMGEXIT (page state change) to add a page in the RMP
> > > 2. Hyperivosr adds the page in the RMP table.
> > > 
> > > The check will be inside the hypervisor (#2), to query the backing page
> > > type, if the backing page is from the hugetlbfs, then don't add the page
> > > in the RMP, and fail the page state change VMGEXIT.
> > 
> > Right, but *LOOOOOONG* before that, something walked the page tables and
> > stuffed the PFN into the NPT (that's the AMD equivalent of EPT, right?).
> >   You could also avoid this whole mess by refusing to allow hugetblfs to
> > be mapped into the guest in the first place.
> > 
> 
> Ah, that should be doable. For SEV stuff, we require the VMM to register the
> memory region to the hypervisor during the VM creation time. I can check the
> hugetlbfs while registering the memory region and fail much earlier.

That's technically unnecessary, because this patch is working on the wrong set of
page tables when handling faults from KVM.

The host page tables constrain KVM's NPT, but the two are not mirrors of each
other.  Specifically, KVM cannot exceed the size of the host page tables because
that would give the guest access to memory it does not own, but KVM isn't required
to use the same size as the host.  E.g. a 1gb page in the host can be 1gb, 2mb, or
4kb in the NPT.

The code "works" because the size contraints mean it can't get false negatives,
only false positives, false positives will never be fatal, e.g. the fault handler
may unnecessarily demote a 1gb, and demoting a host page will further constrain
KVM's NPT.

The distinction matters because it changes our options.  For RMP violations on
NPT due to page size mismatches, KVM can and should handle the fault without
consulting the primary MMU, i.e. by demoting the NPT entry.  That means KVM does
not need to care about hugetlbfs or any other backing type that cannot be split
since KVM will never initiate a host page split in response to a #NPT RMP violation.

That doesn't mean that hugetlbfs will magically work since e.g. get/put_user()
will fault and fail, but that's a generic non-KVM problem since nothing prevents
remapping and/or accessing the page(s) outside of KVM context.

The other reason to not disallow hugetlbfs and co. is that a guest that's
enlightened to operate at 2mb granularity, e.g. always do page state changes on
2mb chunks, can play nice with hugetlbfs without ever hitting an RMP violation.

Last thought, have we taken care in the guest side of things to work at 2mb
granularity when possible?  AFAICT, PSMASH is effectively a one-way street since
RMPUPDATE to restore a 2mb RMP is destructive, i.e. requires PVALIDATE on the
entire 2mb chunk, and the guest can't safely do that without reinitializing the
whole page, e.g. would either lose data or have to save/init/restore.
Vlastimil Babka July 30, 2021, 4 p.m. UTC | #10
On 7/7/21 8:35 PM, Brijesh Singh wrote:
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4407,6 +4407,15 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
>  	return 0;
>  }
>  
> +static int handle_split_page_fault(struct vm_fault *vmf)
> +{
> +	if (!IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
> +		return VM_FAULT_SIGBUS;
> +
> +	__split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
> +	return 0;
> +}
> +

I think back in v1 Dave asked if khugepaged will just coalesce this back, and it
wasn't ever answered AFAICS.

I've checked the code and I think the answer is: no. Khugepaged isn't designed
to coalesce a pte-mapped hugepage back to pmd in place. And the usual way (copy
to a new huge page) I think will not succeed because IIRC the page is also
FOLL_PIN pinned and  khugepaged_scan_pmd() will see the elevated refcounts via
is_refcount_suitable() and give up.

So the lack of coalescing (in case the sub-page leading to split becomes guest
private again later) is somewhat suboptimal, but not critical.
Dave Hansen July 30, 2021, 4:31 p.m. UTC | #11
On 7/30/21 9:00 AM, Vlastimil Babka wrote:
> On 7/7/21 8:35 PM, Brijesh Singh wrote:
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -4407,6 +4407,15 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
>>  	return 0;
>>  }
>>  
>> +static int handle_split_page_fault(struct vm_fault *vmf)
>> +{
>> +	if (!IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
>> +		return VM_FAULT_SIGBUS;
>> +
>> +	__split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
>> +	return 0;
>> +}
>> +
> I think back in v1 Dave asked if khugepaged will just coalesce this back, and it
> wasn't ever answered AFAICS.
> 
> I've checked the code and I think the answer is: no. Khugepaged isn't designed
> to coalesce a pte-mapped hugepage back to pmd in place. And the usual way (copy
> to a new huge page) I think will not succeed because IIRC the page is also
> FOLL_PIN pinned and  khugepaged_scan_pmd() will see the elevated refcounts via
> is_refcount_suitable() and give up.

I _thought_ this was the whole "PTE mapped THP" bit of code, like
collapse_pte_mapped_thp().  But, looking at it again, I think that code
is just for the huge tmpfs flavor of THP.

Either way, I'm kinda surprised that we don't collapse things in place.
 Especially in the early days, there were lots of crazy things that
split THPs.  I think even things like /proc/$pid/smaps split them.

In any case, it sounds like SEV-SNP users should probably be advised to
use MADV_NOHUGEPAGE to avoid any future surprises.  At least until the
hardware folks get their act together and teach the TLB how to fracture
2M entries properly. :)
diff mbox series

Patch

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 195149eae9b6..cdf48019c1a7 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1281,6 +1281,58 @@  do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
 }
 NOKPROBE_SYMBOL(do_kern_addr_fault);
 
+#define RMP_FAULT_RETRY		0
+#define RMP_FAULT_KILL		1
+#define RMP_FAULT_PAGE_SPLIT	2
+
+static inline size_t pages_per_hpage(int level)
+{
+	return page_level_size(level) / PAGE_SIZE;
+}
+
+static int handle_user_rmp_page_fault(unsigned long hw_error_code, unsigned long address)
+{
+	unsigned long pfn, mask;
+	int rmp_level, level;
+	struct rmpentry *e;
+	pte_t *pte;
+
+	if (unlikely(!cpu_feature_enabled(X86_FEATURE_SEV_SNP)))
+		return RMP_FAULT_KILL;
+
+	/* Get the native page level */
+	pte = lookup_address_in_mm(current->mm, address, &level);
+	if (unlikely(!pte))
+		return RMP_FAULT_KILL;
+
+	pfn = pte_pfn(*pte);
+	if (level > PG_LEVEL_4K) {
+		mask = pages_per_hpage(level) - pages_per_hpage(level - 1);
+		pfn |= (address >> PAGE_SHIFT) & mask;
+	}
+
+	/* Get the page level from the RMP entry. */
+	e = snp_lookup_page_in_rmptable(pfn_to_page(pfn), &rmp_level);
+	if (!e)
+		return RMP_FAULT_KILL;
+
+	/*
+	 * Check if the RMP violation is due to the guest private page access.
+	 * We can not resolve this RMP fault, ask to kill the guest.
+	 */
+	if (rmpentry_assigned(e))
+		return RMP_FAULT_KILL;
+
+	/*
+	 * The backing page level is higher than the RMP page level, request
+	 * to split the page.
+	 */
+	if (level > rmp_level)
+		return RMP_FAULT_PAGE_SPLIT;
+
+	return RMP_FAULT_RETRY;
+}
+
 /*
  * Handle faults in the user portion of the address space.  Nothing in here
  * should check X86_PF_USER without a specific justification: for almost
@@ -1298,6 +1350,7 @@  void do_user_addr_fault(struct pt_regs *regs,
 	struct task_struct *tsk;
 	struct mm_struct *mm;
 	vm_fault_t fault;
+	int ret;
 	unsigned int flags = FAULT_FLAG_DEFAULT;
 
 	tsk = current;
@@ -1378,6 +1431,22 @@  void do_user_addr_fault(struct pt_regs *regs,
 	if (error_code & X86_PF_INSTR)
 		flags |= FAULT_FLAG_INSTRUCTION;
 
+	/*
+	 * If its an RMP violation, try resolving it.
+	 */
+	if (error_code & X86_PF_RMP) {
+		ret = handle_user_rmp_page_fault(error_code, address);
+		if (ret == RMP_FAULT_PAGE_SPLIT) {
+			flags |= FAULT_FLAG_PAGE_SPLIT;
+		} else if (ret == RMP_FAULT_KILL) {
+			fault |= VM_FAULT_SIGBUS;
+			do_sigbus(regs, error_code, address, fault);
+			return;
+		} else {
+			return;
+		}
+	}
+
 #ifdef CONFIG_X86_64
 	/*
 	 * Faults in the vsyscall page might need emulation.  The
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 322ec61d0da7..211dfe5d3b1d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -450,6 +450,8 @@  extern pgprot_t protection_map[16];
  * @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
  * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
  * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
+ * @FAULT_FLAG_PAGE_SPLIT: The fault was due page size mismatch, split the
+ *  region to smaller page size and retry.
  *
  * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
  * whether we would allow page faults to retry by specifying these two
@@ -481,6 +483,7 @@  enum fault_flag {
 	FAULT_FLAG_REMOTE =		1 << 7,
 	FAULT_FLAG_INSTRUCTION =	1 << 8,
 	FAULT_FLAG_INTERRUPTIBLE =	1 << 9,
+	FAULT_FLAG_PAGE_SPLIT =		1 << 10,
 };
 
 /*
@@ -520,7 +523,8 @@  static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
 	{ FAULT_FLAG_USER,		"USER" }, \
 	{ FAULT_FLAG_REMOTE,		"REMOTE" }, \
 	{ FAULT_FLAG_INSTRUCTION,	"INSTRUCTION" }, \
-	{ FAULT_FLAG_INTERRUPTIBLE,	"INTERRUPTIBLE" }
+	{ FAULT_FLAG_INTERRUPTIBLE,	"INTERRUPTIBLE" }, \
+	{ FAULT_FLAG_PAGE_SPLIT,	"PAGESPLIT" }
 
 /*
  * vm_fault is filled by the pagefault handler and passed to the vma's
diff --git a/mm/memory.c b/mm/memory.c
index 730daa00952b..aef261d94e33 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4407,6 +4407,15 @@  static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 	return 0;
 }
 
+static int handle_split_page_fault(struct vm_fault *vmf)
+{
+	if (!IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
+		return VM_FAULT_SIGBUS;
+
+	__split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
+	return 0;
+}
+
 /*
  * By the time we get here, we already hold the mm semaphore
  *
@@ -4484,6 +4493,10 @@  static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
 				pmd_migration_entry_wait(mm, vmf.pmd);
 			return 0;
 		}
+
+		if (flags & FAULT_FLAG_PAGE_SPLIT)
+			return handle_split_page_fault(&vmf);
+
 		if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
 			if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
 				return do_huge_pmd_numa_page(&vmf, orig_pmd);