diff mbox series

[WIP,v2,04/14] KVM: x86: Add KVM_CAP_X86_MEMORY_FAULT_EXIT and associated kvm_run field

Message ID 20230315021738.1151386-5-amoorthy@google.com (mailing list archive)
State New, archived
Headers show
Series Avoiding slow get-user-pages via memory fault exit | expand

Commit Message

Anish Moorthy March 15, 2023, 2:17 a.m. UTC
Memory fault exits allow KVM to return useful information from
KVM_RUN instead of having to -EFAULT when a guest memory access goes
wrong. Document the intent and API of the new capability, and introduce
helper functions which will be useful in places where it needs to be
implemented.

Also allow the capability to be enabled, even though that won't
currently *do* anything: implementations at the relevant -EFAULT sites
will performed in subsequent commits.
---
 Documentation/virt/kvm/api.rst | 37 ++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c             |  1 +
 include/linux/kvm_host.h       | 16 +++++++++++++++
 include/uapi/linux/kvm.h       | 16 +++++++++++++++
 tools/include/uapi/linux/kvm.h | 15 ++++++++++++++
 virt/kvm/kvm_main.c            | 28 +++++++++++++++++++++++++
 6 files changed, 113 insertions(+)

Comments

Isaku Yamahata March 17, 2023, 12:02 a.m. UTC | #1
On Wed, Mar 15, 2023 at 02:17:28AM +0000,
Anish Moorthy <amoorthy@google.com> wrote:

> Memory fault exits allow KVM to return useful information from
> KVM_RUN instead of having to -EFAULT when a guest memory access goes
> wrong. Document the intent and API of the new capability, and introduce
> helper functions which will be useful in places where it needs to be
> implemented.
> 
> Also allow the capability to be enabled, even though that won't
> currently *do* anything: implementations at the relevant -EFAULT sites
> will performed in subsequent commits.
> ---
>  Documentation/virt/kvm/api.rst | 37 ++++++++++++++++++++++++++++++++++
>  arch/x86/kvm/x86.c             |  1 +
>  include/linux/kvm_host.h       | 16 +++++++++++++++
>  include/uapi/linux/kvm.h       | 16 +++++++++++++++
>  tools/include/uapi/linux/kvm.h | 15 ++++++++++++++
>  virt/kvm/kvm_main.c            | 28 +++++++++++++++++++++++++
>  6 files changed, 113 insertions(+)
> 
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 62de0768d6aa5..f9ca18bbec879 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -6636,6 +6636,19 @@ array field represents return values. The userspace should update the return
>  values of SBI call before resuming the VCPU. For more details on RISC-V SBI
>  spec refer, https://github.com/riscv/riscv-sbi-doc.
>  
> +::
> +
> +		/* KVM_EXIT_MEMORY_FAULT */
> +		struct {
> +			__u64 flags;
> +			__u64 gpa;
> +			__u64 len; /* in bytes */
> +		} memory_fault;
> +
> +Indicates a memory fault on the guest physical address range [gpa, gpa + len).
> +flags is a bitfield describing the reasons(s) for the fault. See
> +KVM_CAP_X86_MEMORY_FAULT_EXIT for more details.
> +
>  ::
>  
>      /* KVM_EXIT_NOTIFY */
> @@ -7669,6 +7682,30 @@ This capability is aimed to mitigate the threat that malicious VMs can
>  cause CPU stuck (due to event windows don't open up) and make the CPU
>  unavailable to host or other VMs.
>  
> +7.34 KVM_CAP_X86_MEMORY_FAULT_EXIT
> +----------------------------------
> +
> +:Architectures: x86

Why x86 specific?

> +:Parameters: args[0] is a bitfield specifying what reasons to exit upon.
> +:Returns: 0 on success, -EINVAL if unsupported or if unrecognized exit reason
> +          specified.
> +
> +This capability transforms -EFAULTs returned by KVM_RUN in response to guest
> +memory accesses into VM exits (KVM_EXIT_MEMORY_FAULT), with 'gpa' and 'len'
> +describing the problematic range of memory and 'flags' describing the reason(s)
> +for the fault.
> +
> +The implementation is currently incomplete. Please notify the maintainers if you
> +come across a case where it needs to be implemented.
> +
> +Through args[0], the capability can be set on a per-exit-reason basis.
> +Currently, the only exit reasons supported are
> +
> +1. KVM_MEMFAULT_REASON_UNKNOWN (1 << 0)
> +
> +Memory fault exits with a reason of UNKNOWN should not be depended upon: they
> +may be added, removed, or reclassified under a stable reason.
> +
>  8. Other capabilities.
>  ======================
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index f706621c35b86..b3c1b2f57e680 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -4425,6 +4425,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>  	case KVM_CAP_VAPIC:
>  	case KVM_CAP_ENABLE_CAP:
>  	case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
> +	case KVM_CAP_X86_MEMORY_FAULT_EXIT:
>  		r = 1;
>  		break;
>  	case KVM_CAP_EXIT_HYPERCALL:
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 8ada23756b0ec..d3ccfead73e42 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -805,6 +805,7 @@ struct kvm {
>  	struct notifier_block pm_notifier;
>  #endif
>  	char stats_id[KVM_STATS_NAME_SIZE];
> +	uint64_t memfault_exit_reasons;
>  };
>  
>  #define kvm_err(fmt, ...) \
> @@ -2278,4 +2279,19 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
>  /* Max number of entries allowed for each kvm dirty ring */
>  #define  KVM_DIRTY_RING_MAX_ENTRIES  65536
>  
> +/*
> + * If memory fault exits are enabled for any of the reasons given in exit_flags
> + * then sets up a KVM_EXIT_MEMORY_FAULT for the given guest physical address,
> + * length, and flags and returns -1.
> + * Otherwise, returns -EFAULT
> + */
> +inline int kvm_memfault_exit_or_efault(
> +	struct kvm_vcpu *vcpu, uint64_t gpa, uint64_t len, uint64_t exit_flags);
> +
> +/*
> + * Checks that all of the bits specified in 'reasons' correspond to known
> + * memory fault exit reasons.
> + */
> +bool kvm_memfault_exit_flags_valid(uint64_t reasons);
> +
>  #endif
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index d77aef872a0a0..0ba1d7f01346e 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -264,6 +264,7 @@ struct kvm_xen_exit {
>  #define KVM_EXIT_RISCV_SBI        35
>  #define KVM_EXIT_RISCV_CSR        36
>  #define KVM_EXIT_NOTIFY           37
> +#define KVM_EXIT_MEMORY_FAULT     38
>  
>  /* For KVM_EXIT_INTERNAL_ERROR */
>  /* Emulate instruction failed. */
> @@ -505,6 +506,17 @@ struct kvm_run {
>  #define KVM_NOTIFY_CONTEXT_INVALID	(1 << 0)
>  			__u32 flags;
>  		} notify;
> +		/* KVM_EXIT_MEMORY_FAULT */
> +		struct {
> +			/*
> +			 * Indicates a memory fault on the guest physical address range
> +			 * [gpa, gpa + len). flags is a bitfield describing the reasons(s)
> +			 * for the fault.
> +			 */
> +			__u64 flags;
> +			__u64 gpa;
> +			__u64 len; /* in bytes */
> +		} memory_fault;
>  		/* Fix the size of the union. */
>  		char padding[256];
>  	};
> @@ -1184,6 +1196,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224
>  #define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225
>  #define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226
> +#define KVM_CAP_X86_MEMORY_FAULT_EXIT 227
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -2237,4 +2250,7 @@ struct kvm_s390_zpci_op {
>  /* flags for kvm_s390_zpci_op->u.reg_aen.flags */
>  #define KVM_S390_ZPCIOP_REGAEN_HOST    (1 << 0)
>  
> +/* Exit reasons for KVM_EXIT_MEMORY_FAULT */
> +#define KVM_MEMFAULT_REASON_UNKNOWN (1 << 0)
> +
>  #endif /* __LINUX_KVM_H */
> diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
> index 55155e262646e..2b468345f25c3 100644
> --- a/tools/include/uapi/linux/kvm.h
> +++ b/tools/include/uapi/linux/kvm.h
> @@ -264,6 +264,7 @@ struct kvm_xen_exit {
>  #define KVM_EXIT_RISCV_SBI        35
>  #define KVM_EXIT_RISCV_CSR        36
>  #define KVM_EXIT_NOTIFY           37
> +#define KVM_EXIT_MEMORY_FAULT     38
>  
>  /* For KVM_EXIT_INTERNAL_ERROR */
>  /* Emulate instruction failed. */
> @@ -505,6 +506,17 @@ struct kvm_run {
>  #define KVM_NOTIFY_CONTEXT_INVALID	(1 << 0)
>  			__u32 flags;
>  		} notify;
> +		/* KVM_EXIT_MEMORY_FAULT */
> +		struct {
> +			/*
> +			 * Indicates a memory fault on the guest physical address range
> +			 * [gpa, gpa + len). flags is a bitfield describing the reasons(s)
> +			 * for the fault.
> +			 */
> +			__u64 flags;
> +			__u64 gpa;
> +			__u64 len; /* in bytes */
> +		} memory_fault;
>  		/* Fix the size of the union. */
>  		char padding[256];
>  	};
> @@ -2228,4 +2240,7 @@ struct kvm_s390_zpci_op {
>  /* flags for kvm_s390_zpci_op->u.reg_aen.flags */
>  #define KVM_S390_ZPCIOP_REGAEN_HOST    (1 << 0)
>  
> +/* Exit reasons for KVM_EXIT_MEMORY_FAULT */
> +#define KVM_MEMFAULT_REASON_UNKNOWN (1 << 0)
> +
>  #endif /* __LINUX_KVM_H */
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index e38ddda05b261..00aec43860ff1 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1142,6 +1142,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
>  	spin_lock_init(&kvm->mn_invalidate_lock);
>  	rcuwait_init(&kvm->mn_memslots_update_rcuwait);
>  	xa_init(&kvm->vcpu_array);
> +	kvm->memfault_exit_reasons = 0;
>  
>  	INIT_LIST_HEAD(&kvm->gpc_list);
>  	spin_lock_init(&kvm->gpc_lock);
> @@ -4671,6 +4672,14 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
>  
>  		return r;
>  	}
> +	case KVM_CAP_X86_MEMORY_FAULT_EXIT: {
> +		if (!kvm_vm_ioctl_check_extension(kvm, KVM_CAP_X86_MEMORY_FAULT_EXIT))
> +			return -EINVAL;
> +		else if (!kvm_memfault_exit_flags_valid(cap->args[0]))
> +			return -EINVAL;
> +		kvm->memfault_exit_reasons = cap->args[0];
> +		return 0;
> +	}

Is KVM_CAP_X86_MEMORY_FAULT_EXIT really specific to x86?
If so, this should go to kvm_vm_ioctl_enable_cap() in arch/x86/kvm/x86.c.
(Or make it non-arch specific.)


>  	default:
>  		return kvm_vm_ioctl_enable_cap(kvm, cap);
>  	}
> @@ -6172,3 +6181,22 @@ int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
>  
>  	return init_context.err;
>  }
> +
> +inline int kvm_memfault_exit_or_efault(
> +	struct kvm_vcpu *vcpu, uint64_t gpa, uint64_t len, uint64_t exit_flags)
> +{
> +	if (!(vcpu->kvm->memfault_exit_reasons & exit_flags))
> +		return -EFAULT;
> +	vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
> +	vcpu->run->memory_fault.gpa = gpa;
> +	vcpu->run->memory_fault.len = len;
> +	vcpu->run->memory_fault.flags = exit_flags;
> +	return -1;

Why -1? 0? Anyway enum exit_fastpath_completion is x86 kvm mmu internal
convention. As WIP, it's okay for now, though.


> +}
> +
> +bool kvm_memfault_exit_flags_valid(uint64_t reasons)
> +{
> +	uint64_t valid_flags = KVM_MEMFAULT_REASON_UNKNOWN;
> +
> +	return !(reasons & !valid_flags);
> +}
> -- 
> 2.40.0.rc1.284.g88254d51c5-goog
>
Anish Moorthy March 17, 2023, 6:33 p.m. UTC | #2
On Thu, Mar 16, 2023 at 5:02 PM Isaku Yamahata <isaku.yamahata@gmail.com> wrote:

> > +7.34 KVM_CAP_X86_MEMORY_FAULT_EXIT
> > +----------------------------------
> > +
> > +:Architectures: x86
>
> Why x86 specific?

Sean was the only one to bring this functionality up and originally
did so in the context of some x86-specific functions, so I assumed
that x86 was the only ask and that maybe the other architectures had
alternative solutions. Admittedly I also wanted to avoid wading
through another big set of -EFAULT references :/

Those are the only reasons though. Marc, Oliver, should I bring this
capability to Arm as well?

> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index e38ddda05b261..00aec43860ff1 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -1142,6 +1142,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
> >       spin_lock_init(&kvm->mn_invalidate_lock);
> >       rcuwait_init(&kvm->mn_memslots_update_rcuwait);
> >       xa_init(&kvm->vcpu_array);
> > +     kvm->memfault_exit_reasons = 0;
> >
> >       INIT_LIST_HEAD(&kvm->gpc_list);
> >       spin_lock_init(&kvm->gpc_lock);
> > @@ -4671,6 +4672,14 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
> >
> >               return r;
> >       }
> > +     case KVM_CAP_X86_MEMORY_FAULT_EXIT: {
> > +             if (!kvm_vm_ioctl_check_extension(kvm, KVM_CAP_X86_MEMORY_FAULT_EXIT))
> > +                     return -EINVAL;
> > +             else if (!kvm_memfault_exit_flags_valid(cap->args[0]))
> > +                     return -EINVAL;
> > +             kvm->memfault_exit_reasons = cap->args[0];
> > +             return 0;
> > +     }
>
> Is KVM_CAP_X86_MEMORY_FAULT_EXIT really specific to x86?
> If so, this should go to kvm_vm_ioctl_enable_cap() in arch/x86/kvm/x86.c.
> (Or make it non-arch specific.)

Ah, thanks for the catch: I renamed my old non-x86 specific
capability, and forgot to move this block.

> > +inline int kvm_memfault_exit_or_efault(
> > +     struct kvm_vcpu *vcpu, uint64_t gpa, uint64_t len, uint64_t exit_flags)
> > +{
> > +     if (!(vcpu->kvm->memfault_exit_reasons & exit_flags))
> > +             return -EFAULT;
> > +     vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
> > +     vcpu->run->memory_fault.gpa = gpa;
> > +     vcpu->run->memory_fault.len = len;
> > +     vcpu->run->memory_fault.flags = exit_flags;
> > +     return -1;
>
> Why -1? 0? Anyway enum exit_fastpath_completion is x86 kvm mmu internal
> convention. As WIP, it's okay for now, though.

The -1 isn't to indicate a failure in this function itself, but to
allow callers to substitute this for "return -EFAULT." A return code
of zero would mask errors and cause KVM to proceed in ways that it
shouldn't. For instance, "setup_vmgexit_scratch" uses it like this

if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) {
    ...
-  return -EFAULT;
+ return kvm_memfault_exit_or_efault(...);
}

and looking at one of its callers (sev_handle_vmgexit) shows how a
return code of zero would cause a different control flow

case SVM_VMGEXIT_MMIO_READ:
ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
if (ret)
    break;

ret = ret = kvm_sev_es_mmio_read(vcpu,
Oliver Upton March 17, 2023, 6:35 p.m. UTC | #3
On Wed, Mar 15, 2023 at 02:17:28AM +0000, Anish Moorthy wrote:

[...]

> @@ -6172,3 +6181,22 @@ int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
>  
>  	return init_context.err;
>  }
> +
> +inline int kvm_memfault_exit_or_efault(
> +	struct kvm_vcpu *vcpu, uint64_t gpa, uint64_t len, uint64_t exit_flags)
> +{
> +	if (!(vcpu->kvm->memfault_exit_reasons & exit_flags))
> +		return -EFAULT;

<snip>

> +	vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
> +	vcpu->run->memory_fault.gpa = gpa;
> +	vcpu->run->memory_fault.len = len;
> +	vcpu->run->memory_fault.flags = exit_flags;

</snip>

Please spin this off into a helper and make use of it on the arm64 side.

> +	return -1;
> +}
> +
> +bool kvm_memfault_exit_flags_valid(uint64_t reasons)
> +{
> +	uint64_t valid_flags = KVM_MEMFAULT_REASON_UNKNOWN;
> +
> +	return !(reasons & !valid_flags);
> +}
> -- 
> 2.40.0.rc1.284.g88254d51c5-goog
> 
>
Oliver Upton March 17, 2023, 7:30 p.m. UTC | #4
On Fri, Mar 17, 2023 at 11:33:38AM -0700, Anish Moorthy wrote:
> On Thu, Mar 16, 2023 at 5:02 PM Isaku Yamahata <isaku.yamahata@gmail.com> wrote:
> 
> > > +7.34 KVM_CAP_X86_MEMORY_FAULT_EXIT
> > > +----------------------------------
> > > +
> > > +:Architectures: x86
> >
> > Why x86 specific?
> 
> Sean was the only one to bring this functionality up and originally
> did so in the context of some x86-specific functions, so I assumed
> that x86 was the only ask and that maybe the other architectures had
> alternative solutions. Admittedly I also wanted to avoid wading
> through another big set of -EFAULT references :/

There isn't much :) Sanity checks in mmu.c and some currently unhandled
failures to write guest memory in pvtime.c

> Those are the only reasons though. Marc, Oliver, should I bring this
> capability to Arm as well?

The x86 implementation shouldn't preclude UAPI reuse, but I'm not strongly
motivated in either direction on this. A clear use case where the exit
information is actionable rather than just informational would make the change
more desirable.
Sean Christopherson March 17, 2023, 9:50 p.m. UTC | #5
On Fri, Mar 17, 2023, Anish Moorthy wrote:
> On Thu, Mar 16, 2023 at 5:02 PM Isaku Yamahata <isaku.yamahata@gmail.com> wrote:
> > > +inline int kvm_memfault_exit_or_efault(
> > > +     struct kvm_vcpu *vcpu, uint64_t gpa, uint64_t len, uint64_t exit_flags)
> > > +{
> > > +     if (!(vcpu->kvm->memfault_exit_reasons & exit_flags))
> > > +             return -EFAULT;
> > > +     vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
> > > +     vcpu->run->memory_fault.gpa = gpa;
> > > +     vcpu->run->memory_fault.len = len;
> > > +     vcpu->run->memory_fault.flags = exit_flags;
> > > +     return -1;
> >
> > Why -1? 0? Anyway enum exit_fastpath_completion is x86 kvm mmu internal
> > convention. As WIP, it's okay for now, though.
> 
> The -1 isn't to indicate a failure in this function itself, but to
> allow callers to substitute this for "return -EFAULT." A return code
> of zero would mask errors and cause KVM to proceed in ways that it
> shouldn't. For instance, "setup_vmgexit_scratch" uses it like this
> 
> if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) {
>     ...
> -  return -EFAULT;
> + return kvm_memfault_exit_or_efault(...);
> }
> 
> and looking at one of its callers (sev_handle_vmgexit) shows how a
> return code of zero would cause a different control flow
> 
> case SVM_VMGEXIT_MMIO_READ:
> ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
> if (ret)
>     break;
> 
> ret = ret = kvm_sev_es_mmio_read(vcpu,

Hmm, I generally agree with Isaku, the helper should really return 0.  Returning
-1 might work, but it'll likely confuse userspace, and will definitely confuse
KVM developers.

The "0 means exit to userspace" behavior is definitely a pain though, and is likely
going to make this all extremely fragile.

I wonder if we can get away with returning -EFAULT, but still filling vcpu->run
with KVM_EXIT_MEMORY_FAULT and all the other metadata.  That would likely simplify
the implementation greatly, and would let KVM fill vcpu->run unconditonally.  KVM
would still need a capability to advertise support to userspace, but userspace
wouldn't need to opt in.  I think this may have been my very original though, and
I just never actually wrote it down...
Anish Moorthy March 17, 2023, 10:44 p.m. UTC | #6
On Fri, Mar 17, 2023 at 2:50 PM Sean Christopherson <seanjc@google.com> wrote:
> I wonder if we can get away with returning -EFAULT, but still filling vcpu->run
> with KVM_EXIT_MEMORY_FAULT and all the other metadata.  That would likely simplify
> the implementation greatly, and would let KVM fill vcpu->run unconditonally.  KVM
> would still need a capability to advertise support to userspace, but userspace
> wouldn't need to opt in.  I think this may have been my very original though, and
> I just never actually wrote it down...

Oh, good to know that's actually an option. I thought of that too, but
assumed that returning a negative error code was a no-go for a proper
vCPU exit. But if that's not true then I think it's the obvious
solution because it precludes any uncaught behavior-change bugs.

A couple of notes
1. Since we'll likely miss some -EFAULT returns, we'll need to make
sure that the user can check for / doesn't see a stale
kvm_run::memory_fault field when a missed -EFAULT makes it to
userspace. It's a small and easy-to-fix detail, but I thought I'd
point it out.
2. I don't think this would simplify the series that much, since we
still need to find the call sites returning -EFAULT to userspace and
populate memory_fault only in those spots to avoid populating it for
-EFAULTs which don't make it to userspace. We *could* relax that
condition and just document that memory_fault should be ignored when
KVM_RUN does not return -EFAULT... but I don't think that's a good
solution from a coder/maintainer perspective.
Sean Christopherson March 20, 2023, 3:53 p.m. UTC | #7
On Fri, Mar 17, 2023, Anish Moorthy wrote:
> On Fri, Mar 17, 2023 at 2:50 PM Sean Christopherson <seanjc@google.com> wrote:
> > I wonder if we can get away with returning -EFAULT, but still filling vcpu->run
> > with KVM_EXIT_MEMORY_FAULT and all the other metadata.  That would likely simplify
> > the implementation greatly, and would let KVM fill vcpu->run unconditonally.  KVM
> > would still need a capability to advertise support to userspace, but userspace
> > wouldn't need to opt in.  I think this may have been my very original though, and
> > I just never actually wrote it down...
> 
> Oh, good to know that's actually an option. I thought of that too, but
> assumed that returning a negative error code was a no-go for a proper
> vCPU exit. But if that's not true then I think it's the obvious
> solution because it precludes any uncaught behavior-change bugs.
> 
> A couple of notes
> 1. Since we'll likely miss some -EFAULT returns, we'll need to make
> sure that the user can check for / doesn't see a stale
> kvm_run::memory_fault field when a missed -EFAULT makes it to
> userspace. It's a small and easy-to-fix detail, but I thought I'd
> point it out.

Ya, this is the main concern for me as well.  I'm not as confident that it's
easy-to-fix/avoid though.

> 2. I don't think this would simplify the series that much, since we
> still need to find the call sites returning -EFAULT to userspace and
> populate memory_fault only in those spots to avoid populating it for
> -EFAULTs which don't make it to userspace.

Filling kvm_run::memory_fault even if KVM never exits to userspace is perfectly
ok.  It's not ideal, but it's ok.

> We *could* relax that condition and just document that memory_fault should be
> ignored when KVM_RUN does not return -EFAULT... but I don't think that's a
> good solution from a coder/maintainer perspective.

You've got things backward.  memory_fault _must_ be ignored if KVM doesn't return
the associated "magic combo", where the magic value is either "0+KVM_EXIT_MEMORY_FAULT"
or "-EFAULT+KVM_EXIT_MEMORY_FAULT".

Filling kvm_run::memory_fault but not exiting to userspace is ok because userspace
never sees the data, i.e. userspace is completely unaware.  This behavior is not
ideal from a KVM perspective as allowing KVM to fill the kvm_run union without
exiting to userspace can lead to other bugs, e.g. effective corruption of the
kvm_run union, but at least from a uABI perspective, the behavior is acceptable.

The reverse, userspace consuming kvm_run::memory_fault without being explicitly
told the data is valid, is not ok/safe.  KVM's contract is that fields contained
in kvm_run's big union are valid if and only if KVM returns '0' and the associated
exit reason is set in kvm_run::exit_reason.

From an ABI perspective, I don't see anything fundamentally wrong with bending
that rule slightly by saying that kvm_run::memory_fault is valid if KVM returns
-EFAULT+KVM_EXIT_MEMORY_FAULT.  It won't break existing userspace that is unaware
of KVM_EXIT_MEMORY_FAULT, and userspace can precisely check for the combination.

My big concern with piggybacking -EFAULT is that userspace will be fed stale if
KVM exits with -EFAULT in a patch that _doesn't_ fill kvm_run::memory_fault.
Returning a negative error code isn't hazardous in and of itself, e.g. KVM has
had bugs in the past where KVM returns '0' but doesn't fill kvm_run::exit_reason.
The big danger is that KVM has existing paths that return -EFAULT, i.e. we can
introduce bugs simply by doing nothing, whereas returning '0' would largely be
limited to new code.

The counter-argument is that propagating '0' correctly up the stack carries its
own risk due to plenty of code correctly treating '0' as "success" and not "exit
to userspace".

And we can mitigate the risk of using -EFAULT.  E.g. fill in kvm_run::memory_fault
even if we are 99.9999% confident the -EFAULT can't get out to userspace in the
context of KVM_RUN, and set kvm_run::exit_reason to some arbitrary value at the
start of KVM_RUN to prevent reusing memory_fault from a previous userspace exit.
Anish Moorthy March 20, 2023, 6:19 p.m. UTC | #8
On Mon, Mar 20, 2023 at 8:53 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Fri, Mar 17, 2023, Anish Moorthy wrote:
> > On Fri, Mar 17, 2023 at 2:50 PM Sean Christopherson <seanjc@google.com> wrote:
> > > I wonder if we can get away with returning -EFAULT, but still filling vcpu->run
> > > with KVM_EXIT_MEMORY_FAULT and all the other metadata.  That would likely simplify
> > > the implementation greatly, and would let KVM fill vcpu->run unconditonally.  KVM
> > > would still need a capability to advertise support to userspace, but userspace
> > > wouldn't need to opt in.  I think this may have been my very original though, and
> > > I just never actually wrote it down...
> >
> > Oh, good to know that's actually an option. I thought of that too, but
> > assumed that returning a negative error code was a no-go for a proper
> > vCPU exit. But if that's not true then I think it's the obvious
> > solution because it precludes any uncaught behavior-change bugs.
> >
> > A couple of notes
> > 1. Since we'll likely miss some -EFAULT returns, we'll need to make
> > sure that the user can check for / doesn't see a stale
> > kvm_run::memory_fault field when a missed -EFAULT makes it to
> > userspace. It's a small and easy-to-fix detail, but I thought I'd
> > point it out.
>
> Ya, this is the main concern for me as well.  I'm not as confident that it's
> easy-to-fix/avoid though.
>
> > 2. I don't think this would simplify the series that much, since we
> > still need to find the call sites returning -EFAULT to userspace and
> > populate memory_fault only in those spots to avoid populating it for
> > -EFAULTs which don't make it to userspace.
>
> Filling kvm_run::memory_fault even if KVM never exits to userspace is perfectly
> ok.  It's not ideal, but it's ok.

Right- I was just pointing out that doing so could mislead readers of
the code if they assume that kvm_run::memory_fault is populated iff it
was going to be associated w/ an exit to userspace," which I know I
would.

> > We *could* relax that condition and just document that memory_fault should be
> > ignored when KVM_RUN does not return -EFAULT... but I don't think that's a
> > good solution from a coder/maintainer perspective.
>
> You've got things backward.  memory_fault _must_ be ignored if KVM doesn't return
> the associated "magic combo", where the magic value is either "0+KVM_EXIT_MEMORY_FAULT"
> or "-EFAULT+KVM_EXIT_MEMORY_FAULT".

I think we're saying the same thing- I was using "should" to mean "must."

> Filling kvm_run::memory_fault but not exiting to userspace is ok because userspace
> never sees the data, i.e. userspace is completely unaware.  This behavior is not
> ideal from a KVM perspective as allowing KVM to fill the kvm_run union without
> exiting to userspace can lead to other bugs, e.g. effective corruption of the
> kvm_run union

Ooh, I didn't think of the corruption issue here: thanks for pointing it out.

> but at least from a uABI perspective, the behavior is acceptable.

This does complicate things for KVM implementation though, right? In
particular, we'd have to make sure that KVM_RUN never conditionally
modifies its return value/exit reason based on reads from kvm_run:
that seems like a slightly weird thing to do, but I don't want to
assume anything here.

Anyways, unless that's not (and never will be) a problem, allowing
corruption of kvm_run seems very risky.

> The reverse, userspace consuming kvm_run::memory_fault without being explicitly
> told the data is valid, is not ok/safe.  KVM's contract is that fields contained
> in kvm_run's big union are valid if and only if KVM returns '0' and the associated
> exit reason is set in kvm_run::exit_reason.
>
> From an ABI perspective, I don't see anything fundamentally wrong with bending
> that rule slightly by saying that kvm_run::memory_fault is valid if KVM returns
> -EFAULT+KVM_EXIT_MEMORY_FAULT.  It won't break existing userspace that is unaware
> of KVM_EXIT_MEMORY_FAULT, and userspace can precisely check for the combination.
>
> My big concern with piggybacking -EFAULT is that userspace will be fed stale if
> KVM exits with -EFAULT in a patch that _doesn't_ fill kvm_run::memory_fault.
> Returning a negative error code isn't hazardous in and of itself, e.g. KVM has
> had bugs in the past where KVM returns '0' but doesn't fill kvm_run::exit_reason.
> The big danger is that KVM has existing paths that return -EFAULT, i.e. we can
> introduce bugs simply by doing nothing, whereas returning '0' would largely be
> limited to new code.
>
> The counter-argument is that propagating '0' correctly up the stack carries its
> own risk due to plenty of code correctly treating '0' as "success" and not "exit
> to userspace".
>
> And we can mitigate the risk of using -EFAULT.  E.g. fill in kvm_run::memory_fault
> even if we are 99.9999% confident the -EFAULT can't get out to userspace in the
> context of KVM_RUN, and set kvm_run::exit_reason to some arbitrary value at the
> start of KVM_RUN to prevent reusing memory_fault from a previous userspace exit.

Right, this is what I had in mind when I called this "small and
easy-to-fix." Piggybacking -EFAULT seems like the right thing to do to
me, but I'm still uneasy about possibly corrupting kvm_run for masked
-EFAULTS.
Anish Moorthy March 20, 2023, 10:11 p.m. UTC | #9
On Mon, Mar 20, 2023 at 8:53 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Fri, Mar 17, 2023, Anish Moorthy wrote:
> > On Fri, Mar 17, 2023 at 2:50 PM Sean Christopherson <seanjc@google.com> wrote:
> > > I wonder if we can get away with returning -EFAULT, but still filling vcpu->run
> > > with KVM_EXIT_MEMORY_FAULT and all the other metadata.  That would likely simplify
> > > the implementation greatly, and would let KVM fill vcpu->run unconditonally.  KVM
> > > would still need a capability to advertise support to userspace, but userspace
> > > wouldn't need to opt in.  I think this may have been my very original though, and
> > > I just never actually wrote it down...
> >
> > Oh, good to know that's actually an option. I thought of that too, but
> > assumed that returning a negative error code was a no-go for a proper
> > vCPU exit. But if that's not true then I think it's the obvious
> > solution because it precludes any uncaught behavior-change bugs.
> >
> > A couple of notes
> > 1. Since we'll likely miss some -EFAULT returns, we'll need to make
> > sure that the user can check for / doesn't see a stale
> > kvm_run::memory_fault field when a missed -EFAULT makes it to
> > userspace. It's a small and easy-to-fix detail, but I thought I'd
> > point it out.
>
> Ya, this is the main concern for me as well.  I'm not as confident that it's
> easy-to-fix/avoid though.
>
> > 2. I don't think this would simplify the series that much, since we
> > still need to find the call sites returning -EFAULT to userspace and
> > populate memory_fault only in those spots to avoid populating it for
> > -EFAULTs which don't make it to userspace.
>
> Filling kvm_run::memory_fault even if KVM never exits to userspace is perfectly
> ok.  It's not ideal, but it's ok.
>
> > We *could* relax that condition and just document that memory_fault should be
> > ignored when KVM_RUN does not return -EFAULT... but I don't think that's a
> > good solution from a coder/maintainer perspective.
>
> You've got things backward.  memory_fault _must_ be ignored if KVM doesn't return
> the associated "magic combo", where the magic value is either "0+KVM_EXIT_MEMORY_FAULT"
> or "-EFAULT+KVM_EXIT_MEMORY_FAULT".
>
> Filling kvm_run::memory_fault but not exiting to userspace is ok because userspace
> never sees the data, i.e. userspace is completely unaware.  This behavior is not
> ideal from a KVM perspective as allowing KVM to fill the kvm_run union without
> exiting to userspace can lead to other bugs, e.g. effective corruption of the
> kvm_run union, but at least from a uABI perspective, the behavior is acceptable.

Actually, I don't think the idea of filling in kvm_run.memory_fault
for -EFAULTs which don't make it to userspace works at all. Consider
the direct_map function, which bubbles its -EFAULT to
kvm_mmu_do_page_fault. kvm_mmu_do_page_fault is called from both
kvm_arch_async_page_ready (which ignores the return value), and by
kvm_mmu_page_fault (where the return value does make it to userspace).
Populating kvm_run.memory_fault anywhere in or under
kvm_mmu_do_page_fault seems an immediate no-go, because a wayward
kvm_arch_async_page_ready could (presumably) overwrite/corrupt an
already-set kvm_run.memory_fault / other kvm_run field.

That in turn looks problematic for the
memory-fault-exit-on-fast-gup-failure part of this series, because
there are at least a couple of cases for which kvm_mmu_do_page_fault
will -EFAULT. One is the early-efault-on-fast-gup-failure case which
was the original purpose of this series. Another is a -EFAULT from
FNAME(fetch) (passed up through FNAME(page_fault)). There might be
other cases as well. But unless userspace can/should resolve *all*
such -EFAULTs in the same manner, a kvm_run.memory_fault populated in
"kvm_mmu_page_fault" wouldn't be actionable. At least, not without a
whole lot of plumbing code to make it so.

Sean, am I missing anything here?
Sean Christopherson March 21, 2023, 3:21 p.m. UTC | #10
On Mon, Mar 20, 2023, Anish Moorthy wrote:
> On Mon, Mar 20, 2023 at 8:53 AM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Fri, Mar 17, 2023, Anish Moorthy wrote:
> > > On Fri, Mar 17, 2023 at 2:50 PM Sean Christopherson <seanjc@google.com> wrote:
> > > > I wonder if we can get away with returning -EFAULT, but still filling vcpu->run
> > > > with KVM_EXIT_MEMORY_FAULT and all the other metadata.  That would likely simplify
> > > > the implementation greatly, and would let KVM fill vcpu->run unconditonally.  KVM
> > > > would still need a capability to advertise support to userspace, but userspace
> > > > wouldn't need to opt in.  I think this may have been my very original though, and
> > > > I just never actually wrote it down...
> > >
> > > Oh, good to know that's actually an option. I thought of that too, but
> > > assumed that returning a negative error code was a no-go for a proper
> > > vCPU exit. But if that's not true then I think it's the obvious
> > > solution because it precludes any uncaught behavior-change bugs.
> > >
> > > A couple of notes
> > > 1. Since we'll likely miss some -EFAULT returns, we'll need to make
> > > sure that the user can check for / doesn't see a stale
> > > kvm_run::memory_fault field when a missed -EFAULT makes it to
> > > userspace. It's a small and easy-to-fix detail, but I thought I'd
> > > point it out.
> >
> > Ya, this is the main concern for me as well.  I'm not as confident that it's
> > easy-to-fix/avoid though.
> >
> > > 2. I don't think this would simplify the series that much, since we
> > > still need to find the call sites returning -EFAULT to userspace and
> > > populate memory_fault only in those spots to avoid populating it for
> > > -EFAULTs which don't make it to userspace.
> >
> > Filling kvm_run::memory_fault even if KVM never exits to userspace is perfectly
> > ok.  It's not ideal, but it's ok.
> >
> > > We *could* relax that condition and just document that memory_fault should be
> > > ignored when KVM_RUN does not return -EFAULT... but I don't think that's a
> > > good solution from a coder/maintainer perspective.
> >
> > You've got things backward.  memory_fault _must_ be ignored if KVM doesn't return
> > the associated "magic combo", where the magic value is either "0+KVM_EXIT_MEMORY_FAULT"
> > or "-EFAULT+KVM_EXIT_MEMORY_FAULT".
> >
> > Filling kvm_run::memory_fault but not exiting to userspace is ok because userspace
> > never sees the data, i.e. userspace is completely unaware.  This behavior is not
> > ideal from a KVM perspective as allowing KVM to fill the kvm_run union without
> > exiting to userspace can lead to other bugs, e.g. effective corruption of the
> > kvm_run union, but at least from a uABI perspective, the behavior is acceptable.
> 
> Actually, I don't think the idea of filling in kvm_run.memory_fault
> for -EFAULTs which don't make it to userspace works at all. Consider
> the direct_map function, which bubbles its -EFAULT to
> kvm_mmu_do_page_fault. kvm_mmu_do_page_fault is called from both
> kvm_arch_async_page_ready (which ignores the return value), and by
> kvm_mmu_page_fault (where the return value does make it to userspace).
> Populating kvm_run.memory_fault anywhere in or under
> kvm_mmu_do_page_fault seems an immediate no-go, because a wayward
> kvm_arch_async_page_ready could (presumably) overwrite/corrupt an
> already-set kvm_run.memory_fault / other kvm_run field.

This particular case is a non-issue.  kvm_check_async_pf_completion() is called
only when the current task has control of the vCPU, i.e. is the current "running"
vCPU.  That's not a coincidence either, invoking kvm_mmu_do_page_fault() without
having control of the vCPU would be fraught with races, e.g. the entire KVM MMU
context would be unstable.

That will hold true for all cases.  Using a vCPU that is not loaded (not the
current "running" vCPU in KVM's misleading terminology) to access guest memory is
simply not safe, as the vCPU state is non-deterministic.  There are paths where
KVM accesses, and even modifies, vCPU state asynchronously, e.g. for IRQ delivery
and making requests, but those are very controlled flows with dedicated machinery
to make them SMP safe.

That said, I agree that there's a risk that KVM could clobber vcpu->run_run by
hitting an -EFAULT without the vCPU loaded, but that's a solvable problem, e.g.
the helper to fill KVM_EXIT_MEMORY_FAULT could be hardened to yell if called
without the target vCPU being loaded:

	int kvm_handle_efault(struct kvm_vcpu *vcpu, ...)
	{
		preempt_disable();
		if (WARN_ON_ONCE(vcpu != __this_cpu_read(kvm_running_vcpu)))
			goto out;

		vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
		...
	out:
		preempt_enable();
		return -EFAULT;
	}

FWIW, I completely agree that filling KVM_EXIT_MEMORY_FAULT without guaranteeing
that KVM "immediately" exits to userspace isn't ideal, but given the amount of
historical code that we need to deal with, it seems like the lesser of all evils.
Unless I'm misunderstanding the use cases, unnecessarily filling kvm_run is a far
better failure mode than KVM not filling kvm_run when it should, i.e. false
positives are ok, false negatives are fatal.

> That in turn looks problematic for the
> memory-fault-exit-on-fast-gup-failure part of this series, because
> there are at least a couple of cases for which kvm_mmu_do_page_fault
> will -EFAULT. One is the early-efault-on-fast-gup-failure case which
> was the original purpose of this series. Another is a -EFAULT from
> FNAME(fetch) (passed up through FNAME(page_fault)). There might be
> other cases as well. But unless userspace can/should resolve *all*
> such -EFAULTs in the same manner, a kvm_run.memory_fault populated in
> "kvm_mmu_page_fault" wouldn't be actionable.

Killing the VM, which is what all VMMs do today in response to -EFAULT, is an
action.  As I've pointed out elsewhere in this thread, userspace needs to be able
to identify "faults" that it (userspace) can resolve without a hint from KVM.

In other words, KVM is still returning -EFAULT (or a variant thereof), the _only_
difference, for all intents and purposes, is that userspace is given a bit more
information about the source of the -EFAULT.

> At least, not without a whole lot of plumbing code to make it so.

Plumbing where?
Anish Moorthy March 21, 2023, 6:01 p.m. UTC | #11
On Tue, Mar 21, 2023 at 8:21 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Mon, Mar 20, 2023, Anish Moorthy wrote:
> > On Mon, Mar 20, 2023 at 8:53 AM Sean Christopherson <seanjc@google.com> wrote:
> > >
> > > On Fri, Mar 17, 2023, Anish Moorthy wrote:
> > > > On Fri, Mar 17, 2023 at 2:50 PM Sean Christopherson <seanjc@google.com> wrote:
> > > > > I wonder if we can get away with returning -EFAULT, but still filling vcpu->run
> > > > > with KVM_EXIT_MEMORY_FAULT and all the other metadata.  That would likely simplify
> > > > > the implementation greatly, and would let KVM fill vcpu->run unconditonally.  KVM
> > > > > would still need a capability to advertise support to userspace, but userspace
> > > > > wouldn't need to opt in.  I think this may have been my very original though, and
> > > > > I just never actually wrote it down...
> > > >
> > > > Oh, good to know that's actually an option. I thought of that too, but
> > > > assumed that returning a negative error code was a no-go for a proper
> > > > vCPU exit. But if that's not true then I think it's the obvious
> > > > solution because it precludes any uncaught behavior-change bugs.
> > > >
> > > > A couple of notes
> > > > 1. Since we'll likely miss some -EFAULT returns, we'll need to make
> > > > sure that the user can check for / doesn't see a stale
> > > > kvm_run::memory_fault field when a missed -EFAULT makes it to
> > > > userspace. It's a small and easy-to-fix detail, but I thought I'd
> > > > point it out.
> > >
> > > Ya, this is the main concern for me as well.  I'm not as confident that it's
> > > easy-to-fix/avoid though.
> > >
> > > > 2. I don't think this would simplify the series that much, since we
> > > > still need to find the call sites returning -EFAULT to userspace and
> > > > populate memory_fault only in those spots to avoid populating it for
> > > > -EFAULTs which don't make it to userspace.
> > >
> > > Filling kvm_run::memory_fault even if KVM never exits to userspace is perfectly
> > > ok.  It's not ideal, but it's ok.
> > >
> > > > We *could* relax that condition and just document that memory_fault should be
> > > > ignored when KVM_RUN does not return -EFAULT... but I don't think that's a
> > > > good solution from a coder/maintainer perspective.
> > >
> > > You've got things backward.  memory_fault _must_ be ignored if KVM doesn't return
> > > the associated "magic combo", where the magic value is either "0+KVM_EXIT_MEMORY_FAULT"
> > > or "-EFAULT+KVM_EXIT_MEMORY_FAULT".
> > >
> > > Filling kvm_run::memory_fault but not exiting to userspace is ok because userspace
> > > never sees the data, i.e. userspace is completely unaware.  This behavior is not
> > > ideal from a KVM perspective as allowing KVM to fill the kvm_run union without
> > > exiting to userspace can lead to other bugs, e.g. effective corruption of the
> > > kvm_run union, but at least from a uABI perspective, the behavior is acceptable.
> >
> > Actually, I don't think the idea of filling in kvm_run.memory_fault
> > for -EFAULTs which don't make it to userspace works at all. Consider
> > the direct_map function, which bubbles its -EFAULT to
> > kvm_mmu_do_page_fault. kvm_mmu_do_page_fault is called from both
> > kvm_arch_async_page_ready (which ignores the return value), and by
> > kvm_mmu_page_fault (where the return value does make it to userspace).
> > Populating kvm_run.memory_fault anywhere in or under
> > kvm_mmu_do_page_fault seems an immediate no-go, because a wayward
> > kvm_arch_async_page_ready could (presumably) overwrite/corrupt an
> > already-set kvm_run.memory_fault / other kvm_run field.
>
> This particular case is a non-issue.  kvm_check_async_pf_completion() is called
> only when the current task has control of the vCPU, i.e. is the current "running"
> vCPU.  That's not a coincidence either, invoking kvm_mmu_do_page_fault() without
> having control of the vCPU would be fraught with races, e.g. the entire KVM MMU
> context would be unstable.
>
> That will hold true for all cases.  Using a vCPU that is not loaded (not the
> current "running" vCPU in KVM's misleading terminology) to access guest memory is
> simply not safe, as the vCPU state is non-deterministic.  There are paths where
> KVM accesses, and even modifies, vCPU state asynchronously, e.g. for IRQ delivery
> and making requests, but those are very controlled flows with dedicated machinery
> to make them SMP safe.
>
> That said, I agree that there's a risk that KVM could clobber vcpu->run_run by
> hitting an -EFAULT without the vCPU loaded, but that's a solvable problem, e.g.
> the helper to fill KVM_EXIT_MEMORY_FAULT could be hardened to yell if called
> without the target vCPU being loaded:
>
>         int kvm_handle_efault(struct kvm_vcpu *vcpu, ...)
>         {
>                 preempt_disable();
>                 if (WARN_ON_ONCE(vcpu != __this_cpu_read(kvm_running_vcpu)))
>                         goto out;
>
>                 vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
>                 ...
>         out:
>                 preempt_enable();
>                 return -EFAULT;
>         }
>
> FWIW, I completely agree that filling KVM_EXIT_MEMORY_FAULT without guaranteeing
> that KVM "immediately" exits to userspace isn't ideal, but given the amount of
> historical code that we need to deal with, it seems like the lesser of all evils.
> Unless I'm misunderstanding the use cases, unnecessarily filling kvm_run is a far
> better failure mode than KVM not filling kvm_run when it should, i.e. false
> positives are ok, false negatives are fatal.

Don't you have this in reverse? False negatives will just result in
userspace not having useful extra information for the -EFAULT it
receives from KVM_RUN, in which case userspace can do what you
mentioned all VMMs do today and just terminate the VM. Whereas a false
positive might cause a double-write to the KVM_RUN struct, either
putting incorrect information in kvm_run.memory_fault or corrupting
another member of the union.

> > That in turn looks problematic for the
> > memory-fault-exit-on-fast-gup-failure part of this series, because
> > there are at least a couple of cases for which kvm_mmu_do_page_fault
> > will -EFAULT. One is the early-efault-on-fast-gup-failure case which
> > was the original purpose of this series. Another is a -EFAULT from
> > FNAME(fetch) (passed up through FNAME(page_fault)). There might be
> > other cases as well. But unless userspace can/should resolve *all*
> > such -EFAULTs in the same manner, a kvm_run.memory_fault populated in
> > "kvm_mmu_page_fault" wouldn't be actionable.
>
> Killing the VM, which is what all VMMs do today in response to -EFAULT, is an
> action.  As I've pointed out elsewhere in this thread, userspace needs to be able
> to identify "faults" that it (userspace) can resolve without a hint from KVM.
>
> In other words, KVM is still returning -EFAULT (or a variant thereof), the _only_
> difference, for all intents and purposes, is that userspace is given a bit more
> information about the source of the -EFAULT.
>
> > At least, not without a whole lot of plumbing code to make it so.
>
> Plumbing where?

In this example, I meant plumbing code to get a
kvm_run.memory_fault.flags which is more specific than (eg)
MEMFAULT_REASON_PAGE_FAULT_FAILURE from the -EFAULT paths under
kvm_mmu_page_fault. My idea for how userspace would distinguish
fast-gup failures was that kvm_faultin_pfn would set a special bit in
kvm_run.memory_fault.flags to indicate its failure. But (still
assuming that we shouldn't have false-positive kvm_run.memory_fault
fills) if the memory_fault can only be populated from
kvm_mmu_page_fault then either failures from FNAME(page_fault) and
kvm_faultin_pfn will be indistinguishable to userspace, or those
functions will need to plumb more specific exit reasons all the way up
to kvm_mmu_page_fault.

But, since you've made this point elsewhere, my guess is that your
answer is that it's actually userspace's job to detect the "specific"
reason for the fault and resolve it.
Sean Christopherson March 21, 2023, 7:43 p.m. UTC | #12
On Tue, Mar 21, 2023, Anish Moorthy wrote:
> On Tue, Mar 21, 2023 at 8:21 AM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Mon, Mar 20, 2023, Anish Moorthy wrote:
> > > On Mon, Mar 20, 2023 at 8:53 AM Sean Christopherson <seanjc@google.com> wrote:
> > > > Filling kvm_run::memory_fault but not exiting to userspace is ok because userspace
> > > > never sees the data, i.e. userspace is completely unaware.  This behavior is not
> > > > ideal from a KVM perspective as allowing KVM to fill the kvm_run union without
> > > > exiting to userspace can lead to other bugs, e.g. effective corruption of the
> > > > kvm_run union, but at least from a uABI perspective, the behavior is acceptable.
> > >
> > > Actually, I don't think the idea of filling in kvm_run.memory_fault
> > > for -EFAULTs which don't make it to userspace works at all. Consider
> > > the direct_map function, which bubbles its -EFAULT to
> > > kvm_mmu_do_page_fault. kvm_mmu_do_page_fault is called from both
> > > kvm_arch_async_page_ready (which ignores the return value), and by
> > > kvm_mmu_page_fault (where the return value does make it to userspace).
> > > Populating kvm_run.memory_fault anywhere in or under
> > > kvm_mmu_do_page_fault seems an immediate no-go, because a wayward
> > > kvm_arch_async_page_ready could (presumably) overwrite/corrupt an
> > > already-set kvm_run.memory_fault / other kvm_run field.
> >
> > This particular case is a non-issue.  kvm_check_async_pf_completion() is called
> > only when the current task has control of the vCPU, i.e. is the current "running"
> > vCPU.  That's not a coincidence either, invoking kvm_mmu_do_page_fault() without
> > having control of the vCPU would be fraught with races, e.g. the entire KVM MMU
> > context would be unstable.
> >
> > That will hold true for all cases.  Using a vCPU that is not loaded (not the
> > current "running" vCPU in KVM's misleading terminology) to access guest memory is
> > simply not safe, as the vCPU state is non-deterministic.  There are paths where
> > KVM accesses, and even modifies, vCPU state asynchronously, e.g. for IRQ delivery
> > and making requests, but those are very controlled flows with dedicated machinery
> > to make them SMP safe.
> >
> > That said, I agree that there's a risk that KVM could clobber vcpu->run_run by
> > hitting an -EFAULT without the vCPU loaded, but that's a solvable problem, e.g.
> > the helper to fill KVM_EXIT_MEMORY_FAULT could be hardened to yell if called
> > without the target vCPU being loaded:
> >
> >         int kvm_handle_efault(struct kvm_vcpu *vcpu, ...)
> >         {
> >                 preempt_disable();
> >                 if (WARN_ON_ONCE(vcpu != __this_cpu_read(kvm_running_vcpu)))
> >                         goto out;
> >
> >                 vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
> >                 ...
> >         out:
> >                 preempt_enable();
> >                 return -EFAULT;
> >         }
> >
> > FWIW, I completely agree that filling KVM_EXIT_MEMORY_FAULT without guaranteeing
> > that KVM "immediately" exits to userspace isn't ideal, but given the amount of
> > historical code that we need to deal with, it seems like the lesser of all evils.
> > Unless I'm misunderstanding the use cases, unnecessarily filling kvm_run is a far
> > better failure mode than KVM not filling kvm_run when it should, i.e. false
> > positives are ok, false negatives are fatal.
> 
> Don't you have this in reverse?

No, I don't think so.

> False negatives will just result in userspace not having useful extra
> information for the -EFAULT it receives from KVM_RUN, in which case userspace
> can do what you mentioned all VMMs do today and just terminate the VM.

And that is _really_ bad behavior if we have any hope of userspace actually being
able to rely on this functionality.  E.g. any false negative when userspace is
trying to do postcopy demand paging will be fatal to the VM.

> Whereas a false positive might cause a double-write to the KVM_RUN struct,
> either putting incorrect information in kvm_run.memory_fault or

Recording unused information on -EFAULT in kvm_run doesn't make the information
incorrect.

> corrupting another member of the union.

Only if KVM accesses guest memory after initiating an exit to userspace, which
would be a KVM irrespective of kvm_run.memory_fault.  We actually have exactly
this type of bug today in the trainwreck that is KVM's MMIO emulation[*], but
KVM gets away with the shoddy behavior by virtue of the scenario simply not
triggered by any real-world code.

And if we're really concerned about clobbering state, we could add hardening/auditing
code to ensure that KVM actually exits when kvm_run.exit_reason is set (though there
are a non-zero number of exceptions, e.g. the aformentioned MMIO mess, nested SVM/VMX
pages, and probably a few others).

Prior to cleanups a few years back[2], emulation failures had issues similar to
what we are discussing, where KVM would fail to exit to userspace, not fill kvm_run,
etc.  Those are the types of bugs I want to avoid here.

[1] https://lkml.kernel.org/r/ZBNrWZQhMX8AHzWM%40google.com
[2] https://lore.kernel.org/kvm/20190823010709.24879-1-sean.j.christopherson@intel.com

> > > That in turn looks problematic for the
> > > memory-fault-exit-on-fast-gup-failure part of this series, because
> > > there are at least a couple of cases for which kvm_mmu_do_page_fault
> > > will -EFAULT. One is the early-efault-on-fast-gup-failure case which
> > > was the original purpose of this series. Another is a -EFAULT from
> > > FNAME(fetch) (passed up through FNAME(page_fault)). There might be
> > > other cases as well. But unless userspace can/should resolve *all*
> > > such -EFAULTs in the same manner, a kvm_run.memory_fault populated in
> > > "kvm_mmu_page_fault" wouldn't be actionable.
> >
> > Killing the VM, which is what all VMMs do today in response to -EFAULT, is an
> > action.  As I've pointed out elsewhere in this thread, userspace needs to be able
> > to identify "faults" that it (userspace) can resolve without a hint from KVM.
> >
> > In other words, KVM is still returning -EFAULT (or a variant thereof), the _only_
> > difference, for all intents and purposes, is that userspace is given a bit more
> > information about the source of the -EFAULT.
> >
> > > At least, not without a whole lot of plumbing code to make it so.
> >
> > Plumbing where?
> 
> In this example, I meant plumbing code to get a kvm_run.memory_fault.flags
> which is more specific than (eg) MEMFAULT_REASON_PAGE_FAULT_FAILURE from the
> -EFAULT paths under kvm_mmu_page_fault. My idea for how userspace would
> distinguish fast-gup failures was that kvm_faultin_pfn would set a special
> bit in kvm_run.memory_fault.flags to indicate its failure. But (still
> assuming that we shouldn't have false-positive kvm_run.memory_fault fills) if
> the memory_fault can only be populated from kvm_mmu_page_fault then either
> failures from FNAME(page_fault) and kvm_faultin_pfn will be indistinguishable
> to userspace, or those functions will need to plumb more specific exit
> reasons all the way up to kvm_mmu_page_fault.

Setting a flag that essentially says "failure when handling a guest page fault"
is problematic on multiple fronts.  Tying the ABI to KVM's internal implementation
is not an option, i.e. the ABI would need to be defined as "on page faults from
the guest".  And then the resulting behavior would be non-deterministic, e.g.
userspace would see different behavior if KVM accessed a "bad" gfn via emulation
instead of in response to a guest page fault.  And because of hardware TLBs, it
would even be possible for the behavior to be non-deterministic on the same
platform running the same guest code (though this would be exteremly unliklely
in practice).

And even if userspace is ok with only handling guest page faults_today_, I highly
doubt that will hold forever.  I.e. at some point there will be a use case that
wants to react to uaccess failures on fast-only memslots.

Ignoring all of those issues, simplify flagging "this -EFAULT occurred when
handling a guest page fault" isn't precise enough for userspace to blindly resolve
the failure.  Even if KVM went through the trouble of setting information if and
only if get_user_page_fast_only() failed while handling a guest page fault,
userspace would still need/want a way to verify that the failure was expected and
can be resolved, e.g. to guard against userspace bugs due to wrongly unmapping
or mprotecting a page.

> But, since you've made this point elsewhere, my guess is that your answer is
> that it's actually userspace's job to detect the "specific" reason for the
> fault and resolve it.

Yes, it's userspace's responsibity.  I simply don't see how KVM can provide
information that userspace doesn't already have without creating an unmaintainable
uABI, at least not without some deep, deep plumbing into gup().  I.e. unless gup()
were changed to explicitly communicate that it failed because of a uffd equivalent,
at best a flag in kvm_run would be a hint that userspace _might_ be able to resolve
the fault.  And even if we modified gup(), we'd still have all the open questions
about what to do when KVM encounters a fault on a uaccess.
Anish Moorthy March 22, 2023, 9:06 p.m. UTC | #13
On Tue, Mar 21, 2023 at 12:43 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Tue, Mar 21, 2023, Anish Moorthy wrote:
> > On Tue, Mar 21, 2023 at 8:21 AM Sean Christopherson <seanjc@google.com> wrote:
> > >
> > > On Mon, Mar 20, 2023, Anish Moorthy wrote:
> > > > On Mon, Mar 20, 2023 at 8:53 AM Sean Christopherson <seanjc@google.com> wrote:
> > > > > Filling kvm_run::memory_fault but not exiting to userspace is ok because userspace
> > > > > never sees the data, i.e. userspace is completely unaware.  This behavior is not
> > > > > ideal from a KVM perspective as allowing KVM to fill the kvm_run union without
> > > > > exiting to userspace can lead to other bugs, e.g. effective corruption of the
> > > > > kvm_run union, but at least from a uABI perspective, the behavior is acceptable.
> > > >
> > > > Actually, I don't think the idea of filling in kvm_run.memory_fault
> > > > for -EFAULTs which don't make it to userspace works at all. Consider
> > > > the direct_map function, which bubbles its -EFAULT to
> > > > kvm_mmu_do_page_fault. kvm_mmu_do_page_fault is called from both
> > > > kvm_arch_async_page_ready (which ignores the return value), and by
> > > > kvm_mmu_page_fault (where the return value does make it to userspace).
> > > > Populating kvm_run.memory_fault anywhere in or under
> > > > kvm_mmu_do_page_fault seems an immediate no-go, because a wayward
> > > > kvm_arch_async_page_ready could (presumably) overwrite/corrupt an
> > > > already-set kvm_run.memory_fault / other kvm_run field.
> > >
> > > This particular case is a non-issue.  kvm_check_async_pf_completion() is called
> > > only when the current task has control of the vCPU, i.e. is the current "running"
> > > vCPU.  That's not a coincidence either, invoking kvm_mmu_do_page_fault() without
> > > having control of the vCPU would be fraught with races, e.g. the entire KVM MMU
> > > context would be unstable.
> > >
> > > That will hold true for all cases.  Using a vCPU that is not loaded (not the
> > > current "running" vCPU in KVM's misleading terminology) to access guest memory is
> > > simply not safe, as the vCPU state is non-deterministic.  There are paths where
> > > KVM accesses, and even modifies, vCPU state asynchronously, e.g. for IRQ delivery
> > > and making requests, but those are very controlled flows with dedicated machinery
> > > to make them SMP safe.
> > >
> > > That said, I agree that there's a risk that KVM could clobber vcpu->run_run by
> > > hitting an -EFAULT without the vCPU loaded, but that's a solvable problem, e.g.
> > > the helper to fill KVM_EXIT_MEMORY_FAULT could be hardened to yell if called
> > > without the target vCPU being loaded:
> > >
> > >         int kvm_handle_efault(struct kvm_vcpu *vcpu, ...)
> > >         {
> > >                 preempt_disable();
> > >                 if (WARN_ON_ONCE(vcpu != __this_cpu_read(kvm_running_vcpu)))
> > >                         goto out;
> > >
> > >                 vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
> > >                 ...
> > >         out:
> > >                 preempt_enable();
> > >                 return -EFAULT;
> > >         }
> > >
> > > FWIW, I completely agree that filling KVM_EXIT_MEMORY_FAULT without guaranteeing
> > > that KVM "immediately" exits to userspace isn't ideal, but given the amount of
> > > historical code that we need to deal with, it seems like the lesser of all evils.
> > > Unless I'm misunderstanding the use cases, unnecessarily filling kvm_run is a far
> > > better failure mode than KVM not filling kvm_run when it should, i.e. false
> > > positives are ok, false negatives are fatal.
> >
> > Don't you have this in reverse?
>
> No, I don't think so.
>
> > False negatives will just result in userspace not having useful extra
> > information for the -EFAULT it receives from KVM_RUN, in which case userspace
> > can do what you mentioned all VMMs do today and just terminate the VM.
>
> And that is _really_ bad behavior if we have any hope of userspace actually being
> able to rely on this functionality.  E.g. any false negative when userspace is
> trying to do postcopy demand paging will be fatal to the VM.

But since -EFAULTs from KVM_RUN today are already fatal, so there's no
new failure introduced by an -EFAULT w/o a populated memory_fault
field right? Obviously that's of no real use to userspace, but that
seems like part of the point of starting with a partial conversion: to
allow for filling holes in the implementation in the future.

It seems like what you're really concerned about here is the
interaction with the memslot fast-gup-only flag. Obviously, failing to
populate kvm_run.memory_fault for new userspace-visible -EFAULTs
caused by that flag would cause new fatal failures for the guest,
which would make the feature actually harmful. But as far as I know
(and please lmk if I'm wrong), the memslot flag only needs to be used
by the kvm_handle_error_pfn (x86) and user_mem_abort (arm64)
functions, meaning that those are the only places where we need to
check/populate kvm_run.memory_fault for new userspace-visible
-EFAULTs.

> > Whereas a false positive might cause a double-write to the KVM_RUN struct,
> > either putting incorrect information in kvm_run.memory_fault or
>
> Recording unused information on -EFAULT in kvm_run doesn't make the information
> incorrect.
>
> > corrupting another member of the union.
>
> Only if KVM accesses guest memory after initiating an exit to userspace, which
> would be a KVM irrespective of kvm_run.memory_fault.

Ah good: I was concerned that this was a valid set of code paths in
KVM. Although I'm assuming that "initiating an exit to userspace"
includes the "returning -EFAULT from KVM_RUN" cases, because we
wouldn't want EFAULTs to stomp on each other as well (the
kvm_mmu_do_page_fault usages were supposed to be one such example,
though I'm glad to know that they're not a problem).

> And if we're really concerned about clobbering state, we could add hardening/auditing
> code to ensure that KVM actually exits when kvm_run.exit_reason is set (though there
> are a non-zero number of exceptions, e.g. the aformentioned MMIO mess, nested SVM/VMX
> pages, and probably a few others).
>
> Prior to cleanups a few years back[2], emulation failures had issues similar to
> what we are discussing, where KVM would fail to exit to userspace, not fill kvm_run,
> etc.  Those are the types of bugs I want to avoid here.
>
> [1] https://lkml.kernel.org/r/ZBNrWZQhMX8AHzWM%40google.com
> [2] https://lore.kernel.org/kvm/20190823010709.24879-1-sean.j.christopherson@intel.com
>
> > > > That in turn looks problematic for the
> > > > memory-fault-exit-on-fast-gup-failure part of this series, because
> > > > there are at least a couple of cases for which kvm_mmu_do_page_fault
> > > > will -EFAULT. One is the early-efault-on-fast-gup-failure case which
> > > > was the original purpose of this series. Another is a -EFAULT from
> > > > FNAME(fetch) (passed up through FNAME(page_fault)). There might be
> > > > other cases as well. But unless userspace can/should resolve *all*
> > > > such -EFAULTs in the same manner, a kvm_run.memory_fault populated in
> > > > "kvm_mmu_page_fault" wouldn't be actionable.
> > >
> > > Killing the VM, which is what all VMMs do today in response to -EFAULT, is an
> > > action.  As I've pointed out elsewhere in this thread, userspace needs to be able
> > > to identify "faults" that it (userspace) can resolve without a hint from KVM.
> > >
> > > In other words, KVM is still returning -EFAULT (or a variant thereof), the _only_
> > > difference, for all intents and purposes, is that userspace is given a bit more
> > > information about the source of the -EFAULT.
> > >
> > > > At least, not without a whole lot of plumbing code to make it so.
> > >
> > > Plumbing where?
> >
> > In this example, I meant plumbing code to get a kvm_run.memory_fault.flags
> > which is more specific than (eg) MEMFAULT_REASON_PAGE_FAULT_FAILURE from the
> > -EFAULT paths under kvm_mmu_page_fault. My idea for how userspace would
> > distinguish fast-gup failures was that kvm_faultin_pfn would set a special
> > bit in kvm_run.memory_fault.flags to indicate its failure. But (still
> > assuming that we shouldn't have false-positive kvm_run.memory_fault fills) if
> > the memory_fault can only be populated from kvm_mmu_page_fault then either
> > failures from FNAME(page_fault) and kvm_faultin_pfn will be indistinguishable
> > to userspace, or those functions will need to plumb more specific exit
> > reasons all the way up to kvm_mmu_page_fault.
>
> Setting a flag that essentially says "failure when handling a guest page fault"
> is problematic on multiple fronts.  Tying the ABI to KVM's internal implementation
> is not an option, i.e. the ABI would need to be defined as "on page faults from
> the guest".  And then the resulting behavior would be non-deterministic, e.g.
> userspace would see different behavior if KVM accessed a "bad" gfn via emulation
> instead of in response to a guest page fault.  And because of hardware TLBs, it
> would even be possible for the behavior to be non-deterministic on the same
> platform running the same guest code (though this would be exteremly unliklely
> in practice).
>
> And even if userspace is ok with only handling guest page faults_today_, I highly
> doubt that will hold forever.  I.e. at some point there will be a use case that
> wants to react to uaccess failures on fast-only memslots.
>
> Ignoring all of those issues, simplify flagging "this -EFAULT occurred when
> handling a guest page fault" isn't precise enough for userspace to blindly resolve
> the failure.  Even if KVM went through the trouble of setting information if and
> only if get_user_page_fast_only() failed while handling a guest page fault,
> userspace would still need/want a way to verify that the failure was expected and
> can be resolved, e.g. to guard against userspace bugs due to wrongly unmapping
> or mprotecting a page.
>
> > But, since you've made this point elsewhere, my guess is that your answer is
> > that it's actually userspace's job to detect the "specific" reason for the
> > fault and resolve it.
>
> Yes, it's userspace's responsibity.  I simply don't see how KVM can provide
> information that userspace doesn't already have without creating an unmaintainable
> uABI, at least not without some deep, deep plumbing into gup().  I.e. unless gup()
> were changed to explicitly communicate that it failed because of a uffd equivalent,
> at best a flag in kvm_run would be a hint that userspace _might_ be able to resolve
> the fault.  And even if we modified gup(), we'd still have all the open questions
> about what to do when KVM encounters a fault on a uaccess.
Sean Christopherson March 22, 2023, 11:17 p.m. UTC | #14
On Wed, Mar 22, 2023, Anish Moorthy wrote:
> On Tue, Mar 21, 2023 at 12:43 PM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Tue, Mar 21, 2023, Anish Moorthy wrote:
> > > > FWIW, I completely agree that filling KVM_EXIT_MEMORY_FAULT without guaranteeing
> > > > that KVM "immediately" exits to userspace isn't ideal, but given the amount of
> > > > historical code that we need to deal with, it seems like the lesser of all evils.
> > > > Unless I'm misunderstanding the use cases, unnecessarily filling kvm_run is a far
> > > > better failure mode than KVM not filling kvm_run when it should, i.e. false
> > > > positives are ok, false negatives are fatal.
> > >
> > > Don't you have this in reverse?
> >
> > No, I don't think so.
> >
> > > False negatives will just result in userspace not having useful extra
> > > information for the -EFAULT it receives from KVM_RUN, in which case userspace
> > > can do what you mentioned all VMMs do today and just terminate the VM.
> >
> > And that is _really_ bad behavior if we have any hope of userspace actually being
> > able to rely on this functionality.  E.g. any false negative when userspace is
> > trying to do postcopy demand paging will be fatal to the VM.
> 
> But since -EFAULTs from KVM_RUN today are already fatal, so there's no
> new failure introduced by an -EFAULT w/o a populated memory_fault
> field right?

Yes, but it's a bit of a moot piont since the goal of the feature is to avoid
killing the VM.

> Obviously that's of no real use to userspace, but that seems like part of the
> point of starting with a partial conversion: to allow for filling holes in
> the implementation in the future.

Yes, but I want a forcing function to reveal any holes we missed sooner than
later, otherwise the feature will languish since it won't be useful beyond the
fast-gup-only use case.

> It seems like what you're really concerned about here is the interaction with
> the memslot fast-gup-only flag. Obviously, failing to populate
> kvm_run.memory_fault for new userspace-visible -EFAULTs caused by that flag
> would cause new fatal failures for the guest, which would make the feature
> actually harmful. But as far as I know (and please lmk if I'm wrong), the
> memslot flag only needs to be used by the kvm_handle_error_pfn (x86) and
> user_mem_abort (arm64) functions, meaning that those are the only places
> where we need to check/populate kvm_run.memory_fault for new
> userspace-visible -EFAULTs.

No.  As you point out, the fast-gup-only case should be pretty easy to get correct,
i.e. this should all work just fine for _GCE's current_ use case.  I'm more concerned
with setting KVM up for success when future use cases come along that might not be ok
with unhandled faults in random guest accesses killing the VM.

To be clear, I do not expect us to get this 100% correct on the first attempt,
but I do want to have mechanisms in place that will detect any bugs/misses so
that we can fix the issues _before_ a use case comes along that needs 100%
accuracy.

> > > Whereas a false positive might cause a double-write to the KVM_RUN struct,
> > > either putting incorrect information in kvm_run.memory_fault or
> >
> > Recording unused information on -EFAULT in kvm_run doesn't make the information
> > incorrect.
> >
> > > corrupting another member of the union.
> >
> > Only if KVM accesses guest memory after initiating an exit to userspace, which
> > would be a KVM irrespective of kvm_run.memory_fault.
> 
> Ah good: I was concerned that this was a valid set of code paths in
> KVM. Although I'm assuming that "initiating an exit to userspace"
> includes the "returning -EFAULT from KVM_RUN" cases, because we
> wouldn't want EFAULTs to stomp on each other as well (the
> kvm_mmu_do_page_fault usages were supposed to be one such example,
> though I'm glad to know that they're not a problem).

This one gets into a bit of a grey area.  The "rule" is really about the intent,
i.e. once KVM intends to exit to userspace, it's a bug if KVM encounters something
else and runs into the weeds.

In no small part because of the myriad paths where KVM ignores what be fatal errors
in most flows, e.g. record_steal_time(), simply returning -EFAULT from some low
level helper doesn't necessarily signal an intent to exit all the way to userspace.

To be honest, I don't have a clear idea of how difficult it will be to detect bugs.
In most cases, failure to exit to userspace leads to a fatal error fairly quickly.
With userspace faults, it's entirely possible that an exit could be missed and
nothing bad would happen.

Hmm, one idea would be to have the initial -EFAULT detection fill kvm_run.memory_fault,
but set kvm_run.exit_reason to some magic number, e.g. zero it out.  Then KVM could
WARN if something tries to overwrite kvm_run.exit_reason.  The WARN would need to
be buried by a Kconfig or something since kvm_run can be modified by userspace,
but other than that I think it would work.
Anish Moorthy March 28, 2023, 10:19 p.m. UTC | #15
On Tue, Mar 21, 2023 at 12:43 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Tue, Mar 21, 2023, Anish Moorthy wrote:
> > On Tue, Mar 21, 2023 at 8:21 AM Sean Christopherson <seanjc@google.com> wrote:
> > >
> > > On Mon, Mar 20, 2023, Anish Moorthy wrote:
> > > > On Mon, Mar 20, 2023 at 8:53 AM Sean Christopherson <seanjc@google.com> wrote:
> > > > > Filling kvm_run::memory_fault but not exiting to userspace is ok because userspace
> > > > > never sees the data, i.e. userspace is completely unaware.  This behavior is not
> > > > > ideal from a KVM perspective as allowing KVM to fill the kvm_run union without
> > > > > exiting to userspace can lead to other bugs, e.g. effective corruption of the
> > > > > kvm_run union, but at least from a uABI perspective, the behavior is acceptable.
> > > >
> > > > Actually, I don't think the idea of filling in kvm_run.memory_fault
> > > > for -EFAULTs which don't make it to userspace works at all. Consider
> > > > the direct_map function, which bubbles its -EFAULT to
> > > > kvm_mmu_do_page_fault. kvm_mmu_do_page_fault is called from both
> > > > kvm_arch_async_page_ready (which ignores the return value), and by
> > > > kvm_mmu_page_fault (where the return value does make it to userspace).
> > > > Populating kvm_run.memory_fault anywhere in or under
> > > > kvm_mmu_do_page_fault seems an immediate no-go, because a wayward
> > > > kvm_arch_async_page_ready could (presumably) overwrite/corrupt an
> > > > already-set kvm_run.memory_fault / other kvm_run field.
> > >
> > > This particular case is a non-issue.  kvm_check_async_pf_completion() is called
> > > only when the current task has control of the vCPU, i.e. is the current "running"
> > > vCPU.  That's not a coincidence either, invoking kvm_mmu_do_page_fault() without
> > > having control of the vCPU would be fraught with races, e.g. the entire KVM MMU
> > > context would be unstable.
> > >
> > > That will hold true for all cases.  Using a vCPU that is not loaded (not the
> > > current "running" vCPU in KVM's misleading terminology) to access guest memory is
> > > simply not safe, as the vCPU state is non-deterministic.  There are paths where
> > > KVM accesses, and even modifies, vCPU state asynchronously, e.g. for IRQ delivery
> > > and making requests, but those are very controlled flows with dedicated machinery
> > > to make them SMP safe.
> > >
> > > That said, I agree that there's a risk that KVM could clobber vcpu->run_run by
> > > hitting an -EFAULT without the vCPU loaded, but that's a solvable problem, e.g.
> > > the helper to fill KVM_EXIT_MEMORY_FAULT could be hardened to yell if called
> > > without the target vCPU being loaded:
> > >
> > >         int kvm_handle_efault(struct kvm_vcpu *vcpu, ...)
> > >         {
> > >                 preempt_disable();
> > >                 if (WARN_ON_ONCE(vcpu != __this_cpu_read(kvm_running_vcpu)))
> > >                         goto out;
> > >
> > >                 vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
> > >                 ...
> > >         out:
> > >                 preempt_enable();
> > >                 return -EFAULT;
> > >         }
> > >
> > > FWIW, I completely agree that filling KVM_EXIT_MEMORY_FAULT without guaranteeing
> > > that KVM "immediately" exits to userspace isn't ideal, but given the amount of
> > > historical code that we need to deal with, it seems like the lesser of all evils.
> > > Unless I'm misunderstanding the use cases, unnecessarily filling kvm_run is a far
> > > better failure mode than KVM not filling kvm_run when it should, i.e. false
> > > positives are ok, false negatives are fatal.
> >
> > Don't you have this in reverse?
>
> No, I don't think so.
>
> > False negatives will just result in userspace not having useful extra
> > information for the -EFAULT it receives from KVM_RUN, in which case userspace
> > can do what you mentioned all VMMs do today and just terminate the VM.
>
> And that is _really_ bad behavior if we have any hope of userspace actually being
> able to rely on this functionality.  E.g. any false negative when userspace is
> trying to do postcopy demand paging will be fatal to the VM.
>
> > Whereas a false positive might cause a double-write to the KVM_RUN struct,
> > either putting incorrect information in kvm_run.memory_fault or
>
> Recording unused information on -EFAULT in kvm_run doesn't make the information
> incorrect.

Let's say that some function (converted to annotate its EFAULTs) fills
in kvm_run.memory_fault, but the EFAULT is suppressed from being
returned from kvm_run. What if, later within the same kvm_run call,
some other function (which we've completely overlooked) EFAULTs and
that return value actually does make it out to kvm_run? Userspace
would get stale information, which could be catastrophic.

Actually even performing the annotations only in functions that
currently always bubble EFAULTs to userspace still seems brittle: if
new callers are ever added which don't bubble the EFAULTs, then we end
up in the same situation.
Sean Christopherson April 4, 2023, 7:34 p.m. UTC | #16
On Tue, Mar 28, 2023, Anish Moorthy wrote:
> On Tue, Mar 21, 2023 at 12:43 PM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Tue, Mar 21, 2023, Anish Moorthy wrote:
> > > On Tue, Mar 21, 2023 at 8:21 AM Sean Christopherson <seanjc@google.com> wrote:
> > > > FWIW, I completely agree that filling KVM_EXIT_MEMORY_FAULT without guaranteeing
> > > > that KVM "immediately" exits to userspace isn't ideal, but given the amount of
> > > > historical code that we need to deal with, it seems like the lesser of all evils.
> > > > Unless I'm misunderstanding the use cases, unnecessarily filling kvm_run is a far
> > > > better failure mode than KVM not filling kvm_run when it should, i.e. false
> > > > positives are ok, false negatives are fatal.
> > >
> > > Don't you have this in reverse?
> >
> > No, I don't think so.
> >
> > > False negatives will just result in userspace not having useful extra
> > > information for the -EFAULT it receives from KVM_RUN, in which case userspace
> > > can do what you mentioned all VMMs do today and just terminate the VM.
> >
> > And that is _really_ bad behavior if we have any hope of userspace actually being
> > able to rely on this functionality.  E.g. any false negative when userspace is
> > trying to do postcopy demand paging will be fatal to the VM.
> >
> > > Whereas a false positive might cause a double-write to the KVM_RUN struct,
> > > either putting incorrect information in kvm_run.memory_fault or
> >
> > Recording unused information on -EFAULT in kvm_run doesn't make the information
> > incorrect.
> 
> Let's say that some function (converted to annotate its EFAULTs) fills
> in kvm_run.memory_fault, but the EFAULT is suppressed from being
> returned from kvm_run. What if, later within the same kvm_run call,
> some other function (which we've completely overlooked) EFAULTs and
> that return value actually does make it out to kvm_run? Userspace
> would get stale information, which could be catastrophic.

"catastrophic" is a bit hyperbolic.  Yes, it would be bad, but at _worst_ userspace
will kill the VM, which is the status quo today.

> Actually even performing the annotations only in functions that
> currently always bubble EFAULTs to userspace still seems brittle: if
> new callers are ever added which don't bubble the EFAULTs, then we end
> up in the same situation.

Because of KVM's semi-magical '1 == resume, -errno/0 == exit' "design", that's
true for literally every exit to userspace in KVM and every VM-Exit handler.
E.g. see commit 2368048bf5c2 ("KVM: x86: Signal #GP, not -EPERM, on bad
WRMSR(MCi_CTL/STATUS)"), where KVM returned '-1' instead of '1' when rejecting
MSR accesses and inadvertantly killed the VM.  A similar bug would be if KVM
returned EFAULT instead of -EFAULT, in which case vcpu_run() would resume the
guest instead of exiting to userspace and likely put the vCPU into an infinite
loop.

Do I want to harden KVM to make things like this less brittle?  Absolutely.  Do I
think we should hold up this functionality just because it doesn't solve all of
pre-existing flaws in the related KVM code?  No.
Anish Moorthy April 4, 2023, 8:40 p.m. UTC | #17
On Tue, Apr 4, 2023 at 12:35 PM Sean Christopherson <seanjc@google.com> wrote:
> > Let's say that some function (converted to annotate its EFAULTs) fills
> > in kvm_run.memory_fault, but the EFAULT is suppressed from being
> > returned from kvm_run. What if, later within the same kvm_run call,
> > some other function (which we've completely overlooked) EFAULTs and
> > that return value actually does make it out to kvm_run? Userspace
> > would get stale information, which could be catastrophic.
>
> "catastrophic" is a bit hyperbolic.  Yes, it would be bad, but at _worst_ userspace
> will kill the VM, which is the status quo today.

Well what I'm saying is that in these cases userspace *wouldn't know*
that kvm_run.memory_fault contains incorrect information for the
-EFAULT it actually got (do you disagree?), which could presumably
cause it to do bad things like "resolve" faults on incorrect pages
and/or infinite-loop on KVM_RUN, etc.

Annotating the efault information as valid only from the call sites
which return directly to userspace prevents this class of problem, at
the cost of allowing un-annotated EFAULTs to make it to userspace. But
to me, paying that cost to make sure the EFAULT information is always
correct seems by far preferable to not paying it and allowing
userspace to get silently incorrect information.

> > Actually even performing the annotations only in functions that
> > currently always bubble EFAULTs to userspace still seems brittle: if
> > new callers are ever added which don't bubble the EFAULTs, then we end
> > up in the same situation.
>
> Because of KVM's semi-magical '1 == resume, -errno/0 == exit' "design", that's
> true for literally every exit to userspace in KVM and every VM-Exit handler.
> E.g. see commit 2368048bf5c2 ("KVM: x86: Signal #GP, not -EPERM, on bad
> WRMSR(MCi_CTL/STATUS)"), where KVM returned '-1' instead of '1' when rejecting
> MSR accesses and inadvertantly killed the VM.  A similar bug would be if KVM
> returned EFAULT instead of -EFAULT, in which case vcpu_run() would resume the
> guest instead of exiting to userspace and likely put the vCPU into an infinite
> loop.

Right, good point.
Sean Christopherson April 4, 2023, 10:07 p.m. UTC | #18
On Tue, Apr 04, 2023, Anish Moorthy wrote:
> On Tue, Apr 4, 2023 at 12:35 PM Sean Christopherson <seanjc@google.com> wrote:
> > > Let's say that some function (converted to annotate its EFAULTs) fills
> > > in kvm_run.memory_fault, but the EFAULT is suppressed from being
> > > returned from kvm_run. What if, later within the same kvm_run call,
> > > some other function (which we've completely overlooked) EFAULTs and
> > > that return value actually does make it out to kvm_run? Userspace
> > > would get stale information, which could be catastrophic.
> >
> > "catastrophic" is a bit hyperbolic.  Yes, it would be bad, but at _worst_ userspace
> > will kill the VM, which is the status quo today.
> 
> Well what I'm saying is that in these cases userspace *wouldn't know*
> that kvm_run.memory_fault contains incorrect information for the
> -EFAULT it actually got (do you disagree?),

I disagree in the sense that if the stale information causes a problem, then by
definition userspace has to know.  It's the whole "if a tree falls in a forest"
thing.  If KVM reports stale information and literally nothing bad happens, ever,
then is the superfluous exit really a problem?  Not saying it wouldn't be treated
as a bug, just that it might not even warrant a stable backport if the worst case
scenario is a spurious exit to userspace (for example).

> which could presumably cause it to do bad things like "resolve" faults on
> incorrect pages and/or infinite-loop on KVM_RUN, etc.

Putting the vCPU into an infinite loop is _very_ visible, e.g. see the entire
mess surrounding commit 31c25585695a ("Revert "KVM: SVM: avoid infinite loop on
NPF from bad address"").

As above, fixing pages that don't need to be fixed isn't itself a major problem.
If the extra exits lead to a performance issue, then _that_ is a problem, but
again _something_ has to detect the problem and thus it becomes a known thing.

> Annotating the efault information as valid only from the call sites
> which return directly to userspace prevents this class of problem, at
> the cost of allowing un-annotated EFAULTs to make it to userspace. But
> to me, paying that cost to make sure the EFAULT information is always
> correct seems by far preferable to not paying it and allowing
> userspace to get silently incorrect information.

I don't think that's a maintainable approach.  Filling kvm_run if and only if the
-EFAULT has a direct path to userspace is (a) going to require a signficant amount
of code churn and (b) falls apart the instant code further up the stack changes.
E.g. the relatively straightforward page fault case requires bouncing through 7+
functions to get from kvm_handle_error_pfn() to kvm_arch_vcpu_ioctl_run(), and not
all of those are obviously "direct"

	if (IS_ENABLED(CONFIG_RETPOLINE) && fault.is_tdp)
		r = kvm_tdp_page_fault(vcpu, &fault);
	else
		r = vcpu->arch.mmu->page_fault(vcpu, &fault);

	if (fault.write_fault_to_shadow_pgtable && emulation_type)
		*emulation_type |= EMULTYPE_WRITE_PF_TO_SP;

	/*
	 * Similar to above, prefetch faults aren't truly spurious, and the
	 * async #PF path doesn't do emulation.  Do count faults that are fixed
	 * by the async #PF handler though, otherwise they'll never be counted.
	 */
	if (r == RET_PF_FIXED)
		vcpu->stat.pf_fixed++;
	else if (prefetch)
		;
	else if (r == RET_PF_EMULATE)
		vcpu->stat.pf_emulate++;
	else if (r == RET_PF_SPURIOUS)
		vcpu->stat.pf_spurious++;
	return r;


...

	if (r == RET_PF_INVALID) {
		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
					  lower_32_bits(error_code), false,
					  &emulation_type);
		if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
			return -EIO;
	}

	if (r < 0)
		return r;
	if (r != RET_PF_EMULATE)
		return 1;

In other words, the "only if it's direct" rule requires visually auditing changes,
i.e. catching "violations" via code review, not only to code that adds a new -EFAULT
return, but to all code throughout rather large swaths of KVM.  The odds of us (or
whoever the future maintainers/reviewers are) remembering to enforce the "rule", let
alone actually having 100% accuracy, are basically nil.

On the flip side, if we add a helper to fill kvm_run and return -EFAULT, then we can
add rule that only time KVM is allowed to return a bare -EFAULT is immediately after
a uaccess, i.e. after copy_to/from_user() and the many variants.  And _that_ can be
enforced through static checkers, e.g. someone with more (read: any) awk/sed skills
than me could bang something out in a matter of minutes.  Such a static checker won't
catch everything, but there would be very, very few bare non-uaccess -EFAULTS left,
and those could be filtered out with an allowlist, e.g. similar to how the folks that
run smatch and whatnot deal with false positives.
Anish Moorthy April 5, 2023, 8:21 p.m. UTC | #19
Ok. I'm still concerned about the implications of the "annotate
everywhere" approach, but I spoke with James and he shares your
opinion on the severity of the potential issues. I'll put the patches
together and send up a proper v3.
diff mbox series

Patch

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 62de0768d6aa5..f9ca18bbec879 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6636,6 +6636,19 @@  array field represents return values. The userspace should update the return
 values of SBI call before resuming the VCPU. For more details on RISC-V SBI
 spec refer, https://github.com/riscv/riscv-sbi-doc.
 
+::
+
+		/* KVM_EXIT_MEMORY_FAULT */
+		struct {
+			__u64 flags;
+			__u64 gpa;
+			__u64 len; /* in bytes */
+		} memory_fault;
+
+Indicates a memory fault on the guest physical address range [gpa, gpa + len).
+flags is a bitfield describing the reasons(s) for the fault. See
+KVM_CAP_X86_MEMORY_FAULT_EXIT for more details.
+
 ::
 
     /* KVM_EXIT_NOTIFY */
@@ -7669,6 +7682,30 @@  This capability is aimed to mitigate the threat that malicious VMs can
 cause CPU stuck (due to event windows don't open up) and make the CPU
 unavailable to host or other VMs.
 
+7.34 KVM_CAP_X86_MEMORY_FAULT_EXIT
+----------------------------------
+
+:Architectures: x86
+:Parameters: args[0] is a bitfield specifying what reasons to exit upon.
+:Returns: 0 on success, -EINVAL if unsupported or if unrecognized exit reason
+          specified.
+
+This capability transforms -EFAULTs returned by KVM_RUN in response to guest
+memory accesses into VM exits (KVM_EXIT_MEMORY_FAULT), with 'gpa' and 'len'
+describing the problematic range of memory and 'flags' describing the reason(s)
+for the fault.
+
+The implementation is currently incomplete. Please notify the maintainers if you
+come across a case where it needs to be implemented.
+
+Through args[0], the capability can be set on a per-exit-reason basis.
+Currently, the only exit reasons supported are
+
+1. KVM_MEMFAULT_REASON_UNKNOWN (1 << 0)
+
+Memory fault exits with a reason of UNKNOWN should not be depended upon: they
+may be added, removed, or reclassified under a stable reason.
+
 8. Other capabilities.
 ======================
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f706621c35b86..b3c1b2f57e680 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4425,6 +4425,7 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_VAPIC:
 	case KVM_CAP_ENABLE_CAP:
 	case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
+	case KVM_CAP_X86_MEMORY_FAULT_EXIT:
 		r = 1;
 		break;
 	case KVM_CAP_EXIT_HYPERCALL:
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8ada23756b0ec..d3ccfead73e42 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -805,6 +805,7 @@  struct kvm {
 	struct notifier_block pm_notifier;
 #endif
 	char stats_id[KVM_STATS_NAME_SIZE];
+	uint64_t memfault_exit_reasons;
 };
 
 #define kvm_err(fmt, ...) \
@@ -2278,4 +2279,19 @@  static inline void kvm_account_pgtable_pages(void *virt, int nr)
 /* Max number of entries allowed for each kvm dirty ring */
 #define  KVM_DIRTY_RING_MAX_ENTRIES  65536
 
+/*
+ * If memory fault exits are enabled for any of the reasons given in exit_flags
+ * then sets up a KVM_EXIT_MEMORY_FAULT for the given guest physical address,
+ * length, and flags and returns -1.
+ * Otherwise, returns -EFAULT
+ */
+inline int kvm_memfault_exit_or_efault(
+	struct kvm_vcpu *vcpu, uint64_t gpa, uint64_t len, uint64_t exit_flags);
+
+/*
+ * Checks that all of the bits specified in 'reasons' correspond to known
+ * memory fault exit reasons.
+ */
+bool kvm_memfault_exit_flags_valid(uint64_t reasons);
+
 #endif
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d77aef872a0a0..0ba1d7f01346e 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -264,6 +264,7 @@  struct kvm_xen_exit {
 #define KVM_EXIT_RISCV_SBI        35
 #define KVM_EXIT_RISCV_CSR        36
 #define KVM_EXIT_NOTIFY           37
+#define KVM_EXIT_MEMORY_FAULT     38
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -505,6 +506,17 @@  struct kvm_run {
 #define KVM_NOTIFY_CONTEXT_INVALID	(1 << 0)
 			__u32 flags;
 		} notify;
+		/* KVM_EXIT_MEMORY_FAULT */
+		struct {
+			/*
+			 * Indicates a memory fault on the guest physical address range
+			 * [gpa, gpa + len). flags is a bitfield describing the reasons(s)
+			 * for the fault.
+			 */
+			__u64 flags;
+			__u64 gpa;
+			__u64 len; /* in bytes */
+		} memory_fault;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
@@ -1184,6 +1196,7 @@  struct kvm_ppc_resize_hpt {
 #define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224
 #define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225
 #define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226
+#define KVM_CAP_X86_MEMORY_FAULT_EXIT 227
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -2237,4 +2250,7 @@  struct kvm_s390_zpci_op {
 /* flags for kvm_s390_zpci_op->u.reg_aen.flags */
 #define KVM_S390_ZPCIOP_REGAEN_HOST    (1 << 0)
 
+/* Exit reasons for KVM_EXIT_MEMORY_FAULT */
+#define KVM_MEMFAULT_REASON_UNKNOWN (1 << 0)
+
 #endif /* __LINUX_KVM_H */
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 55155e262646e..2b468345f25c3 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -264,6 +264,7 @@  struct kvm_xen_exit {
 #define KVM_EXIT_RISCV_SBI        35
 #define KVM_EXIT_RISCV_CSR        36
 #define KVM_EXIT_NOTIFY           37
+#define KVM_EXIT_MEMORY_FAULT     38
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -505,6 +506,17 @@  struct kvm_run {
 #define KVM_NOTIFY_CONTEXT_INVALID	(1 << 0)
 			__u32 flags;
 		} notify;
+		/* KVM_EXIT_MEMORY_FAULT */
+		struct {
+			/*
+			 * Indicates a memory fault on the guest physical address range
+			 * [gpa, gpa + len). flags is a bitfield describing the reasons(s)
+			 * for the fault.
+			 */
+			__u64 flags;
+			__u64 gpa;
+			__u64 len; /* in bytes */
+		} memory_fault;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
@@ -2228,4 +2240,7 @@  struct kvm_s390_zpci_op {
 /* flags for kvm_s390_zpci_op->u.reg_aen.flags */
 #define KVM_S390_ZPCIOP_REGAEN_HOST    (1 << 0)
 
+/* Exit reasons for KVM_EXIT_MEMORY_FAULT */
+#define KVM_MEMFAULT_REASON_UNKNOWN (1 << 0)
+
 #endif /* __LINUX_KVM_H */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e38ddda05b261..00aec43860ff1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1142,6 +1142,7 @@  static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 	spin_lock_init(&kvm->mn_invalidate_lock);
 	rcuwait_init(&kvm->mn_memslots_update_rcuwait);
 	xa_init(&kvm->vcpu_array);
+	kvm->memfault_exit_reasons = 0;
 
 	INIT_LIST_HEAD(&kvm->gpc_list);
 	spin_lock_init(&kvm->gpc_lock);
@@ -4671,6 +4672,14 @@  static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
 
 		return r;
 	}
+	case KVM_CAP_X86_MEMORY_FAULT_EXIT: {
+		if (!kvm_vm_ioctl_check_extension(kvm, KVM_CAP_X86_MEMORY_FAULT_EXIT))
+			return -EINVAL;
+		else if (!kvm_memfault_exit_flags_valid(cap->args[0]))
+			return -EINVAL;
+		kvm->memfault_exit_reasons = cap->args[0];
+		return 0;
+	}
 	default:
 		return kvm_vm_ioctl_enable_cap(kvm, cap);
 	}
@@ -6172,3 +6181,22 @@  int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
 
 	return init_context.err;
 }
+
+inline int kvm_memfault_exit_or_efault(
+	struct kvm_vcpu *vcpu, uint64_t gpa, uint64_t len, uint64_t exit_flags)
+{
+	if (!(vcpu->kvm->memfault_exit_reasons & exit_flags))
+		return -EFAULT;
+	vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
+	vcpu->run->memory_fault.gpa = gpa;
+	vcpu->run->memory_fault.len = len;
+	vcpu->run->memory_fault.flags = exit_flags;
+	return -1;
+}
+
+bool kvm_memfault_exit_flags_valid(uint64_t reasons)
+{
+	uint64_t valid_flags = KVM_MEMFAULT_REASON_UNKNOWN;
+
+	return !(reasons & !valid_flags);
+}