diff mbox series

[v5,07/17] KVM: x86: add fields to struct kvm_arch for CoCo features

Message ID 20240404121327.3107131-8-pbonzini@redhat.com (mailing list archive)
State New
Headers show
Series KVM: SEV: allow customizing VMSA features | expand

Commit Message

Paolo Bonzini April 4, 2024, 12:13 p.m. UTC
Some VM types have characteristics in common; in fact, the only use
of VM types right now is kvm_arch_has_private_mem and it assumes that
_all_ nonzero VM types have private memory.

We will soon introduce a VM type for SEV and SEV-ES VMs, and at that
point we will have two special characteristics of confidential VMs
that depend on the VM type: not just if memory is private, but
also whether guest state is protected.  For the latter we have
kvm->arch.guest_state_protected, which is only set on a fully initialized
VM.

For VM types with protected guest state, we can actually fix a problem in
the SEV-ES implementation, where ioctls to set registers do not cause an
error even if the VM has been initialized and the guest state encrypted.
Make sure that when using VM types that will become an error.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20240209183743.22030-7-pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  7 ++-
 arch/x86/kvm/x86.c              | 93 ++++++++++++++++++++++++++-------
 2 files changed, 79 insertions(+), 21 deletions(-)

Comments

Isaku Yamahata April 4, 2024, 9:39 p.m. UTC | #1
On Thu, Apr 04, 2024 at 08:13:17AM -0400,
Paolo Bonzini <pbonzini@redhat.com> wrote:

> Some VM types have characteristics in common; in fact, the only use
> of VM types right now is kvm_arch_has_private_mem and it assumes that
> _all_ nonzero VM types have private memory.
> 
> We will soon introduce a VM type for SEV and SEV-ES VMs, and at that
> point we will have two special characteristics of confidential VMs
> that depend on the VM type: not just if memory is private, but
> also whether guest state is protected.  For the latter we have
> kvm->arch.guest_state_protected, which is only set on a fully initialized
> VM.
> 
> For VM types with protected guest state, we can actually fix a problem in
> the SEV-ES implementation, where ioctls to set registers do not cause an
> error even if the VM has been initialized and the guest state encrypted.
> Make sure that when using VM types that will become an error.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Message-Id: <20240209183743.22030-7-pbonzini@redhat.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  7 ++-
>  arch/x86/kvm/x86.c              | 93 ++++++++++++++++++++++++++-------
>  2 files changed, 79 insertions(+), 21 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 04c430eb25cf..3d56b5bb10e9 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1279,12 +1279,14 @@ enum kvm_apicv_inhibit {
>  };
>  
>  struct kvm_arch {
> -	unsigned long vm_type;
>  	unsigned long n_used_mmu_pages;
>  	unsigned long n_requested_mmu_pages;
>  	unsigned long n_max_mmu_pages;
>  	unsigned int indirect_shadow_pages;
>  	u8 mmu_valid_gen;
> +	u8 vm_type;
> +	bool has_private_mem;
> +	bool has_protected_state;
>  	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
>  	struct list_head active_mmu_pages;
>  	struct list_head zapped_obsolete_pages;
> @@ -2153,8 +2155,9 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
>  void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
>  		       int tdp_max_root_level, int tdp_huge_page_level);
>  
> +
>  #ifdef CONFIG_KVM_PRIVATE_MEM
> -#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.vm_type != KVM_X86_DEFAULT_VM)
> +#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
>  #else
>  #define kvm_arch_has_private_mem(kvm) false
>  #endif
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 3934e7682734..d4a8d896798f 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5555,11 +5555,15 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
>  	return 0;
>  }
>  
> -static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
> -					     struct kvm_debugregs *dbgregs)
> +static int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
> +					    struct kvm_debugregs *dbgregs)
>  {
>  	unsigned int i;
>  
> +	if (vcpu->kvm->arch.has_protected_state &&
> +	    vcpu->arch.guest_state_protected)
> +		return -EINVAL;
> +
>  	memset(dbgregs, 0, sizeof(*dbgregs));
>  
>  	BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
> @@ -5568,6 +5572,7 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
>  
>  	dbgregs->dr6 = vcpu->arch.dr6;
>  	dbgregs->dr7 = vcpu->arch.dr7;
> +	return 0;
>  }
>  
>  static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
> @@ -5575,6 +5580,10 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
>  {
>  	unsigned int i;
>  
> +	if (vcpu->kvm->arch.has_protected_state &&
> +	    vcpu->arch.guest_state_protected)
> +		return -EINVAL;
> +
>  	if (dbgregs->flags)
>  		return -EINVAL;
>  
> @@ -5595,8 +5604,8 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
>  }
>  
>  
> -static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
> -					  u8 *state, unsigned int size)
> +static int kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
> +					 u8 *state, unsigned int size)
>  {
>  	/*
>  	 * Only copy state for features that are enabled for the guest.  The
> @@ -5614,24 +5623,25 @@ static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
>  			     XFEATURE_MASK_FPSSE;
>  
>  	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
> -		return;
> +		return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
>  
>  	fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size,
>  				       supported_xcr0, vcpu->arch.pkru);
> +	return 0;
>  }
>  
> -static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
> -					 struct kvm_xsave *guest_xsave)
> +static int kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
> +					struct kvm_xsave *guest_xsave)
>  {
> -	kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
> -				      sizeof(guest_xsave->region));
> +	return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
> +					     sizeof(guest_xsave->region));
>  }
>  
>  static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
>  					struct kvm_xsave *guest_xsave)
>  {
>  	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
> -		return 0;
> +		return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
>  
>  	return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
>  					      guest_xsave->region,
> @@ -5639,18 +5649,23 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
>  					      &vcpu->arch.pkru);
>  }
>  
> -static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
> -					struct kvm_xcrs *guest_xcrs)
> +static int kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
> +				       struct kvm_xcrs *guest_xcrs)
>  {
> +	if (vcpu->kvm->arch.has_protected_state &&
> +	    vcpu->arch.guest_state_protected)
> +		return -EINVAL;
> +
>  	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
>  		guest_xcrs->nr_xcrs = 0;
> -		return;
> +		return 0;
>  	}
>  
>  	guest_xcrs->nr_xcrs = 1;
>  	guest_xcrs->flags = 0;
>  	guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
>  	guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
> +	return 0;
>  }
>  
>  static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
> @@ -5658,6 +5673,10 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
>  {
>  	int i, r = 0;
>  
> +	if (vcpu->kvm->arch.has_protected_state &&
> +	    vcpu->arch.guest_state_protected)
> +		return -EINVAL;
> +
>  	if (!boot_cpu_has(X86_FEATURE_XSAVE))
>  		return -EINVAL;
>  
> @@ -6040,7 +6059,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>  	case KVM_GET_DEBUGREGS: {
>  		struct kvm_debugregs dbgregs;
>  
> -		kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
> +		r = kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
> +		if (r < 0)
> +			break;
>  
>  		r = -EFAULT;
>  		if (copy_to_user(argp, &dbgregs,
> @@ -6070,7 +6091,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>  		if (!u.xsave)
>  			break;
>  
> -		kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
> +		r = kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
> +		if (r < 0)
> +			break;
>  
>  		r = -EFAULT;
>  		if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
> @@ -6099,7 +6122,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>  		if (!u.xsave)
>  			break;
>  
> -		kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
> +		r = kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
> +		if (r < 0)
> +			break;
>  
>  		r = -EFAULT;
>  		if (copy_to_user(argp, u.xsave, size))
> @@ -6115,7 +6140,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>  		if (!u.xcrs)
>  			break;
>  
> -		kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
> +		r = kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
> +		if (r < 0)
> +			break;
>  
>  		r = -EFAULT;
>  		if (copy_to_user(argp, u.xcrs,
> @@ -6259,6 +6286,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>  	}
>  #endif
>  	case KVM_GET_SREGS2: {
> +		r = -EINVAL;
> +		if (vcpu->kvm->arch.has_protected_state &&
> +		    vcpu->arch.guest_state_protected)
> +			goto out;
> +
>  		u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
>  		r = -ENOMEM;
>  		if (!u.sregs2)
> @@ -6271,6 +6303,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>  		break;
>  	}
>  	case KVM_SET_SREGS2: {
> +		r = -EINVAL;
> +		if (vcpu->kvm->arch.has_protected_state &&
> +		    vcpu->arch.guest_state_protected)
> +			goto out;
> +
>  		u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
>  		if (IS_ERR(u.sregs2)) {
>  			r = PTR_ERR(u.sregs2);
> @@ -11478,6 +11515,10 @@ static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>  
>  int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>  {
> +	if (vcpu->kvm->arch.has_protected_state &&
> +	    vcpu->arch.guest_state_protected)
> +		return -EINVAL;
> +
>  	vcpu_load(vcpu);
>  	__get_regs(vcpu, regs);
>  	vcpu_put(vcpu);
> @@ -11519,6 +11560,10 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>  
>  int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
>  {
> +	if (vcpu->kvm->arch.has_protected_state &&
> +	    vcpu->arch.guest_state_protected)
> +		return -EINVAL;
> +
>  	vcpu_load(vcpu);
>  	__set_regs(vcpu, regs);
>  	vcpu_put(vcpu);
> @@ -11591,6 +11636,10 @@ static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
>  int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
>  				  struct kvm_sregs *sregs)
>  {
> +	if (vcpu->kvm->arch.has_protected_state &&
> +	    vcpu->arch.guest_state_protected)
> +		return -EINVAL;
> +
>  	vcpu_load(vcpu);
>  	__get_sregs(vcpu, sregs);
>  	vcpu_put(vcpu);
> @@ -11858,6 +11907,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
>  {
>  	int ret;
>  
> +	if (vcpu->kvm->arch.has_protected_state &&
> +	    vcpu->arch.guest_state_protected)
> +		return -EINVAL;
> +
>  	vcpu_load(vcpu);
>  	ret = __set_sregs(vcpu, sregs);
>  	vcpu_put(vcpu);
> @@ -11975,7 +12028,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
>  	struct fxregs_state *fxsave;
>  
>  	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
> -		return 0;
> +		return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
>  
>  	vcpu_load(vcpu);
>  
> @@ -11998,7 +12051,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
>  	struct fxregs_state *fxsave;
>  
>  	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
> -		return 0;
> +		return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
>  
>  	vcpu_load(vcpu);
>  
> @@ -12524,6 +12577,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
>  		return -EINVAL;
>  
>  	kvm->arch.vm_type = type;
> +	kvm->arch.has_private_mem =
> +		(type == KVM_X86_SW_PROTECTED_VM);
>  
>  	ret = kvm_page_track_init(kvm);
>  	if (ret)
> -- 
> 2.43.0

This works well with TDX KVM patch series.

Reviewed-by: Isaku Yamahata <isaku.yamahata@intel.com>
Edgecombe, Rick P April 5, 2024, 11:01 p.m. UTC | #2
On Thu, 2024-04-04 at 08:13 -0400, Paolo Bonzini wrote:
>  
>  struct kvm_arch {
> -       unsigned long vm_type;
>         unsigned long n_used_mmu_pages;
>         unsigned long n_requested_mmu_pages;
>         unsigned long n_max_mmu_pages;
>         unsigned int indirect_shadow_pages;
>         u8 mmu_valid_gen;
> +       u8 vm_type;
> +       bool has_private_mem;
> +       bool has_protected_state;

I'm a little late to this conversation, so hopefully not just complicating things. But why not
deduce has_private_mem and has_protected_state from the vm_type during runtime? Like if
kvm.arch.vm_type was instead a bit mask with the bit position of the KVM_X86_*_VM set,
kvm_arch_has_private_mem() could bitwise-and with a compile time mask of vm_types that have primate
memory. This also prevents it from ever transitioning through non-nonsensical states like vm_type ==
KVM_X86_TDX_VM, but !has_private_memory, so would be a little more robust.

Partly why I ask is there is logic in the x86 MMU TDX changes that tries to be generic but still
needs special handling for it. The current solution is to look at kvm_gfn_shared_mask() as TDX is
the only vm type that sets it, but Isaku and I were discussing if we should check something else,
that didn't appear to be tying together to unrelated concepts:
https://lore.kernel.org/kvm/20240319235654.GC1994522@ls.amr.corp.intel.com/

Since it's down the mail, the relevant snippet:
"
> >  void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
> >                                    struct kvm_memory_slot *slot)
> >  {
> > -       kvm_mmu_zap_all_fast(kvm);
> > +       if (kvm_gfn_shared_mask(kvm))
> 
> There seems to be an attempt to abstract away the existence of Secure-
> EPT in mmu.c, that is not fully successful. In this case the code
> checks kvm_gfn_shared_mask() to see if it needs to handle the zapping
> in a way specific needed by S-EPT. It ends up being a little confusing
> because the actual check is about whether there is a shared bit. It
> only works because only S-EPT is the only thing that has a
> kvm_gfn_shared_mask().
> 
> Doing something like (kvm->arch.vm_type == KVM_X86_TDX_VM) looks wrong,
> but is more honest about what we are getting up to here. I'm not sure
> though, what do you think?

Right, I attempted and failed in zapping case.  This is due to the restriction
that the Secure-EPT pages must be removed from the leaves.  the VMX case (also
NPT, even SNP) heavily depends on zapping root entry as optimization.

I can think of
- add TDX check. Looks wrong
- Use kvm_gfn_shared_mask(kvm). confusing
- Give other name for this check like zap_from_leafs (or better name?)
  The implementation is same to kvm_gfn_shared_mask() with comment.
  - Or we can add a boolean variable to struct kvm
"

This patch seems like the convention would be to add and check a "zap_leafs_only" bool. But it
starts to become a lot of bools. If instead we added an arch_zap_leafs_only(struct kvm *kvm), that
checked the vm_type was KVM_X86_TDX_VM, it could make the calling code more clear. But then I wonder
why not do the same for has_private_mem and has_protected_state?

Of course TDX can adjust for any format of the state. Just seems cleaner to me.
Sean Christopherson April 9, 2024, 1:21 a.m. UTC | #3
On Fri, Apr 05, 2024, Rick P Edgecombe wrote:
> On Thu, 2024-04-04 at 08:13 -0400, Paolo Bonzini wrote:
> >  
> >  struct kvm_arch {
> > -       unsigned long vm_type;
> >         unsigned long n_used_mmu_pages;
> >         unsigned long n_requested_mmu_pages;
> >         unsigned long n_max_mmu_pages;
> >         unsigned int indirect_shadow_pages;
> >         u8 mmu_valid_gen;
> > +       u8 vm_type;
> > +       bool has_private_mem;
> > +       bool has_protected_state;
> 
> I'm a little late to this conversation, so hopefully not just complicating
> things. But why not deduce has_private_mem and has_protected_state from the
> vm_type during runtime? Like if kvm.arch.vm_type was instead a bit mask with
> the bit position of the KVM_X86_*_VM set, kvm_arch_has_private_mem() could
> bitwise-and with a compile time mask of vm_types that have primate memory.
> This also prevents it from ever transitioning through non-nonsensical states
> like vm_type == KVM_X86_TDX_VM, but !has_private_memory, so would be a little
> more robust.

LOL, time is a circle, or something like that.  Paolo actually did this in v2[*],
and I objected, vociferously.

KVM advertises VM types to userspace via a 32-bit field, one bit per type.  So
without more uAPI changes, the VM type needs to be <=31.  KVM could embed the
"has private memory" information into the type, but then we cut down on the number
of possible VM types *and* bleed has_private_memory into KVM's ABI.

While it's unlikely KVM will ever support TDX without has_private_memory, it's
entirely possible that KVM could add support for an existing VM "base" type that
doesn't currently support private memory.  E.g. with some massaging, KVM could
support private memory for SEV and SEV-ES.  And then we need to add an entirely
new VM type just so that KVM can let it use private memory.

Obviously KVM could shove in bits after the fact, e.g. store vm_type as a u64
instead of u32 (or u8 as in this patch), but then what's the point?  Burning a
byte instead of a bit for per-VM flag is a complete non-issue, and booleans tend
to yield code that's easier to read and easier to maintain.

[*] https://lore.kernel.org/all/ZdjL783FazB6V6Cy@google.com

> Partly why I ask is there is logic in the x86 MMU TDX changes that tries to
> be generic but still needs special handling for it. The current solution is
> to look at kvm_gfn_shared_mask() as TDX is the only vm type that sets it, but
> Isaku and I were discussing if we should check something else, that didn't
> appear to be tying together to unrelated concepts:
> https://lore.kernel.org/kvm/20240319235654.GC1994522@ls.amr.corp.intel.com/
> 
> Since it's down the mail, the relevant snippet:
> "
> > >  void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
> > >                                    struct kvm_memory_slot *slot)
> > >  {
> > > -       kvm_mmu_zap_all_fast(kvm);
> > > +       if (kvm_gfn_shared_mask(kvm))

Whatever you do that is TDX specific and an internal KVM thing is likely the wrong
thing :-)

The main reason KVM doesn't do a targeted zap on memslot removal is because of ABI
baggage that we _think_ is limited to interaction with VFIO.  Since KVM doesn't
have any ABI for TDX *or* SNP, I want to at least entertain the option of doing
a target zap for SNP as well as TDX, even though it's only truly "necessary" for
TDX, in quotes because it's not strictly necessary, e.g. KVM could BLOCK the S-EPT
entries without fully removing the mappings.

Whether or not targeted zapping is optimal for SNP (or any VM type) is very much
TBD, and likely highly dependent on use case, but at the same time it would be
nice to not rule it out completely.

E.g. ChromeOS currently has a use case where they frequently delete and recreate
a 2GiB (give or take) memslot.  For that use case, zapping _just_ that memslot is
likely far superious than blasting and rebuilding the entire VM.  But if userspace
deletes a 1TiB for some reason, e.g. for memory unplug?, then the fast zap is
probably better, even though it requires rebuilding all SPTEs.

> > There seems to be an attempt to abstract away the existence of Secure-
> > EPT in mmu.c, that is not fully successful. In this case the code
> > checks kvm_gfn_shared_mask() to see if it needs to handle the zapping
> > in a way specific needed by S-EPT. It ends up being a little confusing
> > because the actual check is about whether there is a shared bit. It
> > only works because only S-EPT is the only thing that has a
> > kvm_gfn_shared_mask().
> > 
> > Doing something like (kvm->arch.vm_type == KVM_X86_TDX_VM) looks wrong,
> > but is more honest about what we are getting up to here. I'm not sure
> > though, what do you think?
> 
> Right, I attempted and failed in zapping case.  This is due to the restriction
> that the Secure-EPT pages must be removed from the leaves.  the VMX case (also
> NPT, even SNP) heavily depends on zapping root entry as optimization.

As above, it's more nuanced than that.  KVM has come to depend on the fast zap,
but it got that way *because* KVM has historical zapped everything, and userspace
has (unknowingly) relied on that behavior.

> I can think of
> - add TDX check. Looks wrong
> - Use kvm_gfn_shared_mask(kvm). confusing

Ya, even if we end up making it a hardcoded TDX thing, dress it up a bit.  E.g.
even if KVM checks for a shared mask under the hood, add a helper to capture the
logic, e.g. kvm_zap_all_sptes_on_memslot_deletion(kvm).

> - Give other name for this check like zap_from_leafs (or better name?)
>   The implementation is same to kvm_gfn_shared_mask() with comment.
>   - Or we can add a boolean variable to struct kvm

If we _don't_ hardcode the behavior, a per-memslot flag or a per-VM capability
(and thus boolean) is likely the way to go.  My off-the-cuff vote is probably for
a per-memslot flag.
Edgecombe, Rick P April 9, 2024, 2:01 p.m. UTC | #4
On Mon, 2024-04-08 at 18:21 -0700, Sean Christopherson wrote:
> 
> Whatever you do that is TDX specific and an internal KVM thing is likely the
> wrong
> thing :-)
> 
> The main reason KVM doesn't do a targeted zap on memslot removal is because of
> ABI
> baggage that we _think_ is limited to interaction with VFIO.  Since KVM
> doesn't
> have any ABI for TDX *or* SNP, I want to at least entertain the option of
> doing
> a target zap for SNP as well as TDX, even though it's only truly "necessary"
> for
> TDX, in quotes because it's not strictly necessary, e.g. KVM could BLOCK the
> S-EPT
> entries without fully removing the mappings.
> 
> Whether or not targeted zapping is optimal for SNP (or any VM type) is very
> much
> TBD, and likely highly dependent on use case, but at the same time it would be
> nice to not rule it out completely.
> 
> E.g. ChromeOS currently has a use case where they frequently delete and
> recreate
> a 2GiB (give or take) memslot.  For that use case, zapping _just_ that memslot
> is
> likely far superious than blasting and rebuilding the entire VM.  But if
> userspace
> deletes a 1TiB for some reason, e.g. for memory unplug?, then the fast zap is
> probably better, even though it requires rebuilding all SPTEs.

Interesting, thanks for the history.

> 
> > > There seems to be an attempt to abstract away the existence of Secure-
> > > EPT in mmu.c, that is not fully successful. In this case the code
> > > checks kvm_gfn_shared_mask() to see if it needs to handle the zapping
> > > in a way specific needed by S-EPT. It ends up being a little confusing
> > > because the actual check is about whether there is a shared bit. It
> > > only works because only S-EPT is the only thing that has a
> > > kvm_gfn_shared_mask().
> > > 
> > > Doing something like (kvm->arch.vm_type == KVM_X86_TDX_VM) looks wrong,
> > > but is more honest about what we are getting up to here. I'm not sure
> > > though, what do you think?
> > 
> > Right, I attempted and failed in zapping case.  This is due to the
> > restriction
> > that the Secure-EPT pages must be removed from the leaves.  the VMX case
> > (also
> > NPT, even SNP) heavily depends on zapping root entry as optimization.
> 
> As above, it's more nuanced than that.  KVM has come to depend on the fast
> zap,
> but it got that way *because* KVM has historical zapped everything, and
> userspace
> has (unknowingly) relied on that behavior.
> 
> > I can think of
> > - add TDX check. Looks wrong
> > - Use kvm_gfn_shared_mask(kvm). confusing
> 
> Ya, even if we end up making it a hardcoded TDX thing, dress it up a bit. 
> E.g.
> even if KVM checks for a shared mask under the hood, add a helper to capture
> the
> logic, e.g. kvm_zap_all_sptes_on_memslot_deletion(kvm).
> 
> > - Give other name for this check like zap_from_leafs (or better name?)
> >    The implementation is same to kvm_gfn_shared_mask() with comment.
> >    - Or we can add a boolean variable to struct kvm
> 
> If we _don't_ hardcode the behavior, a per-memslot flag or a per-VM capability
> (and thus boolean) is likely the way to go.  My off-the-cuff vote is probably
> for
> a per-memslot flag.

The per-memslot flag is interesting. If we had a per-memslot flag it might be
nice for that 2GB memslot. For TDX, making userspace have to know about zapping
requirements is not ideal. If TDX somehow loses the restriction someday, then
userspace would have to manage that as well. I think the decision belongs inside
KVM, for TDX at least.

We'll have to take a look at how they would come together in the code.
Paolo Bonzini April 9, 2024, 2:43 p.m. UTC | #5
On Tue, Apr 9, 2024 at 3:21 AM Sean Christopherson <seanjc@google.com> wrote:
> > I'm a little late to this conversation, so hopefully not just complicating
> > things. But why not deduce has_private_mem and has_protected_state from the
> > vm_type during runtime? Like if kvm.arch.vm_type was instead a bit mask with
> > the bit position of the KVM_X86_*_VM set, kvm_arch_has_private_mem() could
> > bitwise-and with a compile time mask of vm_types that have primate memory.
> > This also prevents it from ever transitioning through non-nonsensical states
> > like vm_type == KVM_X86_TDX_VM, but !has_private_memory, so would be a little
> > more robust.
>
> LOL, time is a circle, or something like that.  Paolo actually did this in v2[*],
> and I objected, vociferously.

To be fair, Rick is asking for something much less hideous - just set

 kvm->arch.vm_type = (1 << vm_type);

and then define kvm_has_*(kvm) as !!(kvm->arch.vm_type & SOME_BIT_MASK).

And indeed it makes sense as an alternative. It also feels a little
bit more restrictive and the benefit is small, so I think I'm going to
go with this version.

Paolo
Sean Christopherson April 9, 2024, 3:26 p.m. UTC | #6
On Tue, Apr 09, 2024, Paolo Bonzini wrote:
> On Tue, Apr 9, 2024 at 3:21 AM Sean Christopherson <seanjc@google.com> wrote:
> > > I'm a little late to this conversation, so hopefully not just complicating
> > > things. But why not deduce has_private_mem and has_protected_state from the
> > > vm_type during runtime? Like if kvm.arch.vm_type was instead a bit mask with
> > > the bit position of the KVM_X86_*_VM set, kvm_arch_has_private_mem() could
> > > bitwise-and with a compile time mask of vm_types that have primate memory.
> > > This also prevents it from ever transitioning through non-nonsensical states
> > > like vm_type == KVM_X86_TDX_VM, but !has_private_memory, so would be a little
> > > more robust.
> >
> > LOL, time is a circle, or something like that.  Paolo actually did this in v2[*],
> > and I objected, vociferously.
> 
> To be fair, Rick is asking for something much less hideous - just set
> 
>  kvm->arch.vm_type = (1 << vm_type);
> 
> and then define kvm_has_*(kvm) as !!(kvm->arch.vm_type & SOME_BIT_MASK).
> 
> And indeed it makes sense as an alternative.

Ah, yeah, I'd be fine with that. 

> It also feels a little bit more restrictive and the benefit is small, so I
> think I'm going to go with this version.

+1
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 04c430eb25cf..3d56b5bb10e9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1279,12 +1279,14 @@  enum kvm_apicv_inhibit {
 };
 
 struct kvm_arch {
-	unsigned long vm_type;
 	unsigned long n_used_mmu_pages;
 	unsigned long n_requested_mmu_pages;
 	unsigned long n_max_mmu_pages;
 	unsigned int indirect_shadow_pages;
 	u8 mmu_valid_gen;
+	u8 vm_type;
+	bool has_private_mem;
+	bool has_protected_state;
 	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
 	struct list_head active_mmu_pages;
 	struct list_head zapped_obsolete_pages;
@@ -2153,8 +2155,9 @@  void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
 void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
 		       int tdp_max_root_level, int tdp_huge_page_level);
 
+
 #ifdef CONFIG_KVM_PRIVATE_MEM
-#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.vm_type != KVM_X86_DEFAULT_VM)
+#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
 #else
 #define kvm_arch_has_private_mem(kvm) false
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3934e7682734..d4a8d896798f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5555,11 +5555,15 @@  static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
-					     struct kvm_debugregs *dbgregs)
+static int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
+					    struct kvm_debugregs *dbgregs)
 {
 	unsigned int i;
 
+	if (vcpu->kvm->arch.has_protected_state &&
+	    vcpu->arch.guest_state_protected)
+		return -EINVAL;
+
 	memset(dbgregs, 0, sizeof(*dbgregs));
 
 	BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
@@ -5568,6 +5572,7 @@  static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
 
 	dbgregs->dr6 = vcpu->arch.dr6;
 	dbgregs->dr7 = vcpu->arch.dr7;
+	return 0;
 }
 
 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -5575,6 +5580,10 @@  static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 {
 	unsigned int i;
 
+	if (vcpu->kvm->arch.has_protected_state &&
+	    vcpu->arch.guest_state_protected)
+		return -EINVAL;
+
 	if (dbgregs->flags)
 		return -EINVAL;
 
@@ -5595,8 +5604,8 @@  static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 }
 
 
-static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
-					  u8 *state, unsigned int size)
+static int kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
+					 u8 *state, unsigned int size)
 {
 	/*
 	 * Only copy state for features that are enabled for the guest.  The
@@ -5614,24 +5623,25 @@  static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
 			     XFEATURE_MASK_FPSSE;
 
 	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
-		return;
+		return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
 
 	fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size,
 				       supported_xcr0, vcpu->arch.pkru);
+	return 0;
 }
 
-static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
-					 struct kvm_xsave *guest_xsave)
+static int kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+					struct kvm_xsave *guest_xsave)
 {
-	kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
-				      sizeof(guest_xsave->region));
+	return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
+					     sizeof(guest_xsave->region));
 }
 
 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
 					struct kvm_xsave *guest_xsave)
 {
 	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
-		return 0;
+		return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
 
 	return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
 					      guest_xsave->region,
@@ -5639,18 +5649,23 @@  static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
 					      &vcpu->arch.pkru);
 }
 
-static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
-					struct kvm_xcrs *guest_xcrs)
+static int kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
+				       struct kvm_xcrs *guest_xcrs)
 {
+	if (vcpu->kvm->arch.has_protected_state &&
+	    vcpu->arch.guest_state_protected)
+		return -EINVAL;
+
 	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
 		guest_xcrs->nr_xcrs = 0;
-		return;
+		return 0;
 	}
 
 	guest_xcrs->nr_xcrs = 1;
 	guest_xcrs->flags = 0;
 	guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
 	guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
+	return 0;
 }
 
 static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
@@ -5658,6 +5673,10 @@  static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
 {
 	int i, r = 0;
 
+	if (vcpu->kvm->arch.has_protected_state &&
+	    vcpu->arch.guest_state_protected)
+		return -EINVAL;
+
 	if (!boot_cpu_has(X86_FEATURE_XSAVE))
 		return -EINVAL;
 
@@ -6040,7 +6059,9 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 	case KVM_GET_DEBUGREGS: {
 		struct kvm_debugregs dbgregs;
 
-		kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+		r = kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+		if (r < 0)
+			break;
 
 		r = -EFAULT;
 		if (copy_to_user(argp, &dbgregs,
@@ -6070,7 +6091,9 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 		if (!u.xsave)
 			break;
 
-		kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
+		r = kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
+		if (r < 0)
+			break;
 
 		r = -EFAULT;
 		if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
@@ -6099,7 +6122,9 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 		if (!u.xsave)
 			break;
 
-		kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
+		r = kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
+		if (r < 0)
+			break;
 
 		r = -EFAULT;
 		if (copy_to_user(argp, u.xsave, size))
@@ -6115,7 +6140,9 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 		if (!u.xcrs)
 			break;
 
-		kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
+		r = kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
+		if (r < 0)
+			break;
 
 		r = -EFAULT;
 		if (copy_to_user(argp, u.xcrs,
@@ -6259,6 +6286,11 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 	}
 #endif
 	case KVM_GET_SREGS2: {
+		r = -EINVAL;
+		if (vcpu->kvm->arch.has_protected_state &&
+		    vcpu->arch.guest_state_protected)
+			goto out;
+
 		u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
 		r = -ENOMEM;
 		if (!u.sregs2)
@@ -6271,6 +6303,11 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_SET_SREGS2: {
+		r = -EINVAL;
+		if (vcpu->kvm->arch.has_protected_state &&
+		    vcpu->arch.guest_state_protected)
+			goto out;
+
 		u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
 		if (IS_ERR(u.sregs2)) {
 			r = PTR_ERR(u.sregs2);
@@ -11478,6 +11515,10 @@  static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
+	if (vcpu->kvm->arch.has_protected_state &&
+	    vcpu->arch.guest_state_protected)
+		return -EINVAL;
+
 	vcpu_load(vcpu);
 	__get_regs(vcpu, regs);
 	vcpu_put(vcpu);
@@ -11519,6 +11560,10 @@  static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 
 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
+	if (vcpu->kvm->arch.has_protected_state &&
+	    vcpu->arch.guest_state_protected)
+		return -EINVAL;
+
 	vcpu_load(vcpu);
 	__set_regs(vcpu, regs);
 	vcpu_put(vcpu);
@@ -11591,6 +11636,10 @@  static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 {
+	if (vcpu->kvm->arch.has_protected_state &&
+	    vcpu->arch.guest_state_protected)
+		return -EINVAL;
+
 	vcpu_load(vcpu);
 	__get_sregs(vcpu, sregs);
 	vcpu_put(vcpu);
@@ -11858,6 +11907,10 @@  int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 {
 	int ret;
 
+	if (vcpu->kvm->arch.has_protected_state &&
+	    vcpu->arch.guest_state_protected)
+		return -EINVAL;
+
 	vcpu_load(vcpu);
 	ret = __set_sregs(vcpu, sregs);
 	vcpu_put(vcpu);
@@ -11975,7 +12028,7 @@  int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	struct fxregs_state *fxsave;
 
 	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
-		return 0;
+		return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
 
 	vcpu_load(vcpu);
 
@@ -11998,7 +12051,7 @@  int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	struct fxregs_state *fxsave;
 
 	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
-		return 0;
+		return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
 
 	vcpu_load(vcpu);
 
@@ -12524,6 +12577,8 @@  int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 		return -EINVAL;
 
 	kvm->arch.vm_type = type;
+	kvm->arch.has_private_mem =
+		(type == KVM_X86_SW_PROTECTED_VM);
 
 	ret = kvm_page_track_init(kvm);
 	if (ret)