diff mbox series

[v7,11/14] KVM: Register/unregister the guest private memory regions

Message ID 20220706082016.2603916-12-chao.p.peng@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series KVM: mm: fd-based approach for supporting KVM guest private memory | expand

Commit Message

Chao Peng July 6, 2022, 8:20 a.m. UTC
If CONFIG_HAVE_KVM_PRIVATE_MEM=y, userspace can register/unregister the
guest private memory regions through KVM_MEMORY_ENCRYPT_{UN,}REG_REGION
ioctls. The patch reuses existing SEV ioctl but differs that the
address in the region for private memory is gpa while SEV case it's hva.

The private memory region is stored as xarray in KVM for memory
efficiency in normal usages and zapping existing memory mappings is also
a side effect of these two ioctls.

Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
---
 Documentation/virt/kvm/api.rst  | 17 +++++++---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/Kconfig            |  1 +
 arch/x86/kvm/mmu.h              |  2 --
 include/linux/kvm_host.h        |  8 +++++
 virt/kvm/kvm_main.c             | 57 +++++++++++++++++++++++++++++++++
 6 files changed, 80 insertions(+), 6 deletions(-)

Comments

Gupta, Pankaj July 19, 2022, 8 a.m. UTC | #1
Hi Chao,

Some comments below:

> If CONFIG_HAVE_KVM_PRIVATE_MEM=y, userspace can register/unregister the
> guest private memory regions through KVM_MEMORY_ENCRYPT_{UN,}REG_REGION
> ioctls. The patch reuses existing SEV ioctl but differs that the
> address in the region for private memory is gpa while SEV case it's hva.
> 
> The private memory region is stored as xarray in KVM for memory
> efficiency in normal usages and zapping existing memory mappings is also
> a side effect of these two ioctls.
> 
> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> ---
>   Documentation/virt/kvm/api.rst  | 17 +++++++---
>   arch/x86/include/asm/kvm_host.h |  1 +
>   arch/x86/kvm/Kconfig            |  1 +
>   arch/x86/kvm/mmu.h              |  2 --
>   include/linux/kvm_host.h        |  8 +++++
>   virt/kvm/kvm_main.c             | 57 +++++++++++++++++++++++++++++++++
>   6 files changed, 80 insertions(+), 6 deletions(-)
> 
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 5ecfc7fbe0ee..dfb4caecab73 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -4715,10 +4715,19 @@ Documentation/virt/kvm/amd-memory-encryption.rst.
>   This ioctl can be used to register a guest memory region which may
>   contain encrypted data (e.g. guest RAM, SMRAM etc).
>   
> -It is used in the SEV-enabled guest. When encryption is enabled, a guest
> -memory region may contain encrypted data. The SEV memory encryption
> -engine uses a tweak such that two identical plaintext pages, each at
> -different locations will have differing ciphertexts. So swapping or
> +Currently this ioctl supports registering memory regions for two usages:
> +private memory and SEV-encrypted memory.
> +
> +When private memory is enabled, this ioctl is used to register guest private
> +memory region and the addr/size of kvm_enc_region represents guest physical
> +address (GPA). In this usage, this ioctl zaps the existing guest memory
> +mappings in KVM that fallen into the region.
> +
> +When SEV-encrypted memory is enabled, this ioctl is used to register guest
> +memory region which may contain encrypted data for a SEV-enabled guest. The
> +addr/size of kvm_enc_region represents userspace address (HVA). The SEV
> +memory encryption engine uses a tweak such that two identical plaintext pages,
> +each at different locations will have differing ciphertexts. So swapping or
>   moving ciphertext of those pages will not result in plaintext being
>   swapped. So relocating (or migrating) physical backing pages for the SEV
>   guest will require some additional steps.
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index dae190e19fce..92120e3a224e 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -37,6 +37,7 @@
>   #include <asm/hyperv-tlfs.h>
>   
>   #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
> +#define __KVM_HAVE_ZAP_GFN_RANGE
>   
>   #define KVM_MAX_VCPUS 1024
>   
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index 1f160801e2a7..05861b9656a4 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -50,6 +50,7 @@ config KVM
>   	select HAVE_KVM_PM_NOTIFIER if PM
>   	select HAVE_KVM_PRIVATE_MEM if X86_64
>   	select MEMFILE_NOTIFIER if HAVE_KVM_PRIVATE_MEM
> +	select XARRAY_MULTI if HAVE_KVM_PRIVATE_MEM
>   	help
>   	  Support hosting fully virtualized guest machines using hardware
>   	  virtualization extensions.  You will need a fairly recent
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index a99acec925eb..428cd2e88cbd 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -209,8 +209,6 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
>   	return -(u32)fault & errcode;
>   }
>   
> -void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
> -
>   int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
>   
>   int kvm_mmu_post_init_vm(struct kvm *kvm);
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 1b203c8aa696..da33f8828456 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -260,6 +260,10 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
>   bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
>   #endif
>   
> +#ifdef __KVM_HAVE_ZAP_GFN_RANGE
> +void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
> +#endif
> +
>   enum {
>   	OUTSIDE_GUEST_MODE,
>   	IN_GUEST_MODE,
> @@ -795,6 +799,9 @@ struct kvm {
>   	struct notifier_block pm_notifier;
>   #endif
>   	char stats_id[KVM_STATS_NAME_SIZE];
> +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> +	struct xarray mem_attr_array;
> +#endif
>   };
>   
>   #define kvm_err(fmt, ...) \
> @@ -1459,6 +1466,7 @@ bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
>   int kvm_arch_post_init_vm(struct kvm *kvm);
>   void kvm_arch_pre_destroy_vm(struct kvm *kvm);
>   int kvm_arch_create_vm_debugfs(struct kvm *kvm);
> +bool kvm_arch_private_mem_supported(struct kvm *kvm);
>   
>   #ifndef __KVM_HAVE_ARCH_VM_ALLOC
>   /*
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 230c8ff9659c..bb714c2a4b06 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -914,6 +914,35 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
>   
>   #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
>   
> +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> +#define KVM_MEM_ATTR_PRIVATE	0x0001
> +static int kvm_vm_ioctl_set_encrypted_region(struct kvm *kvm, unsigned int ioctl,
> +					     struct kvm_enc_region *region)
> +{
> +	unsigned long start, end;
> +	void *entry;
> +	int r;
> +
> +	if (region->size == 0 || region->addr + region->size < region->addr)
> +		return -EINVAL;
> +	if (region->addr & (PAGE_SIZE - 1) || region->size & (PAGE_SIZE - 1))
> +		return -EINVAL;
> +
> +	start = region->addr >> PAGE_SHIFT;
> +	end = (region->addr + region->size - 1) >> PAGE_SHIFT;
> +
> +	entry = ioctl == KVM_MEMORY_ENCRYPT_REG_REGION ?
> +				xa_mk_value(KVM_MEM_ATTR_PRIVATE) : NULL;
> +
> +	r = xa_err(xa_store_range(&kvm->mem_attr_array, start, end,
> +					entry, GFP_KERNEL_ACCOUNT));
> +
> +	kvm_zap_gfn_range(kvm, start, end + 1);
> +
> +	return r;
> +}
> +#endif /* CONFIG_HAVE_KVM_PRIVATE_MEM */
> +
>   #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
>   static int kvm_pm_notifier_call(struct notifier_block *bl,
>   				unsigned long state,
> @@ -1138,6 +1167,9 @@ static struct kvm *kvm_create_vm(unsigned long type)
>   	spin_lock_init(&kvm->mn_invalidate_lock);
>   	rcuwait_init(&kvm->mn_memslots_update_rcuwait);
>   	xa_init(&kvm->vcpu_array);
> +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> +	xa_init(&kvm->mem_attr_array);
> +#endif
>   
>   	INIT_LIST_HEAD(&kvm->gpc_list);
>   	spin_lock_init(&kvm->gpc_lock);
> @@ -1305,6 +1337,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
>   		kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
>   		kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
>   	}
> +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> +	xa_destroy(&kvm->mem_attr_array);
> +#endif
>   	cleanup_srcu_struct(&kvm->irq_srcu);
>   	cleanup_srcu_struct(&kvm->srcu);
>   	kvm_arch_free_vm(kvm);
> @@ -1508,6 +1543,11 @@ static void kvm_replace_memslot(struct kvm *kvm,
>   	}
>   }
>   
> +bool __weak kvm_arch_private_mem_supported(struct kvm *kvm)
> +{
> +	return false;
> +}

Does this function has to be overriden by SEV and TDX to support the 
private regions?

> +
>   static int check_memory_region_flags(const struct kvm_user_mem_region *mem)
>   {
>   	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
> @@ -4689,6 +4729,22 @@ static long kvm_vm_ioctl(struct file *filp,
>   		r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
>   		break;
>   	}
> +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> +	case KVM_MEMORY_ENCRYPT_REG_REGION:
> +	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
> +		struct kvm_enc_region region;
> +
> +		if (!kvm_arch_private_mem_supported(kvm))
> +			goto arch_vm_ioctl;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&region, argp, sizeof(region)))
> +			goto out;
> +
> +		r = kvm_vm_ioctl_set_encrypted_region(kvm, ioctl, &region);

this is to store private region metadata not only the encrypted region?

Also, seems same ioctl can be used to put other regions (e.g firmware, 
later maybe DAX backend etc) into private memory?

> +		break;
> +	}
> +#endif
>   	case KVM_GET_DIRTY_LOG: {
>   		struct kvm_dirty_log log;
>   
> @@ -4842,6 +4898,7 @@ static long kvm_vm_ioctl(struct file *filp,
>   		r = kvm_vm_ioctl_get_stats_fd(kvm);
>   		break;
>   	default:
> +arch_vm_ioctl:
>   		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
>   	}
>   out:
Chao Peng July 19, 2022, 2:08 p.m. UTC | #2
On Tue, Jul 19, 2022 at 10:00:23AM +0200, Gupta, Pankaj wrote:

...

> > +bool __weak kvm_arch_private_mem_supported(struct kvm *kvm)
> > +{
> > +	return false;
> > +}
> 
> Does this function has to be overriden by SEV and TDX to support the private
> regions?

Yes it should be overridden by architectures which want to support it.

> 
> > +
> >   static int check_memory_region_flags(const struct kvm_user_mem_region *mem)
> >   {
> >   	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
> > @@ -4689,6 +4729,22 @@ static long kvm_vm_ioctl(struct file *filp,
> >   		r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
> >   		break;
> >   	}
> > +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> > +	case KVM_MEMORY_ENCRYPT_REG_REGION:
> > +	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
> > +		struct kvm_enc_region region;
> > +
> > +		if (!kvm_arch_private_mem_supported(kvm))
> > +			goto arch_vm_ioctl;
> > +
> > +		r = -EFAULT;
> > +		if (copy_from_user(&region, argp, sizeof(region)))
> > +			goto out;
> > +
> > +		r = kvm_vm_ioctl_set_encrypted_region(kvm, ioctl, &region);
> 
> this is to store private region metadata not only the encrypted region?

Correct.

> 
> Also, seems same ioctl can be used to put other regions (e.g firmware, later
> maybe DAX backend etc) into private memory?

Possibly. Depends on what exactly the semantics is. If just want to set
those regions as private current code already support that.

Chao
> 
> > +		break;
> > +	}
> > +#endif
> >   	case KVM_GET_DIRTY_LOG: {
> >   		struct kvm_dirty_log log;
> > @@ -4842,6 +4898,7 @@ static long kvm_vm_ioctl(struct file *filp,
> >   		r = kvm_vm_ioctl_get_stats_fd(kvm);
> >   		break;
> >   	default:
> > +arch_vm_ioctl:
> >   		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> >   	}
> >   out:
>
Gupta, Pankaj July 19, 2022, 2:23 p.m. UTC | #3
>>> +bool __weak kvm_arch_private_mem_supported(struct kvm *kvm)
>>> +{
>>> +	return false;
>>> +}
>>
>> Does this function has to be overriden by SEV and TDX to support the private
>> regions?
> 
> Yes it should be overridden by architectures which want to support it.

o.k
> 
>>
>>> +
>>>    static int check_memory_region_flags(const struct kvm_user_mem_region *mem)
>>>    {
>>>    	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
>>> @@ -4689,6 +4729,22 @@ static long kvm_vm_ioctl(struct file *filp,
>>>    		r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
>>>    		break;
>>>    	}
>>> +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
>>> +	case KVM_MEMORY_ENCRYPT_REG_REGION:
>>> +	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
>>> +		struct kvm_enc_region region;
>>> +
>>> +		if (!kvm_arch_private_mem_supported(kvm))
>>> +			goto arch_vm_ioctl;
>>> +
>>> +		r = -EFAULT;
>>> +		if (copy_from_user(&region, argp, sizeof(region)))
>>> +			goto out;
>>> +
>>> +		r = kvm_vm_ioctl_set_encrypted_region(kvm, ioctl, &region);
>>
>> this is to store private region metadata not only the encrypted region?
> 
> Correct.

Sorry for not being clear, was suggesting name change of this function from:
"kvm_vm_ioctl_set_encrypted_region" to "kvm_vm_ioctl_set_private_region"

> 
>>
>> Also, seems same ioctl can be used to put other regions (e.g firmware, later
>> maybe DAX backend etc) into private memory?
> 
> Possibly. Depends on what exactly the semantics is. If just want to set
> those regions as private current code already support that.

Agree. Sure!


Thanks,
Pankaj
Chao Peng July 20, 2022, 3:07 p.m. UTC | #4
On Tue, Jul 19, 2022 at 04:23:52PM +0200, Gupta, Pankaj wrote:
> 
> > > > +bool __weak kvm_arch_private_mem_supported(struct kvm *kvm)
> > > > +{
> > > > +	return false;
> > > > +}
> > > 
> > > Does this function has to be overriden by SEV and TDX to support the private
> > > regions?
> > 
> > Yes it should be overridden by architectures which want to support it.
> 
> o.k
> > 
> > > 
> > > > +
> > > >    static int check_memory_region_flags(const struct kvm_user_mem_region *mem)
> > > >    {
> > > >    	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
> > > > @@ -4689,6 +4729,22 @@ static long kvm_vm_ioctl(struct file *filp,
> > > >    		r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
> > > >    		break;
> > > >    	}
> > > > +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> > > > +	case KVM_MEMORY_ENCRYPT_REG_REGION:
> > > > +	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
> > > > +		struct kvm_enc_region region;
> > > > +
> > > > +		if (!kvm_arch_private_mem_supported(kvm))
> > > > +			goto arch_vm_ioctl;
> > > > +
> > > > +		r = -EFAULT;
> > > > +		if (copy_from_user(&region, argp, sizeof(region)))
> > > > +			goto out;
> > > > +
> > > > +		r = kvm_vm_ioctl_set_encrypted_region(kvm, ioctl, &region);
> > > 
> > > this is to store private region metadata not only the encrypted region?
> > 
> > Correct.
> 
> Sorry for not being clear, was suggesting name change of this function from:
> "kvm_vm_ioctl_set_encrypted_region" to "kvm_vm_ioctl_set_private_region"

Though I don't have strong reason to change it, I'm fine with this and
this name matches the above kvm_arch_private_mem_supported perfectly.

Thanks,
Chao
> 
> > 
> > > 
> > > Also, seems same ioctl can be used to put other regions (e.g firmware, later
> > > maybe DAX backend etc) into private memory?
> > 
> > Possibly. Depends on what exactly the semantics is. If just want to set
> > those regions as private current code already support that.
> 
> Agree. Sure!
> 
> 
> Thanks,
> Pankaj
Gupta, Pankaj July 20, 2022, 3:31 p.m. UTC | #5
>>>>> +bool __weak kvm_arch_private_mem_supported(struct kvm *kvm)
>>>>> +{
>>>>> +	return false;
>>>>> +}
>>>>
>>>> Does this function has to be overriden by SEV and TDX to support the private
>>>> regions?
>>>
>>> Yes it should be overridden by architectures which want to support it.
>>
>> o.k
>>>
>>>>
>>>>> +
>>>>>     static int check_memory_region_flags(const struct kvm_user_mem_region *mem)
>>>>>     {
>>>>>     	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
>>>>> @@ -4689,6 +4729,22 @@ static long kvm_vm_ioctl(struct file *filp,
>>>>>     		r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
>>>>>     		break;
>>>>>     	}
>>>>> +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
>>>>> +	case KVM_MEMORY_ENCRYPT_REG_REGION:
>>>>> +	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
>>>>> +		struct kvm_enc_region region;
>>>>> +
>>>>> +		if (!kvm_arch_private_mem_supported(kvm))
>>>>> +			goto arch_vm_ioctl;
>>>>> +
>>>>> +		r = -EFAULT;
>>>>> +		if (copy_from_user(&region, argp, sizeof(region)))
>>>>> +			goto out;
>>>>> +
>>>>> +		r = kvm_vm_ioctl_set_encrypted_region(kvm, ioctl, &region);
>>>>
>>>> this is to store private region metadata not only the encrypted region?
>>>
>>> Correct.
>>
>> Sorry for not being clear, was suggesting name change of this function from:
>> "kvm_vm_ioctl_set_encrypted_region" to "kvm_vm_ioctl_set_private_region"
> 
> Though I don't have strong reason to change it, I'm fine with this and

Yes, no strong reason, just thought "kvm_vm_ioctl_set_private_region" 
would depict the actual functionality :)

> this name matches the above kvm_arch_private_mem_supported perfectly.
BTW could not understand this, how "kvm_vm_ioctl_set_encrypted_region"
matches "kvm_arch_private_mem_supported"?

Thanks,
Pankaj
Sean Christopherson July 20, 2022, 4:21 p.m. UTC | #6
On Wed, Jul 20, 2022, Gupta, Pankaj wrote:
> 
> > > > > > +bool __weak kvm_arch_private_mem_supported(struct kvm *kvm)

Use kvm_arch_has_private_mem(), both because "has" makes it obvious this is checking
a flag of sorts, and to align with other helpers of this nature (and with
CONFIG_HAVE_KVM_PRIVATE_MEM).

  $ git grep kvm_arch | grep supported | wc -l
  0
  $ git grep kvm_arch | grep has | wc -l
  26

> > > > > > +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> > > > > > +	case KVM_MEMORY_ENCRYPT_REG_REGION:
> > > > > > +	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
> > > > > > +		struct kvm_enc_region region;
> > > > > > +
> > > > > > +		if (!kvm_arch_private_mem_supported(kvm))
> > > > > > +			goto arch_vm_ioctl;
> > > > > > +
> > > > > > +		r = -EFAULT;
> > > > > > +		if (copy_from_user(&region, argp, sizeof(region)))
> > > > > > +			goto out;
> > > > > > +
> > > > > > +		r = kvm_vm_ioctl_set_encrypted_region(kvm, ioctl, &region);
> > > > > 
> > > > > this is to store private region metadata not only the encrypted region?
> > > > 
> > > > Correct.
> > > 
> > > Sorry for not being clear, was suggesting name change of this function from:
> > > "kvm_vm_ioctl_set_encrypted_region" to "kvm_vm_ioctl_set_private_region"
> > 
> > Though I don't have strong reason to change it, I'm fine with this and
> 
> Yes, no strong reason, just thought "kvm_vm_ioctl_set_private_region" would
> depict the actual functionality :)
> 
> > this name matches the above kvm_arch_private_mem_supported perfectly.
> BTW could not understand this, how "kvm_vm_ioctl_set_encrypted_region"
> matches "kvm_arch_private_mem_supported"?

Chao is saying that kvm_vm_ioctl_set_private_region() pairs nicely with
kvm_arch_private_mem_supported(), not that the "encrypted" variant pairs nicely.

I also like using "private" instead of "encrypted", though we should probably
find a different verb than "set", because calling "set_private" when making the
region shared is confusing.  I'm struggling to come up with a good alternative
though.

kvm_vm_ioctl_set_memory_region() is already taken by KVM_SET_USER_MEMORY_REGION,
and that also means that anything with "memory_region" in the name is bound to be
confusing.

Hmm, and if we move away from "encrypted", it probably makes sense to pass in
addr+size instead of a kvm_enc_region.

Maybe this?

static int kvm_vm_ioctl_set_or_clear_mem_private(struct kvm *kvm, gpa_t gpa,
					         gpa_t size, bool set_private)

and then:

#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
	case KVM_MEMORY_ENCRYPT_REG_REGION:
	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
		bool set = ioctl == KVM_MEMORY_ENCRYPT_REG_REGION;
		struct kvm_enc_region region;

		if (!kvm_arch_private_mem_supported(kvm))
			goto arch_vm_ioctl;

		r = -EFAULT;
		if (copy_from_user(&region, argp, sizeof(region)))
			goto out;

		r = kvm_vm_ioctl_set_or_clear_mem_private(kvm, region.addr,
							  region.size, set);
		break;
	}
#endif

I don't love it, so if someone has a better idea...
Sean Christopherson July 20, 2022, 4:44 p.m. UTC | #7
On Wed, Jul 06, 2022, Chao Peng wrote:
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 230c8ff9659c..bb714c2a4b06 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -914,6 +914,35 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
>  
>  #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
>  
> +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> +#define KVM_MEM_ATTR_PRIVATE	0x0001
> +static int kvm_vm_ioctl_set_encrypted_region(struct kvm *kvm, unsigned int ioctl,
> +					     struct kvm_enc_region *region)
> +{
> +	unsigned long start, end;

As alluded to in a different reply, because this will track GPAs instead of HVAs,
the type needs to be "gpa_t", not "unsigned long".  Oh, actually, they need to
be gfn_t, since those are what gets shoved into the xarray.

> +	void *entry;
> +	int r;
> +
> +	if (region->size == 0 || region->addr + region->size < region->addr)
> +		return -EINVAL;
> +	if (region->addr & (PAGE_SIZE - 1) || region->size & (PAGE_SIZE - 1))
> +		return -EINVAL;
> +
> +	start = region->addr >> PAGE_SHIFT;
> +	end = (region->addr + region->size - 1) >> PAGE_SHIFT;
> +
> +	entry = ioctl == KVM_MEMORY_ENCRYPT_REG_REGION ?
> +				xa_mk_value(KVM_MEM_ATTR_PRIVATE) : NULL;
> +
> +	r = xa_err(xa_store_range(&kvm->mem_attr_array, start, end,
> +					entry, GFP_KERNEL_ACCOUNT));

IIUC, this series treats memory as shared by default.  I think we should invert
that and have KVM's ABI be that all guest memory as private by default, i.e.
require the guest to opt into sharing memory instead of opt out of sharing memory.

And then the xarray would track which regions are shared.

Regarding mem_attr_array, it probably makes sense to explicitly include what it's
tracking in the name, i.e. name it {private,shared}_mem_array depending on whether
it's used to track private vs. shared memory.  If we ever need to track metadata
beyond shared/private then we can tweak the name as needed, e.g. if hardware ever
supports secondary non-ephemeral encryption keys.
Gupta, Pankaj July 20, 2022, 5:41 p.m. UTC | #8
> Use kvm_arch_has_private_mem(), both because "has" makes it obvious this is checking
> a flag of sorts, and to align with other helpers of this nature (and with
> CONFIG_HAVE_KVM_PRIVATE_MEM).
> 
>    $ git grep kvm_arch | grep supported | wc -l
>    0
>    $ git grep kvm_arch | grep has | wc -l
>    26
> 
>>>>>>> +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
>>>>>>> +	case KVM_MEMORY_ENCRYPT_REG_REGION:
>>>>>>> +	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
>>>>>>> +		struct kvm_enc_region region;
>>>>>>> +
>>>>>>> +		if (!kvm_arch_private_mem_supported(kvm))
>>>>>>> +			goto arch_vm_ioctl;
>>>>>>> +
>>>>>>> +		r = -EFAULT;
>>>>>>> +		if (copy_from_user(&region, argp, sizeof(region)))
>>>>>>> +			goto out;
>>>>>>> +
>>>>>>> +		r = kvm_vm_ioctl_set_encrypted_region(kvm, ioctl, &region);
>>>>>>
>>>>>> this is to store private region metadata not only the encrypted region?
>>>>>
>>>>> Correct.
>>>>
>>>> Sorry for not being clear, was suggesting name change of this function from:
>>>> "kvm_vm_ioctl_set_encrypted_region" to "kvm_vm_ioctl_set_private_region"
>>>
>>> Though I don't have strong reason to change it, I'm fine with this and
>>
>> Yes, no strong reason, just thought "kvm_vm_ioctl_set_private_region" would
>> depict the actual functionality :)
>>
>>> this name matches the above kvm_arch_private_mem_supported perfectly.
>> BTW could not understand this, how "kvm_vm_ioctl_set_encrypted_region"
>> matches "kvm_arch_private_mem_supported"?
> 
> Chao is saying that kvm_vm_ioctl_set_private_region() pairs nicely with
> kvm_arch_private_mem_supported(), not that the "encrypted" variant pairs nicely.
> 
> I also like using "private" instead of "encrypted", though we should probably
> find a different verb than "set", because calling "set_private" when making the
> region shared is confusing.  I'm struggling to come up with a good alternative
> though.
> 
> kvm_vm_ioctl_set_memory_region() is already taken by KVM_SET_USER_MEMORY_REGION,
> and that also means that anything with "memory_region" in the name is bound to be
> confusing.
> 
> Hmm, and if we move away from "encrypted", it probably makes sense to pass in
> addr+size instead of a kvm_enc_region.
> 
> Maybe this?
> 
> static int kvm_vm_ioctl_set_or_clear_mem_private(struct kvm *kvm, gpa_t gpa,
> 					         gpa_t size, bool set_private)
> 
> and then:
> 
> #ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> 	case KVM_MEMORY_ENCRYPT_REG_REGION:
> 	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
> 		bool set = ioctl == KVM_MEMORY_ENCRYPT_REG_REGION;
> 		struct kvm_enc_region region;
> 
> 		if (!kvm_arch_private_mem_supported(kvm))
> 			goto arch_vm_ioctl;
> 
> 		r = -EFAULT;
> 		if (copy_from_user(&region, argp, sizeof(region)))
> 			goto out;
> 
> 		r = kvm_vm_ioctl_set_or_clear_mem_private(kvm, region.addr,
> 							  region.size, set);
> 		break;
> 	}
> #endif
> 
> I don't love it, so if someone has a better idea...

Both the suggestions look good to me. Bring more clarity.

Thanks,
Pankaj
Wei Wang July 21, 2022, 7:34 a.m. UTC | #9
On 7/21/22 00:21, Sean Christopherson wrote:
> On Wed, Jul 20, 2022, Gupta, Pankaj wrote:
>>>>>>> +bool __weak kvm_arch_private_mem_supported(struct kvm *kvm)
> Use kvm_arch_has_private_mem(), both because "has" makes it obvious this is checking
> a flag of sorts, and to align with other helpers of this nature (and with
> CONFIG_HAVE_KVM_PRIVATE_MEM).
>
>    $ git grep kvm_arch | grep supported | wc -l
>    0
>    $ git grep kvm_arch | grep has | wc -l
>    26
>
>>>>>>> +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
>>>>>>> +	case KVM_MEMORY_ENCRYPT_REG_REGION:
>>>>>>> +	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
>>>>>>> +		struct kvm_enc_region region;
>>>>>>> +
>>>>>>> +		if (!kvm_arch_private_mem_supported(kvm))
>>>>>>> +			goto arch_vm_ioctl;
>>>>>>> +
>>>>>>> +		r = -EFAULT;
>>>>>>> +		if (copy_from_user(&region, argp, sizeof(region)))
>>>>>>> +			goto out;
>>>>>>> +
>>>>>>> +		r = kvm_vm_ioctl_set_encrypted_region(kvm, ioctl, &region);
>>>>>> this is to store private region metadata not only the encrypted region?
>>>>> Correct.
>>>> Sorry for not being clear, was suggesting name change of this function from:
>>>> "kvm_vm_ioctl_set_encrypted_region" to "kvm_vm_ioctl_set_private_region"
>>> Though I don't have strong reason to change it, I'm fine with this and
>> Yes, no strong reason, just thought "kvm_vm_ioctl_set_private_region" would
>> depict the actual functionality :)
>>
>>> this name matches the above kvm_arch_private_mem_supported perfectly.
>> BTW could not understand this, how "kvm_vm_ioctl_set_encrypted_region"
>> matches "kvm_arch_private_mem_supported"?
> Chao is saying that kvm_vm_ioctl_set_private_region() pairs nicely with
> kvm_arch_private_mem_supported(), not that the "encrypted" variant pairs nicely.
>
> I also like using "private" instead of "encrypted", though we should probably
> find a different verb than "set", because calling "set_private" when making the
> region shared is confusing.  I'm struggling to come up with a good alternative
> though.
>
> kvm_vm_ioctl_set_memory_region() is already taken by KVM_SET_USER_MEMORY_REGION,
> and that also means that anything with "memory_region" in the name is bound to be
> confusing.
>
> Hmm, and if we move away from "encrypted", it probably makes sense to pass in
> addr+size instead of a kvm_enc_region.
>
> Maybe this?
>
> static int kvm_vm_ioctl_set_or_clear_mem_private(struct kvm *kvm, gpa_t gpa,
> 					         gpa_t size, bool set_private)
>
> and then:
>
> #ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> 	case KVM_MEMORY_ENCRYPT_REG_REGION:
> 	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
> 		bool set = ioctl == KVM_MEMORY_ENCRYPT_REG_REGION;
> 		struct kvm_enc_region region;
>
> 		if (!kvm_arch_private_mem_supported(kvm))
> 			goto arch_vm_ioctl;
>
> 		r = -EFAULT;
> 		if (copy_from_user(&region, argp, sizeof(region)))
> 			goto out;
>
> 		r = kvm_vm_ioctl_set_or_clear_mem_private(kvm, region.addr,
> 							  region.size, set);
> 		break;
> 	}
> #endif
>
> I don't love it, so if someone has a better idea...
>
Maybe you could tag it with cgs for all the confidential guest support 
related stuff:
e.g. kvm_vm_ioctl_set_cgs_mem()

bool is_private = ioctl == KVM_MEMORY_ENCRYPT_REG_REGION;
...
kvm_vm_ioctl_set_cgs_mem(, is_private)
Chao Peng July 21, 2022, 9:29 a.m. UTC | #10
On Thu, Jul 21, 2022 at 03:34:59PM +0800, Wei Wang wrote:
> 
> 
> On 7/21/22 00:21, Sean Christopherson wrote:
> > On Wed, Jul 20, 2022, Gupta, Pankaj wrote:
> > > > > > > > +bool __weak kvm_arch_private_mem_supported(struct kvm *kvm)
> > Use kvm_arch_has_private_mem(), both because "has" makes it obvious this is checking
> > a flag of sorts, and to align with other helpers of this nature (and with
> > CONFIG_HAVE_KVM_PRIVATE_MEM).
> > 
> >    $ git grep kvm_arch | grep supported | wc -l
> >    0
> >    $ git grep kvm_arch | grep has | wc -l
> >    26

Make sense. kvm_arch_has_private_mem it actually better.

> > 
> > > > > > > > +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> > > > > > > > +	case KVM_MEMORY_ENCRYPT_REG_REGION:
> > > > > > > > +	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
> > > > > > > > +		struct kvm_enc_region region;
> > > > > > > > +
> > > > > > > > +		if (!kvm_arch_private_mem_supported(kvm))
> > > > > > > > +			goto arch_vm_ioctl;
> > > > > > > > +
> > > > > > > > +		r = -EFAULT;
> > > > > > > > +		if (copy_from_user(&region, argp, sizeof(region)))
> > > > > > > > +			goto out;
> > > > > > > > +
> > > > > > > > +		r = kvm_vm_ioctl_set_encrypted_region(kvm, ioctl, &region);
> > > > > > > this is to store private region metadata not only the encrypted region?
> > > > > > Correct.
> > > > > Sorry for not being clear, was suggesting name change of this function from:
> > > > > "kvm_vm_ioctl_set_encrypted_region" to "kvm_vm_ioctl_set_private_region"
> > > > Though I don't have strong reason to change it, I'm fine with this and
> > > Yes, no strong reason, just thought "kvm_vm_ioctl_set_private_region" would
> > > depict the actual functionality :)
> > > 
> > > > this name matches the above kvm_arch_private_mem_supported perfectly.
> > > BTW could not understand this, how "kvm_vm_ioctl_set_encrypted_region"
> > > matches "kvm_arch_private_mem_supported"?
> > Chao is saying that kvm_vm_ioctl_set_private_region() pairs nicely with
> > kvm_arch_private_mem_supported(), not that the "encrypted" variant pairs nicely.
> > 
> > I also like using "private" instead of "encrypted", though we should probably
> > find a different verb than "set", because calling "set_private" when making the
> > region shared is confusing.  I'm struggling to come up with a good alternative
> > though.
> > 
> > kvm_vm_ioctl_set_memory_region() is already taken by KVM_SET_USER_MEMORY_REGION,
> > and that also means that anything with "memory_region" in the name is bound to be
> > confusing.
> > 
> > Hmm, and if we move away from "encrypted", it probably makes sense to pass in
> > addr+size instead of a kvm_enc_region.

This makes sense.

> > 
> > Maybe this?
> > 
> > static int kvm_vm_ioctl_set_or_clear_mem_private(struct kvm *kvm, gpa_t gpa,
> > 					         gpa_t size, bool set_private)

Currently this should work.

> > 
> > and then:
> > 
> > #ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> > 	case KVM_MEMORY_ENCRYPT_REG_REGION:
> > 	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
> > 		bool set = ioctl == KVM_MEMORY_ENCRYPT_REG_REGION;
> > 		struct kvm_enc_region region;
> > 
> > 		if (!kvm_arch_private_mem_supported(kvm))
> > 			goto arch_vm_ioctl;
> > 
> > 		r = -EFAULT;
> > 		if (copy_from_user(&region, argp, sizeof(region)))
> > 			goto out;
> > 
> > 		r = kvm_vm_ioctl_set_or_clear_mem_private(kvm, region.addr,
> > 							  region.size, set);
> > 		break;
> > 	}
> > #endif
> > 
> > I don't love it, so if someone has a better idea...
> > 
> Maybe you could tag it with cgs for all the confidential guest support
> related stuff:
> e.g. kvm_vm_ioctl_set_cgs_mem()
> 
> bool is_private = ioctl == KVM_MEMORY_ENCRYPT_REG_REGION;
> ...
> kvm_vm_ioctl_set_cgs_mem(, is_private)

If we plan to widely use such abbr. through KVM (e.g. it's well known),
I'm fine.

I actually use mem_attr in patch: https://lkml.org/lkml/2022/7/20/610
But I also don't quite like it, it's so generic and sounds say nothing.

But I do want a name can cover future usages other than just 
private/shared (pKVM for example may have a third state).

Thanks,
Chao
Chao Peng July 21, 2022, 9:37 a.m. UTC | #11
On Wed, Jul 20, 2022 at 04:44:32PM +0000, Sean Christopherson wrote:
> On Wed, Jul 06, 2022, Chao Peng wrote:
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 230c8ff9659c..bb714c2a4b06 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -914,6 +914,35 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
> >  
> >  #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
> >  
> > +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> > +#define KVM_MEM_ATTR_PRIVATE	0x0001
> > +static int kvm_vm_ioctl_set_encrypted_region(struct kvm *kvm, unsigned int ioctl,
> > +					     struct kvm_enc_region *region)
> > +{
> > +	unsigned long start, end;
> 
> As alluded to in a different reply, because this will track GPAs instead of HVAs,
> the type needs to be "gpa_t", not "unsigned long".  Oh, actually, they need to
> be gfn_t, since those are what gets shoved into the xarray.

It's gfn_t actually. My original purpose for this is 32bit architectures
(if any) can also work with it since index of xarrary is 32bit on those
architectures.  But kvm_enc_region is u64 so itr's even not possible.

> 
> > +	void *entry;
> > +	int r;
> > +
> > +	if (region->size == 0 || region->addr + region->size < region->addr)
> > +		return -EINVAL;
> > +	if (region->addr & (PAGE_SIZE - 1) || region->size & (PAGE_SIZE - 1))
> > +		return -EINVAL;
> > +
> > +	start = region->addr >> PAGE_SHIFT;
> > +	end = (region->addr + region->size - 1) >> PAGE_SHIFT;
> > +
> > +	entry = ioctl == KVM_MEMORY_ENCRYPT_REG_REGION ?
> > +				xa_mk_value(KVM_MEM_ATTR_PRIVATE) : NULL;
> > +
> > +	r = xa_err(xa_store_range(&kvm->mem_attr_array, start, end,
> > +					entry, GFP_KERNEL_ACCOUNT));
> 
> IIUC, this series treats memory as shared by default.  I think we should invert
> that and have KVM's ABI be that all guest memory as private by default, i.e.
> require the guest to opt into sharing memory instead of opt out of sharing memory.
> 
> And then the xarray would track which regions are shared.

Maybe I missed some information discussed elsewhere? I followed
https://lkml.org/lkml/2022/5/23/772. KVM is shared by default but
userspace should set all guest memory to private before the guest
launch, guest then sees all memory as private.  While default it to
private sounds also good, if we only talk about the private/shared in
private memory context (I think so), then there is no ambiguity.

> 
> Regarding mem_attr_array, it probably makes sense to explicitly include what it's
> tracking in the name, i.e. name it {private,shared}_mem_array depending on whether
> it's used to track private vs. shared memory.  If we ever need to track metadata
> beyond shared/private then we can tweak the name as needed, e.g. if hardware ever
> supports secondary non-ephemeral encryption keys.

As I think that there may be other state beyond that. Fine with me to
just take consideration of private/shared, and it also sounds
reasonable for people who want to support that to change.

Chao
Sean Christopherson July 21, 2022, 5:58 p.m. UTC | #12
On Thu, Jul 21, 2022, Chao Peng wrote:
> On Thu, Jul 21, 2022 at 03:34:59PM +0800, Wei Wang wrote:
> > 
> > 
> > On 7/21/22 00:21, Sean Christopherson wrote:
> > Maybe you could tag it with cgs for all the confidential guest support
> > related stuff: e.g. kvm_vm_ioctl_set_cgs_mem()
> > 
> > bool is_private = ioctl == KVM_MEMORY_ENCRYPT_REG_REGION;
> > ...
> > kvm_vm_ioctl_set_cgs_mem(, is_private)
> 
> If we plan to widely use such abbr. through KVM (e.g. it's well known),
> I'm fine.

I'd prefer to stay away from "confidential guest", and away from any VM-scoped
name for that matter.  User-unmappable memmory has use cases beyond hiding guest
state from the host, e.g. userspace could use inaccessible/unmappable memory to
harden itself against unintentional access to guest memory.

> I actually use mem_attr in patch: https://lkml.org/lkml/2022/7/20/610
> But I also don't quite like it, it's so generic and sounds say nothing.
> 
> But I do want a name can cover future usages other than just 
> private/shared (pKVM for example may have a third state).

I don't think there can be a third top-level state.  Memory is either private to
the guest or it's not.  There can be sub-states, e.g. memory could be selectively
shared or encrypted with a different key, in which case we'd need metadata to
track that state.

Though that begs the question of whether or not private_fd is the correct
terminology.  E.g. if guest memory is backed by a memfd that can't be mapped by
userspace (currently F_SEAL_INACCESSIBLE), but something else in the kernel plugs
that memory into a device or another VM, then arguably that memory is shared,
especially the multi-VM scenario.

For TDX and SNP "private vs. shared" is likely the correct terminology given the
current specs, but for generic KVM it's probably better to align with whatever
terminology is used for memfd.  "inaccessible_fd" and "user_inaccessible_fd" are
a bit odd since the fd itself is accesible.

What about "user_unmappable"?  E.g.

  F_SEAL_USER_UNMAPPABLE, MFD_USER_UNMAPPABLE, KVM_HAS_USER_UNMAPPABLE_MEMORY,
  MEMFILE_F_USER_INACCESSIBLE, user_unmappable_fd, etc...

that gives us flexibility to map the memory from within the kernel, e.g. into
other VMs or devices.

Hmm, and then keep your original "mem_attr_array" name?  And probably 

 int kvm_vm_ioctl_set_mem_attr(struct kvm *kvm, gpa_t gpa, gpa_t size,
 			       bool is_user_mappable)

Then the x86/mmu code for TDX/SNP private faults could be:

	is_private = !kvm_is_gpa_user_mappable();

	if (fault->is_private != is_private) {

or if we want to avoid mixing up "user_mappable" and "user_unmappable":

	is_private = kvm_is_gpa_user_unmappable();

	if (fault->is_private != is_private) {

though a helper that returns a negative (not mappable) feels kludgy.  And I like
kvm_is_gpa_user_mappable() because then when there's not "special" memory, it
defaults to true, which is more intuitive IMO.

And then if the future needs more precision, e.g. user-unmappable memory isn't
necessarily guest-exclusive, the uAPI names still work even though KVM internals
will need to be reworked, but that's unavoidable.  E.g. piggybacking
KVM_MEMORY_ENCRYPT_(UN)REG_REGION doesn't allow for further differentiation,
so we'd need to _extend_ the uAPI, but the _existing_ uAPI would still be sane.
Chao Peng July 25, 2022, 1:04 p.m. UTC | #13
On Thu, Jul 21, 2022 at 05:58:50PM +0000, Sean Christopherson wrote:
> On Thu, Jul 21, 2022, Chao Peng wrote:
> > On Thu, Jul 21, 2022 at 03:34:59PM +0800, Wei Wang wrote:
> > > 
> > > 
> > > On 7/21/22 00:21, Sean Christopherson wrote:
> > > Maybe you could tag it with cgs for all the confidential guest support
> > > related stuff: e.g. kvm_vm_ioctl_set_cgs_mem()
> > > 
> > > bool is_private = ioctl == KVM_MEMORY_ENCRYPT_REG_REGION;
> > > ...
> > > kvm_vm_ioctl_set_cgs_mem(, is_private)
> > 
> > If we plan to widely use such abbr. through KVM (e.g. it's well known),
> > I'm fine.
> 
> I'd prefer to stay away from "confidential guest", and away from any VM-scoped
> name for that matter.  User-unmappable memmory has use cases beyond hiding guest
> state from the host, e.g. userspace could use inaccessible/unmappable memory to
> harden itself against unintentional access to guest memory.
> 
> > I actually use mem_attr in patch: https://lkml.org/lkml/2022/7/20/610
> > But I also don't quite like it, it's so generic and sounds say nothing.
> > 
> > But I do want a name can cover future usages other than just 
> > private/shared (pKVM for example may have a third state).
> 
> I don't think there can be a third top-level state.  Memory is either private to
> the guest or it's not.  There can be sub-states, e.g. memory could be selectively
> shared or encrypted with a different key, in which case we'd need metadata to
> track that state.
> 
> Though that begs the question of whether or not private_fd is the correct
> terminology.  E.g. if guest memory is backed by a memfd that can't be mapped by
> userspace (currently F_SEAL_INACCESSIBLE), but something else in the kernel plugs
> that memory into a device or another VM, then arguably that memory is shared,
> especially the multi-VM scenario.
> 
> For TDX and SNP "private vs. shared" is likely the correct terminology given the
> current specs, but for generic KVM it's probably better to align with whatever
> terminology is used for memfd.  "inaccessible_fd" and "user_inaccessible_fd" are
> a bit odd since the fd itself is accesible.
> 
> What about "user_unmappable"?  E.g.
> 
>   F_SEAL_USER_UNMAPPABLE, MFD_USER_UNMAPPABLE, KVM_HAS_USER_UNMAPPABLE_MEMORY,
>   MEMFILE_F_USER_INACCESSIBLE, user_unmappable_fd, etc...

For KVM I also think user_unmappable looks better than 'private', e.g.
user_unmappable_fd/KVM_HAS_USER_UNMAPPABLE_MEMORY sounds more
appropriate names. For memfd however, I don't feel that strong to change
it from current 'inaccessible' to 'user_unmappable', one of the reason
is it's not just about unmappable, but actually also inaccessible
through direct ioctls like read()/write().

> 
> that gives us flexibility to map the memory from within the kernel, e.g. into
> other VMs or devices.
> 
> Hmm, and then keep your original "mem_attr_array" name?  And probably 
> 
>  int kvm_vm_ioctl_set_mem_attr(struct kvm *kvm, gpa_t gpa, gpa_t size,
>  			       bool is_user_mappable)
> 
> Then the x86/mmu code for TDX/SNP private faults could be:
> 
> 	is_private = !kvm_is_gpa_user_mappable();
> 
> 	if (fault->is_private != is_private) {
> 
> or if we want to avoid mixing up "user_mappable" and "user_unmappable":
> 
> 	is_private = kvm_is_gpa_user_unmappable();
> 
> 	if (fault->is_private != is_private) {
> 
> though a helper that returns a negative (not mappable) feels kludgy.  And I like
> kvm_is_gpa_user_mappable() because then when there's not "special" memory, it
> defaults to true, which is more intuitive IMO.

yes.

> 
> And then if the future needs more precision, e.g. user-unmappable memory isn't
> necessarily guest-exclusive, the uAPI names still work even though KVM internals
> will need to be reworked, but that's unavoidable.  E.g. piggybacking
> KVM_MEMORY_ENCRYPT_(UN)REG_REGION doesn't allow for further differentiation,
> so we'd need to _extend_ the uAPI, but the _existing_ uAPI would still be sane.

Right, that has to be extended.

Chao
Sean Christopherson July 29, 2022, 7:54 p.m. UTC | #14
On Mon, Jul 25, 2022, Chao Peng wrote:
> On Thu, Jul 21, 2022 at 05:58:50PM +0000, Sean Christopherson wrote:
> > On Thu, Jul 21, 2022, Chao Peng wrote:
> > > On Thu, Jul 21, 2022 at 03:34:59PM +0800, Wei Wang wrote:
> > > > 
> > > > 
> > > > On 7/21/22 00:21, Sean Christopherson wrote:
> > > > Maybe you could tag it with cgs for all the confidential guest support
> > > > related stuff: e.g. kvm_vm_ioctl_set_cgs_mem()
> > > > 
> > > > bool is_private = ioctl == KVM_MEMORY_ENCRYPT_REG_REGION;
> > > > ...
> > > > kvm_vm_ioctl_set_cgs_mem(, is_private)
> > > 
> > > If we plan to widely use such abbr. through KVM (e.g. it's well known),
> > > I'm fine.
> > 
> > I'd prefer to stay away from "confidential guest", and away from any VM-scoped
> > name for that matter.  User-unmappable memmory has use cases beyond hiding guest
> > state from the host, e.g. userspace could use inaccessible/unmappable memory to
> > harden itself against unintentional access to guest memory.
> > 
> > > I actually use mem_attr in patch: https://lkml.org/lkml/2022/7/20/610
> > > But I also don't quite like it, it's so generic and sounds say nothing.
> > > 
> > > But I do want a name can cover future usages other than just 
> > > private/shared (pKVM for example may have a third state).
> > 
> > I don't think there can be a third top-level state.  Memory is either private to
> > the guest or it's not.  There can be sub-states, e.g. memory could be selectively
> > shared or encrypted with a different key, in which case we'd need metadata to
> > track that state.
> > 
> > Though that begs the question of whether or not private_fd is the correct
> > terminology.  E.g. if guest memory is backed by a memfd that can't be mapped by
> > userspace (currently F_SEAL_INACCESSIBLE), but something else in the kernel plugs
> > that memory into a device or another VM, then arguably that memory is shared,
> > especially the multi-VM scenario.
> > 
> > For TDX and SNP "private vs. shared" is likely the correct terminology given the
> > current specs, but for generic KVM it's probably better to align with whatever
> > terminology is used for memfd.  "inaccessible_fd" and "user_inaccessible_fd" are
> > a bit odd since the fd itself is accesible.
> > 
> > What about "user_unmappable"?  E.g.
> > 
> >   F_SEAL_USER_UNMAPPABLE, MFD_USER_UNMAPPABLE, KVM_HAS_USER_UNMAPPABLE_MEMORY,
> >   MEMFILE_F_USER_INACCESSIBLE, user_unmappable_fd, etc...
> 
> For KVM I also think user_unmappable looks better than 'private', e.g.
> user_unmappable_fd/KVM_HAS_USER_UNMAPPABLE_MEMORY sounds more
> appropriate names. For memfd however, I don't feel that strong to change
> it from current 'inaccessible' to 'user_unmappable', one of the reason
> is it's not just about unmappable, but actually also inaccessible
> through direct ioctls like read()/write().

Heh, I _knew_ there had to be a catch.  I agree that INACCESSIBLE is better for
memfd.
Sean Christopherson Aug. 2, 2022, 12:49 a.m. UTC | #15
On Fri, Jul 29, 2022, Sean Christopherson wrote:
> On Mon, Jul 25, 2022, Chao Peng wrote:
> > On Thu, Jul 21, 2022 at 05:58:50PM +0000, Sean Christopherson wrote:
> > > On Thu, Jul 21, 2022, Chao Peng wrote:
> > > > On Thu, Jul 21, 2022 at 03:34:59PM +0800, Wei Wang wrote:
> > > > > 
> > > > > 
> > > > > On 7/21/22 00:21, Sean Christopherson wrote:
> > > > > Maybe you could tag it with cgs for all the confidential guest support
> > > > > related stuff: e.g. kvm_vm_ioctl_set_cgs_mem()
> > > > > 
> > > > > bool is_private = ioctl == KVM_MEMORY_ENCRYPT_REG_REGION;
> > > > > ...
> > > > > kvm_vm_ioctl_set_cgs_mem(, is_private)
> > > > 
> > > > If we plan to widely use such abbr. through KVM (e.g. it's well known),
> > > > I'm fine.
> > > 
> > > I'd prefer to stay away from "confidential guest", and away from any VM-scoped
> > > name for that matter.  User-unmappable memmory has use cases beyond hiding guest
> > > state from the host, e.g. userspace could use inaccessible/unmappable memory to
> > > harden itself against unintentional access to guest memory.
> > > 
> > > > I actually use mem_attr in patch: https://lkml.org/lkml/2022/7/20/610
> > > > But I also don't quite like it, it's so generic and sounds say nothing.
> > > > 
> > > > But I do want a name can cover future usages other than just 
> > > > private/shared (pKVM for example may have a third state).
> > > 
> > > I don't think there can be a third top-level state.  Memory is either private to
> > > the guest or it's not.  There can be sub-states, e.g. memory could be selectively
> > > shared or encrypted with a different key, in which case we'd need metadata to
> > > track that state.
> > > 
> > > Though that begs the question of whether or not private_fd is the correct
> > > terminology.  E.g. if guest memory is backed by a memfd that can't be mapped by
> > > userspace (currently F_SEAL_INACCESSIBLE), but something else in the kernel plugs
> > > that memory into a device or another VM, then arguably that memory is shared,
> > > especially the multi-VM scenario.
> > > 
> > > For TDX and SNP "private vs. shared" is likely the correct terminology given the
> > > current specs, but for generic KVM it's probably better to align with whatever
> > > terminology is used for memfd.  "inaccessible_fd" and "user_inaccessible_fd" are
> > > a bit odd since the fd itself is accesible.
> > > 
> > > What about "user_unmappable"?  E.g.
> > > 
> > >   F_SEAL_USER_UNMAPPABLE, MFD_USER_UNMAPPABLE, KVM_HAS_USER_UNMAPPABLE_MEMORY,
> > >   MEMFILE_F_USER_INACCESSIBLE, user_unmappable_fd, etc...
> > 
> > For KVM I also think user_unmappable looks better than 'private', e.g.
> > user_unmappable_fd/KVM_HAS_USER_UNMAPPABLE_MEMORY sounds more
> > appropriate names. For memfd however, I don't feel that strong to change
> > it from current 'inaccessible' to 'user_unmappable', one of the reason
> > is it's not just about unmappable, but actually also inaccessible
> > through direct ioctls like read()/write().
> 
> Heh, I _knew_ there had to be a catch.  I agree that INACCESSIBLE is better for
> memfd.

Thought about this some more...

I think we should avoid UNMAPPABLE even on the KVM side of things for the core
memslots functionality and instead be very literal, e.g.

	KVM_HAS_FD_BASED_MEMSLOTS
	KVM_MEM_FD_VALID

We'll still need KVM_HAS_USER_UNMAPPABLE_MEMORY, but it won't be tied directly to
the memslot.  Decoupling the two thingis will require a bit of extra work, but the
code impact should be quite small, e.g. explicitly query and propagate
MEMFILE_F_USER_INACCESSIBLE to kvm_memory_slot to track if a memslot can be private.
And unless I'm missing something, it won't require an additional memslot flag.
The biggest oddity (if we don't also add KVM_MEM_PRIVATE) is that KVM would
effectively ignore the hva for fd-based memslots for VM types that don't support
private memory, i.e. userspace can't opt out of using the fd-based backing, but that
doesn't seem like a deal breaker.

Decoupling private memory from fd-based memslots will allow using fd-based memslots
for backing VMs even if the memory is user mappable, which opens up potentially
interesting use cases.  It would also allow testing some parts of fd-based memslots
with existing VMs.

The big advantage of KVM's hva-based memslots is that KVM doesn't care what's backing
a memslot, and so (in thoery) enabling new backing stores for KVM is free.  It's not
always free, but at this point I think we've eliminated most of the hiccups, e.g. x86's
MMU should no longer require additional enlightenment to support huge pages for new
backing types.

On the flip-side, a big disadvantage of hva-based memslots is that KVM doesn't
_know_ what's backing a memslot.  This is one of the major reasons, if not _the_
main reason at this point, why KVM binds a VM to a single virtual address space.
Running with different hva=>pfn mappings would either be completely unsafe or
prohibitively expensive (nearly impossible?) to ensure.

With fd-based memslots, KVM essentially binds a memslot directly to the backing
store.  This allows KVM to do a "deep" comparison of a memslot between two address
spaces simply by checking that the backing store is the same.  For intra-host/copyless
migration (to upgrade the userspace VMM), being able to do a deep comparison would
theoretically allow transferring KVM's page tables between VMs instead of forcing
the target VM to rebuild the page tables.  There are memcg complications (and probably
many others) for transferring page tables, but I'm pretty sure it could work.

I don't have a concrete use case (this is a recent idea on my end), but since we're
already adding fd-based memory, I can't think of a good reason not make it more generic
for not much extra cost.  And there are definitely classes of VMs for which fd-based
memory would Just Work, e.g. large VMs that are never oversubscribed on memory don't
need to support reclaim, so the fact that fd-based memslots won't support page aging
(among other things) right away is a non-issue.
Sean Christopherson Aug. 2, 2022, 4:38 p.m. UTC | #16
On Tue, Aug 02, 2022, Sean Christopherson wrote:
> I think we should avoid UNMAPPABLE even on the KVM side of things for the core
> memslots functionality and instead be very literal, e.g.
> 
> 	KVM_HAS_FD_BASED_MEMSLOTS
> 	KVM_MEM_FD_VALID
> 
> We'll still need KVM_HAS_USER_UNMAPPABLE_MEMORY, but it won't be tied directly to
> the memslot.  Decoupling the two thingis will require a bit of extra work, but the
> code impact should be quite small, e.g. explicitly query and propagate
> MEMFILE_F_USER_INACCESSIBLE to kvm_memory_slot to track if a memslot can be private.
> And unless I'm missing something, it won't require an additional memslot flag.
> The biggest oddity (if we don't also add KVM_MEM_PRIVATE) is that KVM would
> effectively ignore the hva for fd-based memslots for VM types that don't support
> private memory, i.e. userspace can't opt out of using the fd-based backing, but that
> doesn't seem like a deal breaker.

Hrm, but basing private memory on top of a generic FD_VALID would effectively require
shared memory to use hva-based memslots for confidential VMs.  That'd yield a very
weird API, e.g. non-confidential VMs could be backed entirely by fd-based memslots,
but confidential VMs would be forced to use hva-based memslots.

Ignore this idea for now.  If there's an actual use case for generic fd-based memory
then we'll want a separate flag, fd, and offset, i.e. that support could be added
independent of KVM_MEM_PRIVATE.
Chao Peng Aug. 3, 2022, 9:48 a.m. UTC | #17
On Tue, Aug 02, 2022 at 04:38:55PM +0000, Sean Christopherson wrote:
> On Tue, Aug 02, 2022, Sean Christopherson wrote:
> > I think we should avoid UNMAPPABLE even on the KVM side of things for the core
> > memslots functionality and instead be very literal, e.g.
> > 
> > 	KVM_HAS_FD_BASED_MEMSLOTS
> > 	KVM_MEM_FD_VALID
> > 
> > We'll still need KVM_HAS_USER_UNMAPPABLE_MEMORY, but it won't be tied directly to
> > the memslot.  Decoupling the two thingis will require a bit of extra work, but the
> > code impact should be quite small, e.g. explicitly query and propagate
> > MEMFILE_F_USER_INACCESSIBLE to kvm_memory_slot to track if a memslot can be private.
> > And unless I'm missing something, it won't require an additional memslot flag.
> > The biggest oddity (if we don't also add KVM_MEM_PRIVATE) is that KVM would
> > effectively ignore the hva for fd-based memslots for VM types that don't support
> > private memory, i.e. userspace can't opt out of using the fd-based backing, but that
> > doesn't seem like a deal breaker.

I actually love this idea. I don't mind adding extra code for potential
usage other than confidential VMs if we can have a workable solution for
it.

> 
> Hrm, but basing private memory on top of a generic FD_VALID would effectively require
> shared memory to use hva-based memslots for confidential VMs.  That'd yield a very
> weird API, e.g. non-confidential VMs could be backed entirely by fd-based memslots,
> but confidential VMs would be forced to use hva-based memslots.

It would work if we can treat userspace_addr as optional for
KVM_MEM_FD_VALID, e.g. userspace can opt in to decide whether needing
the mappable part or not for a regular VM and we can enforce KVM for
confidential VMs. But the u64 type of userspace_addr doesn't allow us to
express a 'null' value so sounds like we will end up needing another
flag anyway.

In concept, we could have three cofigurations here:
  1. hva-only: without any flag and use userspace_addr;
  2. fd-only:  another new flag is needed and use fd/offset;
  3. hva/fd mixed: both userspace_addr and fd/offset is effective.
     KVM_MEM_PRIVATE is a subset of it for confidential VMs. Not sure
     regular VM also wants this.

There is no direct relationship between unmappable and fd-based since
even fd-based can also be mappable for regular VM?

> 
> Ignore this idea for now.  If there's an actual use case for generic fd-based memory
> then we'll want a separate flag, fd, and offset, i.e. that support could be added
> independent of KVM_MEM_PRIVATE.

If we ignore this idea now (which I'm also fine), do you still think we
need change KVM_MEM_PRIVATE to KVM_MEM_USER_UNMAPPBLE?

Thanks,
Chao
Sean Christopherson Aug. 3, 2022, 3:51 p.m. UTC | #18
On Wed, Aug 03, 2022, Chao Peng wrote:
> On Tue, Aug 02, 2022 at 04:38:55PM +0000, Sean Christopherson wrote:
> > On Tue, Aug 02, 2022, Sean Christopherson wrote:
> > > I think we should avoid UNMAPPABLE even on the KVM side of things for the core
> > > memslots functionality and instead be very literal, e.g.
> > > 
> > > 	KVM_HAS_FD_BASED_MEMSLOTS
> > > 	KVM_MEM_FD_VALID
> > > 
> > > We'll still need KVM_HAS_USER_UNMAPPABLE_MEMORY, but it won't be tied directly to
> > > the memslot.  Decoupling the two thingis will require a bit of extra work, but the
> > > code impact should be quite small, e.g. explicitly query and propagate
> > > MEMFILE_F_USER_INACCESSIBLE to kvm_memory_slot to track if a memslot can be private.
> > > And unless I'm missing something, it won't require an additional memslot flag.
> > > The biggest oddity (if we don't also add KVM_MEM_PRIVATE) is that KVM would
> > > effectively ignore the hva for fd-based memslots for VM types that don't support
> > > private memory, i.e. userspace can't opt out of using the fd-based backing, but that
> > > doesn't seem like a deal breaker.
> 
> I actually love this idea. I don't mind adding extra code for potential
> usage other than confidential VMs if we can have a workable solution for
> it.
> 
> > 
> > Hrm, but basing private memory on top of a generic FD_VALID would effectively require
> > shared memory to use hva-based memslots for confidential VMs.  That'd yield a very
> > weird API, e.g. non-confidential VMs could be backed entirely by fd-based memslots,
> > but confidential VMs would be forced to use hva-based memslots.
> 
> It would work if we can treat userspace_addr as optional for
> KVM_MEM_FD_VALID, e.g. userspace can opt in to decide whether needing
> the mappable part or not for a regular VM and we can enforce KVM for
> confidential VMs. But the u64 type of userspace_addr doesn't allow us to
> express a 'null' value so sounds like we will end up needing another
> flag anyway.
> 
> In concept, we could have three cofigurations here:
>   1. hva-only: without any flag and use userspace_addr;
>   2. fd-only:  another new flag is needed and use fd/offset;
>   3. hva/fd mixed: both userspace_addr and fd/offset is effective.
>      KVM_MEM_PRIVATE is a subset of it for confidential VMs. Not sure
>      regular VM also wants this.

My mental model breaks things down slightly differently, though the end result is
more or less the same. 

After this series, there will be two types of memory: private and "regular" (I'm
trying to avoid "shared").  "Regular" memory is always hva-based (userspace_addr),
and private always fd-based (fd+offset).

In the future, if we want to support fd-based memory for "regular" memory, then
as you said we'd need to add a new flag, and a new fd+offset pair.

At that point, we'd have two new (relatively to current) flags:

  KVM_MEM_PRIVATE_FD_VALID
  KVM_MEM_FD_VALID

along with two new pairs of fd+offset (private_* and "regular").  Mapping those
to your above list:
  
  1.  Neither *_FD_VALID flag set.
  2a. Both PRIVATE_FD_VALID and FD_VALID are set
  2b. FD_VALID is set and the VM doesn't support private memory
  3.  Only PRIVATE_FD_VALID is set (which private memory support in the VM).

Thus, "regular" VMs can't have a mix in a single memslot because they can't use
private memory.

> There is no direct relationship between unmappable and fd-based since
> even fd-based can also be mappable for regular VM?

Yep.

> > Ignore this idea for now.  If there's an actual use case for generic fd-based memory
> > then we'll want a separate flag, fd, and offset, i.e. that support could be added
> > independent of KVM_MEM_PRIVATE.
> 
> If we ignore this idea now (which I'm also fine), do you still think we
> need change KVM_MEM_PRIVATE to KVM_MEM_USER_UNMAPPBLE?

Hmm, no.  After working through this, I think it's safe to say KVM_MEM_USER_UNMAPPABLE
is bad name because we could end up with "regular" memory that's backed by an
inaccessible (unmappable) file.

One alternative would be to call it KVM_MEM_PROTECTED.  That shouldn't cause
problems for the known use of "private" (TDX and SNP), and it gives us a little
wiggle room, e.g. if we ever get a use case where VMs can share memory that is
otherwise protected.

That's a pretty big "if" though, and odds are good we'd need more memslot flags and
fd+offset pairs to allow differentiating "private" vs. "protected-shared" without
forcing userspace to punch holes in memslots, so I don't know that hedging now will
buy us anything.

So I'd say that if people think KVM_MEM_PRIVATE brings additional and meaningful
clarity over KVM_MEM_PROTECTECD, then lets go with PRIVATE.  But if PROTECTED is
just as good, go with PROTECTED as it gives us a wee bit of wiggle room for the
future.

Note, regardless of what name we settle on, I think it makes to do the
KVM_PRIVATE_MEM_SLOTS => KVM_INTERNAL_MEM_SLOTS rename.
Chao Peng Aug. 4, 2022, 7:58 a.m. UTC | #19
On Wed, Aug 03, 2022 at 03:51:24PM +0000, Sean Christopherson wrote:
> On Wed, Aug 03, 2022, Chao Peng wrote:
> > On Tue, Aug 02, 2022 at 04:38:55PM +0000, Sean Christopherson wrote:
> > > On Tue, Aug 02, 2022, Sean Christopherson wrote:
> > > > I think we should avoid UNMAPPABLE even on the KVM side of things for the core
> > > > memslots functionality and instead be very literal, e.g.
> > > > 
> > > > 	KVM_HAS_FD_BASED_MEMSLOTS
> > > > 	KVM_MEM_FD_VALID
> > > > 
> > > > We'll still need KVM_HAS_USER_UNMAPPABLE_MEMORY, but it won't be tied directly to
> > > > the memslot.  Decoupling the two thingis will require a bit of extra work, but the
> > > > code impact should be quite small, e.g. explicitly query and propagate
> > > > MEMFILE_F_USER_INACCESSIBLE to kvm_memory_slot to track if a memslot can be private.
> > > > And unless I'm missing something, it won't require an additional memslot flag.
> > > > The biggest oddity (if we don't also add KVM_MEM_PRIVATE) is that KVM would
> > > > effectively ignore the hva for fd-based memslots for VM types that don't support
> > > > private memory, i.e. userspace can't opt out of using the fd-based backing, but that
> > > > doesn't seem like a deal breaker.
> > 
> > I actually love this idea. I don't mind adding extra code for potential
> > usage other than confidential VMs if we can have a workable solution for
> > it.
> > 
> > > 
> > > Hrm, but basing private memory on top of a generic FD_VALID would effectively require
> > > shared memory to use hva-based memslots for confidential VMs.  That'd yield a very
> > > weird API, e.g. non-confidential VMs could be backed entirely by fd-based memslots,
> > > but confidential VMs would be forced to use hva-based memslots.
> > 
> > It would work if we can treat userspace_addr as optional for
> > KVM_MEM_FD_VALID, e.g. userspace can opt in to decide whether needing
> > the mappable part or not for a regular VM and we can enforce KVM for
> > confidential VMs. But the u64 type of userspace_addr doesn't allow us to
> > express a 'null' value so sounds like we will end up needing another
> > flag anyway.
> > 
> > In concept, we could have three cofigurations here:
> >   1. hva-only: without any flag and use userspace_addr;
> >   2. fd-only:  another new flag is needed and use fd/offset;
> >   3. hva/fd mixed: both userspace_addr and fd/offset is effective.
> >      KVM_MEM_PRIVATE is a subset of it for confidential VMs. Not sure
> >      regular VM also wants this.
> 
> My mental model breaks things down slightly differently, though the end result is
> more or less the same. 
> 
> After this series, there will be two types of memory: private and "regular" (I'm
> trying to avoid "shared").  "Regular" memory is always hva-based (userspace_addr),
> and private always fd-based (fd+offset).
> 
> In the future, if we want to support fd-based memory for "regular" memory, then
> as you said we'd need to add a new flag, and a new fd+offset pair.
> 
> At that point, we'd have two new (relatively to current) flags:
> 
>   KVM_MEM_PRIVATE_FD_VALID
>   KVM_MEM_FD_VALID
> 
> along with two new pairs of fd+offset (private_* and "regular").  Mapping those
> to your above list:

I previously thought we could reuse the private_fd (name should be
changed) for regular VM as well so only need one pair of fd+offset, the
meaning of the fd can be decided by the flag. But introducing two pairs
of them may support extra usages like one fd for regular memory and
another private_fd for private memory, though unsure this is a useful
configuration.

>   
>   1.  Neither *_FD_VALID flag set.
>   2a. Both PRIVATE_FD_VALID and FD_VALID are set
>   2b. FD_VALID is set and the VM doesn't support private memory
>   3.  Only PRIVATE_FD_VALID is set (which private memory support in the VM).
> 
> Thus, "regular" VMs can't have a mix in a single memslot because they can't use
> private memory.
> 
> > There is no direct relationship between unmappable and fd-based since
> > even fd-based can also be mappable for regular VM?

Hmm, yes, for private memory we have special treatment in page fault
handler and that is not applied to regular VM.

> 
> Yep.
> 
> > > Ignore this idea for now.  If there's an actual use case for generic fd-based memory
> > > then we'll want a separate flag, fd, and offset, i.e. that support could be added
> > > independent of KVM_MEM_PRIVATE.
> > 
> > If we ignore this idea now (which I'm also fine), do you still think we
> > need change KVM_MEM_PRIVATE to KVM_MEM_USER_UNMAPPBLE?
> 
> Hmm, no.  After working through this, I think it's safe to say KVM_MEM_USER_UNMAPPABLE
> is bad name because we could end up with "regular" memory that's backed by an
> inaccessible (unmappable) file.
> 
> One alternative would be to call it KVM_MEM_PROTECTED.  That shouldn't cause
> problems for the known use of "private" (TDX and SNP), and it gives us a little
> wiggle room, e.g. if we ever get a use case where VMs can share memory that is
> otherwise protected.
> 
> That's a pretty big "if" though, and odds are good we'd need more memslot flags and
> fd+offset pairs to allow differentiating "private" vs. "protected-shared" without
> forcing userspace to punch holes in memslots, so I don't know that hedging now will
> buy us anything.
> 
> So I'd say that if people think KVM_MEM_PRIVATE brings additional and meaningful
> clarity over KVM_MEM_PROTECTECD, then lets go with PRIVATE.  But if PROTECTED is
> just as good, go with PROTECTED as it gives us a wee bit of wiggle room for the
> future.

Then I'd stay with PRIVATE.

> 
> Note, regardless of what name we settle on, I think it makes to do the
> KVM_PRIVATE_MEM_SLOTS => KVM_INTERNAL_MEM_SLOTS rename.

Agreed.

Chao
Vishal Annapurve Aug. 19, 2022, 7:37 p.m. UTC | #20
> ...
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 230c8ff9659c..bb714c2a4b06 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -914,6 +914,35 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
>
>  #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
>
> +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> +#define KVM_MEM_ATTR_PRIVATE   0x0001
> +static int kvm_vm_ioctl_set_encrypted_region(struct kvm *kvm, unsigned int ioctl,
> +                                            struct kvm_enc_region *region)
> +{
> +       unsigned long start, end;
> +       void *entry;
> +       int r;
> +
> +       if (region->size == 0 || region->addr + region->size < region->addr)
> +               return -EINVAL;
> +       if (region->addr & (PAGE_SIZE - 1) || region->size & (PAGE_SIZE - 1))
> +               return -EINVAL;
> +
> +       start = region->addr >> PAGE_SHIFT;
> +       end = (region->addr + region->size - 1) >> PAGE_SHIFT;
> +
> +       entry = ioctl == KVM_MEMORY_ENCRYPT_REG_REGION ?
> +                               xa_mk_value(KVM_MEM_ATTR_PRIVATE) : NULL;
> +
> +       r = xa_err(xa_store_range(&kvm->mem_attr_array, start, end,
> +                                       entry, GFP_KERNEL_ACCOUNT));

xa_store_range seems to create multi-index entries by default.
Subsequent xa_store_range call changes all the entries stored
previously.
xa_store needs to be used here instead of xa_store_range to achieve
the intended behavior.

> +
> +       kvm_zap_gfn_range(kvm, start, end + 1);
> +
> +       return r;
> +}
> +#endif /* CONFIG_HAVE_KVM_PRIVATE_MEM */
> +
> ...
Chao Peng Aug. 24, 2022, 10:37 a.m. UTC | #21
On Fri, Aug 19, 2022 at 12:37:42PM -0700, Vishal Annapurve wrote:
> > ...
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 230c8ff9659c..bb714c2a4b06 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -914,6 +914,35 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
> >
> >  #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
> >
> > +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> > +#define KVM_MEM_ATTR_PRIVATE   0x0001
> > +static int kvm_vm_ioctl_set_encrypted_region(struct kvm *kvm, unsigned int ioctl,
> > +                                            struct kvm_enc_region *region)
> > +{
> > +       unsigned long start, end;
> > +       void *entry;
> > +       int r;
> > +
> > +       if (region->size == 0 || region->addr + region->size < region->addr)
> > +               return -EINVAL;
> > +       if (region->addr & (PAGE_SIZE - 1) || region->size & (PAGE_SIZE - 1))
> > +               return -EINVAL;
> > +
> > +       start = region->addr >> PAGE_SHIFT;
> > +       end = (region->addr + region->size - 1) >> PAGE_SHIFT;
> > +
> > +       entry = ioctl == KVM_MEMORY_ENCRYPT_REG_REGION ?
> > +                               xa_mk_value(KVM_MEM_ATTR_PRIVATE) : NULL;
> > +
> > +       r = xa_err(xa_store_range(&kvm->mem_attr_array, start, end,
> > +                                       entry, GFP_KERNEL_ACCOUNT));
> 
> xa_store_range seems to create multi-index entries by default.
> Subsequent xa_store_range call changes all the entries stored
> previously.

By using xa_store_range and storing them as multi-index entries I
expected to save some memory for continuous pages originally.

But sounds like the current multi-index store behaviour isn't quite
ready for our usage.

Chao
> xa_store needs to be used here instead of xa_store_range to achieve
> the intended behavior.
> 
> > +
> > +       kvm_zap_gfn_range(kvm, start, end + 1);
> > +
> > +       return r;
> > +}
> > +#endif /* CONFIG_HAVE_KVM_PRIVATE_MEM */
> > +
> > ...
Fuad Tabba Aug. 26, 2022, 3:19 p.m. UTC | #22
Hi Chao,

On Wed, Jul 6, 2022 at 9:27 AM Chao Peng <chao.p.peng@linux.intel.com> wrote:
>
> If CONFIG_HAVE_KVM_PRIVATE_MEM=y, userspace can register/unregister the
> guest private memory regions through KVM_MEMORY_ENCRYPT_{UN,}REG_REGION
> ioctls. The patch reuses existing SEV ioctl but differs that the
> address in the region for private memory is gpa while SEV case it's hva.
>
> The private memory region is stored as xarray in KVM for memory
> efficiency in normal usages and zapping existing memory mappings is also
> a side effect of these two ioctls.
>
> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> ---
>  Documentation/virt/kvm/api.rst  | 17 +++++++---
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/kvm/Kconfig            |  1 +
>  arch/x86/kvm/mmu.h              |  2 --
>  include/linux/kvm_host.h        |  8 +++++
>  virt/kvm/kvm_main.c             | 57 +++++++++++++++++++++++++++++++++
>  6 files changed, 80 insertions(+), 6 deletions(-)
>
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 5ecfc7fbe0ee..dfb4caecab73 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -4715,10 +4715,19 @@ Documentation/virt/kvm/amd-memory-encryption.rst.
>  This ioctl can be used to register a guest memory region which may
>  contain encrypted data (e.g. guest RAM, SMRAM etc).
>
> -It is used in the SEV-enabled guest. When encryption is enabled, a guest
> -memory region may contain encrypted data. The SEV memory encryption
> -engine uses a tweak such that two identical plaintext pages, each at
> -different locations will have differing ciphertexts. So swapping or
> +Currently this ioctl supports registering memory regions for two usages:
> +private memory and SEV-encrypted memory.
> +
> +When private memory is enabled, this ioctl is used to register guest private
> +memory region and the addr/size of kvm_enc_region represents guest physical
> +address (GPA). In this usage, this ioctl zaps the existing guest memory
> +mappings in KVM that fallen into the region.
> +
> +When SEV-encrypted memory is enabled, this ioctl is used to register guest
> +memory region which may contain encrypted data for a SEV-enabled guest. The
> +addr/size of kvm_enc_region represents userspace address (HVA). The SEV
> +memory encryption engine uses a tweak such that two identical plaintext pages,
> +each at different locations will have differing ciphertexts. So swapping or
>  moving ciphertext of those pages will not result in plaintext being
>  swapped. So relocating (or migrating) physical backing pages for the SEV
>  guest will require some additional steps.
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index dae190e19fce..92120e3a224e 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -37,6 +37,7 @@
>  #include <asm/hyperv-tlfs.h>
>
>  #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
> +#define __KVM_HAVE_ZAP_GFN_RANGE
>
>  #define KVM_MAX_VCPUS 1024
>
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index 1f160801e2a7..05861b9656a4 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -50,6 +50,7 @@ config KVM
>         select HAVE_KVM_PM_NOTIFIER if PM
>         select HAVE_KVM_PRIVATE_MEM if X86_64
>         select MEMFILE_NOTIFIER if HAVE_KVM_PRIVATE_MEM
> +       select XARRAY_MULTI if HAVE_KVM_PRIVATE_MEM
>         help
>           Support hosting fully virtualized guest machines using hardware
>           virtualization extensions.  You will need a fairly recent
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index a99acec925eb..428cd2e88cbd 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -209,8 +209,6 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
>         return -(u32)fault & errcode;
>  }
>
> -void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
> -
>  int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
>
>  int kvm_mmu_post_init_vm(struct kvm *kvm);
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 1b203c8aa696..da33f8828456 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -260,6 +260,10 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
>  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
>  #endif
>
> +#ifdef __KVM_HAVE_ZAP_GFN_RANGE
> +void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
> +#endif
> +
>  enum {
>         OUTSIDE_GUEST_MODE,
>         IN_GUEST_MODE,
> @@ -795,6 +799,9 @@ struct kvm {
>         struct notifier_block pm_notifier;
>  #endif
>         char stats_id[KVM_STATS_NAME_SIZE];
> +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> +       struct xarray mem_attr_array;
> +#endif
>  };
>
>  #define kvm_err(fmt, ...) \
> @@ -1459,6 +1466,7 @@ bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
>  int kvm_arch_post_init_vm(struct kvm *kvm);
>  void kvm_arch_pre_destroy_vm(struct kvm *kvm);
>  int kvm_arch_create_vm_debugfs(struct kvm *kvm);
> +bool kvm_arch_private_mem_supported(struct kvm *kvm);
>
>  #ifndef __KVM_HAVE_ARCH_VM_ALLOC
>  /*
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 230c8ff9659c..bb714c2a4b06 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -914,6 +914,35 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
>
>  #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
>
> +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> +#define KVM_MEM_ATTR_PRIVATE   0x0001
> +static int kvm_vm_ioctl_set_encrypted_region(struct kvm *kvm, unsigned int ioctl,
> +                                            struct kvm_enc_region *region)
> +{
> +       unsigned long start, end;
> +       void *entry;
> +       int r;
> +
> +       if (region->size == 0 || region->addr + region->size < region->addr)
> +               return -EINVAL;
> +       if (region->addr & (PAGE_SIZE - 1) || region->size & (PAGE_SIZE - 1))
> +               return -EINVAL;
> +
> +       start = region->addr >> PAGE_SHIFT;
> +       end = (region->addr + region->size - 1) >> PAGE_SHIFT;
> +
> +       entry = ioctl == KVM_MEMORY_ENCRYPT_REG_REGION ?
> +                               xa_mk_value(KVM_MEM_ATTR_PRIVATE) : NULL;
> +
> +       r = xa_err(xa_store_range(&kvm->mem_attr_array, start, end,
> +                                       entry, GFP_KERNEL_ACCOUNT));
> +
> +       kvm_zap_gfn_range(kvm, start, end + 1);
> +
> +       return r;
> +}
> +#endif /* CONFIG_HAVE_KVM_PRIVATE_MEM */
> +
>  #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
>  static int kvm_pm_notifier_call(struct notifier_block *bl,
>                                 unsigned long state,
> @@ -1138,6 +1167,9 @@ static struct kvm *kvm_create_vm(unsigned long type)
>         spin_lock_init(&kvm->mn_invalidate_lock);
>         rcuwait_init(&kvm->mn_memslots_update_rcuwait);
>         xa_init(&kvm->vcpu_array);
> +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> +       xa_init(&kvm->mem_attr_array);
> +#endif
>
>         INIT_LIST_HEAD(&kvm->gpc_list);
>         spin_lock_init(&kvm->gpc_lock);
> @@ -1305,6 +1337,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
>                 kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
>                 kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
>         }
> +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> +       xa_destroy(&kvm->mem_attr_array);
> +#endif
>         cleanup_srcu_struct(&kvm->irq_srcu);
>         cleanup_srcu_struct(&kvm->srcu);
>         kvm_arch_free_vm(kvm);
> @@ -1508,6 +1543,11 @@ static void kvm_replace_memslot(struct kvm *kvm,
>         }
>  }
>
> +bool __weak kvm_arch_private_mem_supported(struct kvm *kvm)
> +{
> +       return false;
> +}
> +
>  static int check_memory_region_flags(const struct kvm_user_mem_region *mem)
>  {
>         u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
> @@ -4689,6 +4729,22 @@ static long kvm_vm_ioctl(struct file *filp,
>                 r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
>                 break;
>         }
> +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> +       case KVM_MEMORY_ENCRYPT_REG_REGION:
> +       case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
> +               struct kvm_enc_region region;
> +
> +               if (!kvm_arch_private_mem_supported(kvm))
> +                       goto arch_vm_ioctl;
> +
> +               r = -EFAULT;
> +               if (copy_from_user(&region, argp, sizeof(region)))
> +                       goto out;
> +
> +               r = kvm_vm_ioctl_set_encrypted_region(kvm, ioctl, &region);
> +               break;
> +       }
> +#endif
>         case KVM_GET_DIRTY_LOG: {
>                 struct kvm_dirty_log log;
>
> @@ -4842,6 +4898,7 @@ static long kvm_vm_ioctl(struct file *filp,
>                 r = kvm_vm_ioctl_get_stats_fd(kvm);
>                 break;
>         default:
> +arch_vm_ioctl:

It might be good to make this label conditional on
CONFIG_HAVE_KVM_PRIVATE_MEM, otherwise you get a warning if
CONFIG_HAVE_KVM_PRIVATE_MEM isn't defined.

+#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
 arch_vm_ioctl:
+#endif

Cheers,
/fuad





>                 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
>         }
>  out:
> --
> 2.25.1
>
Chao Peng Aug. 29, 2022, 3:21 p.m. UTC | #23
On Fri, Aug 26, 2022 at 04:19:43PM +0100, Fuad Tabba wrote:
> > +bool __weak kvm_arch_private_mem_supported(struct kvm *kvm)
> > +{
> > +       return false;
> > +}
> > +
> >  static int check_memory_region_flags(const struct kvm_user_mem_region *mem)
> >  {
> >         u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
> > @@ -4689,6 +4729,22 @@ static long kvm_vm_ioctl(struct file *filp,
> >                 r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
> >                 break;
> >         }
> > +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> > +       case KVM_MEMORY_ENCRYPT_REG_REGION:
> > +       case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
> > +               struct kvm_enc_region region;
> > +
> > +               if (!kvm_arch_private_mem_supported(kvm))
> > +                       goto arch_vm_ioctl;
> > +
> > +               r = -EFAULT;
> > +               if (copy_from_user(&region, argp, sizeof(region)))
> > +                       goto out;
> > +
> > +               r = kvm_vm_ioctl_set_encrypted_region(kvm, ioctl, &region);
> > +               break;
> > +       }
> > +#endif
> >         case KVM_GET_DIRTY_LOG: {
> >                 struct kvm_dirty_log log;
> >
> > @@ -4842,6 +4898,7 @@ static long kvm_vm_ioctl(struct file *filp,
> >                 r = kvm_vm_ioctl_get_stats_fd(kvm);
> >                 break;
> >         default:
> > +arch_vm_ioctl:
> 
> It might be good to make this label conditional on
> CONFIG_HAVE_KVM_PRIVATE_MEM, otherwise you get a warning if
> CONFIG_HAVE_KVM_PRIVATE_MEM isn't defined.
> 
> +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
>  arch_vm_ioctl:
> +#endif

Right, as the bot already complains.

Chao
> 
> Cheers,
> /fuad
> 
> 
> 
> 
> 
> >                 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> >         }
> >  out:
> > --
> > 2.25.1
> >
diff mbox series

Patch

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 5ecfc7fbe0ee..dfb4caecab73 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -4715,10 +4715,19 @@  Documentation/virt/kvm/amd-memory-encryption.rst.
 This ioctl can be used to register a guest memory region which may
 contain encrypted data (e.g. guest RAM, SMRAM etc).
 
-It is used in the SEV-enabled guest. When encryption is enabled, a guest
-memory region may contain encrypted data. The SEV memory encryption
-engine uses a tweak such that two identical plaintext pages, each at
-different locations will have differing ciphertexts. So swapping or
+Currently this ioctl supports registering memory regions for two usages:
+private memory and SEV-encrypted memory.
+
+When private memory is enabled, this ioctl is used to register guest private
+memory region and the addr/size of kvm_enc_region represents guest physical
+address (GPA). In this usage, this ioctl zaps the existing guest memory
+mappings in KVM that fallen into the region.
+
+When SEV-encrypted memory is enabled, this ioctl is used to register guest
+memory region which may contain encrypted data for a SEV-enabled guest. The
+addr/size of kvm_enc_region represents userspace address (HVA). The SEV
+memory encryption engine uses a tweak such that two identical plaintext pages,
+each at different locations will have differing ciphertexts. So swapping or
 moving ciphertext of those pages will not result in plaintext being
 swapped. So relocating (or migrating) physical backing pages for the SEV
 guest will require some additional steps.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dae190e19fce..92120e3a224e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -37,6 +37,7 @@ 
 #include <asm/hyperv-tlfs.h>
 
 #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
+#define __KVM_HAVE_ZAP_GFN_RANGE
 
 #define KVM_MAX_VCPUS 1024
 
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 1f160801e2a7..05861b9656a4 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -50,6 +50,7 @@  config KVM
 	select HAVE_KVM_PM_NOTIFIER if PM
 	select HAVE_KVM_PRIVATE_MEM if X86_64
 	select MEMFILE_NOTIFIER if HAVE_KVM_PRIVATE_MEM
+	select XARRAY_MULTI if HAVE_KVM_PRIVATE_MEM
 	help
 	  Support hosting fully virtualized guest machines using hardware
 	  virtualization extensions.  You will need a fairly recent
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index a99acec925eb..428cd2e88cbd 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -209,8 +209,6 @@  static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 	return -(u32)fault & errcode;
 }
 
-void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
-
 int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
 
 int kvm_mmu_post_init_vm(struct kvm *kvm);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1b203c8aa696..da33f8828456 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -260,6 +260,10 @@  bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 #endif
 
+#ifdef __KVM_HAVE_ZAP_GFN_RANGE
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
+#endif
+
 enum {
 	OUTSIDE_GUEST_MODE,
 	IN_GUEST_MODE,
@@ -795,6 +799,9 @@  struct kvm {
 	struct notifier_block pm_notifier;
 #endif
 	char stats_id[KVM_STATS_NAME_SIZE];
+#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
+	struct xarray mem_attr_array;
+#endif
 };
 
 #define kvm_err(fmt, ...) \
@@ -1459,6 +1466,7 @@  bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_post_init_vm(struct kvm *kvm);
 void kvm_arch_pre_destroy_vm(struct kvm *kvm);
 int kvm_arch_create_vm_debugfs(struct kvm *kvm);
+bool kvm_arch_private_mem_supported(struct kvm *kvm);
 
 #ifndef __KVM_HAVE_ARCH_VM_ALLOC
 /*
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 230c8ff9659c..bb714c2a4b06 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -914,6 +914,35 @@  static int kvm_init_mmu_notifier(struct kvm *kvm)
 
 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
 
+#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
+#define KVM_MEM_ATTR_PRIVATE	0x0001
+static int kvm_vm_ioctl_set_encrypted_region(struct kvm *kvm, unsigned int ioctl,
+					     struct kvm_enc_region *region)
+{
+	unsigned long start, end;
+	void *entry;
+	int r;
+
+	if (region->size == 0 || region->addr + region->size < region->addr)
+		return -EINVAL;
+	if (region->addr & (PAGE_SIZE - 1) || region->size & (PAGE_SIZE - 1))
+		return -EINVAL;
+
+	start = region->addr >> PAGE_SHIFT;
+	end = (region->addr + region->size - 1) >> PAGE_SHIFT;
+
+	entry = ioctl == KVM_MEMORY_ENCRYPT_REG_REGION ?
+				xa_mk_value(KVM_MEM_ATTR_PRIVATE) : NULL;
+
+	r = xa_err(xa_store_range(&kvm->mem_attr_array, start, end,
+					entry, GFP_KERNEL_ACCOUNT));
+
+	kvm_zap_gfn_range(kvm, start, end + 1);
+
+	return r;
+}
+#endif /* CONFIG_HAVE_KVM_PRIVATE_MEM */
+
 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
 static int kvm_pm_notifier_call(struct notifier_block *bl,
 				unsigned long state,
@@ -1138,6 +1167,9 @@  static struct kvm *kvm_create_vm(unsigned long type)
 	spin_lock_init(&kvm->mn_invalidate_lock);
 	rcuwait_init(&kvm->mn_memslots_update_rcuwait);
 	xa_init(&kvm->vcpu_array);
+#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
+	xa_init(&kvm->mem_attr_array);
+#endif
 
 	INIT_LIST_HEAD(&kvm->gpc_list);
 	spin_lock_init(&kvm->gpc_lock);
@@ -1305,6 +1337,9 @@  static void kvm_destroy_vm(struct kvm *kvm)
 		kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
 		kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
 	}
+#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
+	xa_destroy(&kvm->mem_attr_array);
+#endif
 	cleanup_srcu_struct(&kvm->irq_srcu);
 	cleanup_srcu_struct(&kvm->srcu);
 	kvm_arch_free_vm(kvm);
@@ -1508,6 +1543,11 @@  static void kvm_replace_memslot(struct kvm *kvm,
 	}
 }
 
+bool __weak kvm_arch_private_mem_supported(struct kvm *kvm)
+{
+	return false;
+}
+
 static int check_memory_region_flags(const struct kvm_user_mem_region *mem)
 {
 	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
@@ -4689,6 +4729,22 @@  static long kvm_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
 		break;
 	}
+#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
+	case KVM_MEMORY_ENCRYPT_REG_REGION:
+	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
+		struct kvm_enc_region region;
+
+		if (!kvm_arch_private_mem_supported(kvm))
+			goto arch_vm_ioctl;
+
+		r = -EFAULT;
+		if (copy_from_user(&region, argp, sizeof(region)))
+			goto out;
+
+		r = kvm_vm_ioctl_set_encrypted_region(kvm, ioctl, &region);
+		break;
+	}
+#endif
 	case KVM_GET_DIRTY_LOG: {
 		struct kvm_dirty_log log;
 
@@ -4842,6 +4898,7 @@  static long kvm_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_get_stats_fd(kvm);
 		break;
 	default:
+arch_vm_ioctl:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 	}
 out: