diff mbox series

[v18,064/121] KVM: TDX: Create initial guest memory

Message ID 97bb1f2996d8a7b828cd9e3309380d1a86ca681b.1705965635.git.isaku.yamahata@intel.com (mailing list archive)
State New, archived
Headers show
Series KVM TDX basic feature support | expand

Commit Message

Isaku Yamahata Jan. 22, 2024, 11:53 p.m. UTC
From: Isaku Yamahata <isaku.yamahata@intel.com>

Because the guest memory is protected in TDX, the creation of the initial
guest memory requires a dedicated TDX module API, tdh_mem_page_add, instead
of directly copying the memory contents into the guest memory in the case
of the default VM type.  KVM MMU page fault handler callback,
private_page_add, handles it.

Define new subcommand, KVM_TDX_INIT_MEM_REGION, of VM-scoped
KVM_MEMORY_ENCRYPT_OP.  It assigns the guest page, copies the initial
memory contents into the guest memory, encrypts the guest memory.  At the
same time, optionally it extends memory measurement of the TDX guest.  It
calls the KVM MMU page fault(EPT-violation) handler to trigger the
callbacks for it.

Reported-by: gkirkpatrick@google.com
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>

---
v18:
- rename tdx_sept_page_add() -> tdx_mem_page_add().
- open code tdx_measure_page() into tdx_mem_page_add().
- remove the change of tools/arch/x86/include/uapi/asm/kvm.h.

v15 -> v16:
- add check if nr_pages isn't large with
  (nr_page << PAGE_SHIFT) >> PAGE_SHIFT

v14 -> v15:
- add a check if TD is finalized or not to tdx_init_mem_region()
- return -EAGAIN when partial population
---
 arch/x86/include/uapi/asm/kvm.h |   9 ++
 arch/x86/kvm/mmu/mmu.c          |   1 +
 arch/x86/kvm/vmx/tdx.c          | 160 +++++++++++++++++++++++++++++++-
 arch/x86/kvm/vmx/tdx.h          |   2 +
 4 files changed, 169 insertions(+), 3 deletions(-)

Comments

Sean Christopherson Feb. 1, 2024, 12:20 a.m. UTC | #1
On Mon, Jan 22, 2024, isaku.yamahata@intel.com wrote:
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 4cbcedff4f16..1a5a91b99de9 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -591,6 +591,69 @@ static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
>  	return 0;
>  }
>  
> +static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn,
> +			    enum pg_level level, kvm_pfn_t pfn)
> +{
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +	hpa_t hpa = pfn_to_hpa(pfn);
> +	gpa_t gpa = gfn_to_gpa(gfn);
> +	struct tdx_module_args out;
> +	hpa_t source_pa;
> +	bool measure;
> +	u64 err;
> +	int i;
> +
> +	/*
> +	 * KVM_INIT_MEM_REGION, tdx_init_mem_region(), supports only 4K page
> +	 * because tdh_mem_page_add() supports only 4K page.
> +	 */
> +	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
> +		return -EINVAL;
> +
> +	/*
> +	 * In case of TDP MMU, fault handler can run concurrently.  Note
> +	 * 'source_pa' is a TD scope variable, meaning if there are multiple
> +	 * threads reaching here with all needing to access 'source_pa', it
> +	 * will break.  However fortunately this won't happen, because below
> +	 * TDH_MEM_PAGE_ADD code path is only used when VM is being created
> +	 * before it is running, using KVM_TDX_INIT_MEM_REGION ioctl (which
> +	 * always uses vcpu 0's page table and protected by vcpu->mutex).
> +	 */

Most of the above is superflous.  tdx_mem_page_add() is called if and only if
the TD is finalized, and the TDX module disallow running vCPUs before the TD is
finalized.  That's it.  And maybe throw in a lockdep to assert that kvm->lock is
held.

> +	if (KVM_BUG_ON(kvm_tdx->source_pa == INVALID_PAGE, kvm)) {
> +		tdx_unpin(kvm, pfn);
> +		return -EINVAL;
> +	}
> +
> +	source_pa = kvm_tdx->source_pa & ~KVM_TDX_MEASURE_MEMORY_REGION;
> +	measure = kvm_tdx->source_pa & KVM_TDX_MEASURE_MEMORY_REGION;
> +	kvm_tdx->source_pa = INVALID_PAGE;
> +
> +	do {
> +		err = tdh_mem_page_add(kvm_tdx->tdr_pa, gpa, hpa, source_pa,
> +				       &out);
> +		/*
> +		 * This path is executed during populating initial guest memory
> +		 * image. i.e. before running any vcpu.  Race is rare.

How are races possible at all?

> +		 */
> +	} while (unlikely(err == TDX_ERROR_SEPT_BUSY));
> +	if (KVM_BUG_ON(err, kvm)) {
> +		pr_tdx_error(TDH_MEM_PAGE_ADD, err, &out);
> +		tdx_unpin(kvm, pfn);
> +		return -EIO;
> +	} else if (measure) {
> +		for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
> +			err = tdh_mr_extend(kvm_tdx->tdr_pa, gpa + i, &out);
> +			if (KVM_BUG_ON(err, &kvm_tdx->kvm)) {
> +				pr_tdx_error(TDH_MR_EXTEND, err, &out);
> +				break;
> +			}
> +		}

Why is measurement done deep within the MMU?  At a glance, I don't see why this
can't be done up in the ioctl, outside of a spinlock.

And IIRC, the order affects the measurement but doesn't truly matter, e.g. KVM
could choose to completely separate tdh_mr_extend() from tdh_mem_page_add(), no?

> +static int tdx_init_mem_region(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
> +{
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +	struct kvm_tdx_init_mem_region region;
> +	struct kvm_vcpu *vcpu;
> +	struct page *page;
> +	int idx, ret = 0;
> +	bool added = false;
> +
> +	/* Once TD is finalized, the initial guest memory is fixed. */
> +	if (is_td_finalized(kvm_tdx))
> +		return -EINVAL;
> +
> +	/* The BSP vCPU must be created before initializing memory regions. */
> +	if (!atomic_read(&kvm->online_vcpus))
> +		return -EINVAL;
> +
> +	if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
> +		return -EINVAL;
> +
> +	if (copy_from_user(&region, (void __user *)cmd->data, sizeof(region)))
> +		return -EFAULT;
> +
> +	/* Sanity check */
> +	if (!IS_ALIGNED(region.source_addr, PAGE_SIZE) ||
> +	    !IS_ALIGNED(region.gpa, PAGE_SIZE) ||
> +	    !region.nr_pages ||
> +	    region.nr_pages & GENMASK_ULL(63, 63 - PAGE_SHIFT) ||
> +	    region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
> +	    !kvm_is_private_gpa(kvm, region.gpa) ||
> +	    !kvm_is_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT)))
> +		return -EINVAL;
> +
> +	vcpu = kvm_get_vcpu(kvm, 0);
> +	if (mutex_lock_killable(&vcpu->mutex))
> +		return -EINTR;

The real reason for this drive-by pseudo-review is that I am hoping/wishing we
can turn this into a generic KVM ioctl() to allow userspace to pre-map guest
memory[*].

If we're going to carry non-trivial code, we might as well squeeze as much use
out of it as we can.

Beyond wanting to shove this into KVM_MEMORY_ENCRYPT_OP, is there any reason why
this is a VM ioctl() and not a vCPU ioctl()?  Very roughly, couldn't we use a
struct like this as input to a vCPU ioctl() that maps memory, and optionally
initializes memory from @source?

	struct kvm_memory_mapping {
		__u64 base_gfn;
		__u64 nr_pages;
		__u64 flags;
		__u64 source;
	}

TDX would need to do special things for copying the source, but beyond that most
of the code in this function is generic.

[*] https://lore.kernel.org/all/65262e67-7885-971a-896d-ad9c0a760907@polito.it
David Matlack Feb. 1, 2024, 11:06 p.m. UTC | #2
+Vipin Sharma

On Wed, Jan 31, 2024 at 4:21 PM Sean Christopherson <seanjc@google.com> wrote:
> On Mon, Jan 22, 2024, isaku.yamahata@intel.com wrote:
>
> The real reason for this drive-by pseudo-review is that I am hoping/wishing we
> can turn this into a generic KVM ioctl() to allow userspace to pre-map guest
> memory[*].
>
> If we're going to carry non-trivial code, we might as well squeeze as much use
> out of it as we can.
>
> Beyond wanting to shove this into KVM_MEMORY_ENCRYPT_OP, is there any reason why
> this is a VM ioctl() and not a vCPU ioctl()?  Very roughly, couldn't we use a
> struct like this as input to a vCPU ioctl() that maps memory, and optionally
> initializes memory from @source?
>
>         struct kvm_memory_mapping {
>                 __u64 base_gfn;
>                 __u64 nr_pages;
>                 __u64 flags;
>                 __u64 source;
>         }
>
> TDX would need to do special things for copying the source, but beyond that most
> of the code in this function is generic.
>
> [*] https://lore.kernel.org/all/65262e67-7885-971a-896d-ad9c0a760907@polito.it

We would also be interested in such an API to reduce the guest
performance impact of intra-host migration.
Binbin Wu Feb. 19, 2024, 8:54 a.m. UTC | #3
On 1/23/2024 7:53 AM, isaku.yamahata@intel.com wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
>
> Because the guest memory is protected in TDX, the creation of the initial
> guest memory requires a dedicated TDX module API, tdh_mem_page_add, instead
> of directly copying the memory contents into the guest memory in the case
> of the default VM type.  KVM MMU page fault handler callback,
> private_page_add, handles it.

The changelog is stale?  Do you mean "set_private_spte"?

>
> Define new subcommand, KVM_TDX_INIT_MEM_REGION, of VM-scoped
> KVM_MEMORY_ENCRYPT_OP.  It assigns the guest page, copies the initial
> memory contents into the guest memory, encrypts the guest memory.  At the
> same time, optionally it extends memory measurement of the TDX guest.  It
> calls the KVM MMU page fault(EPT-violation) handler to trigger the
> callbacks for it.
>
> Reported-by: gkirkpatrick@google.com
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
>
> ---
> v18:
> - rename tdx_sept_page_add() -> tdx_mem_page_add().
> - open code tdx_measure_page() into tdx_mem_page_add().
> - remove the change of tools/arch/x86/include/uapi/asm/kvm.h.
>
> v15 -> v16:
> - add check if nr_pages isn't large with
>    (nr_page << PAGE_SHIFT) >> PAGE_SHIFT
>
> v14 -> v15:
> - add a check if TD is finalized or not to tdx_init_mem_region()
> - return -EAGAIN when partial population
> ---
>   arch/x86/include/uapi/asm/kvm.h |   9 ++
>   arch/x86/kvm/mmu/mmu.c          |   1 +
>   arch/x86/kvm/vmx/tdx.c          | 160 +++++++++++++++++++++++++++++++-
>   arch/x86/kvm/vmx/tdx.h          |   2 +
>   4 files changed, 169 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index 4000a2e087a8..9fda7c90b7b5 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -572,6 +572,7 @@ enum kvm_tdx_cmd_id {
>   	KVM_TDX_CAPABILITIES = 0,
>   	KVM_TDX_INIT_VM,
>   	KVM_TDX_INIT_VCPU,
> +	KVM_TDX_INIT_MEM_REGION,
>   
>   	KVM_TDX_CMD_NR_MAX,
>   };
> @@ -649,4 +650,12 @@ struct kvm_tdx_init_vm {
>   	struct kvm_cpuid2 cpuid;
>   };
>   
> +#define KVM_TDX_MEASURE_MEMORY_REGION	(1UL << 0)
> +
> +struct kvm_tdx_init_mem_region {
> +	__u64 source_addr;
> +	__u64 gpa;
> +	__u64 nr_pages;
> +};
> +
>   #endif /* _ASM_X86_KVM_H */
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 26d215e85b76..fc258f112e73 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5663,6 +5663,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
>   out:
>   	return r;
>   }
> +EXPORT_SYMBOL(kvm_mmu_load);
>   
>   void kvm_mmu_unload(struct kvm_vcpu *vcpu)
>   {
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 4cbcedff4f16..1a5a91b99de9 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -591,6 +591,69 @@ static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
>   	return 0;
>   }
>   
> +static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn,
> +			    enum pg_level level, kvm_pfn_t pfn)
> +{
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +	hpa_t hpa = pfn_to_hpa(pfn);
> +	gpa_t gpa = gfn_to_gpa(gfn);
> +	struct tdx_module_args out;
> +	hpa_t source_pa;
> +	bool measure;
> +	u64 err;
> +	int i;
> +
> +	/*
> +	 * KVM_INIT_MEM_REGION, tdx_init_mem_region(), supports only 4K page
> +	 * because tdh_mem_page_add() supports only 4K page.
> +	 */
> +	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
> +		return -EINVAL;
> +
> +	/*
> +	 * In case of TDP MMU, fault handler can run concurrently.  Note
> +	 * 'source_pa' is a TD scope variable, meaning if there are multiple
> +	 * threads reaching here with all needing to access 'source_pa', it
> +	 * will break.  However fortunately this won't happen, because below
> +	 * TDH_MEM_PAGE_ADD code path is only used when VM is being created
> +	 * before it is running, using KVM_TDX_INIT_MEM_REGION ioctl (which
> +	 * always uses vcpu 0's page table and protected by vcpu->mutex).
> +	 */
> +	if (KVM_BUG_ON(kvm_tdx->source_pa == INVALID_PAGE, kvm)) {
> +		tdx_unpin(kvm, pfn);
> +		return -EINVAL;
> +	}
> +
> +	source_pa = kvm_tdx->source_pa & ~KVM_TDX_MEASURE_MEMORY_REGION;
> +	measure = kvm_tdx->source_pa & KVM_TDX_MEASURE_MEMORY_REGION;
> +	kvm_tdx->source_pa = INVALID_PAGE;
> +
> +	do {
> +		err = tdh_mem_page_add(kvm_tdx->tdr_pa, gpa, hpa, source_pa,
> +				       &out);
> +		/*
> +		 * This path is executed during populating initial guest memory
> +		 * image. i.e. before running any vcpu.  Race is rare.
> +		 */
> +	} while (unlikely(err == TDX_ERROR_SEPT_BUSY));

For page add, since pages are added one by one, there should be no such
error, right?

> +	if (KVM_BUG_ON(err, kvm)) {
> +		pr_tdx_error(TDH_MEM_PAGE_ADD, err, &out);
> +		tdx_unpin(kvm, pfn);
> +		return -EIO;
> +	} else if (measure) {
> +		for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
> +			err = tdh_mr_extend(kvm_tdx->tdr_pa, gpa + i, &out);
> +			if (KVM_BUG_ON(err, &kvm_tdx->kvm)) {
> +				pr_tdx_error(TDH_MR_EXTEND, err, &out);
> +				break;
> +			}
> +		}
> +	}
> +
> +	return 0;
> +
> +}
> +
>   static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
>   				     enum pg_level level, kvm_pfn_t pfn)
>   {
> @@ -613,9 +676,7 @@ static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
>   	if (likely(is_td_finalized(kvm_tdx)))
>   		return tdx_mem_page_aug(kvm, gfn, level, pfn);
>   
> -	/* TODO: tdh_mem_page_add() comes here for the initial memory. */
> -
> -	return 0;
> +	return tdx_mem_page_add(kvm, gfn, level, pfn);
>   }
>   
>   static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
> @@ -1322,6 +1383,96 @@ void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
>   	tdx_track(vcpu->kvm);
>   }
>   
> +#define TDX_SEPT_PFERR	(PFERR_WRITE_MASK | PFERR_GUEST_ENC_MASK)
> +
> +static int tdx_init_mem_region(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
> +{
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +	struct kvm_tdx_init_mem_region region;
> +	struct kvm_vcpu *vcpu;
> +	struct page *page;
> +	int idx, ret = 0;
> +	bool added = false;
> +
> +	/* Once TD is finalized, the initial guest memory is fixed. */
> +	if (is_td_finalized(kvm_tdx))
> +		return -EINVAL;
> +
> +	/* The BSP vCPU must be created before initializing memory regions. */
> +	if (!atomic_read(&kvm->online_vcpus))
> +		return -EINVAL;
> +
> +	if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
> +		return -EINVAL;
> +
> +	if (copy_from_user(&region, (void __user *)cmd->data, sizeof(region)))
> +		return -EFAULT;
> +
> +	/* Sanity check */
> +	if (!IS_ALIGNED(region.source_addr, PAGE_SIZE) ||
> +	    !IS_ALIGNED(region.gpa, PAGE_SIZE) ||
> +	    !region.nr_pages ||
> +	    region.nr_pages & GENMASK_ULL(63, 63 - PAGE_SHIFT) ||
> +	    region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
> +	    !kvm_is_private_gpa(kvm, region.gpa) ||
> +	    !kvm_is_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT)))
> +		return -EINVAL;
> +
> +	vcpu = kvm_get_vcpu(kvm, 0);
> +	if (mutex_lock_killable(&vcpu->mutex))
> +		return -EINTR;
> +
> +	vcpu_load(vcpu);
> +	idx = srcu_read_lock(&kvm->srcu);
> +
> +	kvm_mmu_reload(vcpu);
> +
> +	while (region.nr_pages) {
> +		if (signal_pending(current)) {
> +			ret = -ERESTARTSYS;
> +			break;
> +		}
> +
> +		if (need_resched())
> +			cond_resched();
> +
> +		/* Pin the source page. */
> +		ret = get_user_pages_fast(region.source_addr, 1, 0, &page);
> +		if (ret < 0)
> +			break;
> +		if (ret != 1) {
> +			ret = -ENOMEM;
> +			break;
> +		}
> +
> +		kvm_tdx->source_pa = pfn_to_hpa(page_to_pfn(page)) |
> +				     (cmd->flags & KVM_TDX_MEASURE_MEMORY_REGION);
> +
> +		ret = kvm_mmu_map_tdp_page(vcpu, region.gpa, TDX_SEPT_PFERR,
> +					   PG_LEVEL_4K);
> +		put_page(page);
> +		if (ret)
> +			break;
> +
> +		region.source_addr += PAGE_SIZE;
> +		region.gpa += PAGE_SIZE;
> +		region.nr_pages--;
> +		added = true;
> +	}
> +
> +	srcu_read_unlock(&kvm->srcu, idx);
> +	vcpu_put(vcpu);
> +
> +	mutex_unlock(&vcpu->mutex);
> +
> +	if (added && region.nr_pages > 0)
> +		ret = -EAGAIN;
> +	if (copy_to_user((void __user *)cmd->data, &region, sizeof(region)))
> +		ret = -EFAULT;
> +
> +	return ret;
> +}
> +
>   int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
>   {
>   	struct kvm_tdx_cmd tdx_cmd;
> @@ -1341,6 +1492,9 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
>   	case KVM_TDX_INIT_VM:
>   		r = tdx_td_init(kvm, &tdx_cmd);
>   		break;
> +	case KVM_TDX_INIT_MEM_REGION:
> +		r = tdx_init_mem_region(kvm, &tdx_cmd);
> +		break;
>   	default:
>   		r = -EINVAL;
>   		goto out;
> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index 783ce329d7da..d589a2caedfb 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -17,6 +17,8 @@ struct kvm_tdx {
>   	u64 xfam;
>   	int hkid;
>   
> +	hpa_t source_pa;
> +
>   	bool finalized;
>   	atomic_t tdh_mem_track;
>
Isaku Yamahata Feb. 26, 2024, 6:07 p.m. UTC | #4
On Thu, Feb 01, 2024 at 03:06:46PM -0800,
David Matlack <dmatlack@google.com> wrote:

> +Vipin Sharma
> 
> On Wed, Jan 31, 2024 at 4:21 PM Sean Christopherson <seanjc@google.com> wrote:
> > On Mon, Jan 22, 2024, isaku.yamahata@intel.com wrote:
> >
> > The real reason for this drive-by pseudo-review is that I am hoping/wishing we
> > can turn this into a generic KVM ioctl() to allow userspace to pre-map guest
> > memory[*].
> >
> > If we're going to carry non-trivial code, we might as well squeeze as much use
> > out of it as we can.
> >
> > Beyond wanting to shove this into KVM_MEMORY_ENCRYPT_OP, is there any reason why
> > this is a VM ioctl() and not a vCPU ioctl()?  Very roughly, couldn't we use a
> > struct like this as input to a vCPU ioctl() that maps memory, and optionally
> > initializes memory from @source?
> >
> >         struct kvm_memory_mapping {
> >                 __u64 base_gfn;
> >                 __u64 nr_pages;
> >                 __u64 flags;
> >                 __u64 source;
> >         }
> >
> > TDX would need to do special things for copying the source, but beyond that most
> > of the code in this function is generic.
> >
> > [*] https://lore.kernel.org/all/65262e67-7885-971a-896d-ad9c0a760907@polito.it
> 
> We would also be interested in such an API to reduce the guest
> performance impact of intra-host migration.

I introduce KVM_MEMORY_MAPPING and KVM_CAP_MEMORY_MAPPING with v19.
We can continue the discussion there.
Sean Christopherson Feb. 26, 2024, 6:50 p.m. UTC | #5
On Mon, Feb 26, 2024, Isaku Yamahata wrote:
> On Thu, Feb 01, 2024 at 03:06:46PM -0800,
> David Matlack <dmatlack@google.com> wrote:
> 
> > +Vipin Sharma
> > 
> > On Wed, Jan 31, 2024 at 4:21 PM Sean Christopherson <seanjc@google.com> wrote:
> > > On Mon, Jan 22, 2024, isaku.yamahata@intel.com wrote:
> > >
> > > The real reason for this drive-by pseudo-review is that I am hoping/wishing we
> > > can turn this into a generic KVM ioctl() to allow userspace to pre-map guest
> > > memory[*].
> > >
> > > If we're going to carry non-trivial code, we might as well squeeze as much use
> > > out of it as we can.
> > >
> > > Beyond wanting to shove this into KVM_MEMORY_ENCRYPT_OP, is there any reason why
> > > this is a VM ioctl() and not a vCPU ioctl()?  Very roughly, couldn't we use a
> > > struct like this as input to a vCPU ioctl() that maps memory, and optionally
> > > initializes memory from @source?
> > >
> > >         struct kvm_memory_mapping {
> > >                 __u64 base_gfn;
> > >                 __u64 nr_pages;
> > >                 __u64 flags;
> > >                 __u64 source;
> > >         }
> > >
> > > TDX would need to do special things for copying the source, but beyond that most
> > > of the code in this function is generic.
> > >
> > > [*] https://lore.kernel.org/all/65262e67-7885-971a-896d-ad9c0a760907@polito.it
> > 
> > We would also be interested in such an API to reduce the guest
> > performance impact of intra-host migration.
> 
> I introduce KVM_MEMORY_MAPPING and KVM_CAP_MEMORY_MAPPING with v19.

KVM_MEMORY_MAPPING is not a good ioctl() name.  There needs to be a verb in there
somewhere, e.g. KVM_MAP_MEMORY, KVM_FAULTIN_MEMORY, etc.

> We can continue the discussion there.

No, we absolutely cannot continue the conversation there.   That is not how kernel
development works.

Enough is enough.  I am archiving v19 and not touching it.

Please post an RFC for _just_ this functionality, and follow-up in existing,
pre-v19 conversations for anything else that changed between v18 and v19 and might
need additional input/discussion.
Isaku Yamahata Feb. 27, 2024, 2:12 p.m. UTC | #6
On Mon, Feb 26, 2024 at 10:50:55AM -0800,
Sean Christopherson <seanjc@google.com> wrote:

> Please post an RFC for _just_ this functionality, and follow-up in existing,
> pre-v19 conversations for anything else that changed between v18 and v19 and might
> need additional input/discussion.

Sure, will post it. My plan is as follow for input/discussion
- Review SEV-SNP patches by Paolo for commonality 
- RFC patch to KVM_MAP_MEMORY or KVM_FAULTIN_MEMORY
- RFC patch for uKVM for confidential VM
Sean Christopherson Feb. 27, 2024, 5:30 p.m. UTC | #7
On Tue, Feb 27, 2024, Isaku Yamahata wrote:
> On Mon, Feb 26, 2024 at 10:50:55AM -0800,
> Sean Christopherson <seanjc@google.com> wrote:
> 
> > Please post an RFC for _just_ this functionality, and follow-up in existing,
> > pre-v19 conversations for anything else that changed between v18 and v19 and might
> > need additional input/discussion.
> 
> Sure, will post it. My plan is as follow for input/discussion
> - Review SEV-SNP patches by Paolo for commonality 
> - RFC patch to KVM_MAP_MEMORY or KVM_FAULTIN_MEMORY
> - RFC patch for uKVM for confidential VM

uKVM?
Isaku Yamahata March 8, 2024, 1:07 a.m. UTC | #8
On Tue, Feb 27, 2024 at 09:30:11AM -0800,
Sean Christopherson <seanjc@google.com> wrote:

> On Tue, Feb 27, 2024, Isaku Yamahata wrote:
> > On Mon, Feb 26, 2024 at 10:50:55AM -0800,
> > Sean Christopherson <seanjc@google.com> wrote:
> > 
> > > Please post an RFC for _just_ this functionality, and follow-up in existing,
> > > pre-v19 conversations for anything else that changed between v18 and v19 and might
> > > need additional input/discussion.
> > 
> > Sure, will post it. My plan is as follow for input/discussion
> > - Review SEV-SNP patches by Paolo for commonality 
> > - RFC patch to KVM_MAP_MEMORY or KVM_FAULTIN_MEMORY
> > - RFC patch for uKVM for confidential VM
> 
> uKVM?

I meant uAPI, sorry for typo.
Although I looked into a unified uAPI with SEV, the gain seem to be small or
none. I'm currently planning to drop it based on the feedback at
https://lore.kernel.org/kvm/ZL%2Fr6Vca8WkFVaic@google.com/
diff mbox series

Patch

diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 4000a2e087a8..9fda7c90b7b5 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -572,6 +572,7 @@  enum kvm_tdx_cmd_id {
 	KVM_TDX_CAPABILITIES = 0,
 	KVM_TDX_INIT_VM,
 	KVM_TDX_INIT_VCPU,
+	KVM_TDX_INIT_MEM_REGION,
 
 	KVM_TDX_CMD_NR_MAX,
 };
@@ -649,4 +650,12 @@  struct kvm_tdx_init_vm {
 	struct kvm_cpuid2 cpuid;
 };
 
+#define KVM_TDX_MEASURE_MEMORY_REGION	(1UL << 0)
+
+struct kvm_tdx_init_mem_region {
+	__u64 source_addr;
+	__u64 gpa;
+	__u64 nr_pages;
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 26d215e85b76..fc258f112e73 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5663,6 +5663,7 @@  int kvm_mmu_load(struct kvm_vcpu *vcpu)
 out:
 	return r;
 }
+EXPORT_SYMBOL(kvm_mmu_load);
 
 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 4cbcedff4f16..1a5a91b99de9 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -591,6 +591,69 @@  static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
 	return 0;
 }
 
+static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn,
+			    enum pg_level level, kvm_pfn_t pfn)
+{
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	hpa_t hpa = pfn_to_hpa(pfn);
+	gpa_t gpa = gfn_to_gpa(gfn);
+	struct tdx_module_args out;
+	hpa_t source_pa;
+	bool measure;
+	u64 err;
+	int i;
+
+	/*
+	 * KVM_INIT_MEM_REGION, tdx_init_mem_region(), supports only 4K page
+	 * because tdh_mem_page_add() supports only 4K page.
+	 */
+	if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
+		return -EINVAL;
+
+	/*
+	 * In case of TDP MMU, fault handler can run concurrently.  Note
+	 * 'source_pa' is a TD scope variable, meaning if there are multiple
+	 * threads reaching here with all needing to access 'source_pa', it
+	 * will break.  However fortunately this won't happen, because below
+	 * TDH_MEM_PAGE_ADD code path is only used when VM is being created
+	 * before it is running, using KVM_TDX_INIT_MEM_REGION ioctl (which
+	 * always uses vcpu 0's page table and protected by vcpu->mutex).
+	 */
+	if (KVM_BUG_ON(kvm_tdx->source_pa == INVALID_PAGE, kvm)) {
+		tdx_unpin(kvm, pfn);
+		return -EINVAL;
+	}
+
+	source_pa = kvm_tdx->source_pa & ~KVM_TDX_MEASURE_MEMORY_REGION;
+	measure = kvm_tdx->source_pa & KVM_TDX_MEASURE_MEMORY_REGION;
+	kvm_tdx->source_pa = INVALID_PAGE;
+
+	do {
+		err = tdh_mem_page_add(kvm_tdx->tdr_pa, gpa, hpa, source_pa,
+				       &out);
+		/*
+		 * This path is executed during populating initial guest memory
+		 * image. i.e. before running any vcpu.  Race is rare.
+		 */
+	} while (unlikely(err == TDX_ERROR_SEPT_BUSY));
+	if (KVM_BUG_ON(err, kvm)) {
+		pr_tdx_error(TDH_MEM_PAGE_ADD, err, &out);
+		tdx_unpin(kvm, pfn);
+		return -EIO;
+	} else if (measure) {
+		for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
+			err = tdh_mr_extend(kvm_tdx->tdr_pa, gpa + i, &out);
+			if (KVM_BUG_ON(err, &kvm_tdx->kvm)) {
+				pr_tdx_error(TDH_MR_EXTEND, err, &out);
+				break;
+			}
+		}
+	}
+
+	return 0;
+
+}
+
 static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
 				     enum pg_level level, kvm_pfn_t pfn)
 {
@@ -613,9 +676,7 @@  static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
 	if (likely(is_td_finalized(kvm_tdx)))
 		return tdx_mem_page_aug(kvm, gfn, level, pfn);
 
-	/* TODO: tdh_mem_page_add() comes here for the initial memory. */
-
-	return 0;
+	return tdx_mem_page_add(kvm, gfn, level, pfn);
 }
 
 static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
@@ -1322,6 +1383,96 @@  void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
 	tdx_track(vcpu->kvm);
 }
 
+#define TDX_SEPT_PFERR	(PFERR_WRITE_MASK | PFERR_GUEST_ENC_MASK)
+
+static int tdx_init_mem_region(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
+{
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	struct kvm_tdx_init_mem_region region;
+	struct kvm_vcpu *vcpu;
+	struct page *page;
+	int idx, ret = 0;
+	bool added = false;
+
+	/* Once TD is finalized, the initial guest memory is fixed. */
+	if (is_td_finalized(kvm_tdx))
+		return -EINVAL;
+
+	/* The BSP vCPU must be created before initializing memory regions. */
+	if (!atomic_read(&kvm->online_vcpus))
+		return -EINVAL;
+
+	if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
+		return -EINVAL;
+
+	if (copy_from_user(&region, (void __user *)cmd->data, sizeof(region)))
+		return -EFAULT;
+
+	/* Sanity check */
+	if (!IS_ALIGNED(region.source_addr, PAGE_SIZE) ||
+	    !IS_ALIGNED(region.gpa, PAGE_SIZE) ||
+	    !region.nr_pages ||
+	    region.nr_pages & GENMASK_ULL(63, 63 - PAGE_SHIFT) ||
+	    region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
+	    !kvm_is_private_gpa(kvm, region.gpa) ||
+	    !kvm_is_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT)))
+		return -EINVAL;
+
+	vcpu = kvm_get_vcpu(kvm, 0);
+	if (mutex_lock_killable(&vcpu->mutex))
+		return -EINTR;
+
+	vcpu_load(vcpu);
+	idx = srcu_read_lock(&kvm->srcu);
+
+	kvm_mmu_reload(vcpu);
+
+	while (region.nr_pages) {
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		if (need_resched())
+			cond_resched();
+
+		/* Pin the source page. */
+		ret = get_user_pages_fast(region.source_addr, 1, 0, &page);
+		if (ret < 0)
+			break;
+		if (ret != 1) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		kvm_tdx->source_pa = pfn_to_hpa(page_to_pfn(page)) |
+				     (cmd->flags & KVM_TDX_MEASURE_MEMORY_REGION);
+
+		ret = kvm_mmu_map_tdp_page(vcpu, region.gpa, TDX_SEPT_PFERR,
+					   PG_LEVEL_4K);
+		put_page(page);
+		if (ret)
+			break;
+
+		region.source_addr += PAGE_SIZE;
+		region.gpa += PAGE_SIZE;
+		region.nr_pages--;
+		added = true;
+	}
+
+	srcu_read_unlock(&kvm->srcu, idx);
+	vcpu_put(vcpu);
+
+	mutex_unlock(&vcpu->mutex);
+
+	if (added && region.nr_pages > 0)
+		ret = -EAGAIN;
+	if (copy_to_user((void __user *)cmd->data, &region, sizeof(region)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_tdx_cmd tdx_cmd;
@@ -1341,6 +1492,9 @@  int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
 	case KVM_TDX_INIT_VM:
 		r = tdx_td_init(kvm, &tdx_cmd);
 		break;
+	case KVM_TDX_INIT_MEM_REGION:
+		r = tdx_init_mem_region(kvm, &tdx_cmd);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index 783ce329d7da..d589a2caedfb 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -17,6 +17,8 @@  struct kvm_tdx {
 	u64 xfam;
 	int hkid;
 
+	hpa_t source_pa;
+
 	bool finalized;
 	atomic_t tdh_mem_track;