diff mbox series

[RFC,6/8] KVM: x86: Implement kvm_arch_{, pre_}vcpu_map_memory()

Message ID 66a957f4ec4a8591d2ff2550686e361ec648b308.1709288671.git.isaku.yamahata@intel.com (mailing list archive)
State New, archived
Headers show
Series KVM: Prepopulate guest memory API | expand

Commit Message

Isaku Yamahata March 1, 2024, 5:28 p.m. UTC
From: Isaku Yamahata <isaku.yamahata@intel.com>

Wire KVM_MAP_MEMORY ioctl to kvm_mmu_map_tdp_page() to populate guest
memory.

Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 arch/x86/kvm/x86.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

Comments

David Matlack March 7, 2024, 12:30 a.m. UTC | #1
On 2024-03-01 09:28 AM, isaku.yamahata@intel.com wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> Wire KVM_MAP_MEMORY ioctl to kvm_mmu_map_tdp_page() to populate guest
> memory.
> 
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>  arch/x86/kvm/x86.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 49 insertions(+)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 3b8cb69b04fa..6025c0e12d89 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -4660,6 +4660,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>  	case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
>  	case KVM_CAP_IRQFD_RESAMPLE:
>  	case KVM_CAP_MEMORY_FAULT_INFO:
> +	case KVM_CAP_MAP_MEMORY:
>  		r = 1;
>  		break;
>  	case KVM_CAP_EXIT_HYPERCALL:
> @@ -5805,6 +5806,54 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
>  	}
>  }
>  
> +int kvm_arch_vcpu_pre_map_memory(struct kvm_vcpu *vcpu)
> +{
> +	return kvm_mmu_reload(vcpu);
> +}

Why is the here and not kvm_arch_vcpu_map_memory()?

> +
> +int kvm_arch_vcpu_map_memory(struct kvm_vcpu *vcpu,
> +			     struct kvm_memory_mapping *mapping)
> +{
> +	u8 max_level, goal_level = PG_LEVEL_4K;
> +	u32 error_code;
> +	int r;
> +
> +	error_code = 0;
> +	if (mapping->flags & KVM_MEMORY_MAPPING_FLAG_WRITE)
> +		error_code |= PFERR_WRITE_MASK;
> +	if (mapping->flags & KVM_MEMORY_MAPPING_FLAG_EXEC)
> +		error_code |= PFERR_FETCH_MASK;
> +	if (mapping->flags & KVM_MEMORY_MAPPING_FLAG_USER)
> +		error_code |= PFERR_USER_MASK;
> +	if (mapping->flags & KVM_MEMORY_MAPPING_FLAG_PRIVATE) {
> +#ifdef PFERR_PRIVATE_ACCESS
> +		error_code |= PFERR_PRIVATE_ACCESS;
> +#else
> +		return -OPNOTSUPP;

-EOPNOTSUPP

> +#endif
> +	}
> +
> +	if (IS_ALIGNED(mapping->base_gfn, KVM_PAGES_PER_HPAGE(PG_LEVEL_1G)) &&
> +	    mapping->nr_pages >= KVM_PAGES_PER_HPAGE(PG_LEVEL_1G))
> +		max_level = PG_LEVEL_1G;
> +	else if (IS_ALIGNED(mapping->base_gfn, KVM_PAGES_PER_HPAGE(PG_LEVEL_2M)) &&
> +		 mapping->nr_pages >= KVM_PAGES_PER_HPAGE(PG_LEVEL_2M))
> +		max_level = PG_LEVEL_2M;
> +	else
> +		max_level = PG_LEVEL_4K;

Is there a requirement that KVM must not map memory outside of the
requested region?

> +
> +	r = kvm_mmu_map_page(vcpu, gfn_to_gpa(mapping->base_gfn), error_code,
> +			     max_level, &goal_level);
> +	if (r)
> +		return r;
> +
> +	if (mapping->source)
> +		mapping->source += KVM_HPAGE_SIZE(goal_level);
> +	mapping->base_gfn += KVM_PAGES_PER_HPAGE(goal_level);
> +	mapping->nr_pages -= KVM_PAGES_PER_HPAGE(goal_level);
> +	return r;
> +}
> +
>  long kvm_arch_vcpu_ioctl(struct file *filp,
>  			 unsigned int ioctl, unsigned long arg)
>  {
> -- 
> 2.25.1
>
David Matlack March 7, 2024, 12:36 a.m. UTC | #2
On Wed, Mar 6, 2024 at 4:31 PM David Matlack <dmatlack@google.com> wrote:
>
> On 2024-03-01 09:28 AM, isaku.yamahata@intel.com wrote:
> >
> > +     if (IS_ALIGNED(mapping->base_gfn, KVM_PAGES_PER_HPAGE(PG_LEVEL_1G)) &&
> > +         mapping->nr_pages >= KVM_PAGES_PER_HPAGE(PG_LEVEL_1G))
> > +             max_level = PG_LEVEL_1G;
> > +     else if (IS_ALIGNED(mapping->base_gfn, KVM_PAGES_PER_HPAGE(PG_LEVEL_2M)) &&
> > +              mapping->nr_pages >= KVM_PAGES_PER_HPAGE(PG_LEVEL_2M))
> > +             max_level = PG_LEVEL_2M;
> > +     else
> > +             max_level = PG_LEVEL_4K;
>
> Is there a requirement that KVM must not map memory outside of the
> requested region?

And if so, what if the requested region is already mapped with a larger page?
Isaku Yamahata March 7, 2024, 1:34 a.m. UTC | #3
On Wed, Mar 06, 2024 at 04:30:58PM -0800,
David Matlack <dmatlack@google.com> wrote:

> On 2024-03-01 09:28 AM, isaku.yamahata@intel.com wrote:
> > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > 
> > Wire KVM_MAP_MEMORY ioctl to kvm_mmu_map_tdp_page() to populate guest
> > memory.
> > 
> > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> > ---
> >  arch/x86/kvm/x86.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 49 insertions(+)
> > 
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 3b8cb69b04fa..6025c0e12d89 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -4660,6 +4660,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> >  	case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
> >  	case KVM_CAP_IRQFD_RESAMPLE:
> >  	case KVM_CAP_MEMORY_FAULT_INFO:
> > +	case KVM_CAP_MAP_MEMORY:
> >  		r = 1;
> >  		break;
> >  	case KVM_CAP_EXIT_HYPERCALL:
> > @@ -5805,6 +5806,54 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
> >  	}
> >  }
> >  
> > +int kvm_arch_vcpu_pre_map_memory(struct kvm_vcpu *vcpu)
> > +{
> > +	return kvm_mmu_reload(vcpu);
> > +}
> 
> Why is the here and not kvm_arch_vcpu_map_memory()?

We can push down kvm_mmu_relaod into kvm_arch_vcpu_map_memory() under gpa loop.
Probably the inefficiency won't matter.

kvm_mmu_realod()
loop on gpa
  kvm_arch_vcpu_map_memory()

=>

loop on gpa
    kvm_arch_vcpu_map_memory()
      kvm_mmu_reload()
Isaku Yamahata March 7, 2024, 1:51 a.m. UTC | #4
On Wed, Mar 06, 2024 at 04:36:25PM -0800,
David Matlack <dmatlack@google.com> wrote:

> On Wed, Mar 6, 2024 at 4:31 PM David Matlack <dmatlack@google.com> wrote:
> >
> > On 2024-03-01 09:28 AM, isaku.yamahata@intel.com wrote:
> > >
> > > +     if (IS_ALIGNED(mapping->base_gfn, KVM_PAGES_PER_HPAGE(PG_LEVEL_1G)) &&
> > > +         mapping->nr_pages >= KVM_PAGES_PER_HPAGE(PG_LEVEL_1G))
> > > +             max_level = PG_LEVEL_1G;
> > > +     else if (IS_ALIGNED(mapping->base_gfn, KVM_PAGES_PER_HPAGE(PG_LEVEL_2M)) &&
> > > +              mapping->nr_pages >= KVM_PAGES_PER_HPAGE(PG_LEVEL_2M))
> > > +             max_level = PG_LEVEL_2M;
> > > +     else
> > > +             max_level = PG_LEVEL_4K;
> >
> > Is there a requirement that KVM must not map memory outside of the
> > requested region?
> 
> And if so, what if the requested region is already mapped with a larger page?

Yes. We'd like to map exact gpa range for SNP or TDX case. We don't want to map
zero at around range.  For SNP or TDX, we map page to GPA, it's one time
operation.  It updates measurement.

Say, we'd like to populate GPA1 and GPA2 with initial guest memory image.  And
they are within same 2M range.  Map GPA1 first. If GPA2 is also mapped with zero
with 2M page, the following mapping of GPA2 fails.  Even if mapping of GPA2
succeeds, measurement may be updated when mapping GPA1. 

It's user space VMM responsibility to map GPA range only once at most for SNP or
TDX.  Is this too strict requirement for default VM use case to mitigate KVM
page fault at guest boot up?  If so, what about a flag like EXACT_MAPPING or
something?
Sean Christopherson March 11, 2024, 11:26 p.m. UTC | #5
On Fri, Mar 01, 2024, isaku.yamahata@intel.com wrote:
> +int kvm_arch_vcpu_map_memory(struct kvm_vcpu *vcpu,
> +			     struct kvm_memory_mapping *mapping)
> +{
> +	u8 max_level, goal_level = PG_LEVEL_4K;

goal_level is misleading.  kvm_page_fault.goal_level is appropriate, because for
the majority of the structures lifetime, it is the level KVM is trying to map,
not the level that KVM has mapped.

For this variable, maybe "mapped_level" or just "level"


> +	u32 error_code;

u64 when this gets rebased...

> +	int r;
> +
> +	error_code = 0;
> +	if (mapping->flags & KVM_MEMORY_MAPPING_FLAG_WRITE)
> +		error_code |= PFERR_WRITE_MASK;
> +	if (mapping->flags & KVM_MEMORY_MAPPING_FLAG_EXEC)
> +		error_code |= PFERR_FETCH_MASK;
> +	if (mapping->flags & KVM_MEMORY_MAPPING_FLAG_USER)
> +		error_code |= PFERR_USER_MASK;
> +	if (mapping->flags & KVM_MEMORY_MAPPING_FLAG_PRIVATE) {
> +#ifdef PFERR_PRIVATE_ACCESS
> +		error_code |= PFERR_PRIVATE_ACCESS;

...because PFERR_PRIVATE_ACCESS is in bits 63:32.

> +#else
> +		return -OPNOTSUPP;

Broken code.  And I don't see any reason for this to exist, PFERR_PRIVATE_ACCESS
will be unconditionally #defined.

> +#endif
> +	}
> +
> +	if (IS_ALIGNED(mapping->base_gfn, KVM_PAGES_PER_HPAGE(PG_LEVEL_1G)) &&
> +	    mapping->nr_pages >= KVM_PAGES_PER_HPAGE(PG_LEVEL_1G))
> +		max_level = PG_LEVEL_1G;
> +	else if (IS_ALIGNED(mapping->base_gfn, KVM_PAGES_PER_HPAGE(PG_LEVEL_2M)) &&
> +		 mapping->nr_pages >= KVM_PAGES_PER_HPAGE(PG_LEVEL_2M))
> +		max_level = PG_LEVEL_2M;
> +	else
> +		max_level = PG_LEVEL_4K;

Hrm, I would much prefer to allow KVM to map more than what is requested, i.e.
not artificially restrict the hugepage size.  Restricting the mapping size is
just giving userspace another way to shoot themselves in the foot.  E.g. if
userspace prefaults the wrong size, KVM will likely either run with degraded
performance or immediately zap and rebuild the too-small SPTE.

Ugh, and TDX's S-EPT *must* be populated with 4KiB entries to start.  On one hand,
allowing KVM to map a larger page (but still bounded by the memslot(s)) won't affect
TDX's measurement, because KVM never use a larger page.  On the other hand, TDX
would need a way to restrict the mapping.

Another complication is that TDH.MEM.PAGE.ADD affects the measurement.  We can
push the ordering requirements to userspace, but for all intents and purposes,
calls to this ioctl() would need to be serialized for doing PAGE.ADD, but could
run in parallel for every other use case, including PAGE.AUG and pre-mapping
shared memory for TDX.

Hrm.

Wait. KVM doesn't *need* to do PAGE.ADD from deep in the MMU.  The only inputs to
PAGE.ADD are the gfn, pfn, tdr (vm), and source.  The S-EPT structures need to be
pre-built, but when they are built is irrelevant, so long as they are in place
before PAGE.ADD.

Crazy idea.  For TDX S-EPT, what if KVM_MAP_MEMORY does all of the SEPT.ADD stuff,
which doesn't affect the measurement, and even fills in KVM's copy of the leaf EPTE, 
but tdx_sept_set_private_spte() doesn't do anything if the TD isn't finalized?

Then KVM provides a dedicated TDX ioctl(), i.e. what is/was KVM_TDX_INIT_MEM_REGION,
to do PAGE.ADD.  KVM_TDX_INIT_MEM_REGION wouldn't need to map anything, it would
simply need to verify that the pfn from guest_memfd() is the same as what's in
the TDP MMU.

Or if we want to make things more robust for userspace, set a software-available
flag in the leaf TDP MMU SPTE to indicate that the page is awaiting PAGE.ADD.
That way tdp_mmu_map_handle_target_level() wouldn't treat a fault as spurious
(KVM will see the SPTE as PRESENT, but the S-EPT entry will be !PRESENT).

Then KVM_MAP_MEMORY doesn't need to support @source, KVM_TDX_INIT_MEM_REGION
doesn't need to fake a page fault and doesn't need to temporarily stash the
source_pa in KVM, and KVM_MAP_MEMORY could be used to fully pre-map TDX memory.

I believe the only missing piece is a way for the TDX code to communicate that
hugepages are disallowed.
Huang, Kai March 12, 2024, 12:38 p.m. UTC | #6
> 
> Wait. KVM doesn't *need* to do PAGE.ADD from deep in the MMU.  The only inputs to
> PAGE.ADD are the gfn, pfn, tdr (vm), and source.  The S-EPT structures need to be
> pre-built, but when they are built is irrelevant, so long as they are in place
> before PAGE.ADD.
> 
> Crazy idea.  For TDX S-EPT, what if KVM_MAP_MEMORY does all of the SEPT.ADD stuff,
> which doesn't affect the measurement, and even fills in KVM's copy of the leaf EPTE, 
> but tdx_sept_set_private_spte() doesn't do anything if the TD isn't finalized?
> 
> Then KVM provides a dedicated TDX ioctl(), i.e. what is/was KVM_TDX_INIT_MEM_REGION,
> to do PAGE.ADD.  KVM_TDX_INIT_MEM_REGION wouldn't need to map anything, it would
> simply need to verify that the pfn from guest_memfd() is the same as what's in
> the TDP MMU.

One small question:

What if the memory region passed to KVM_TDX_INIT_MEM_REGION hasn't been pre-
populated?  If we want to make KVM_TDX_INIT_MEM_REGION work with these regions,
then we still need to do the real map.  Or we can make KVM_TDX_INIT_MEM_REGION
return error when it finds the region hasn't been pre-populated?

> 
> Or if we want to make things more robust for userspace, set a software-available
> flag in the leaf TDP MMU SPTE to indicate that the page is awaiting PAGE.ADD.
> That way tdp_mmu_map_handle_target_level() wouldn't treat a fault as spurious
> (KVM will see the SPTE as PRESENT, but the S-EPT entry will be !PRESENT).
> 
> Then KVM_MAP_MEMORY doesn't need to support @source, KVM_TDX_INIT_MEM_REGION
> doesn't need to fake a page fault and doesn't need to temporarily stash the
> source_pa in KVM, and KVM_MAP_MEMORY could be used to fully pre-map TDX memory.
> 
> I believe the only missing piece is a way for the TDX code to communicate that
> hugepages are disallowed.
>
Sean Christopherson March 12, 2024, 2:20 p.m. UTC | #7
On Tue, Mar 12, 2024, Kai Huang wrote:
> > Wait. KVM doesn't *need* to do PAGE.ADD from deep in the MMU.  The only inputs to
> > PAGE.ADD are the gfn, pfn, tdr (vm), and source.  The S-EPT structures need to be
> > pre-built, but when they are built is irrelevant, so long as they are in place
> > before PAGE.ADD.
> > 
> > Crazy idea.  For TDX S-EPT, what if KVM_MAP_MEMORY does all of the SEPT.ADD stuff,
> > which doesn't affect the measurement, and even fills in KVM's copy of the leaf EPTE, 
> > but tdx_sept_set_private_spte() doesn't do anything if the TD isn't finalized?
> > 
> > Then KVM provides a dedicated TDX ioctl(), i.e. what is/was KVM_TDX_INIT_MEM_REGION,
> > to do PAGE.ADD.  KVM_TDX_INIT_MEM_REGION wouldn't need to map anything, it would
> > simply need to verify that the pfn from guest_memfd() is the same as what's in
> > the TDP MMU.
> 
> One small question:
> 
> What if the memory region passed to KVM_TDX_INIT_MEM_REGION hasn't been pre-
> populated?  If we want to make KVM_TDX_INIT_MEM_REGION work with these regions,
> then we still need to do the real map.  Or we can make KVM_TDX_INIT_MEM_REGION
> return error when it finds the region hasn't been pre-populated?

Return an error.  I don't love the idea of bleeding so many TDX details into
userspace, but I'm pretty sure that ship sailed a long, long time ago.
Huang, Kai March 12, 2024, 9:41 p.m. UTC | #8
On Tue, 2024-03-12 at 07:20 -0700, Sean Christopherson wrote:
> On Tue, Mar 12, 2024, Kai Huang wrote:
> > > Wait. KVM doesn't *need* to do PAGE.ADD from deep in the MMU.  The only inputs to
> > > PAGE.ADD are the gfn, pfn, tdr (vm), and source.  The S-EPT structures need to be
> > > pre-built, but when they are built is irrelevant, so long as they are in place
> > > before PAGE.ADD.
> > > 
> > > Crazy idea.  For TDX S-EPT, what if KVM_MAP_MEMORY does all of the SEPT.ADD stuff,
> > > which doesn't affect the measurement, and even fills in KVM's copy of the leaf EPTE, 
> > > but tdx_sept_set_private_spte() doesn't do anything if the TD isn't finalized?
> > > 
> > > Then KVM provides a dedicated TDX ioctl(), i.e. what is/was KVM_TDX_INIT_MEM_REGION,
> > > to do PAGE.ADD.  KVM_TDX_INIT_MEM_REGION wouldn't need to map anything, it would
> > > simply need to verify that the pfn from guest_memfd() is the same as what's in
> > > the TDP MMU.
> > 
> > One small question:
> > 
> > What if the memory region passed to KVM_TDX_INIT_MEM_REGION hasn't been pre-
> > populated?  If we want to make KVM_TDX_INIT_MEM_REGION work with these regions,
> > then we still need to do the real map.  Or we can make KVM_TDX_INIT_MEM_REGION
> > return error when it finds the region hasn't been pre-populated?
> 
> Return an error.  I don't love the idea of bleeding so many TDX details into
> userspace, but I'm pretty sure that ship sailed a long, long time ago.

In this case, IIUC the KVM_MAP_MEMORY ioctl() will be mandatory for TDX
(presumbly also SNP) guests, but _optional_ for other VMs.  Not sure whether
this is ideal.

And just want to make sure I understand the background correctly:

The KVM_MAP_MEMORY ioctl() is supposed to be generic, and it should be able to
be used by any VM but not just CoCo VMs (including SW_PROTECTED ones)?

But it is only supposed to be used by the VMs which use guest_memfd()?  Because
IIUC for normal VMs using mmap() we already have MAP_POPULATE for this purpose.

Looking at [*], it doesn't say what kind of VM the sender was trying to use.

Therefore can we interpret KVM_MAP_MEMORY ioctl() is effectively for CoCo VMs? 
SW_PROTECTED VMs can also use guest_memfd(), but I believe nobody is going to
use it seriously.

[*] https://lore.kernel.org/all/65262e67-7885-971a-896d-ad9c0a760907@polito.it/
Huang, Kai March 12, 2024, 9:46 p.m. UTC | #9
On Tue, 2024-03-12 at 21:41 +0000, Huang, Kai wrote:
> On Tue, 2024-03-12 at 07:20 -0700, Sean Christopherson wrote:
> > On Tue, Mar 12, 2024, Kai Huang wrote:
> > > > Wait. KVM doesn't *need* to do PAGE.ADD from deep in the MMU.  The only inputs to
> > > > PAGE.ADD are the gfn, pfn, tdr (vm), and source.  The S-EPT structures need to be
> > > > pre-built, but when they are built is irrelevant, so long as they are in place
> > > > before PAGE.ADD.
> > > > 
> > > > Crazy idea.  For TDX S-EPT, what if KVM_MAP_MEMORY does all of the SEPT.ADD stuff,
> > > > which doesn't affect the measurement, and even fills in KVM's copy of the leaf EPTE, 
> > > > but tdx_sept_set_private_spte() doesn't do anything if the TD isn't finalized?
> > > > 
> > > > Then KVM provides a dedicated TDX ioctl(), i.e. what is/was KVM_TDX_INIT_MEM_REGION,
> > > > to do PAGE.ADD.  KVM_TDX_INIT_MEM_REGION wouldn't need to map anything, it would
> > > > simply need to verify that the pfn from guest_memfd() is the same as what's in
> > > > the TDP MMU.
> > > 
> > > One small question:
> > > 
> > > What if the memory region passed to KVM_TDX_INIT_MEM_REGION hasn't been pre-
> > > populated?  If we want to make KVM_TDX_INIT_MEM_REGION work with these regions,
> > > then we still need to do the real map.  Or we can make KVM_TDX_INIT_MEM_REGION
> > > return error when it finds the region hasn't been pre-populated?
> > 
> > Return an error.  I don't love the idea of bleeding so many TDX details into
> > userspace, but I'm pretty sure that ship sailed a long, long time ago.
> 
> In this case, IIUC the KVM_MAP_MEMORY ioctl() will be mandatory for TDX
> (presumbly also SNP) guests, but _optional_ for other VMs.  Not sure whether
> this is ideal.
> 
> And just want to make sure I understand the background correctly:
> 
> The KVM_MAP_MEMORY ioctl() is supposed to be generic, and it should be able to
> be used by any VM but not just CoCo VMs (including SW_PROTECTED ones)?
> 
> But it is only supposed to be used by the VMs which use guest_memfd()?  Because
> IIUC for normal VMs using mmap() we already have MAP_POPULATE for this purpose.
> 
> Looking at [*], it doesn't say what kind of VM the sender was trying to use.
> 
> Therefore can we interpret KVM_MAP_MEMORY ioctl() is effectively for CoCo VMs? 
> SW_PROTECTED VMs can also use guest_memfd(), but I believe nobody is going to
> use it seriously.
> 
> [*] https://lore.kernel.org/all/65262e67-7885-971a-896d-ad9c0a760907@polito.it/
> 
> 

Hmm.. Just after sending I realized the MAP_POPULATE only pre-populate page
table at host side, not EPT...

So the KVM_MAP_MEMORY is indeed can be used by _ALL_ VMs.  My bad :-(
Sean Christopherson March 12, 2024, 11:03 p.m. UTC | #10
On Tue, Mar 12, 2024, Kai Huang wrote:
> On Tue, 2024-03-12 at 21:41 +0000, Huang, Kai wrote:
> > On Tue, 2024-03-12 at 07:20 -0700, Sean Christopherson wrote:
> > > On Tue, Mar 12, 2024, Kai Huang wrote:
> > > > > Wait. KVM doesn't *need* to do PAGE.ADD from deep in the MMU.  The only inputs to
> > > > > PAGE.ADD are the gfn, pfn, tdr (vm), and source.  The S-EPT structures need to be
> > > > > pre-built, but when they are built is irrelevant, so long as they are in place
> > > > > before PAGE.ADD.
> > > > > 
> > > > > Crazy idea.  For TDX S-EPT, what if KVM_MAP_MEMORY does all of the SEPT.ADD stuff,
> > > > > which doesn't affect the measurement, and even fills in KVM's copy of the leaf EPTE, 
> > > > > but tdx_sept_set_private_spte() doesn't do anything if the TD isn't finalized?
> > > > > 
> > > > > Then KVM provides a dedicated TDX ioctl(), i.e. what is/was KVM_TDX_INIT_MEM_REGION,
> > > > > to do PAGE.ADD.  KVM_TDX_INIT_MEM_REGION wouldn't need to map anything, it would
> > > > > simply need to verify that the pfn from guest_memfd() is the same as what's in
> > > > > the TDP MMU.
> > > > 
> > > > One small question:
> > > > 
> > > > What if the memory region passed to KVM_TDX_INIT_MEM_REGION hasn't been pre-
> > > > populated?  If we want to make KVM_TDX_INIT_MEM_REGION work with these regions,
> > > > then we still need to do the real map.  Or we can make KVM_TDX_INIT_MEM_REGION
> > > > return error when it finds the region hasn't been pre-populated?
> > > 
> > > Return an error.  I don't love the idea of bleeding so many TDX details into
> > > userspace, but I'm pretty sure that ship sailed a long, long time ago.
> > 
> > In this case, IIUC the KVM_MAP_MEMORY ioctl() will be mandatory for TDX
> > (presumbly also SNP) guests, but _optional_ for other VMs.  Not sure whether
> > this is ideal.

No, just TDX.  SNP's RMP purely works with pfns, i.e. the enforcement layer comes
into play *after* the stage-2 page table walks.  KVM can zap NPTs for SNP VMs at
will.

> > And just want to make sure I understand the background correctly:
> > 
> > The KVM_MAP_MEMORY ioctl() is supposed to be generic, and it should be able to
> > be used by any VM but not just CoCo VMs (including SW_PROTECTED ones)?
> > 
> > But it is only supposed to be used by the VMs which use guest_memfd()?  Because
> > IIUC for normal VMs using mmap() we already have MAP_POPULATE for this purpose.
> > 
> > Looking at [*], it doesn't say what kind of VM the sender was trying to use.
> > 
> > Therefore can we interpret KVM_MAP_MEMORY ioctl() is effectively for CoCo VMs? 
> > SW_PROTECTED VMs can also use guest_memfd(), but I believe nobody is going to
> > use it seriously.
> > 
> > [*] https://lore.kernel.org/all/65262e67-7885-971a-896d-ad9c0a760907@polito.it/
> > 
> > 
> 
> Hmm.. Just after sending I realized the MAP_POPULATE only pre-populate page
> table at host side, not EPT...

Yep, exactly.

> So the KVM_MAP_MEMORY is indeed can be used by _ALL_ VMs.  My bad :-(
Isaku Yamahata March 19, 2024, 4:26 p.m. UTC | #11
On Wed, Mar 06, 2024 at 05:51:51PM -0800,
Isaku Yamahata <isaku.yamahata@linux.intel.com> wrote:

> On Wed, Mar 06, 2024 at 04:36:25PM -0800,
> David Matlack <dmatlack@google.com> wrote:
> 
> > On Wed, Mar 6, 2024 at 4:31 PM David Matlack <dmatlack@google.com> wrote:
> > >
> > > On 2024-03-01 09:28 AM, isaku.yamahata@intel.com wrote:
> > > >
> > > > +     if (IS_ALIGNED(mapping->base_gfn, KVM_PAGES_PER_HPAGE(PG_LEVEL_1G)) &&
> > > > +         mapping->nr_pages >= KVM_PAGES_PER_HPAGE(PG_LEVEL_1G))
> > > > +             max_level = PG_LEVEL_1G;
> > > > +     else if (IS_ALIGNED(mapping->base_gfn, KVM_PAGES_PER_HPAGE(PG_LEVEL_2M)) &&
> > > > +              mapping->nr_pages >= KVM_PAGES_PER_HPAGE(PG_LEVEL_2M))
> > > > +             max_level = PG_LEVEL_2M;
> > > > +     else
> > > > +             max_level = PG_LEVEL_4K;
> > >
> > > Is there a requirement that KVM must not map memory outside of the
> > > requested region?
> > 
> > And if so, what if the requested region is already mapped with a larger page?
> 
> Yes. We'd like to map exact gpa range for SNP or TDX case. We don't want to map
> zero at around range.  For SNP or TDX, we map page to GPA, it's one time
> operation.  It updates measurement.
> 
> Say, we'd like to populate GPA1 and GPA2 with initial guest memory image.  And
> they are within same 2M range.  Map GPA1 first. If GPA2 is also mapped with zero
> with 2M page, the following mapping of GPA2 fails.  Even if mapping of GPA2
> succeeds, measurement may be updated when mapping GPA1. 
> 
> It's user space VMM responsibility to map GPA range only once at most for SNP or
> TDX.  Is this too strict requirement for default VM use case to mitigate KVM
> page fault at guest boot up?  If so, what about a flag like EXACT_MAPPING or
> something?

I'm thinking as follows. What do you think?

- Allow mapping larger than requested with gmem_max_level hook:
  Depend on the following patch. [1]
  The gmem_max_level hook allows vendor-backend to determine max level.
  By default (for default VM or sw-protected), it allows KVM_MAX_HUGEPAGE_LEVEL
  mapping.  TDX allows only 4KB mapping.

  [1] https://lore.kernel.org/kvm/20231230172351.574091-31-michael.roth@amd.com/
  [PATCH v11 30/35] KVM: x86: Add gmem hook for determining max NPT mapping level

- Pure mapping without coco operation:
  As Sean suggested at [2], make KVM_MAP_MEMORY pure mapping without coco
  operation.  In the case of TDX, the API doesn't issue TDX specific operation
  like TDH.PAGE.ADD() and TDH.EXTEND.MR().  We need TDX specific API.

  [2] https://lore.kernel.org/kvm/Ze-XW-EbT9vXaagC@google.com/

- KVM_MAP_MEMORY on already mapped area potentially with large page:
  It succeeds. Not error.  It doesn't care whether the GPA is backed by large
  page or not.  Because the use case is pre-population before guest running, it
  doesn't matter if the given GPA was mapped or not, and what large page level
  it backs.

  Do you want error like -EEXIST?
Sean Christopherson April 3, 2024, 11:15 p.m. UTC | #12
On Tue, Mar 19, 2024, Isaku Yamahata wrote:
> On Wed, Mar 06, 2024 at 05:51:51PM -0800,
> > Yes. We'd like to map exact gpa range for SNP or TDX case. We don't want to map
> > zero at around range.  For SNP or TDX, we map page to GPA, it's one time
> > operation.  It updates measurement.
> > 
> > Say, we'd like to populate GPA1 and GPA2 with initial guest memory image.  And
> > they are within same 2M range.  Map GPA1 first. If GPA2 is also mapped with zero
> > with 2M page, the following mapping of GPA2 fails.  Even if mapping of GPA2
> > succeeds, measurement may be updated when mapping GPA1. 
> > 
> > It's user space VMM responsibility to map GPA range only once at most for SNP or
> > TDX.  Is this too strict requirement for default VM use case to mitigate KVM
> > page fault at guest boot up?  If so, what about a flag like EXACT_MAPPING or
> > something?
> 
> I'm thinking as follows. What do you think?
> 
> - Allow mapping larger than requested with gmem_max_level hook:

I don't see any reason to allow userspace to request a mapping level.  If the
prefetch is defined to have read fault semantics, KVM has all the wiggle room it
needs to do the optimal/sane thing, without having to worry reconcile userspace's
desired mapping level.

>   Depend on the following patch. [1]
>   The gmem_max_level hook allows vendor-backend to determine max level.
>   By default (for default VM or sw-protected), it allows KVM_MAX_HUGEPAGE_LEVEL
>   mapping.  TDX allows only 4KB mapping.
> 
>   [1] https://lore.kernel.org/kvm/20231230172351.574091-31-michael.roth@amd.com/
>   [PATCH v11 30/35] KVM: x86: Add gmem hook for determining max NPT mapping level
> 
> - Pure mapping without coco operation:
>   As Sean suggested at [2], make KVM_MAP_MEMORY pure mapping without coco
>   operation.  In the case of TDX, the API doesn't issue TDX specific operation
>   like TDH.PAGE.ADD() and TDH.EXTEND.MR().  We need TDX specific API.
> 
>   [2] https://lore.kernel.org/kvm/Ze-XW-EbT9vXaagC@google.com/
> 
> - KVM_MAP_MEMORY on already mapped area potentially with large page:
>   It succeeds. Not error.  It doesn't care whether the GPA is backed by large
>   page or not.  Because the use case is pre-population before guest running, it
>   doesn't matter if the given GPA was mapped or not, and what large page level
>   it backs.
> 
>   Do you want error like -EEXIST?

No error.  As above, I think the ioctl() should behave like a read fault, i.e.
be an expensive nop if there's nothing to be done.

For VMA-based memory, userspace can operate on the userspace address.  E.g. if
userspace wants to break CoW, it can do that by writing from userspace.  And if
userspace wants to "request" a certain mapping level, it can do that by MADV_*.

For guest_memfd, there are no protections (everything is RWX, for now), and when
hugepage support comes along, userspace can simply manipulate the guest_memfd
instance as needed.
diff mbox series

Patch

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3b8cb69b04fa..6025c0e12d89 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4660,6 +4660,7 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
 	case KVM_CAP_IRQFD_RESAMPLE:
 	case KVM_CAP_MEMORY_FAULT_INFO:
+	case KVM_CAP_MAP_MEMORY:
 		r = 1;
 		break;
 	case KVM_CAP_EXIT_HYPERCALL:
@@ -5805,6 +5806,54 @@  static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 	}
 }
 
+int kvm_arch_vcpu_pre_map_memory(struct kvm_vcpu *vcpu)
+{
+	return kvm_mmu_reload(vcpu);
+}
+
+int kvm_arch_vcpu_map_memory(struct kvm_vcpu *vcpu,
+			     struct kvm_memory_mapping *mapping)
+{
+	u8 max_level, goal_level = PG_LEVEL_4K;
+	u32 error_code;
+	int r;
+
+	error_code = 0;
+	if (mapping->flags & KVM_MEMORY_MAPPING_FLAG_WRITE)
+		error_code |= PFERR_WRITE_MASK;
+	if (mapping->flags & KVM_MEMORY_MAPPING_FLAG_EXEC)
+		error_code |= PFERR_FETCH_MASK;
+	if (mapping->flags & KVM_MEMORY_MAPPING_FLAG_USER)
+		error_code |= PFERR_USER_MASK;
+	if (mapping->flags & KVM_MEMORY_MAPPING_FLAG_PRIVATE) {
+#ifdef PFERR_PRIVATE_ACCESS
+		error_code |= PFERR_PRIVATE_ACCESS;
+#else
+		return -OPNOTSUPP;
+#endif
+	}
+
+	if (IS_ALIGNED(mapping->base_gfn, KVM_PAGES_PER_HPAGE(PG_LEVEL_1G)) &&
+	    mapping->nr_pages >= KVM_PAGES_PER_HPAGE(PG_LEVEL_1G))
+		max_level = PG_LEVEL_1G;
+	else if (IS_ALIGNED(mapping->base_gfn, KVM_PAGES_PER_HPAGE(PG_LEVEL_2M)) &&
+		 mapping->nr_pages >= KVM_PAGES_PER_HPAGE(PG_LEVEL_2M))
+		max_level = PG_LEVEL_2M;
+	else
+		max_level = PG_LEVEL_4K;
+
+	r = kvm_mmu_map_page(vcpu, gfn_to_gpa(mapping->base_gfn), error_code,
+			     max_level, &goal_level);
+	if (r)
+		return r;
+
+	if (mapping->source)
+		mapping->source += KVM_HPAGE_SIZE(goal_level);
+	mapping->base_gfn += KVM_PAGES_PER_HPAGE(goal_level);
+	mapping->nr_pages -= KVM_PAGES_PER_HPAGE(goal_level);
+	return r;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {