diff mbox series

[v3,09/21] KVM: X86: Don't track dirty for KVM_SET_[TSS_ADDR|IDENTITY_MAP_ADDR]

Message ID 20200109145729.32898-10-peterx@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: Dirty ring interface | expand

Commit Message

Peter Xu Jan. 9, 2020, 2:57 p.m. UTC
Originally, we have three code paths that can dirty a page without
vcpu context for X86:

  - init_rmode_identity_map
  - init_rmode_tss
  - kvmgt_rw_gpa

init_rmode_identity_map and init_rmode_tss will be setup on
destination VM no matter what (and the guest cannot even see them), so
it does not make sense to track them at all.

To do this, allow __x86_set_memory_region() to return the userspace
address that just allocated to the caller.  Then in both of the
functions we directly write to the userspace address instead of
calling kvm_write_*() APIs.  We need to make sure that we have the
slots_lock held when accessing the userspace address.

Another trivial change is that we don't need to explicitly clear the
identity page table root in init_rmode_identity_map() because no
matter what we'll write to the whole page with 4M huge page entries.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 +-
 arch/x86/kvm/svm.c              |  3 +-
 arch/x86/kvm/vmx/vmx.c          | 68 ++++++++++++++++-----------------
 arch/x86/kvm/x86.c              | 18 +++++++--
 4 files changed, 51 insertions(+), 41 deletions(-)

Comments

Paolo Bonzini Jan. 19, 2020, 9:01 a.m. UTC | #1
On 09/01/20 15:57, Peter Xu wrote:
> -int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
> +/*
> + * If `uaddr' is specified, `*uaddr' will be returned with the
> + * userspace address that was just allocated.  `uaddr' is only
> + * meaningful if the function returns zero, and `uaddr' will only be
> + * valid when with either the slots_lock or with the SRCU read lock
> + * held.  After we release the lock, the returned `uaddr' will be invalid.
> + */

In practice the address is still protected by the refcount, isn't it?
Only destroying the VM could invalidate it.

Paolo
Peter Xu Jan. 20, 2020, 6:45 a.m. UTC | #2
On Sun, Jan 19, 2020 at 10:01:50AM +0100, Paolo Bonzini wrote:
> On 09/01/20 15:57, Peter Xu wrote:
> > -int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
> > +/*
> > + * If `uaddr' is specified, `*uaddr' will be returned with the
> > + * userspace address that was just allocated.  `uaddr' is only
> > + * meaningful if the function returns zero, and `uaddr' will only be
> > + * valid when with either the slots_lock or with the SRCU read lock
> > + * held.  After we release the lock, the returned `uaddr' will be invalid.
> > + */
> 
> In practice the address is still protected by the refcount, isn't it?
> Only destroying the VM could invalidate it.

Yes I think so.  I wanted to make it clear that uaddr is temporary,
however "will be invalid" could be be too strong...  Thanks,
Sean Christopherson Jan. 21, 2020, 3:56 p.m. UTC | #3
On Thu, Jan 09, 2020 at 09:57:17AM -0500, Peter Xu wrote:
> Originally, we have three code paths that can dirty a page without
> vcpu context for X86:
> 
>   - init_rmode_identity_map
>   - init_rmode_tss
>   - kvmgt_rw_gpa
> 
> init_rmode_identity_map and init_rmode_tss will be setup on
> destination VM no matter what (and the guest cannot even see them), so
> it does not make sense to track them at all.
> 
> To do this, allow __x86_set_memory_region() to return the userspace
> address that just allocated to the caller.  Then in both of the
> functions we directly write to the userspace address instead of
> calling kvm_write_*() APIs.  We need to make sure that we have the
> slots_lock held when accessing the userspace address.
> 
> Another trivial change is that we don't need to explicitly clear the
> identity page table root in init_rmode_identity_map() because no
> matter what we'll write to the whole page with 4M huge page entries.
> 
> Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  3 +-
>  arch/x86/kvm/svm.c              |  3 +-
>  arch/x86/kvm/vmx/vmx.c          | 68 ++++++++++++++++-----------------
>  arch/x86/kvm/x86.c              | 18 +++++++--
>  4 files changed, 51 insertions(+), 41 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index eb6673c7d2e3..f536d139b3d2 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1618,7 +1618,8 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);
>  
>  int kvm_is_in_guest(void);
>  
> -int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size);
> +int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size,
> +			    unsigned long *uaddr);

No need for a new param, just return a "void __user *" (or "void *" if the
__user part requires lots of casting) and use ERR_PTR() to encode errors in
the return value.  I.e. return the userspace address.

The refactoring to return the address should be done in a separate patch as
prep work for the move to __copy_to_user().

>  bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
>  bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
>  
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 8f1b715dfde8..03a344ce7b66 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -1698,7 +1698,8 @@ static int avic_init_access_page(struct kvm_vcpu *vcpu)
>  	ret = __x86_set_memory_region(kvm,
>  				      APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
>  				      APIC_DEFAULT_PHYS_BASE,
> -				      PAGE_SIZE);
> +				      PAGE_SIZE,
> +				      NULL);
>  	if (ret)
>  		goto out;
>  
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 7e3d370209e0..62175a246bcc 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -3441,34 +3441,28 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu)
>  	return true;
>  }
>  
> -static int init_rmode_tss(struct kvm *kvm)
> +static int init_rmode_tss(struct kvm *kvm, unsigned long *uaddr)

uaddr is not a pointer to an unsigned long, it's a pointer to a TSS.  Given
that it's dereferenced as a "void __user *", it's probably best passed as
exactly that.

This code also needs to be tested by doing unrestricted_guest=0 when
loading kvm_intel, because it's obviously broken.  __x86_set_memory_region()
takes an "unsigned long *", interpreted as a "pointer to a usersepace
address", i.e. a "void __user **".  But the callers are treating the param
as a "unsigned long in userpace", e.g. init_rmode_identity_map() declares
uaddr as an "unsigned long *", when really it should be declaring a
straight "unsigned long" and passing "&uaddr".  The only thing that saves
KVM from dereferencing a bad pointer in __x86_set_memory_region() is that
uaddr is initialized to NULL 

>  {
> -	gfn_t fn;
> +	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
>  	u16 data = 0;
>  	int idx, r;
>  
> -	idx = srcu_read_lock(&kvm->srcu);
> -	fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
> -	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
> -	if (r < 0)
> -		goto out;
> +	for (idx = 0; idx < 3; idx++) {
> +		r = __copy_to_user((void __user *)uaddr + PAGE_SIZE * idx,
> +				   zero_page, PAGE_SIZE);
> +		if (r)
> +			return -EFAULT;
> +	}
> +
>  	data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
> -	r = kvm_write_guest_page(kvm, fn++, &data,
> -			TSS_IOPB_BASE_OFFSET, sizeof(u16));
> -	if (r < 0)
> -		goto out;
> -	r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
> -	if (r < 0)
> -		goto out;
> -	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
> -	if (r < 0)
> -		goto out;
> +	r = __copy_to_user((void __user *)uaddr + TSS_IOPB_BASE_OFFSET,
> +			   &data, sizeof(data));
> +	if (r)
> +		return -EFAULT;
> +
>  	data = ~0;
> -	r = kvm_write_guest_page(kvm, fn, &data,
> -				 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
> -				 sizeof(u8));
> -out:
> -	srcu_read_unlock(&kvm->srcu, idx);
> +	r = __copy_to_user((void __user *)uaddr - 1, &data, sizeof(data));
> +
>  	return r;

Why not "return __copy_to_user();"?

>  }
>  
> @@ -3478,6 +3472,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
>  	int i, r = 0;
>  	kvm_pfn_t identity_map_pfn;
>  	u32 tmp;
> +	unsigned long *uaddr = NULL;

Again, not a pointer to an unsigned long.

>  	/* Protect kvm_vmx->ept_identity_pagetable_done. */
>  	mutex_lock(&kvm->slots_lock);
> @@ -3490,21 +3485,21 @@ static int init_rmode_identity_map(struct kvm *kvm)
>  	identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
>  
>  	r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
> -				    kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
> +				    kvm_vmx->ept_identity_map_addr, PAGE_SIZE,
> +				    uaddr);
>  	if (r < 0)
>  		goto out;
>  
> -	r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
> -	if (r < 0)
> -		goto out;
>  	/* Set up identity-mapping pagetable for EPT in real mode */
>  	for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
>  		tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
>  			_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
> -		r = kvm_write_guest_page(kvm, identity_map_pfn,
> -				&tmp, i * sizeof(tmp), sizeof(tmp));
> -		if (r < 0)
> +		r = __copy_to_user((void __user *)uaddr + i * sizeof(tmp),
> +				   &tmp, sizeof(tmp));
> +		if (r) {
> +			r = -EFAULT;
>  			goto out;
> +		}
>  	}
>  	kvm_vmx->ept_identity_pagetable_done = true;
>  
> @@ -3537,7 +3532,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
>  	if (kvm->arch.apic_access_page_done)
>  		goto out;
>  	r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
> -				    APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
> +				    APIC_DEFAULT_PHYS_BASE, PAGE_SIZE, NULL);
>  	if (r)
>  		goto out;
>  
> @@ -4478,19 +4473,22 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
>  static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
>  {
>  	int ret;
> +	unsigned long *uaddr = NULL;
>  
>  	if (enable_unrestricted_guest)
>  		return 0;
>  
>  	mutex_lock(&kvm->slots_lock);
>  	ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
> -				      PAGE_SIZE * 3);
> -	mutex_unlock(&kvm->slots_lock);
> -
> +				      PAGE_SIZE * 3, uaddr);
>  	if (ret)
> -		return ret;
> +		goto out;
> +
>  	to_kvm_vmx(kvm)->tss_addr = addr;
> -	return init_rmode_tss(kvm);
> +	ret = init_rmode_tss(kvm, uaddr);
> +out:
> +	mutex_unlock(&kvm->slots_lock);

Unnecessary, see below.

> +	return ret;
>  }
>  
>  static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index c4d3972dcd14..ff97782b3919 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -9584,7 +9584,15 @@ void kvm_arch_sync_events(struct kvm *kvm)
>  	kvm_free_pit(kvm);
>  }
>  
> -int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
> +/*
> + * If `uaddr' is specified, `*uaddr' will be returned with the
> + * userspace address that was just allocated.  `uaddr' is only
> + * meaningful if the function returns zero, and `uaddr' will only be
> + * valid when with either the slots_lock or with the SRCU read lock
> + * held.  After we release the lock, the returned `uaddr' will be invalid.

This is all incorrect.  Neither of those locks has any bearing on the
validity of the hva.  slots_lock does as the name suggests and prevents
concurrent writes to the memslots.  The SRCU lock ensures the implicit
memslots lookup in kvm_clear_guest_page() won't result in a use-after-free
due to derefencing old memslots.

Neither of those has anything to do with the userspace address, they're
both fully tied to KVM's gfn->hva lookup.  As Paolo pointed out, KVM's
mapping is instead tied to the lifecycle of the VM.  Note, even *that* has
no bearing on the validity of the mapping or address as KVM only increments
mm_count, not mm_users, i.e. guarantees the mm struct itself won't be freed
but doesn't ensure the vmas or associated pages tables are valid.

Which is the entire point of using __copy_{to,from}_user(), as they
gracefully handle the scenario where the process has not valid mapping
and/or translation for the address.

> + */
> +int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size,
> +			    unsigned long *uaddr)
>  {
>  	int i, r;
>  	unsigned long hva;

Note, hva is a straight "unsigned long".

> @@ -9608,6 +9616,8 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
>  			      MAP_SHARED | MAP_ANONYMOUS, 0);
>  		if (IS_ERR((void *)hva))
>  			return PTR_ERR((void *)hva);
> +		if (uaddr)
> +			*uaddr = hva;
>  	} else {
>  		if (!slot->npages)
>  			return 0;

@uaddr should be to zero here.  Actually returning the address as a void *
will force this case to be handled correctly.

> @@ -9651,10 +9661,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
>  		 */
>  		mutex_lock(&kvm->slots_lock);
>  		__x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
> -					0, 0);
> +					0, 0, NULL);
>  		__x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
> -					0, 0);
> -		__x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
> +					0, 0, NULL);
> +		__x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0, NULL);
>  		mutex_unlock(&kvm->slots_lock);
>  	}
>  	if (kvm_x86_ops->vm_destroy)
> -- 
> 2.24.1
>
Paolo Bonzini Jan. 21, 2020, 4:14 p.m. UTC | #4
On 21/01/20 16:56, Sean Christopherson wrote:
> This code also needs to be tested by doing unrestricted_guest=0 when
> loading kvm_intel, because it's obviously broken.

... as I had just found out after starting tests on kvm/queue.  Unqueued
this patch.

Paolo

> __x86_set_memory_region()
> takes an "unsigned long *", interpreted as a "pointer to a usersepace
> address", i.e. a "void __user **".  But the callers are treating the param
> as a "unsigned long in userpace", e.g. init_rmode_identity_map() declares
> uaddr as an "unsigned long *", when really it should be declaring a
> straight "unsigned long" and passing "&uaddr".  The only thing that saves
> KVM from dereferencing a bad pointer in __x86_set_memory_region() is that
> uaddr is initialized to NULL
Peter Xu Jan. 28, 2020, 5:50 a.m. UTC | #5
On Tue, Jan 21, 2020 at 07:56:57AM -0800, Sean Christopherson wrote:
> On Thu, Jan 09, 2020 at 09:57:17AM -0500, Peter Xu wrote:
> > Originally, we have three code paths that can dirty a page without
> > vcpu context for X86:
> > 
> >   - init_rmode_identity_map
> >   - init_rmode_tss
> >   - kvmgt_rw_gpa
> > 
> > init_rmode_identity_map and init_rmode_tss will be setup on
> > destination VM no matter what (and the guest cannot even see them), so
> > it does not make sense to track them at all.
> > 
> > To do this, allow __x86_set_memory_region() to return the userspace
> > address that just allocated to the caller.  Then in both of the
> > functions we directly write to the userspace address instead of
> > calling kvm_write_*() APIs.  We need to make sure that we have the
> > slots_lock held when accessing the userspace address.
> > 
> > Another trivial change is that we don't need to explicitly clear the
> > identity page table root in init_rmode_identity_map() because no
> > matter what we'll write to the whole page with 4M huge page entries.
> > 
> > Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  arch/x86/include/asm/kvm_host.h |  3 +-
> >  arch/x86/kvm/svm.c              |  3 +-
> >  arch/x86/kvm/vmx/vmx.c          | 68 ++++++++++++++++-----------------
> >  arch/x86/kvm/x86.c              | 18 +++++++--
> >  4 files changed, 51 insertions(+), 41 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index eb6673c7d2e3..f536d139b3d2 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -1618,7 +1618,8 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);
> >  
> >  int kvm_is_in_guest(void);
> >  
> > -int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size);
> > +int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size,
> > +			    unsigned long *uaddr);
> 
> No need for a new param, just return a "void __user *" (or "void *" if the
> __user part requires lots of casting) and use ERR_PTR() to encode errors in
> the return value.  I.e. return the userspace address.
> 
> The refactoring to return the address should be done in a separate patch as
> prep work for the move to __copy_to_user().

Yes this sounds cleaner, will do.

> 
> >  bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
> >  bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
> >  
> > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > index 8f1b715dfde8..03a344ce7b66 100644
> > --- a/arch/x86/kvm/svm.c
> > +++ b/arch/x86/kvm/svm.c
> > @@ -1698,7 +1698,8 @@ static int avic_init_access_page(struct kvm_vcpu *vcpu)
> >  	ret = __x86_set_memory_region(kvm,
> >  				      APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
> >  				      APIC_DEFAULT_PHYS_BASE,
> > -				      PAGE_SIZE);
> > +				      PAGE_SIZE,
> > +				      NULL);
> >  	if (ret)
> >  		goto out;
> >  
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index 7e3d370209e0..62175a246bcc 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -3441,34 +3441,28 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu)
> >  	return true;
> >  }
> >  
> > -static int init_rmode_tss(struct kvm *kvm)
> > +static int init_rmode_tss(struct kvm *kvm, unsigned long *uaddr)
> 
> uaddr is not a pointer to an unsigned long, it's a pointer to a TSS.  Given
> that it's dereferenced as a "void __user *", it's probably best passed as
> exactly that.
> 
> This code also needs to be tested by doing unrestricted_guest=0 when
> loading kvm_intel, because it's obviously broken.  __x86_set_memory_region()
> takes an "unsigned long *", interpreted as a "pointer to a usersepace
> address", i.e. a "void __user **".  But the callers are treating the param
> as a "unsigned long in userpace", e.g. init_rmode_identity_map() declares
> uaddr as an "unsigned long *", when really it should be declaring a
> straight "unsigned long" and passing "&uaddr".  The only thing that saves
> KVM from dereferencing a bad pointer in __x86_set_memory_region() is that
> uaddr is initialized to NULL 

Yes it's broken.  Thanks very much for figuring it out.  I'll test
unrestricted_guest=N.

> 
> >  {
> > -	gfn_t fn;
> > +	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
> >  	u16 data = 0;
> >  	int idx, r;
> >  
> > -	idx = srcu_read_lock(&kvm->srcu);
> > -	fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
> > -	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
> > -	if (r < 0)
> > -		goto out;
> > +	for (idx = 0; idx < 3; idx++) {
> > +		r = __copy_to_user((void __user *)uaddr + PAGE_SIZE * idx,
> > +				   zero_page, PAGE_SIZE);
> > +		if (r)
> > +			return -EFAULT;
> > +	}
> > +
> >  	data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
> > -	r = kvm_write_guest_page(kvm, fn++, &data,
> > -			TSS_IOPB_BASE_OFFSET, sizeof(u16));
> > -	if (r < 0)
> > -		goto out;
> > -	r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
> > -	if (r < 0)
> > -		goto out;
> > -	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
> > -	if (r < 0)
> > -		goto out;
> > +	r = __copy_to_user((void __user *)uaddr + TSS_IOPB_BASE_OFFSET,
> > +			   &data, sizeof(data));
> > +	if (r)
> > +		return -EFAULT;
> > +
> >  	data = ~0;
> > -	r = kvm_write_guest_page(kvm, fn, &data,
> > -				 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
> > -				 sizeof(u8));
> > -out:
> > -	srcu_read_unlock(&kvm->srcu, idx);
> > +	r = __copy_to_user((void __user *)uaddr - 1, &data, sizeof(data));
> > +
> >  	return r;
> 
> Why not "return __copy_to_user();"?

Sure.

> 
> >  }
> >  
> > @@ -3478,6 +3472,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
> >  	int i, r = 0;
> >  	kvm_pfn_t identity_map_pfn;
> >  	u32 tmp;
> > +	unsigned long *uaddr = NULL;
> 
> Again, not a pointer to an unsigned long.
> 
> >  	/* Protect kvm_vmx->ept_identity_pagetable_done. */
> >  	mutex_lock(&kvm->slots_lock);
> > @@ -3490,21 +3485,21 @@ static int init_rmode_identity_map(struct kvm *kvm)
> >  	identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
> >  
> >  	r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
> > -				    kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
> > +				    kvm_vmx->ept_identity_map_addr, PAGE_SIZE,
> > +				    uaddr);
> >  	if (r < 0)
> >  		goto out;
> >  
> > -	r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
> > -	if (r < 0)
> > -		goto out;
> >  	/* Set up identity-mapping pagetable for EPT in real mode */
> >  	for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
> >  		tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
> >  			_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
> > -		r = kvm_write_guest_page(kvm, identity_map_pfn,
> > -				&tmp, i * sizeof(tmp), sizeof(tmp));
> > -		if (r < 0)
> > +		r = __copy_to_user((void __user *)uaddr + i * sizeof(tmp),
> > +				   &tmp, sizeof(tmp));
> > +		if (r) {
> > +			r = -EFAULT;
> >  			goto out;
> > +		}
> >  	}
> >  	kvm_vmx->ept_identity_pagetable_done = true;
> >  
> > @@ -3537,7 +3532,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
> >  	if (kvm->arch.apic_access_page_done)
> >  		goto out;
> >  	r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
> > -				    APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
> > +				    APIC_DEFAULT_PHYS_BASE, PAGE_SIZE, NULL);
> >  	if (r)
> >  		goto out;
> >  
> > @@ -4478,19 +4473,22 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
> >  static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
> >  {
> >  	int ret;
> > +	unsigned long *uaddr = NULL;
> >  
> >  	if (enable_unrestricted_guest)
> >  		return 0;
> >  
> >  	mutex_lock(&kvm->slots_lock);
> >  	ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
> > -				      PAGE_SIZE * 3);
> > -	mutex_unlock(&kvm->slots_lock);
> > -
> > +				      PAGE_SIZE * 3, uaddr);
> >  	if (ret)
> > -		return ret;
> > +		goto out;
> > +
> >  	to_kvm_vmx(kvm)->tss_addr = addr;
> > -	return init_rmode_tss(kvm);
> > +	ret = init_rmode_tss(kvm, uaddr);
> > +out:
> > +	mutex_unlock(&kvm->slots_lock);
> 
> Unnecessary, see below.

Do you mean that we don't even need the lock?

I feel like this could at least fail lockdep.  More below.

[1]

> 
> > +	return ret;
> >  }
> >  
> >  static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index c4d3972dcd14..ff97782b3919 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -9584,7 +9584,15 @@ void kvm_arch_sync_events(struct kvm *kvm)
> >  	kvm_free_pit(kvm);
> >  }
> >  
> > -int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
> > +/*
> > + * If `uaddr' is specified, `*uaddr' will be returned with the
> > + * userspace address that was just allocated.  `uaddr' is only
> > + * meaningful if the function returns zero, and `uaddr' will only be
> > + * valid when with either the slots_lock or with the SRCU read lock
> > + * held.  After we release the lock, the returned `uaddr' will be invalid.
> 
> This is all incorrect.  Neither of those locks has any bearing on the
> validity of the hva.  slots_lock does as the name suggests and prevents
> concurrent writes to the memslots.  The SRCU lock ensures the implicit
> memslots lookup in kvm_clear_guest_page() won't result in a use-after-free
> due to derefencing old memslots.
> 
> Neither of those has anything to do with the userspace address, they're
> both fully tied to KVM's gfn->hva lookup.  As Paolo pointed out, KVM's
> mapping is instead tied to the lifecycle of the VM.  Note, even *that* has
> no bearing on the validity of the mapping or address as KVM only increments
> mm_count, not mm_users, i.e. guarantees the mm struct itself won't be freed
> but doesn't ensure the vmas or associated pages tables are valid.
> 
> Which is the entire point of using __copy_{to,from}_user(), as they
> gracefully handle the scenario where the process has not valid mapping
> and/or translation for the address.

Sorry I don't understand.

I do think either the slots_lock or SRCU would protect at least the
existing kvm.memslots, and if so at least the previous vm_mmap()
return value should still be valid.  I agree that __copy_to_user()
will protect us from many cases from process mm pov (which allows page
faults inside), but again if the kvm.memslots is changed underneath us
then it's another story, IMHO, and that's why we need either the lock
or SRCU.

Or are you assuming that (1) __x86_set_memory_region() is only for the
3 private kvm memslots, and (2) currently the kvm private memory slots
will never change after VM is created and before VM is destroyed?  If
so, I agree with you.  However I don't see why we need to restrict
__x86_set_memory_region() with that assumption, after all taking a
lock is not expensive in this slow path.  Even if so, we'd better
comment above __x86_set_memory_region() about this, so we know that we
should not use __x86_set_memory_region() for future kvm internal
memslots that are prone to change during VM's lifecycle (while
currently it seems to be a very general interface).

Please let me know if I misunderstood your point.

Thanks,
Sean Christopherson Jan. 28, 2020, 6:24 p.m. UTC | #6
On Tue, Jan 28, 2020 at 01:50:05PM +0800, Peter Xu wrote:
> On Tue, Jan 21, 2020 at 07:56:57AM -0800, Sean Christopherson wrote:
> > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > index c4d3972dcd14..ff97782b3919 100644
> > > --- a/arch/x86/kvm/x86.c
> > > +++ b/arch/x86/kvm/x86.c
> > > @@ -9584,7 +9584,15 @@ void kvm_arch_sync_events(struct kvm *kvm)
> > >  	kvm_free_pit(kvm);
> > >  }
> > >  
> > > -int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
> > > +/*
> > > + * If `uaddr' is specified, `*uaddr' will be returned with the
> > > + * userspace address that was just allocated.  `uaddr' is only
> > > + * meaningful if the function returns zero, and `uaddr' will only be
> > > + * valid when with either the slots_lock or with the SRCU read lock
> > > + * held.  After we release the lock, the returned `uaddr' will be invalid.
> > 
> > This is all incorrect.  Neither of those locks has any bearing on the
> > validity of the hva.  slots_lock does as the name suggests and prevents
> > concurrent writes to the memslots.  The SRCU lock ensures the implicit
> > memslots lookup in kvm_clear_guest_page() won't result in a use-after-free
> > due to derefencing old memslots.
> > 
> > Neither of those has anything to do with the userspace address, they're
> > both fully tied to KVM's gfn->hva lookup.  As Paolo pointed out, KVM's
> > mapping is instead tied to the lifecycle of the VM.  Note, even *that* has
> > no bearing on the validity of the mapping or address as KVM only increments
> > mm_count, not mm_users, i.e. guarantees the mm struct itself won't be freed
> > but doesn't ensure the vmas or associated pages tables are valid.
> > 
> > Which is the entire point of using __copy_{to,from}_user(), as they
> > gracefully handle the scenario where the process has not valid mapping
> > and/or translation for the address.
> 
> Sorry I don't understand.
> 
> I do think either the slots_lock or SRCU would protect at least the
> existing kvm.memslots, and if so at least the previous vm_mmap()
> return value should still be valid.

Nope.  kvm->slots_lock only protects gfn->hva lookups, e.g. userspace can
munmap() the range at any time.

> I agree that __copy_to_user() will protect us from many cases from process
> mm pov (which allows page faults inside), but again if the kvm.memslots is
> changed underneath us then it's another story, IMHO, and that's why we need
> either the lock or SRCU.

No, again, slots_lock and SRCU only protect gfn->hva lookups.

> Or are you assuming that (1) __x86_set_memory_region() is only for the
> 3 private kvm memslots, 

It's not an assumption, the entire purpose of __x86_set_memory_region()
is to provide support for private KVM memslots.

> and (2) currently the kvm private memory slots will never change after VM
> is created and before VM is destroyed?

No, I'm not assuming the private memslots are constant, e.g. the flow in
question, vmx_set_tss_addr() is directly tied to an unprotected ioctl().

KVM's sole responsible for vmx_set_tss_addr() is to not crash the kernel.
Userspace is responsible for ensuring it doesn't break its guests, e.g.
that multiple calls to KVM_SET_TSS_ADDR are properly serialized.

In the existing code, KVM ensures it doesn't crash by holding the SRCU lock
for the duration of init_rmode_tss() so that the gfn->hva lookups in
kvm_clear_guest_page() don't dereference a stale memslots array.  In no way
does that ensure the validity of the resulting hva, e.g. multiple calls to
KVM_SET_TSS_ADDR would race to set vmx->tss_addr and so init_rmode_tss()
could be operating on a stale gpa.

Putting the onus on KVM to ensure atomicity is pointless because concurrent
calls to KVM_SET_TSS_ADDR would still race, i.e. the end value of
vmx->tss_addr would be non-deterministic.  The intregrity of the underlying
TSS would be guaranteed, but that guarantee isn't part of KVM's ABI.

> If so, I agree with you.  However I don't see why we need to restrict
> __x86_set_memory_region() with that assumption, after all taking a
> lock is not expensive in this slow path.

In what way would not holding slots_lock in vmx_set_tss_addr() restrict
__x86_set_memory_region()?  Literally every other usage of
__x86_set_memory_region() holds slots_lock for the duration of creating
the private memslot, because in those flows, KVM *is* responsible for
ensuring correct ordering.

> Even if so, we'd better comment above __x86_set_memory_region() about this,
> so we know that we should not use __x86_set_memory_region() for future kvm
> internal memslots that are prone to change during VM's lifecycle (while
> currently it seems to be a very general interface).

There is no such restriction.  Obviously such a flow would need to ensure
correctness, but hopefully that goes without saying.
Peter Xu Jan. 31, 2020, 3:08 p.m. UTC | #7
On Tue, Jan 28, 2020 at 10:24:03AM -0800, Sean Christopherson wrote:
> On Tue, Jan 28, 2020 at 01:50:05PM +0800, Peter Xu wrote:
> > On Tue, Jan 21, 2020 at 07:56:57AM -0800, Sean Christopherson wrote:
> > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > index c4d3972dcd14..ff97782b3919 100644
> > > > --- a/arch/x86/kvm/x86.c
> > > > +++ b/arch/x86/kvm/x86.c
> > > > @@ -9584,7 +9584,15 @@ void kvm_arch_sync_events(struct kvm *kvm)
> > > >  	kvm_free_pit(kvm);
> > > >  }
> > > >  
> > > > -int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
> > > > +/*
> > > > + * If `uaddr' is specified, `*uaddr' will be returned with the
> > > > + * userspace address that was just allocated.  `uaddr' is only
> > > > + * meaningful if the function returns zero, and `uaddr' will only be
> > > > + * valid when with either the slots_lock or with the SRCU read lock
> > > > + * held.  After we release the lock, the returned `uaddr' will be invalid.
> > > 
> > > This is all incorrect.  Neither of those locks has any bearing on the
> > > validity of the hva.  slots_lock does as the name suggests and prevents
> > > concurrent writes to the memslots.  The SRCU lock ensures the implicit
> > > memslots lookup in kvm_clear_guest_page() won't result in a use-after-free
> > > due to derefencing old memslots.
> > > 
> > > Neither of those has anything to do with the userspace address, they're
> > > both fully tied to KVM's gfn->hva lookup.  As Paolo pointed out, KVM's
> > > mapping is instead tied to the lifecycle of the VM.  Note, even *that* has
> > > no bearing on the validity of the mapping or address as KVM only increments
> > > mm_count, not mm_users, i.e. guarantees the mm struct itself won't be freed
> > > but doesn't ensure the vmas or associated pages tables are valid.
> > > 
> > > Which is the entire point of using __copy_{to,from}_user(), as they
> > > gracefully handle the scenario where the process has not valid mapping
> > > and/or translation for the address.
> > 
> > Sorry I don't understand.
> > 
> > I do think either the slots_lock or SRCU would protect at least the
> > existing kvm.memslots, and if so at least the previous vm_mmap()
> > return value should still be valid.
> 
> Nope.  kvm->slots_lock only protects gfn->hva lookups, e.g. userspace can
> munmap() the range at any time.

Do we need to consider that?  If the userspace did this then it'll
corrupt itself, and imho private memory slot is not anything special
here comparing to the user memory slots.  For example, the userspace
can unmap any region after KVM_SET_USER_MEMORY_REGION ioctl even if
the region is filled into some of the userspace_addr of
kvm_userspace_memory_region, so the cached userspace_addr can be
invalid, then kvm_write_guest_page() can fail too with the same
reason.  IMHO kvm only need to make sure it handles the failure path
then it's perfectly fine.

> 
> > I agree that __copy_to_user() will protect us from many cases from process
> > mm pov (which allows page faults inside), but again if the kvm.memslots is
> > changed underneath us then it's another story, IMHO, and that's why we need
> > either the lock or SRCU.
> 
> No, again, slots_lock and SRCU only protect gfn->hva lookups.

Yes, then could you further explain why do you think we don't need the
slot lock?  

> 
> > Or are you assuming that (1) __x86_set_memory_region() is only for the
> > 3 private kvm memslots, 
> 
> It's not an assumption, the entire purpose of __x86_set_memory_region()
> is to provide support for private KVM memslots.
> 
> > and (2) currently the kvm private memory slots will never change after VM
> > is created and before VM is destroyed?
> 
> No, I'm not assuming the private memslots are constant, e.g. the flow in
> question, vmx_set_tss_addr() is directly tied to an unprotected ioctl().

Why it's unprotected?  Now vmx_set_tss_add() is protected by the slots
lock so concurrent operation is safe, also it'll return -EEXIST if
called for more than once.

[1]

> 
> KVM's sole responsible for vmx_set_tss_addr() is to not crash the kernel.
> Userspace is responsible for ensuring it doesn't break its guests, e.g.
> that multiple calls to KVM_SET_TSS_ADDR are properly serialized.
> 
> In the existing code, KVM ensures it doesn't crash by holding the SRCU lock
> for the duration of init_rmode_tss() so that the gfn->hva lookups in
> kvm_clear_guest_page() don't dereference a stale memslots array.

Here in the current master branch we have both the RCU lock and the
slot lock held, that's why I think we can safely remove the RCU lock
as long as we're still holding the slots lock.  We can't do the
reverse because otherwise multiple KVM_SET_TSS_ADDR could race.

> In no way
> does that ensure the validity of the resulting hva,

Yes, but as I mentioned, I don't think it's an issue to be considered
by KVM, otherwise we should have the same issue all over the places
when we fetch the cached userspace_addr from any user slots.

> e.g. multiple calls to
> KVM_SET_TSS_ADDR would race to set vmx->tss_addr and so init_rmode_tss()
> could be operating on a stale gpa.

Please refer to [1].

I just want to double-confirm on what we're discussing now. Are you
sure you're suggesting that we should remove the slot lock in
init_rmode_tss()?  Asked because you discussed quite a bit on how the
slot lock should protect GPA->HVA, about concurrency and so on, then
I'm even more comfused...

Thanks,
Sean Christopherson Jan. 31, 2020, 7:33 p.m. UTC | #8
On Fri, Jan 31, 2020 at 10:08:32AM -0500, Peter Xu wrote:
> On Tue, Jan 28, 2020 at 10:24:03AM -0800, Sean Christopherson wrote:
> > On Tue, Jan 28, 2020 at 01:50:05PM +0800, Peter Xu wrote:
> > > On Tue, Jan 21, 2020 at 07:56:57AM -0800, Sean Christopherson wrote:
> > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > > index c4d3972dcd14..ff97782b3919 100644
> > > > > --- a/arch/x86/kvm/x86.c
> > > > > +++ b/arch/x86/kvm/x86.c
> > > > > @@ -9584,7 +9584,15 @@ void kvm_arch_sync_events(struct kvm *kvm)
> > > > >  	kvm_free_pit(kvm);
> > > > >  }
> > > > >  
> > > > > -int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
> > > > > +/*
> > > > > + * If `uaddr' is specified, `*uaddr' will be returned with the
> > > > > + * userspace address that was just allocated.  `uaddr' is only
> > > > > + * meaningful if the function returns zero, and `uaddr' will only be
> > > > > + * valid when with either the slots_lock or with the SRCU read lock
> > > > > + * held.  After we release the lock, the returned `uaddr' will be invalid.
> > > > 
> > > > This is all incorrect.  Neither of those locks has any bearing on the
> > > > validity of the hva.  slots_lock does as the name suggests and prevents
> > > > concurrent writes to the memslots.  The SRCU lock ensures the implicit
> > > > memslots lookup in kvm_clear_guest_page() won't result in a use-after-free
> > > > due to derefencing old memslots.
> > > > 
> > > > Neither of those has anything to do with the userspace address, they're
> > > > both fully tied to KVM's gfn->hva lookup.  As Paolo pointed out, KVM's
> > > > mapping is instead tied to the lifecycle of the VM.  Note, even *that* has
> > > > no bearing on the validity of the mapping or address as KVM only increments
> > > > mm_count, not mm_users, i.e. guarantees the mm struct itself won't be freed
> > > > but doesn't ensure the vmas or associated pages tables are valid.
> > > > 
> > > > Which is the entire point of using __copy_{to,from}_user(), as they
> > > > gracefully handle the scenario where the process has not valid mapping
> > > > and/or translation for the address.
> > > 
> > > Sorry I don't understand.
> > > 
> > > I do think either the slots_lock or SRCU would protect at least the
> > > existing kvm.memslots, and if so at least the previous vm_mmap()
> > > return value should still be valid.
> > 
> > Nope.  kvm->slots_lock only protects gfn->hva lookups, e.g. userspace can
> > munmap() the range at any time.
> 
> Do we need to consider that?  If the userspace did this then it'll
> corrupt itself, and imho private memory slot is not anything special
> here comparing to the user memory slots.  For example, the userspace
> can unmap any region after KVM_SET_USER_MEMORY_REGION ioctl even if
> the region is filled into some of the userspace_addr of
> kvm_userspace_memory_region, so the cached userspace_addr can be
> invalid, then kvm_write_guest_page() can fail too with the same
> reason.  IMHO kvm only need to make sure it handles the failure path
> then it's perfectly fine.

Yes?  No?  My point is that your original comment's assertion that "'uaddr'
will only be valid when with either the slots_lock or with the SRCU read
lock held." is wrong and misleading.

> > > I agree that __copy_to_user() will protect us from many cases from process
> > > mm pov (which allows page faults inside), but again if the kvm.memslots is
> > > changed underneath us then it's another story, IMHO, and that's why we need
> > > either the lock or SRCU.
> > 
> > No, again, slots_lock and SRCU only protect gfn->hva lookups.
> 
> Yes, then could you further explain why do you think we don't need the
> slot lock?  

For the same reason we don't take mmap_sem, it gains us nothing, i.e. KVM
still has to use copy_{to,from}_user().

In the proposed __x86_set_memory_region() refactor, vmx_set_tss_addr()
would be provided the hva of the memory region.  Since slots_lock and SRCU
only protect gfn->hva, why would KVM take slots_lock since it already has
the hva?

> > > Or are you assuming that (1) __x86_set_memory_region() is only for the
> > > 3 private kvm memslots, 
> > 
> > It's not an assumption, the entire purpose of __x86_set_memory_region()
> > is to provide support for private KVM memslots.
> > 
> > > and (2) currently the kvm private memory slots will never change after VM
> > > is created and before VM is destroyed?
> > 
> > No, I'm not assuming the private memslots are constant, e.g. the flow in
> > question, vmx_set_tss_addr() is directly tied to an unprotected ioctl().
> 
> Why it's unprotected?

Because it doesn't need to be protected.

> Now vmx_set_tss_add() is protected by the slots lock so concurrent operation
> is safe, also it'll return -EEXIST if called for more than once.

Returning -EEXIST is an ABI change, e.g. userspace can currently call
KVM_SET_TSS_ADDR any number of times, it just needs to ensure proper
serialization between calls.

If you want to change the ABI, then submit a patch to do exactly that.
But don't bury an ABI change under the pretense that it's a bug fix.

> [1]
> 
> > 
> > KVM's sole responsible for vmx_set_tss_addr() is to not crash the kernel.
> > Userspace is responsible for ensuring it doesn't break its guests, e.g.
> > that multiple calls to KVM_SET_TSS_ADDR are properly serialized.
> > 
> > In the existing code, KVM ensures it doesn't crash by holding the SRCU lock
> > for the duration of init_rmode_tss() so that the gfn->hva lookups in
> > kvm_clear_guest_page() don't dereference a stale memslots array.
> 
> Here in the current master branch we have both the RCU lock and the
> slot lock held, that's why I think we can safely remove the RCU lock
> as long as we're still holding the slots lock.  We can't do the
> reverse because otherwise multiple KVM_SET_TSS_ADDR could race.

Your wording is all messed up.  "we have both the RCU lock and the slot
lock held" is wrong.  KVM holds slot_lock around __x86_set_memory_region(),
because changing the memslots must be mutually exclusive.  It then *drops*
slots_lock because it's done writing the memslots and grabs the SRCU lock
in order to protect the gfn->hva lookups done by init_rmode_tss().  It
*intentionally* drops slots_lock because writing init_rmode_tss() does not
need to be a mutually exclusive operation, per KVM's existing ABI.

If KVM held both slots_lock and SRCU then __x86_set_memory_region() would
deadlock on synchronize_srcu().

> > In no way
> > does that ensure the validity of the resulting hva,
> 
> Yes, but as I mentioned, I don't think it's an issue to be considered
> by KVM, otherwise we should have the same issue all over the places
> when we fetch the cached userspace_addr from any user slots.

Huh?  Of course it's an issue that needs to be considered by KVM, e.g.
kvm_{read,write}_guest_cached() aren't using __copy_{to,}from_user() for
giggles.

> > e.g. multiple calls to
> > KVM_SET_TSS_ADDR would race to set vmx->tss_addr and so init_rmode_tss()
> > could be operating on a stale gpa.
> 
> Please refer to [1].
> 
> I just want to double-confirm on what we're discussing now. Are you
> sure you're suggesting that we should remove the slot lock in
> init_rmode_tss()?  Asked because you discussed quite a bit on how the
> slot lock should protect GPA->HVA, about concurrency and so on, then
> I'm even more comfused...

Yes, if init_rmode_tss() is provided the hva then it does not need to
grab srcu_read_lock(&kvm->srcu) because it can directly call
__copy_{to,from}_user() instead of bouncing through the KVM helpers that
translate a gfn to hva.

The code can look like this.  That being said, I've completely lost track
of why __x86_set_memory_region() needs to provide the hva, i.e. have no
idea if we *should* do this, or it would be better to keep the current
code, which would be slower, but less custom.

static int init_rmode_tss(void __user *hva)
{
	const void *zero_page = (const void *)__va(page_to_phys(ZERO_PAGE(0)));
	u16 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
	int r;

	r = __copy_to_user(hva, zero_page, PAGE_SIZE);
	if (r)
		return -EFAULT;

	r = __copy_to_user(hva + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16))
	if (r)
		return -EFAULT;

	hva += PAGE_SIZE;
	r = __copy_to_user(hva + PAGE_SIZE, zero_page, PAGE_SIZE);
	if (r)
		return -EFAULT;

	hva += PAGE_SIZE;
	r = __copy_to_user(hva + PAGE_SIZE, zero_page, PAGE_SIZE);
	if (r)
		return -EFAULT;

	data = ~0;
	hva += RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1;
	r = __copy_to_user(hva, &data, sizeof(u16))
	if (r)
		return -EFAULT;
}

static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
{
	void __user *hva;

	if (enable_unrestricted_guest)
		return 0;

	mutex_lock(&kvm->slots_lock);
	hva = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
				      PAGE_SIZE * 3);
	mutex_unlock(&kvm->slots_lock);

	if (IS_ERR(hva))
		return PTR_ERR(hva);

	to_kvm_vmx(kvm)->tss_addr = addr;
	return init_rmode_tss(hva);
}

Yes, userspace can corrupt its VM by invoking KVM_SET_TSS_ADDR multiple
times without serializing the calls, but that's already true today.
Peter Xu Jan. 31, 2020, 8:28 p.m. UTC | #9
On Fri, Jan 31, 2020 at 11:33:01AM -0800, Sean Christopherson wrote:
> On Fri, Jan 31, 2020 at 10:08:32AM -0500, Peter Xu wrote:
> > On Tue, Jan 28, 2020 at 10:24:03AM -0800, Sean Christopherson wrote:
> > > On Tue, Jan 28, 2020 at 01:50:05PM +0800, Peter Xu wrote:
> > > > On Tue, Jan 21, 2020 at 07:56:57AM -0800, Sean Christopherson wrote:
> > > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > > > index c4d3972dcd14..ff97782b3919 100644
> > > > > > --- a/arch/x86/kvm/x86.c
> > > > > > +++ b/arch/x86/kvm/x86.c
> > > > > > @@ -9584,7 +9584,15 @@ void kvm_arch_sync_events(struct kvm *kvm)
> > > > > >  	kvm_free_pit(kvm);
> > > > > >  }
> > > > > >  
> > > > > > -int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
> > > > > > +/*
> > > > > > + * If `uaddr' is specified, `*uaddr' will be returned with the
> > > > > > + * userspace address that was just allocated.  `uaddr' is only
> > > > > > + * meaningful if the function returns zero, and `uaddr' will only be
> > > > > > + * valid when with either the slots_lock or with the SRCU read lock
> > > > > > + * held.  After we release the lock, the returned `uaddr' will be invalid.
> > > > > 
> > > > > This is all incorrect.  Neither of those locks has any bearing on the
> > > > > validity of the hva.  slots_lock does as the name suggests and prevents
> > > > > concurrent writes to the memslots.  The SRCU lock ensures the implicit
> > > > > memslots lookup in kvm_clear_guest_page() won't result in a use-after-free
> > > > > due to derefencing old memslots.
> > > > > 
> > > > > Neither of those has anything to do with the userspace address, they're
> > > > > both fully tied to KVM's gfn->hva lookup.  As Paolo pointed out, KVM's
> > > > > mapping is instead tied to the lifecycle of the VM.  Note, even *that* has
> > > > > no bearing on the validity of the mapping or address as KVM only increments
> > > > > mm_count, not mm_users, i.e. guarantees the mm struct itself won't be freed
> > > > > but doesn't ensure the vmas or associated pages tables are valid.
> > > > > 
> > > > > Which is the entire point of using __copy_{to,from}_user(), as they
> > > > > gracefully handle the scenario where the process has not valid mapping
> > > > > and/or translation for the address.
> > > > 
> > > > Sorry I don't understand.
> > > > 
> > > > I do think either the slots_lock or SRCU would protect at least the
> > > > existing kvm.memslots, and if so at least the previous vm_mmap()
> > > > return value should still be valid.
> > > 
> > > Nope.  kvm->slots_lock only protects gfn->hva lookups, e.g. userspace can
> > > munmap() the range at any time.
> > 
> > Do we need to consider that?  If the userspace did this then it'll
> > corrupt itself, and imho private memory slot is not anything special
> > here comparing to the user memory slots.  For example, the userspace
> > can unmap any region after KVM_SET_USER_MEMORY_REGION ioctl even if
> > the region is filled into some of the userspace_addr of
> > kvm_userspace_memory_region, so the cached userspace_addr can be
> > invalid, then kvm_write_guest_page() can fail too with the same
> > reason.  IMHO kvm only need to make sure it handles the failure path
> > then it's perfectly fine.
> 
> Yes?  No?  My point is that your original comment's assertion that "'uaddr'
> will only be valid when with either the slots_lock or with the SRCU read
> lock held." is wrong and misleading.

Yes I'll fix that.

> 
> > > > I agree that __copy_to_user() will protect us from many cases from process
> > > > mm pov (which allows page faults inside), but again if the kvm.memslots is
> > > > changed underneath us then it's another story, IMHO, and that's why we need
> > > > either the lock or SRCU.
> > > 
> > > No, again, slots_lock and SRCU only protect gfn->hva lookups.
> > 
> > Yes, then could you further explain why do you think we don't need the
> > slot lock?  
> 
> For the same reason we don't take mmap_sem, it gains us nothing, i.e. KVM
> still has to use copy_{to,from}_user().
> 
> In the proposed __x86_set_memory_region() refactor, vmx_set_tss_addr()
> would be provided the hva of the memory region.  Since slots_lock and SRCU
> only protect gfn->hva, why would KVM take slots_lock since it already has
> the hva?

OK so you're suggesting to unlock the lock earlier to not cover
init_rmode_tss() rather than dropping the whole lock...  Yes it looks
good to me.  I think that's the major confusion I got.

> 
> > > > Or are you assuming that (1) __x86_set_memory_region() is only for the
> > > > 3 private kvm memslots, 
> > > 
> > > It's not an assumption, the entire purpose of __x86_set_memory_region()
> > > is to provide support for private KVM memslots.
> > > 
> > > > and (2) currently the kvm private memory slots will never change after VM
> > > > is created and before VM is destroyed?
> > > 
> > > No, I'm not assuming the private memslots are constant, e.g. the flow in
> > > question, vmx_set_tss_addr() is directly tied to an unprotected ioctl().
> > 
> > Why it's unprotected?
> 
> Because it doesn't need to be protected.
> 
> > Now vmx_set_tss_add() is protected by the slots lock so concurrent operation
> > is safe, also it'll return -EEXIST if called for more than once.
> 
> Returning -EEXIST is an ABI change, e.g. userspace can currently call
> KVM_SET_TSS_ADDR any number of times, it just needs to ensure proper
> serialization between calls.
> 
> If you want to change the ABI, then submit a patch to do exactly that.
> But don't bury an ABI change under the pretense that it's a bug fix.

Could you explain what do you mean by "ABI change"?

I was talking about the original code, not after applying the
patchset.  To be explicit, I mean [a] below:

int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size,
			    unsigned long *uaddr)
{
	int i, r;
	unsigned long hva;
	struct kvm_memslots *slots = kvm_memslots(kvm);
	struct kvm_memory_slot *slot, old;

	/* Called with kvm->slots_lock held.  */
	if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
		return -EINVAL;

	slot = id_to_memslot(slots, id);
	if (size) {
		if (slot->npages)
			return -EEXIST;  <------------------------ [a]
        }
        ...
}

> 
> > [1]
> > 
> > > 
> > > KVM's sole responsible for vmx_set_tss_addr() is to not crash the kernel.
> > > Userspace is responsible for ensuring it doesn't break its guests, e.g.
> > > that multiple calls to KVM_SET_TSS_ADDR are properly serialized.
> > > 
> > > In the existing code, KVM ensures it doesn't crash by holding the SRCU lock
> > > for the duration of init_rmode_tss() so that the gfn->hva lookups in
> > > kvm_clear_guest_page() don't dereference a stale memslots array.
> > 
> > Here in the current master branch we have both the RCU lock and the
> > slot lock held, that's why I think we can safely remove the RCU lock
> > as long as we're still holding the slots lock.  We can't do the
> > reverse because otherwise multiple KVM_SET_TSS_ADDR could race.
> 
> Your wording is all messed up.  "we have both the RCU lock and the slot
> lock held" is wrong.

I did mess up with 2a5755bb21ee2.  We didn't take both lock here,
sorry.

> KVM holds slot_lock around __x86_set_memory_region(),
> because changing the memslots must be mutually exclusive.  It then *drops*
> slots_lock because it's done writing the memslots and grabs the SRCU lock
> in order to protect the gfn->hva lookups done by init_rmode_tss().  It
> *intentionally* drops slots_lock because writing init_rmode_tss() does not
> need to be a mutually exclusive operation, per KVM's existing ABI.
> 
> If KVM held both slots_lock and SRCU then __x86_set_memory_region() would
> deadlock on synchronize_srcu().
> 
> > > In no way
> > > does that ensure the validity of the resulting hva,
> > 
> > Yes, but as I mentioned, I don't think it's an issue to be considered
> > by KVM, otherwise we should have the same issue all over the places
> > when we fetch the cached userspace_addr from any user slots.
> 
> Huh?  Of course it's an issue that needs to be considered by KVM, e.g.
> kvm_{read,write}_guest_cached() aren't using __copy_{to,}from_user() for
> giggles.

The cache is for the GPA->HVA translation (struct gfn_to_hva_cache),
we still use __copy_{to,}from_user() upon the HVAs, no?

> 
> > > e.g. multiple calls to
> > > KVM_SET_TSS_ADDR would race to set vmx->tss_addr and so init_rmode_tss()
> > > could be operating on a stale gpa.
> > 
> > Please refer to [1].
> > 
> > I just want to double-confirm on what we're discussing now. Are you
> > sure you're suggesting that we should remove the slot lock in
> > init_rmode_tss()?  Asked because you discussed quite a bit on how the
> > slot lock should protect GPA->HVA, about concurrency and so on, then
> > I'm even more comfused...
> 
> Yes, if init_rmode_tss() is provided the hva then it does not need to
> grab srcu_read_lock(&kvm->srcu) because it can directly call
> __copy_{to,from}_user() instead of bouncing through the KVM helpers that
> translate a gfn to hva.
> 
> The code can look like this.  That being said, I've completely lost track
> of why __x86_set_memory_region() needs to provide the hva, i.e. have no
> idea if we *should* do this, or it would be better to keep the current
> code, which would be slower, but less custom.
> 
> static int init_rmode_tss(void __user *hva)
> {
> 	const void *zero_page = (const void *)__va(page_to_phys(ZERO_PAGE(0)));
> 	u16 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
> 	int r;
> 
> 	r = __copy_to_user(hva, zero_page, PAGE_SIZE);
> 	if (r)
> 		return -EFAULT;
> 
> 	r = __copy_to_user(hva + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16))
> 	if (r)
> 		return -EFAULT;
> 
> 	hva += PAGE_SIZE;
> 	r = __copy_to_user(hva + PAGE_SIZE, zero_page, PAGE_SIZE);
> 	if (r)
> 		return -EFAULT;
> 
> 	hva += PAGE_SIZE;
> 	r = __copy_to_user(hva + PAGE_SIZE, zero_page, PAGE_SIZE);
> 	if (r)
> 		return -EFAULT;
> 
> 	data = ~0;
> 	hva += RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1;
> 	r = __copy_to_user(hva, &data, sizeof(u16))
> 	if (r)
> 		return -EFAULT;
> }
> 
> static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
> {
> 	void __user *hva;
> 
> 	if (enable_unrestricted_guest)
> 		return 0;
> 
> 	mutex_lock(&kvm->slots_lock);
> 	hva = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
> 				      PAGE_SIZE * 3);
> 	mutex_unlock(&kvm->slots_lock);
> 
> 	if (IS_ERR(hva))
> 		return PTR_ERR(hva);
> 
> 	to_kvm_vmx(kvm)->tss_addr = addr;
> 	return init_rmode_tss(hva);
> }
> 
> Yes, userspace can corrupt its VM by invoking KVM_SET_TSS_ADDR multiple
> times without serializing the calls, but that's already true today.

But I still don't see why we have any problem here.  Only the first
thread will get the slots_lock here and succeed this ioctl.  The rest
threads will fail with -EEXIST, no?
Sean Christopherson Jan. 31, 2020, 8:36 p.m. UTC | #10
On Fri, Jan 31, 2020 at 03:28:24PM -0500, Peter Xu wrote:
> On Fri, Jan 31, 2020 at 11:33:01AM -0800, Sean Christopherson wrote:
> > For the same reason we don't take mmap_sem, it gains us nothing, i.e. KVM
> > still has to use copy_{to,from}_user().
> > 
> > In the proposed __x86_set_memory_region() refactor, vmx_set_tss_addr()
> > would be provided the hva of the memory region.  Since slots_lock and SRCU
> > only protect gfn->hva, why would KVM take slots_lock since it already has
> > the hva?
> 
> OK so you're suggesting to unlock the lock earlier to not cover
> init_rmode_tss() rather than dropping the whole lock...  Yes it looks
> good to me.  I think that's the major confusion I got.

Ya.  And I missed where the -EEXIST was coming from.  I think we're on the
same page.

> > Returning -EEXIST is an ABI change, e.g. userspace can currently call
> > KVM_SET_TSS_ADDR any number of times, it just needs to ensure proper
> > serialization between calls.
> > 
> > If you want to change the ABI, then submit a patch to do exactly that.
> > But don't bury an ABI change under the pretense that it's a bug fix.
> 
> Could you explain what do you mean by "ABI change"?
> 
> I was talking about the original code, not after applying the
> patchset.  To be explicit, I mean [a] below:
> 
> int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size,
> 			    unsigned long *uaddr)
> {
> 	int i, r;
> 	unsigned long hva;
> 	struct kvm_memslots *slots = kvm_memslots(kvm);
> 	struct kvm_memory_slot *slot, old;
> 
> 	/* Called with kvm->slots_lock held.  */
> 	if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
> 		return -EINVAL;
> 
> 	slot = id_to_memslot(slots, id);
> 	if (size) {
> 		if (slot->npages)
> 			return -EEXIST;  <------------------------ [a]
>         }
>         ...
> }

Doh, I completely forgot that the second __x86_set_memory_region() would
fail.  Sorry :-(

> > > Yes, but as I mentioned, I don't think it's an issue to be considered
> > > by KVM, otherwise we should have the same issue all over the places
> > > when we fetch the cached userspace_addr from any user slots.
> > 
> > Huh?  Of course it's an issue that needs to be considered by KVM, e.g.
> > kvm_{read,write}_guest_cached() aren't using __copy_{to,}from_user() for
> > giggles.
> 
> The cache is for the GPA->HVA translation (struct gfn_to_hva_cache),
> we still use __copy_{to,}from_user() upon the HVAs, no?

I'm still lost on this one.  I'm pretty sure I'm incorrectly interpreting:
  
  I don't think it's an issue to be considered by KVM, otherwise we should
  have the same issue all over the places when we fetch the cached
  userspace_addr from any user slots.

What is the issue to which you are referring?
Peter Xu Jan. 31, 2020, 8:55 p.m. UTC | #11
On Fri, Jan 31, 2020 at 12:36:22PM -0800, Sean Christopherson wrote:
> On Fri, Jan 31, 2020 at 03:28:24PM -0500, Peter Xu wrote:
> > On Fri, Jan 31, 2020 at 11:33:01AM -0800, Sean Christopherson wrote:
> > > For the same reason we don't take mmap_sem, it gains us nothing, i.e. KVM
> > > still has to use copy_{to,from}_user().
> > > 
> > > In the proposed __x86_set_memory_region() refactor, vmx_set_tss_addr()
> > > would be provided the hva of the memory region.  Since slots_lock and SRCU
> > > only protect gfn->hva, why would KVM take slots_lock since it already has
> > > the hva?
> > 
> > OK so you're suggesting to unlock the lock earlier to not cover
> > init_rmode_tss() rather than dropping the whole lock...  Yes it looks
> > good to me.  I think that's the major confusion I got.
> 
> Ya.  And I missed where the -EEXIST was coming from.  I think we're on the
> same page.

Good to know.  Btw, for me I would still prefer to keep the lock be
after the __copy_to_user()s because "HVA is valid without lock" is
only true for these private memslots.  After all this is super slow
path so I wouldn't mind to take the lock for some time longer.  Or
otherwise if you really like the unlock() to be earlier I can comment
above the unlock:

  /*
   * We can unlock before using the HVA only because this KVM private
   * memory slot will never change until the end of VM lifecycle.
   */

> 
> > > Returning -EEXIST is an ABI change, e.g. userspace can currently call
> > > KVM_SET_TSS_ADDR any number of times, it just needs to ensure proper
> > > serialization between calls.
> > > 
> > > If you want to change the ABI, then submit a patch to do exactly that.
> > > But don't bury an ABI change under the pretense that it's a bug fix.
> > 
> > Could you explain what do you mean by "ABI change"?
> > 
> > I was talking about the original code, not after applying the
> > patchset.  To be explicit, I mean [a] below:
> > 
> > int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size,
> > 			    unsigned long *uaddr)
> > {
> > 	int i, r;
> > 	unsigned long hva;
> > 	struct kvm_memslots *slots = kvm_memslots(kvm);
> > 	struct kvm_memory_slot *slot, old;
> > 
> > 	/* Called with kvm->slots_lock held.  */
> > 	if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
> > 		return -EINVAL;
> > 
> > 	slot = id_to_memslot(slots, id);
> > 	if (size) {
> > 		if (slot->npages)
> > 			return -EEXIST;  <------------------------ [a]
> >         }
> >         ...
> > }
> 
> Doh, I completely forgot that the second __x86_set_memory_region() would
> fail.  Sorry :-(
> 
> > > > Yes, but as I mentioned, I don't think it's an issue to be considered
> > > > by KVM, otherwise we should have the same issue all over the places
> > > > when we fetch the cached userspace_addr from any user slots.
> > > 
> > > Huh?  Of course it's an issue that needs to be considered by KVM, e.g.
> > > kvm_{read,write}_guest_cached() aren't using __copy_{to,}from_user() for
> > > giggles.
> > 
> > The cache is for the GPA->HVA translation (struct gfn_to_hva_cache),
> > we still use __copy_{to,}from_user() upon the HVAs, no?
> 
> I'm still lost on this one.  I'm pretty sure I'm incorrectly interpreting:
>   
>   I don't think it's an issue to be considered by KVM, otherwise we should
>   have the same issue all over the places when we fetch the cached
>   userspace_addr from any user slots.
> 
> What is the issue to which you are referring?

The issue I was referring to is "HVA can be unmapped by the userspace
without KVM's notice".  I think actually we're on the same page too
here, my follow-up question is really a pure question for when you say
"kvm_{read,write}_guest_cached() aren't using __copy_{to,}from_user()"
above - because that's against my understanding.
Sean Christopherson Jan. 31, 2020, 9:29 p.m. UTC | #12
On Fri, Jan 31, 2020 at 03:55:50PM -0500, Peter Xu wrote:
> On Fri, Jan 31, 2020 at 12:36:22PM -0800, Sean Christopherson wrote:
> > On Fri, Jan 31, 2020 at 03:28:24PM -0500, Peter Xu wrote:
> > > On Fri, Jan 31, 2020 at 11:33:01AM -0800, Sean Christopherson wrote:
> > > > For the same reason we don't take mmap_sem, it gains us nothing, i.e. KVM
> > > > still has to use copy_{to,from}_user().
> > > > 
> > > > In the proposed __x86_set_memory_region() refactor, vmx_set_tss_addr()
> > > > would be provided the hva of the memory region.  Since slots_lock and SRCU
> > > > only protect gfn->hva, why would KVM take slots_lock since it already has
> > > > the hva?
> > > 
> > > OK so you're suggesting to unlock the lock earlier to not cover
> > > init_rmode_tss() rather than dropping the whole lock...  Yes it looks
> > > good to me.  I think that's the major confusion I got.
> > 
> > Ya.  And I missed where the -EEXIST was coming from.  I think we're on the
> > same page.
> 
> Good to know.  Btw, for me I would still prefer to keep the lock be
> after the __copy_to_user()s because "HVA is valid without lock" is
> only true for these private memslots.

No.  From KVM's perspective, the HVA is *never* valid.  Even if you rewrote
this statement to say "the gfn->hva translation is valid without lock" it
would still be incorrect. 

KVM is *always* using HVAs without holding lock, e.g. every time it enters
the guest it is deferencing a memslot because the translations stored in
the TLB are effectively gfn->hva->hpa.  Obviously KVM ensures that it won't
dereference a memslot that has been deleted/moved, but it's a lot more
subtle than simply holding a lock.

> After all this is super slow path so I wouldn't mind to take the lock
> for some time longer.

Holding the lock doesn't affect this super slow vmx_set_tss_addr(), it
affects everything else that wants slots_lock.  Now, admittedly it's
extremely unlikely userspace is going to do KVM_SET_USER_MEMORY_REGION in
parallel, but that's not the point and it's not why I'm objecting to
holding the lock.

Holding the lock implies protection that is *not* provided.  You and I know
it's not needed for copy_{to,from}_user(), but look how long it's taken us
to get on the same page.  A future KVM developer comes along, sees this
code, and thinks "oh, I need to hold slots_lock to dereference a gfn", and
propagates the unnecessary locking to some other code.

> Or otherwise if you really like the unlock() to
> be earlier I can comment above the unlock:
> 
>   /*
>    * We can unlock before using the HVA only because this KVM private
>    * memory slot will never change until the end of VM lifecycle.
>    */

How about:

	/*
	 * No need to hold slots_lock while filling the TSS, the TSS private
	 * memslot is guaranteed to be valid until the VM is destroyed, i.e.
	 * there is no danger of corrupting guest memory by consuming a stale
	 * gfn->hva lookup.
	 */
Peter Xu Jan. 31, 2020, 10:16 p.m. UTC | #13
On Fri, Jan 31, 2020 at 01:29:28PM -0800, Sean Christopherson wrote:
> On Fri, Jan 31, 2020 at 03:55:50PM -0500, Peter Xu wrote:
> > On Fri, Jan 31, 2020 at 12:36:22PM -0800, Sean Christopherson wrote:
> > > On Fri, Jan 31, 2020 at 03:28:24PM -0500, Peter Xu wrote:
> > > > On Fri, Jan 31, 2020 at 11:33:01AM -0800, Sean Christopherson wrote:
> > > > > For the same reason we don't take mmap_sem, it gains us nothing, i.e. KVM
> > > > > still has to use copy_{to,from}_user().
> > > > > 
> > > > > In the proposed __x86_set_memory_region() refactor, vmx_set_tss_addr()
> > > > > would be provided the hva of the memory region.  Since slots_lock and SRCU
> > > > > only protect gfn->hva, why would KVM take slots_lock since it already has
> > > > > the hva?
> > > > 
> > > > OK so you're suggesting to unlock the lock earlier to not cover
> > > > init_rmode_tss() rather than dropping the whole lock...  Yes it looks
> > > > good to me.  I think that's the major confusion I got.
> > > 
> > > Ya.  And I missed where the -EEXIST was coming from.  I think we're on the
> > > same page.
> > 
> > Good to know.  Btw, for me I would still prefer to keep the lock be
> > after the __copy_to_user()s because "HVA is valid without lock" is
> > only true for these private memslots.
> 
> No.  From KVM's perspective, the HVA is *never* valid.  Even if you rewrote
> this statement to say "the gfn->hva translation is valid without lock" it
> would still be incorrect. 
> 
> KVM is *always* using HVAs without holding lock, e.g. every time it enters
> the guest it is deferencing a memslot because the translations stored in
> the TLB are effectively gfn->hva->hpa.  Obviously KVM ensures that it won't
> dereference a memslot that has been deleted/moved, but it's a lot more
> subtle than simply holding a lock.
> 
> > After all this is super slow path so I wouldn't mind to take the lock
> > for some time longer.
> 
> Holding the lock doesn't affect this super slow vmx_set_tss_addr(), it
> affects everything else that wants slots_lock.  Now, admittedly it's
> extremely unlikely userspace is going to do KVM_SET_USER_MEMORY_REGION in
> parallel, but that's not the point and it's not why I'm objecting to
> holding the lock.
> 
> Holding the lock implies protection that is *not* provided.  You and I know
> it's not needed for copy_{to,from}_user(), but look how long it's taken us
> to get on the same page.  A future KVM developer comes along, sees this
> code, and thinks "oh, I need to hold slots_lock to dereference a gfn", and
> propagates the unnecessary locking to some other code.

At least for a user memory slot, we "need to hold slots_lock to
dereference a gfn" (or srcu), right?

You know I'm suffering from a jetlag today, I thought I was still
fine, now I start to doubt it. :-)

> 
> > Or otherwise if you really like the unlock() to
> > be earlier I can comment above the unlock:
> > 
> >   /*
> >    * We can unlock before using the HVA only because this KVM private
> >    * memory slot will never change until the end of VM lifecycle.
> >    */
> 
> How about:
> 
> 	/*
> 	 * No need to hold slots_lock while filling the TSS, the TSS private
> 	 * memslot is guaranteed to be valid until the VM is destroyed, i.e.
> 	 * there is no danger of corrupting guest memory by consuming a stale
> 	 * gfn->hva lookup.
> 	 */

Sure for this.
Sean Christopherson Jan. 31, 2020, 10:20 p.m. UTC | #14
On Fri, Jan 31, 2020 at 05:16:37PM -0500, Peter Xu wrote:
> On Fri, Jan 31, 2020 at 01:29:28PM -0800, Sean Christopherson wrote:
> > On Fri, Jan 31, 2020 at 03:55:50PM -0500, Peter Xu wrote:
> > > On Fri, Jan 31, 2020 at 12:36:22PM -0800, Sean Christopherson wrote:
> > > > On Fri, Jan 31, 2020 at 03:28:24PM -0500, Peter Xu wrote:
> > > > > On Fri, Jan 31, 2020 at 11:33:01AM -0800, Sean Christopherson wrote:
> > > > > > For the same reason we don't take mmap_sem, it gains us nothing, i.e. KVM
> > > > > > still has to use copy_{to,from}_user().
> > > > > > 
> > > > > > In the proposed __x86_set_memory_region() refactor, vmx_set_tss_addr()
> > > > > > would be provided the hva of the memory region.  Since slots_lock and SRCU
> > > > > > only protect gfn->hva, why would KVM take slots_lock since it already has
> > > > > > the hva?
> > > > > 
> > > > > OK so you're suggesting to unlock the lock earlier to not cover
> > > > > init_rmode_tss() rather than dropping the whole lock...  Yes it looks
> > > > > good to me.  I think that's the major confusion I got.
> > > > 
> > > > Ya.  And I missed where the -EEXIST was coming from.  I think we're on the
> > > > same page.
> > > 
> > > Good to know.  Btw, for me I would still prefer to keep the lock be
> > > after the __copy_to_user()s because "HVA is valid without lock" is
> > > only true for these private memslots.
> > 
> > No.  From KVM's perspective, the HVA is *never* valid.  Even if you rewrote
> > this statement to say "the gfn->hva translation is valid without lock" it
> > would still be incorrect. 
> > 
> > KVM is *always* using HVAs without holding lock, e.g. every time it enters
> > the guest it is deferencing a memslot because the translations stored in
> > the TLB are effectively gfn->hva->hpa.  Obviously KVM ensures that it won't
> > dereference a memslot that has been deleted/moved, but it's a lot more
> > subtle than simply holding a lock.
> > 
> > > After all this is super slow path so I wouldn't mind to take the lock
> > > for some time longer.
> > 
> > Holding the lock doesn't affect this super slow vmx_set_tss_addr(), it
> > affects everything else that wants slots_lock.  Now, admittedly it's
> > extremely unlikely userspace is going to do KVM_SET_USER_MEMORY_REGION in
> > parallel, but that's not the point and it's not why I'm objecting to
> > holding the lock.
> > 
> > Holding the lock implies protection that is *not* provided.  You and I know
> > it's not needed for copy_{to,from}_user(), but look how long it's taken us
> > to get on the same page.  A future KVM developer comes along, sees this
> > code, and thinks "oh, I need to hold slots_lock to dereference a gfn", and
> > propagates the unnecessary locking to some other code.
> 
> At least for a user memory slot, we "need to hold slots_lock to
> dereference a gfn" (or srcu), right?

Gah, that was supposed to be "dereference a hva".  Yes, a gfn->hva lookup
requires slots_lock or SRCU read lock.

> You know I'm suffering from a jetlag today, I thought I was still
> fine, now I start to doubt it. :-)

Unintentional gaslighting.  Or was it?  :-D
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index eb6673c7d2e3..f536d139b3d2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1618,7 +1618,8 @@  void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);
 
 int kvm_is_in_guest(void);
 
-int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size);
+int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size,
+			    unsigned long *uaddr);
 bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
 bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8f1b715dfde8..03a344ce7b66 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1698,7 +1698,8 @@  static int avic_init_access_page(struct kvm_vcpu *vcpu)
 	ret = __x86_set_memory_region(kvm,
 				      APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
 				      APIC_DEFAULT_PHYS_BASE,
-				      PAGE_SIZE);
+				      PAGE_SIZE,
+				      NULL);
 	if (ret)
 		goto out;
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 7e3d370209e0..62175a246bcc 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3441,34 +3441,28 @@  static bool guest_state_valid(struct kvm_vcpu *vcpu)
 	return true;
 }
 
-static int init_rmode_tss(struct kvm *kvm)
+static int init_rmode_tss(struct kvm *kvm, unsigned long *uaddr)
 {
-	gfn_t fn;
+	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
 	u16 data = 0;
 	int idx, r;
 
-	idx = srcu_read_lock(&kvm->srcu);
-	fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
-	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
-	if (r < 0)
-		goto out;
+	for (idx = 0; idx < 3; idx++) {
+		r = __copy_to_user((void __user *)uaddr + PAGE_SIZE * idx,
+				   zero_page, PAGE_SIZE);
+		if (r)
+			return -EFAULT;
+	}
+
 	data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
-	r = kvm_write_guest_page(kvm, fn++, &data,
-			TSS_IOPB_BASE_OFFSET, sizeof(u16));
-	if (r < 0)
-		goto out;
-	r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
-	if (r < 0)
-		goto out;
-	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
-	if (r < 0)
-		goto out;
+	r = __copy_to_user((void __user *)uaddr + TSS_IOPB_BASE_OFFSET,
+			   &data, sizeof(data));
+	if (r)
+		return -EFAULT;
+
 	data = ~0;
-	r = kvm_write_guest_page(kvm, fn, &data,
-				 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
-				 sizeof(u8));
-out:
-	srcu_read_unlock(&kvm->srcu, idx);
+	r = __copy_to_user((void __user *)uaddr - 1, &data, sizeof(data));
+
 	return r;
 }
 
@@ -3478,6 +3472,7 @@  static int init_rmode_identity_map(struct kvm *kvm)
 	int i, r = 0;
 	kvm_pfn_t identity_map_pfn;
 	u32 tmp;
+	unsigned long *uaddr = NULL;
 
 	/* Protect kvm_vmx->ept_identity_pagetable_done. */
 	mutex_lock(&kvm->slots_lock);
@@ -3490,21 +3485,21 @@  static int init_rmode_identity_map(struct kvm *kvm)
 	identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
 
 	r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
-				    kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
+				    kvm_vmx->ept_identity_map_addr, PAGE_SIZE,
+				    uaddr);
 	if (r < 0)
 		goto out;
 
-	r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
-	if (r < 0)
-		goto out;
 	/* Set up identity-mapping pagetable for EPT in real mode */
 	for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
 		tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
 			_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
-		r = kvm_write_guest_page(kvm, identity_map_pfn,
-				&tmp, i * sizeof(tmp), sizeof(tmp));
-		if (r < 0)
+		r = __copy_to_user((void __user *)uaddr + i * sizeof(tmp),
+				   &tmp, sizeof(tmp));
+		if (r) {
+			r = -EFAULT;
 			goto out;
+		}
 	}
 	kvm_vmx->ept_identity_pagetable_done = true;
 
@@ -3537,7 +3532,7 @@  static int alloc_apic_access_page(struct kvm *kvm)
 	if (kvm->arch.apic_access_page_done)
 		goto out;
 	r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
-				    APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
+				    APIC_DEFAULT_PHYS_BASE, PAGE_SIZE, NULL);
 	if (r)
 		goto out;
 
@@ -4478,19 +4473,22 @@  static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
 {
 	int ret;
+	unsigned long *uaddr = NULL;
 
 	if (enable_unrestricted_guest)
 		return 0;
 
 	mutex_lock(&kvm->slots_lock);
 	ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
-				      PAGE_SIZE * 3);
-	mutex_unlock(&kvm->slots_lock);
-
+				      PAGE_SIZE * 3, uaddr);
 	if (ret)
-		return ret;
+		goto out;
+
 	to_kvm_vmx(kvm)->tss_addr = addr;
-	return init_rmode_tss(kvm);
+	ret = init_rmode_tss(kvm, uaddr);
+out:
+	mutex_unlock(&kvm->slots_lock);
+	return ret;
 }
 
 static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c4d3972dcd14..ff97782b3919 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9584,7 +9584,15 @@  void kvm_arch_sync_events(struct kvm *kvm)
 	kvm_free_pit(kvm);
 }
 
-int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
+/*
+ * If `uaddr' is specified, `*uaddr' will be returned with the
+ * userspace address that was just allocated.  `uaddr' is only
+ * meaningful if the function returns zero, and `uaddr' will only be
+ * valid when with either the slots_lock or with the SRCU read lock
+ * held.  After we release the lock, the returned `uaddr' will be invalid.
+ */
+int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size,
+			    unsigned long *uaddr)
 {
 	int i, r;
 	unsigned long hva;
@@ -9608,6 +9616,8 @@  int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
 			      MAP_SHARED | MAP_ANONYMOUS, 0);
 		if (IS_ERR((void *)hva))
 			return PTR_ERR((void *)hva);
+		if (uaddr)
+			*uaddr = hva;
 	} else {
 		if (!slot->npages)
 			return 0;
@@ -9651,10 +9661,10 @@  void kvm_arch_destroy_vm(struct kvm *kvm)
 		 */
 		mutex_lock(&kvm->slots_lock);
 		__x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
-					0, 0);
+					0, 0, NULL);
 		__x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
-					0, 0);
-		__x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
+					0, 0, NULL);
+		__x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0, NULL);
 		mutex_unlock(&kvm->slots_lock);
 	}
 	if (kvm_x86_ops->vm_destroy)