diff mbox series

[RFC,01/24] KVM: VMX: Generalize VPID allocation to be vendor-neutral

Message ID 20250326193619.3714986-2-yosry.ahmed@linux.dev (mailing list archive)
State New
Headers show
Series KVM: SVM: Rework ASID management | expand

Commit Message

Yosry Ahmed March 26, 2025, 7:35 p.m. UTC
Generalize the VMX VPID allocation code and make move it to common code
in preparation for sharing with SVM. Create a generic struct
kvm_tlb_tags, representing a factory for VPIDs (or ASIDs later), and use
one for VPIDs.

Most of the functionality remains the same, with the following
differences:
- The enable_vpid checks are moved to the callers for allocate_vpid()
  and free_vpid(), as they are specific to VMX.
- The bitmap allocation is now dynamic (which will be required for SVM),
  so it is initialized and cleaned up in vmx_hardware_{setup/unsetup}().
- The range of valid TLB tags is expressed in terms of min/max instead
  of the number of tags to support SVM use cases.

Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
---
 arch/x86/kvm/vmx/nested.c |  4 +--
 arch/x86/kvm/vmx/vmx.c    | 38 +++++--------------------
 arch/x86/kvm/vmx/vmx.h    |  4 +--
 arch/x86/kvm/x86.c        | 58 +++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.h        | 13 +++++++++
 5 files changed, 82 insertions(+), 35 deletions(-)

Comments

Nikunj A Dadhania March 27, 2025, 10:58 a.m. UTC | #1
Yosry Ahmed <yosry.ahmed@linux.dev> writes:

> Generalize the VMX VPID allocation code and make move it to common
> code

s/make//

> in preparation for sharing with SVM. Create a generic struct
> kvm_tlb_tags, representing a factory for VPIDs (or ASIDs later), and use
> one for VPIDs.
>
> Most of the functionality remains the same, with the following
> differences:
> - The enable_vpid checks are moved to the callers for allocate_vpid()
>   and free_vpid(), as they are specific to VMX.
> - The bitmap allocation is now dynamic (which will be required for SVM),
>   so it is initialized and cleaned up in vmx_hardware_{setup/unsetup}().
> - The range of valid TLB tags is expressed in terms of min/max instead
>   of the number of tags to support SVM use cases.
>
> Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
> ---
>  arch/x86/kvm/vmx/nested.c |  4 +--
>  arch/x86/kvm/vmx/vmx.c    | 38 +++++--------------------
>  arch/x86/kvm/vmx/vmx.h    |  4 +--
>  arch/x86/kvm/x86.c        | 58 +++++++++++++++++++++++++++++++++++++++
>  arch/x86/kvm/x86.h        | 13 +++++++++
>  5 files changed, 82 insertions(+), 35 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index d06e50d9c0e79..b017bd2eb2382 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -343,7 +343,7 @@ static void free_nested(struct kvm_vcpu *vcpu)
>  	vmx->nested.vmxon = false;
>  	vmx->nested.smm.vmxon = false;
>  	vmx->nested.vmxon_ptr = INVALID_GPA;
> -	free_vpid(vmx->nested.vpid02);
> +	kvm_tlb_tags_free(&vmx_vpids, vmx->nested.vpid02);
>  	vmx->nested.posted_intr_nv = -1;
>  	vmx->nested.current_vmptr = INVALID_GPA;
>  	if (enable_shadow_vmcs) {
> @@ -5333,7 +5333,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
>  		     HRTIMER_MODE_ABS_PINNED);
>  	vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
>  
> -	vmx->nested.vpid02 = allocate_vpid();
> +	vmx->nested.vpid02 = enable_vpid ? kvm_tlb_tags_alloc(&vmx_vpids) : 0;
>  
>  	vmx->nested.vmcs02_initialized = false;
>  	vmx->nested.vmxon = true;
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index b70ed72c1783d..f7ce75842fa26 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -496,8 +496,7 @@ DEFINE_PER_CPU(struct vmcs *, current_vmcs);
>   */
>  static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
>  
> -static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
> -static DEFINE_SPINLOCK(vmx_vpid_lock);
> +struct kvm_tlb_tags vmx_vpids;
>  
>  struct vmcs_config vmcs_config __ro_after_init;
>  struct vmx_capability vmx_capability __ro_after_init;
> @@ -3972,31 +3971,6 @@ static void seg_setup(int seg)
>  	vmcs_write32(sf->ar_bytes, ar);
>  }
>  
> -int allocate_vpid(void)
> -{
> -	int vpid;
> -
> -	if (!enable_vpid)
> -		return 0;
> -	spin_lock(&vmx_vpid_lock);
> -	vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
> -	if (vpid < VMX_NR_VPIDS)
> -		__set_bit(vpid, vmx_vpid_bitmap);
> -	else
> -		vpid = 0;
> -	spin_unlock(&vmx_vpid_lock);
> -	return vpid;
> -}
> -
> -void free_vpid(int vpid)
> -{
> -	if (!enable_vpid || vpid == 0)
> -		return;
> -	spin_lock(&vmx_vpid_lock);
> -	__clear_bit(vpid, vmx_vpid_bitmap);
> -	spin_unlock(&vmx_vpid_lock);
> -}
> -
>  static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
>  {
>  	/*
> @@ -7559,7 +7533,7 @@ void vmx_vcpu_free(struct kvm_vcpu *vcpu)
>  
>  	if (enable_pml)
>  		vmx_destroy_pml_buffer(vmx);
> -	free_vpid(vmx->vpid);
> +	kvm_tlb_tags_free(&vmx_vpids, vmx->vpid);
>  	nested_vmx_free_vcpu(vcpu);
>  	free_loaded_vmcs(vmx->loaded_vmcs);
>  	free_page((unsigned long)vmx->ve_info);
> @@ -7578,7 +7552,7 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu)
>  
>  	err = -ENOMEM;
>  
> -	vmx->vpid = allocate_vpid();
> +	vmx->vpid = enable_vpid ? kvm_tlb_tags_alloc(&vmx_vpids) : 0;
>  
>  	/*
>  	 * If PML is turned on, failure on enabling PML just results in failure
> @@ -7681,7 +7655,7 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu)
>  free_pml:
>  	vmx_destroy_pml_buffer(vmx);
>  free_vpid:
> -	free_vpid(vmx->vpid);
> +	kvm_tlb_tags_free(&vmx_vpids, vmx->vpid);
>  	return err;
>  }
>  
> @@ -8373,6 +8347,7 @@ void vmx_hardware_unsetup(void)
>  		nested_vmx_hardware_unsetup();
>  
>  	free_kvm_area();
> +	kvm_tlb_tags_destroy(&vmx_vpids);
>  }
>  
>  void vmx_vm_destroy(struct kvm *kvm)
> @@ -8591,7 +8566,8 @@ __init int vmx_hardware_setup(void)
>  	kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
>  	kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
>  
> -	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
> +	/* VPID 0 is reserved for host, so min=1  */
> +	kvm_tlb_tags_init(&vmx_vpids, 1, VMX_NR_VPIDS - 1);

This needs to handle errors from kvm_tlb_tags_init().

>  
>  	if (enable_ept)
>  		kvm_mmu_set_ept_masks(enable_ept_ad_bits,
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> index 951e44dc9d0ea..9bece3ea63eaa 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -376,10 +376,10 @@ struct kvm_vmx {
>  	u64 *pid_table;
>  };
>  
> +extern struct kvm_tlb_tags vmx_vpids;
> +
>  void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
>  			struct loaded_vmcs *buddy);
> -int allocate_vpid(void);
> -void free_vpid(int vpid);
>  void vmx_set_constant_host_state(struct vcpu_vmx *vmx);
>  void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
>  void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 69c20a68a3f01..182f18ebc62f3 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -13992,6 +13992,64 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
>  }
>  EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
>  
> +int kvm_tlb_tags_init(struct kvm_tlb_tags *tlb_tags, unsigned int min,
> +		      unsigned int max)
> +{
> +	/*
> +	 * 0 is assumed to be the host's TLB tag and is returned on failed
> +	 * allocations.
> +	 */
> +	if (WARN_ON_ONCE(min == 0))
> +		return -1;

Probably -EINVAL ?

> +
> +	/*
> +	 * Allocate enough bits to index the bitmap directly by the tag,
> +	 * potentially wasting a bit of memory.
> +	 */
> +	tlb_tags->bitmap = bitmap_zalloc(max + 1, GFP_KERNEL);
> +	if (!tlb_tags->bitmap)
> +		return -1;

-ENOMEM ?

> +
> +	tlb_tags->min = min;
> +	tlb_tags->max = max;
> +	spin_lock_init(&tlb_tags->lock);
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(kvm_tlb_tags_init);
> +
> +void kvm_tlb_tags_destroy(struct kvm_tlb_tags *tlb_tags)
> +{
> +	bitmap_free(tlb_tags->bitmap);

Do we need to take tlb_tabs->lock here ?

> +}
> +EXPORT_SYMBOL_GPL(kvm_tlb_tags_destroy);
> +
> +unsigned int kvm_tlb_tags_alloc(struct kvm_tlb_tags *tlb_tags)
> +{
> +	unsigned int tag;
> +
> +	spin_lock(&tlb_tags->lock);
> +	tag = find_next_zero_bit(tlb_tags->bitmap, tlb_tags->max + 1,
> +				 tlb_tags->min);
> +	if (tag <= tlb_tags->max)
> +		__set_bit(tag, tlb_tags->bitmap);
> +	else
> +		tag = 0;

In the event that the KVM runs out of tags, adding WARN_ON_ONCE() here will
help debugging.

Regards
Nikunj
Yosry Ahmed March 27, 2025, 5:13 p.m. UTC | #2
On Thu, Mar 27, 2025 at 10:58:31AM +0000, Nikunj A Dadhania wrote:
> Yosry Ahmed <yosry.ahmed@linux.dev> writes:
> 
> > Generalize the VMX VPID allocation code and make move it to common
> > code
> 
> s/make//
> 
> > in preparation for sharing with SVM. Create a generic struct
> > kvm_tlb_tags, representing a factory for VPIDs (or ASIDs later), and use
> > one for VPIDs.
> >
> > Most of the functionality remains the same, with the following
> > differences:
> > - The enable_vpid checks are moved to the callers for allocate_vpid()
> >   and free_vpid(), as they are specific to VMX.
> > - The bitmap allocation is now dynamic (which will be required for SVM),
> >   so it is initialized and cleaned up in vmx_hardware_{setup/unsetup}().
> > - The range of valid TLB tags is expressed in terms of min/max instead
> >   of the number of tags to support SVM use cases.
> >
> > Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
> > ---
> >  arch/x86/kvm/vmx/nested.c |  4 +--
> >  arch/x86/kvm/vmx/vmx.c    | 38 +++++--------------------
> >  arch/x86/kvm/vmx/vmx.h    |  4 +--
> >  arch/x86/kvm/x86.c        | 58 +++++++++++++++++++++++++++++++++++++++
> >  arch/x86/kvm/x86.h        | 13 +++++++++
> >  5 files changed, 82 insertions(+), 35 deletions(-)
> >
> > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> > index d06e50d9c0e79..b017bd2eb2382 100644
> > --- a/arch/x86/kvm/vmx/nested.c
> > +++ b/arch/x86/kvm/vmx/nested.c
> > @@ -343,7 +343,7 @@ static void free_nested(struct kvm_vcpu *vcpu)
> >  	vmx->nested.vmxon = false;
> >  	vmx->nested.smm.vmxon = false;
> >  	vmx->nested.vmxon_ptr = INVALID_GPA;
> > -	free_vpid(vmx->nested.vpid02);
> > +	kvm_tlb_tags_free(&vmx_vpids, vmx->nested.vpid02);
> >  	vmx->nested.posted_intr_nv = -1;
> >  	vmx->nested.current_vmptr = INVALID_GPA;
> >  	if (enable_shadow_vmcs) {
> > @@ -5333,7 +5333,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
> >  		     HRTIMER_MODE_ABS_PINNED);
> >  	vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
> >  
> > -	vmx->nested.vpid02 = allocate_vpid();
> > +	vmx->nested.vpid02 = enable_vpid ? kvm_tlb_tags_alloc(&vmx_vpids) : 0;
> >  
> >  	vmx->nested.vmcs02_initialized = false;
> >  	vmx->nested.vmxon = true;
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index b70ed72c1783d..f7ce75842fa26 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -496,8 +496,7 @@ DEFINE_PER_CPU(struct vmcs *, current_vmcs);
> >   */
> >  static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
> >  
> > -static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
> > -static DEFINE_SPINLOCK(vmx_vpid_lock);
> > +struct kvm_tlb_tags vmx_vpids;
> >  
> >  struct vmcs_config vmcs_config __ro_after_init;
> >  struct vmx_capability vmx_capability __ro_after_init;
> > @@ -3972,31 +3971,6 @@ static void seg_setup(int seg)
> >  	vmcs_write32(sf->ar_bytes, ar);
> >  }
> >  
> > -int allocate_vpid(void)
> > -{
> > -	int vpid;
> > -
> > -	if (!enable_vpid)
> > -		return 0;
> > -	spin_lock(&vmx_vpid_lock);
> > -	vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
> > -	if (vpid < VMX_NR_VPIDS)
> > -		__set_bit(vpid, vmx_vpid_bitmap);
> > -	else
> > -		vpid = 0;
> > -	spin_unlock(&vmx_vpid_lock);
> > -	return vpid;
> > -}
> > -
> > -void free_vpid(int vpid)
> > -{
> > -	if (!enable_vpid || vpid == 0)
> > -		return;
> > -	spin_lock(&vmx_vpid_lock);
> > -	__clear_bit(vpid, vmx_vpid_bitmap);
> > -	spin_unlock(&vmx_vpid_lock);
> > -}
> > -
> >  static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
> >  {
> >  	/*
> > @@ -7559,7 +7533,7 @@ void vmx_vcpu_free(struct kvm_vcpu *vcpu)
> >  
> >  	if (enable_pml)
> >  		vmx_destroy_pml_buffer(vmx);
> > -	free_vpid(vmx->vpid);
> > +	kvm_tlb_tags_free(&vmx_vpids, vmx->vpid);
> >  	nested_vmx_free_vcpu(vcpu);
> >  	free_loaded_vmcs(vmx->loaded_vmcs);
> >  	free_page((unsigned long)vmx->ve_info);
> > @@ -7578,7 +7552,7 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu)
> >  
> >  	err = -ENOMEM;
> >  
> > -	vmx->vpid = allocate_vpid();
> > +	vmx->vpid = enable_vpid ? kvm_tlb_tags_alloc(&vmx_vpids) : 0;
> >  
> >  	/*
> >  	 * If PML is turned on, failure on enabling PML just results in failure
> > @@ -7681,7 +7655,7 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu)
> >  free_pml:
> >  	vmx_destroy_pml_buffer(vmx);
> >  free_vpid:
> > -	free_vpid(vmx->vpid);
> > +	kvm_tlb_tags_free(&vmx_vpids, vmx->vpid);
> >  	return err;
> >  }
> >  
> > @@ -8373,6 +8347,7 @@ void vmx_hardware_unsetup(void)
> >  		nested_vmx_hardware_unsetup();
> >  
> >  	free_kvm_area();
> > +	kvm_tlb_tags_destroy(&vmx_vpids);
> >  }
> >  
> >  void vmx_vm_destroy(struct kvm *kvm)
> > @@ -8591,7 +8566,8 @@ __init int vmx_hardware_setup(void)
> >  	kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
> >  	kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
> >  
> > -	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
> > +	/* VPID 0 is reserved for host, so min=1  */
> > +	kvm_tlb_tags_init(&vmx_vpids, 1, VMX_NR_VPIDS - 1);
> 
> This needs to handle errors from kvm_tlb_tags_init().
> 
> >  
> >  	if (enable_ept)
> >  		kvm_mmu_set_ept_masks(enable_ept_ad_bits,
> > diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> > index 951e44dc9d0ea..9bece3ea63eaa 100644
> > --- a/arch/x86/kvm/vmx/vmx.h
> > +++ b/arch/x86/kvm/vmx/vmx.h
> > @@ -376,10 +376,10 @@ struct kvm_vmx {
> >  	u64 *pid_table;
> >  };
> >  
> > +extern struct kvm_tlb_tags vmx_vpids;
> > +
> >  void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
> >  			struct loaded_vmcs *buddy);
> > -int allocate_vpid(void);
> > -void free_vpid(int vpid);
> >  void vmx_set_constant_host_state(struct vcpu_vmx *vmx);
> >  void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
> >  void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 69c20a68a3f01..182f18ebc62f3 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -13992,6 +13992,64 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
> >  }
> >  EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
> >  
> > +int kvm_tlb_tags_init(struct kvm_tlb_tags *tlb_tags, unsigned int min,
> > +		      unsigned int max)
> > +{
> > +	/*
> > +	 * 0 is assumed to be the host's TLB tag and is returned on failed
> > +	 * allocations.
> > +	 */
> > +	if (WARN_ON_ONCE(min == 0))
> > +		return -1;
> 
> Probably -EINVAL ?

Yeah we can use error codes for clarity, thanks.

> 
> > +
> > +	/*
> > +	 * Allocate enough bits to index the bitmap directly by the tag,
> > +	 * potentially wasting a bit of memory.
> > +	 */
> > +	tlb_tags->bitmap = bitmap_zalloc(max + 1, GFP_KERNEL);
> > +	if (!tlb_tags->bitmap)
> > +		return -1;
> 
> -ENOMEM ?
> 
> > +
> > +	tlb_tags->min = min;
> > +	tlb_tags->max = max;
> > +	spin_lock_init(&tlb_tags->lock);
> > +	return 0;
> > +}
> > +EXPORT_SYMBOL_GPL(kvm_tlb_tags_init);
> > +
> > +void kvm_tlb_tags_destroy(struct kvm_tlb_tags *tlb_tags)
> > +{
> > +	bitmap_free(tlb_tags->bitmap);
> 
> Do we need to take tlb_tabs->lock here ?

Hmm we could, but I think it's a bug from the caller if they allow
kvm_tlb_tags_destroy() and any of the other functions (init/alloc/free)
to race. kvm_tlb_tags_destroy() should be called when we are done using
the factory.

> 
> > +}
> > +EXPORT_SYMBOL_GPL(kvm_tlb_tags_destroy);
> > +
> > +unsigned int kvm_tlb_tags_alloc(struct kvm_tlb_tags *tlb_tags)
> > +{
> > +	unsigned int tag;
> > +
> > +	spin_lock(&tlb_tags->lock);
> > +	tag = find_next_zero_bit(tlb_tags->bitmap, tlb_tags->max + 1,
> > +				 tlb_tags->min);
> > +	if (tag <= tlb_tags->max)
> > +		__set_bit(tag, tlb_tags->bitmap);
> > +	else
> > +		tag = 0;
> 
> In the event that the KVM runs out of tags, adding WARN_ON_ONCE() here will
> help debugging.

Yeah I wanted to do that, but we do not currently WARN in VMX if we run
out of VPIDs. I am fine with doing adding it if others are. My main
concern was if there's some existing use case that routinely runs out of
VPIDs (although I cannot imagine one).

> 
> Regards
> Nikunj
>
Sean Christopherson March 27, 2025, 7:42 p.m. UTC | #3
On Thu, Mar 27, 2025, Yosry Ahmed wrote:
> On Thu, Mar 27, 2025 at 10:58:31AM +0000, Nikunj A Dadhania wrote:
> > > +unsigned int kvm_tlb_tags_alloc(struct kvm_tlb_tags *tlb_tags)
> > > +{
> > > +	unsigned int tag;
> > > +
> > > +	spin_lock(&tlb_tags->lock);
> > > +	tag = find_next_zero_bit(tlb_tags->bitmap, tlb_tags->max + 1,
> > > +				 tlb_tags->min);
> > > +	if (tag <= tlb_tags->max)
> > > +		__set_bit(tag, tlb_tags->bitmap);
> > > +	else
> > > +		tag = 0;
> > 
> > In the event that the KVM runs out of tags, adding WARN_ON_ONCE() here will
> > help debugging.
> 
> Yeah I wanted to do that, but we do not currently WARN in VMX if we run
> out of VPIDs. I am fine with doing adding it if others are. My main
> concern was if there's some existing use case that routinely runs out of
> VPIDs (although I cannot imagine one).

No WARNs, it would be userspace triggerable (hello, syzkaller).  If we really
want to harden things against performance issues due to unexpected VPID/ASID
allocation, I would rather do something like add a knob to fail VM or vCPU
creation if allocation fails (nested would just have to suffer).
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index d06e50d9c0e79..b017bd2eb2382 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -343,7 +343,7 @@  static void free_nested(struct kvm_vcpu *vcpu)
 	vmx->nested.vmxon = false;
 	vmx->nested.smm.vmxon = false;
 	vmx->nested.vmxon_ptr = INVALID_GPA;
-	free_vpid(vmx->nested.vpid02);
+	kvm_tlb_tags_free(&vmx_vpids, vmx->nested.vpid02);
 	vmx->nested.posted_intr_nv = -1;
 	vmx->nested.current_vmptr = INVALID_GPA;
 	if (enable_shadow_vmcs) {
@@ -5333,7 +5333,7 @@  static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 		     HRTIMER_MODE_ABS_PINNED);
 	vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
 
-	vmx->nested.vpid02 = allocate_vpid();
+	vmx->nested.vpid02 = enable_vpid ? kvm_tlb_tags_alloc(&vmx_vpids) : 0;
 
 	vmx->nested.vmcs02_initialized = false;
 	vmx->nested.vmxon = true;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b70ed72c1783d..f7ce75842fa26 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -496,8 +496,7 @@  DEFINE_PER_CPU(struct vmcs *, current_vmcs);
  */
 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
 
-static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
-static DEFINE_SPINLOCK(vmx_vpid_lock);
+struct kvm_tlb_tags vmx_vpids;
 
 struct vmcs_config vmcs_config __ro_after_init;
 struct vmx_capability vmx_capability __ro_after_init;
@@ -3972,31 +3971,6 @@  static void seg_setup(int seg)
 	vmcs_write32(sf->ar_bytes, ar);
 }
 
-int allocate_vpid(void)
-{
-	int vpid;
-
-	if (!enable_vpid)
-		return 0;
-	spin_lock(&vmx_vpid_lock);
-	vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
-	if (vpid < VMX_NR_VPIDS)
-		__set_bit(vpid, vmx_vpid_bitmap);
-	else
-		vpid = 0;
-	spin_unlock(&vmx_vpid_lock);
-	return vpid;
-}
-
-void free_vpid(int vpid)
-{
-	if (!enable_vpid || vpid == 0)
-		return;
-	spin_lock(&vmx_vpid_lock);
-	__clear_bit(vpid, vmx_vpid_bitmap);
-	spin_unlock(&vmx_vpid_lock);
-}
-
 static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
 {
 	/*
@@ -7559,7 +7533,7 @@  void vmx_vcpu_free(struct kvm_vcpu *vcpu)
 
 	if (enable_pml)
 		vmx_destroy_pml_buffer(vmx);
-	free_vpid(vmx->vpid);
+	kvm_tlb_tags_free(&vmx_vpids, vmx->vpid);
 	nested_vmx_free_vcpu(vcpu);
 	free_loaded_vmcs(vmx->loaded_vmcs);
 	free_page((unsigned long)vmx->ve_info);
@@ -7578,7 +7552,7 @@  int vmx_vcpu_create(struct kvm_vcpu *vcpu)
 
 	err = -ENOMEM;
 
-	vmx->vpid = allocate_vpid();
+	vmx->vpid = enable_vpid ? kvm_tlb_tags_alloc(&vmx_vpids) : 0;
 
 	/*
 	 * If PML is turned on, failure on enabling PML just results in failure
@@ -7681,7 +7655,7 @@  int vmx_vcpu_create(struct kvm_vcpu *vcpu)
 free_pml:
 	vmx_destroy_pml_buffer(vmx);
 free_vpid:
-	free_vpid(vmx->vpid);
+	kvm_tlb_tags_free(&vmx_vpids, vmx->vpid);
 	return err;
 }
 
@@ -8373,6 +8347,7 @@  void vmx_hardware_unsetup(void)
 		nested_vmx_hardware_unsetup();
 
 	free_kvm_area();
+	kvm_tlb_tags_destroy(&vmx_vpids);
 }
 
 void vmx_vm_destroy(struct kvm *kvm)
@@ -8591,7 +8566,8 @@  __init int vmx_hardware_setup(void)
 	kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
 	kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
 
-	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
+	/* VPID 0 is reserved for host, so min=1  */
+	kvm_tlb_tags_init(&vmx_vpids, 1, VMX_NR_VPIDS - 1);
 
 	if (enable_ept)
 		kvm_mmu_set_ept_masks(enable_ept_ad_bits,
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 951e44dc9d0ea..9bece3ea63eaa 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -376,10 +376,10 @@  struct kvm_vmx {
 	u64 *pid_table;
 };
 
+extern struct kvm_tlb_tags vmx_vpids;
+
 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
 			struct loaded_vmcs *buddy);
-int allocate_vpid(void);
-void free_vpid(int vpid);
 void vmx_set_constant_host_state(struct vcpu_vmx *vmx);
 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 69c20a68a3f01..182f18ebc62f3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13992,6 +13992,64 @@  int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
 }
 EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
 
+int kvm_tlb_tags_init(struct kvm_tlb_tags *tlb_tags, unsigned int min,
+		      unsigned int max)
+{
+	/*
+	 * 0 is assumed to be the host's TLB tag and is returned on failed
+	 * allocations.
+	 */
+	if (WARN_ON_ONCE(min == 0))
+		return -1;
+
+	/*
+	 * Allocate enough bits to index the bitmap directly by the tag,
+	 * potentially wasting a bit of memory.
+	 */
+	tlb_tags->bitmap = bitmap_zalloc(max + 1, GFP_KERNEL);
+	if (!tlb_tags->bitmap)
+		return -1;
+
+	tlb_tags->min = min;
+	tlb_tags->max = max;
+	spin_lock_init(&tlb_tags->lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_tlb_tags_init);
+
+void kvm_tlb_tags_destroy(struct kvm_tlb_tags *tlb_tags)
+{
+	bitmap_free(tlb_tags->bitmap);
+}
+EXPORT_SYMBOL_GPL(kvm_tlb_tags_destroy);
+
+unsigned int kvm_tlb_tags_alloc(struct kvm_tlb_tags *tlb_tags)
+{
+	unsigned int tag;
+
+	spin_lock(&tlb_tags->lock);
+	tag = find_next_zero_bit(tlb_tags->bitmap, tlb_tags->max + 1,
+				 tlb_tags->min);
+	if (tag <= tlb_tags->max)
+		__set_bit(tag, tlb_tags->bitmap);
+	else
+		tag = 0;
+	spin_unlock(&tlb_tags->lock);
+	return tag;
+}
+EXPORT_SYMBOL_GPL(kvm_tlb_tags_alloc);
+
+void kvm_tlb_tags_free(struct kvm_tlb_tags *tlb_tags, unsigned int tag)
+{
+	if (tag < tlb_tags->min || WARN_ON_ONCE(tag > tlb_tags->max))
+		return;
+
+	spin_lock(&tlb_tags->lock);
+	__clear_bit(tag, tlb_tags->bitmap);
+	spin_unlock(&tlb_tags->lock);
+}
+EXPORT_SYMBOL_GPL(kvm_tlb_tags_free);
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 9dc32a4090761..9f84e933d189b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -652,4 +652,17 @@  int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
 
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 
+struct kvm_tlb_tags {
+	spinlock_t	lock;
+	unsigned long	*bitmap;
+	unsigned int	min;
+	unsigned int	max;
+};
+
+int kvm_tlb_tags_init(struct kvm_tlb_tags *tlb_tags, unsigned int min,
+		      unsigned int max);
+void kvm_tlb_tags_destroy(struct kvm_tlb_tags *tlb_tags);
+unsigned int kvm_tlb_tags_alloc(struct kvm_tlb_tags *tlb_tags);
+void kvm_tlb_tags_free(struct kvm_tlb_tags *tlb_tags, unsigned int tag);
+
 #endif