diff mbox series

[08/15] KVM: VMX: Configure list of user return MSRs at module init

Message ID 20210504171734.1434054-9-seanjc@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86: RDPID/RDTSCP fixes and uret MSR cleanups | expand

Commit Message

Sean Christopherson May 4, 2021, 5:17 p.m. UTC
Configure the list of user return MSRs that are actually supported at
module init instead of reprobing the list of possible MSRs every time a
vCPU is created.  Curating the list on a per-vCPU basis is pointless; KVM
is completely hosed if the set of supported MSRs changes after module init,
or if the set of MSRs differs per physical PCU.

The per-vCPU lists also increase complexity (see __vmx_find_uret_msr()) and
creates corner cases that _should_ be impossible, but theoretically exist
in KVM, e.g. advertising RDTSCP to userspace without actually being able to
virtualize RDTSCP if probing MSR_TSC_AUX fails.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/vmx/vmx.c | 61 ++++++++++++++++++++++++++++--------------
 arch/x86/kvm/vmx/vmx.h | 10 ++++++-
 2 files changed, 50 insertions(+), 21 deletions(-)

Comments

Maxim Levitsky May 10, 2021, 8:23 a.m. UTC | #1
On Tue, 2021-05-04 at 10:17 -0700, Sean Christopherson wrote:
> Configure the list of user return MSRs that are actually supported at
> module init instead of reprobing the list of possible MSRs every time a
> vCPU is created.  Curating the list on a per-vCPU basis is pointless; KVM
> is completely hosed if the set of supported MSRs changes after module init,
> or if the set of MSRs differs per physical PCU.
> 
> The per-vCPU lists also increase complexity (see __vmx_find_uret_msr()) and
> creates corner cases that _should_ be impossible, but theoretically exist
> in KVM, e.g. advertising RDTSCP to userspace without actually being able to
> virtualize RDTSCP if probing MSR_TSC_AUX fails.
> 
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/kvm/vmx/vmx.c | 61 ++++++++++++++++++++++++++++--------------
>  arch/x86/kvm/vmx/vmx.h | 10 ++++++-
>  2 files changed, 50 insertions(+), 21 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 42e4bbaa299a..68454b0de2b1 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -461,7 +461,7 @@ static unsigned long host_idt_base;
>   * support this emulation, IA32_STAR must always be included in
>   * vmx_uret_msrs_list[], even in i386 builds.
>   */
> -static const u32 vmx_uret_msrs_list[] = {
> +static u32 vmx_uret_msrs_list[] = {
>  #ifdef CONFIG_X86_64
>  	MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
>  #endif
> @@ -469,6 +469,12 @@ static const u32 vmx_uret_msrs_list[] = {
>  	MSR_IA32_TSX_CTRL,
>  };
>  
> +/*
> + * Number of user return MSRs that are actually supported in hardware.
> + * vmx_uret_msrs_list is modified when KVM is loaded to drop unsupported MSRs.
> + */
> +static int vmx_nr_uret_msrs;
> +
>  #if IS_ENABLED(CONFIG_HYPERV)
>  static bool __read_mostly enlightened_vmcs = true;
>  module_param(enlightened_vmcs, bool, 0444);
> @@ -700,9 +706,16 @@ static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
>  {
>  	int i;
>  
> -	for (i = 0; i < vmx->nr_uret_msrs; ++i)
> +	/*
> +	 * Note, vmx->guest_uret_msrs is the same size as vmx_uret_msrs_list,
> +	 * but is ordered differently.  The MSR is matched against the list of
> +	 * supported uret MSRs using "slot", but the index that is returned is
> +	 * the index into guest_uret_msrs.
> +	 */
> +	for (i = 0; i < vmx_nr_uret_msrs; ++i) {
>  		if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr)
>  			return i;
> +	}
>  	return -1;
>  }
>  
> @@ -6929,18 +6942,10 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
>  			goto free_vpid;
>  	}
>  
> -	BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
> +	for (i = 0; i < vmx_nr_uret_msrs; ++i) {
> +		vmx->guest_uret_msrs[i].data = 0;
>  
> -	for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) {
> -		u32 index = vmx_uret_msrs_list[i];
> -		int j = vmx->nr_uret_msrs;
> -
> -		if (kvm_probe_user_return_msr(index))
> -			continue;
> -
> -		vmx->guest_uret_msrs[j].slot = i;
I don't see anything initalizing the .slot after this patch.
Now this code is removed later which masks this bug, 
but for the bisect sake, I think that this patch 
should still be fixed.

Other than this minor bug:

Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>

Best regards,
	Maxim Levitsky



> -		vmx->guest_uret_msrs[j].data = 0;
> -		switch (index) {
> +		switch (vmx_uret_msrs_list[i]) {
>  		case MSR_IA32_TSX_CTRL:
>  			/*
>  			 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID
> @@ -6954,15 +6959,14 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
>  			 * host so that TSX remains always disabled.
>  			 */
>  			if (boot_cpu_has(X86_FEATURE_RTM))
> -				vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
> +				vmx->guest_uret_msrs[i].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
>  			else
> -				vmx->guest_uret_msrs[j].mask = 0;
> +				vmx->guest_uret_msrs[i].mask = 0;
>  			break;
>  		default:
> -			vmx->guest_uret_msrs[j].mask = -1ull;
> +			vmx->guest_uret_msrs[i].mask = -1ull;
>  			break;
>  		}
> -		++vmx->nr_uret_msrs;
>  	}
>  
>  	err = alloc_loaded_vmcs(&vmx->vmcs01);
> @@ -7821,17 +7825,34 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
>  	.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
>  };
>  
> +static __init void vmx_setup_user_return_msrs(void)
> +{
> +	u32 msr;
> +	int i;
> +
> +	BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
> +
> +	for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) {
> +		msr = vmx_uret_msrs_list[i];
> +
> +		if (kvm_probe_user_return_msr(msr))
> +			continue;
> +
> +		kvm_define_user_return_msr(vmx_nr_uret_msrs, msr);
> +		vmx_uret_msrs_list[vmx_nr_uret_msrs++] = msr;
> +	}
> +}
> +
>  static __init int hardware_setup(void)
>  {
>  	unsigned long host_bndcfgs;
>  	struct desc_ptr dt;
> -	int r, i, ept_lpage_level;
> +	int r, ept_lpage_level;
>  
>  	store_idt(&dt);
>  	host_idt_base = dt.address;
>  
> -	for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
> -		kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]);
> +	vmx_setup_user_return_msrs();
>  
>  	if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
>  		return -EIO;
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> index 008cb87ff088..d71ed8b425c5 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -245,8 +245,16 @@ struct vcpu_vmx {
>  	u32                   idt_vectoring_info;
>  	ulong                 rflags;
>  
> +	/*
> +	 * User return MSRs are always emulated when enabled in the guest, but
> +	 * only loaded into hardware when necessary, e.g. SYSCALL #UDs outside
> +	 * of 64-bit mode or if EFER.SCE=1, thus the SYSCALL MSRs don't need to
> +	 * be loaded into hardware if those conditions aren't met.
> +	 * nr_active_uret_msrs tracks the number of MSRs that need to be loaded
> +	 * into hardware when running the guest.  guest_uret_msrs[] is resorted
> +	 * whenever the number of "active" uret MSRs is modified.
> +	 */
>  	struct vmx_uret_msr   guest_uret_msrs[MAX_NR_USER_RETURN_MSRS];
> -	int                   nr_uret_msrs;
>  	int                   nr_active_uret_msrs;
>  	bool                  guest_uret_msrs_loaded;
>  #ifdef CONFIG_X86_64
Sean Christopherson May 10, 2021, 3:13 p.m. UTC | #2
On Mon, May 10, 2021, Maxim Levitsky wrote:
> On Tue, 2021-05-04 at 10:17 -0700, Sean Christopherson wrote:
> > @@ -6929,18 +6942,10 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
> >  			goto free_vpid;
> >  	}
> >  
> > -	BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
> > +	for (i = 0; i < vmx_nr_uret_msrs; ++i) {
> > +		vmx->guest_uret_msrs[i].data = 0;
> >  
> > -	for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) {
> > -		u32 index = vmx_uret_msrs_list[i];
> > -		int j = vmx->nr_uret_msrs;
> > -
> > -		if (kvm_probe_user_return_msr(index))
> > -			continue;
> > -
> > -		vmx->guest_uret_msrs[j].slot = i;
> I don't see anything initalizing the .slot after this patch.
> Now this code is removed later which masks this bug, 
> but for the bisect sake, I think that this patch 
> should still be fixed.

Egad, indeed it's broken.  I'll retest the whole series to verify the other
patches will bisect cleanly.

Nice catch!
Maxim Levitsky May 11, 2021, 12:34 p.m. UTC | #3
On Mon, 2021-05-10 at 15:13 +0000, Sean Christopherson wrote:
> On Mon, May 10, 2021, Maxim Levitsky wrote:
> > On Tue, 2021-05-04 at 10:17 -0700, Sean Christopherson wrote:
> > > @@ -6929,18 +6942,10 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
> > >  			goto free_vpid;
> > >  	}
> > >  
> > > -	BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
> > > +	for (i = 0; i < vmx_nr_uret_msrs; ++i) {
> > > +		vmx->guest_uret_msrs[i].data = 0;
> > >  
> > > -	for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) {
> > > -		u32 index = vmx_uret_msrs_list[i];
> > > -		int j = vmx->nr_uret_msrs;
> > > -
> > > -		if (kvm_probe_user_return_msr(index))
> > > -			continue;
> > > -
> > > -		vmx->guest_uret_msrs[j].slot = i;
> > I don't see anything initalizing the .slot after this patch.
> > Now this code is removed later which masks this bug, 
> > but for the bisect sake, I think that this patch 
> > should still be fixed.
> 
> Egad, indeed it's broken.  I'll retest the whole series to verify the other
> patches will bisect cleanly.
> 
> Nice catch!
> 
Thanks!

Best regards,
	Maxim Levitsky
Sean Christopherson May 11, 2021, 8:10 p.m. UTC | #4
On Tue, May 11, 2021, Maxim Levitsky wrote:
> On Mon, 2021-05-10 at 15:13 +0000, Sean Christopherson wrote:
> > On Mon, May 10, 2021, Maxim Levitsky wrote:
> > > On Tue, 2021-05-04 at 10:17 -0700, Sean Christopherson wrote:
> > > > @@ -6929,18 +6942,10 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
> > > >  			goto free_vpid;
> > > >  	}
> > > >  
> > > > -	BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
> > > > +	for (i = 0; i < vmx_nr_uret_msrs; ++i) {
> > > > +		vmx->guest_uret_msrs[i].data = 0;
> > > >  
> > > > -	for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) {
> > > > -		u32 index = vmx_uret_msrs_list[i];
> > > > -		int j = vmx->nr_uret_msrs;
> > > > -
> > > > -		if (kvm_probe_user_return_msr(index))
> > > > -			continue;
> > > > -
> > > > -		vmx->guest_uret_msrs[j].slot = i;
> > > I don't see anything initalizing the .slot after this patch.
> > > Now this code is removed later which masks this bug, 
> > > but for the bisect sake, I think that this patch 
> > > should still be fixed.
> > 
> > Egad, indeed it's broken.  I'll retest the whole series to verify the other
> > patches will bisect cleanly.
> > 
> > Nice catch!
> > 
> Thanks!

Unfortunately this made it's way to Linus before I could fix my goof.  Fingers
crossed no one is unfortunate enough to land on this while bisecting...
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 42e4bbaa299a..68454b0de2b1 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -461,7 +461,7 @@  static unsigned long host_idt_base;
  * support this emulation, IA32_STAR must always be included in
  * vmx_uret_msrs_list[], even in i386 builds.
  */
-static const u32 vmx_uret_msrs_list[] = {
+static u32 vmx_uret_msrs_list[] = {
 #ifdef CONFIG_X86_64
 	MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
 #endif
@@ -469,6 +469,12 @@  static const u32 vmx_uret_msrs_list[] = {
 	MSR_IA32_TSX_CTRL,
 };
 
+/*
+ * Number of user return MSRs that are actually supported in hardware.
+ * vmx_uret_msrs_list is modified when KVM is loaded to drop unsupported MSRs.
+ */
+static int vmx_nr_uret_msrs;
+
 #if IS_ENABLED(CONFIG_HYPERV)
 static bool __read_mostly enlightened_vmcs = true;
 module_param(enlightened_vmcs, bool, 0444);
@@ -700,9 +706,16 @@  static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
 {
 	int i;
 
-	for (i = 0; i < vmx->nr_uret_msrs; ++i)
+	/*
+	 * Note, vmx->guest_uret_msrs is the same size as vmx_uret_msrs_list,
+	 * but is ordered differently.  The MSR is matched against the list of
+	 * supported uret MSRs using "slot", but the index that is returned is
+	 * the index into guest_uret_msrs.
+	 */
+	for (i = 0; i < vmx_nr_uret_msrs; ++i) {
 		if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr)
 			return i;
+	}
 	return -1;
 }
 
@@ -6929,18 +6942,10 @@  static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
 			goto free_vpid;
 	}
 
-	BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
+	for (i = 0; i < vmx_nr_uret_msrs; ++i) {
+		vmx->guest_uret_msrs[i].data = 0;
 
-	for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) {
-		u32 index = vmx_uret_msrs_list[i];
-		int j = vmx->nr_uret_msrs;
-
-		if (kvm_probe_user_return_msr(index))
-			continue;
-
-		vmx->guest_uret_msrs[j].slot = i;
-		vmx->guest_uret_msrs[j].data = 0;
-		switch (index) {
+		switch (vmx_uret_msrs_list[i]) {
 		case MSR_IA32_TSX_CTRL:
 			/*
 			 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID
@@ -6954,15 +6959,14 @@  static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
 			 * host so that TSX remains always disabled.
 			 */
 			if (boot_cpu_has(X86_FEATURE_RTM))
-				vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
+				vmx->guest_uret_msrs[i].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
 			else
-				vmx->guest_uret_msrs[j].mask = 0;
+				vmx->guest_uret_msrs[i].mask = 0;
 			break;
 		default:
-			vmx->guest_uret_msrs[j].mask = -1ull;
+			vmx->guest_uret_msrs[i].mask = -1ull;
 			break;
 		}
-		++vmx->nr_uret_msrs;
 	}
 
 	err = alloc_loaded_vmcs(&vmx->vmcs01);
@@ -7821,17 +7825,34 @@  static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
 };
 
+static __init void vmx_setup_user_return_msrs(void)
+{
+	u32 msr;
+	int i;
+
+	BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
+
+	for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) {
+		msr = vmx_uret_msrs_list[i];
+
+		if (kvm_probe_user_return_msr(msr))
+			continue;
+
+		kvm_define_user_return_msr(vmx_nr_uret_msrs, msr);
+		vmx_uret_msrs_list[vmx_nr_uret_msrs++] = msr;
+	}
+}
+
 static __init int hardware_setup(void)
 {
 	unsigned long host_bndcfgs;
 	struct desc_ptr dt;
-	int r, i, ept_lpage_level;
+	int r, ept_lpage_level;
 
 	store_idt(&dt);
 	host_idt_base = dt.address;
 
-	for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
-		kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]);
+	vmx_setup_user_return_msrs();
 
 	if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
 		return -EIO;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 008cb87ff088..d71ed8b425c5 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -245,8 +245,16 @@  struct vcpu_vmx {
 	u32                   idt_vectoring_info;
 	ulong                 rflags;
 
+	/*
+	 * User return MSRs are always emulated when enabled in the guest, but
+	 * only loaded into hardware when necessary, e.g. SYSCALL #UDs outside
+	 * of 64-bit mode or if EFER.SCE=1, thus the SYSCALL MSRs don't need to
+	 * be loaded into hardware if those conditions aren't met.
+	 * nr_active_uret_msrs tracks the number of MSRs that need to be loaded
+	 * into hardware when running the guest.  guest_uret_msrs[] is resorted
+	 * whenever the number of "active" uret MSRs is modified.
+	 */
 	struct vmx_uret_msr   guest_uret_msrs[MAX_NR_USER_RETURN_MSRS];
-	int                   nr_uret_msrs;
 	int                   nr_active_uret_msrs;
 	bool                  guest_uret_msrs_loaded;
 #ifdef CONFIG_X86_64