diff mbox series

kvm: x86: Set highest physical address bits in non-present/reserved SPTEs

Message ID 20180814171534.61890-1-junaids@google.com (mailing list archive)
State New, archived
Headers show
Series kvm: x86: Set highest physical address bits in non-present/reserved SPTEs | expand

Commit Message

Junaid Shahid Aug. 14, 2018, 5:15 p.m. UTC
Always set the 5 upper-most supported physical address bits to 1 for SPTEs
that are marked as non-present or reserved, to make them unusable for
L1TF attacks from the guest. Currently, this just applies to MMIO SPTEs.
(We do not need to mark PTEs that are completely 0 as physical page 0
is already reserved.)

This allows mitigation of L1TF without disabling hyper-threading by using
shadow paging mode instead of EPT.

Signed-off-by: Junaid Shahid <junaids@google.com>
---
 arch/x86/kvm/mmu.c | 43 ++++++++++++++++++++++++++++++++++++++-----
 arch/x86/kvm/x86.c |  8 ++++++--
 2 files changed, 44 insertions(+), 7 deletions(-)

Comments

Paolo Bonzini Aug. 14, 2018, 5:22 p.m. UTC | #1
On 14/08/2018 19:15, Junaid Shahid wrote:
>  	access &= ACC_WRITE_MASK | ACC_USER_MASK;
> -	mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT;
> +	mask |= shadow_mmio_value | shadow_nonpresent_or_rsvd_mask;
> +	mask |= access | gpa;
> +	mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
> +		<< shadow_nonpresent_or_rsvd_mask_len;

Just a microscopic change here which is a little bit clearer in my opinion:

+       mask |= shadow_mmio_value | access;
+       mask |= gpa | shadow_nonpresent_or_rsvd_mask;
+       mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
+               << shadow_nonpresent_or_rsvd_mask_len;

Thanks, queued for 4.19-rc1.

Paolo
Sakari Ailus Sept. 24, 2018, 7:57 p.m. UTC | #2
Hi Junaid,

(Cc'ing LKML.)

> commit 28a1f3ac1d0c8558ee4453d9634dad891a6e922e upstream.
> 
> Always set the 5 upper-most supported physical address bits to 1 for SPTEs
> that are marked as non-present or reserved, to make them unusable for
> L1TF attacks from the guest. Currently, this just applies to MMIO SPTEs.
> (We do not need to mark PTEs that are completely 0 as physical page 0
> is already reserved.)

I'm not subscribed to the kvm list nor I did find the original e-mail with
the headers so I constructed the reply from the bits I did find. If
something goes wrong it's probably because of that.

This patch in the host kernel breaks KVM guests on Core 2 Duo (T9500). KVM
guests do not boot on plain v4.18.9 (which contains the patch) host;
reverting this patch fixes it.

The contents of /proc/cpuinfo are below in case it matters:

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 23
model name	: Intel(R) Core(TM)2 Duo CPU     T9500  @ 2.60GHz
stepping	: 6
microcode	: 0x60f
cpu MHz		: 798.083
cache size	: 6144 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 2
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 10
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc arch_perfmon pebs bts rep_good nopl cpuid aperfmperf pni dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm sse4_1 lahf_lm pti tpr_shadow vnmi flexpriority dtherm ida
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf
bogomips	: 5187.28
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 1
vendor_id	: GenuineIntel
cpu family	: 6
model		: 23
model name	: Intel(R) Core(TM)2 Duo CPU     T9500  @ 2.60GHz
stepping	: 6
microcode	: 0x60f
cpu MHz		: 798.062
cache size	: 6144 KB
physical id	: 0
siblings	: 2
core id		: 1
cpu cores	: 2
apicid		: 1
initial apicid	: 1
fpu		: yes
fpu_exception	: yes
cpuid level	: 10
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc arch_perfmon pebs bts rep_good nopl cpuid aperfmperf pni dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm sse4_1 lahf_lm pti tpr_shadow vnmi flexpriority dtherm ida
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf
bogomips	: 5187.28
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

> 
> This allows mitigation of L1TF without disabling hyper-threading by using
> shadow paging mode instead of EPT.
> 
> Signed-off-by: Junaid Shahid <junaids@google.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> ---
>  arch/x86/kvm/mmu.c | 43 ++++++++++++++++++++++++++++++++++++++-----
>  arch/x86/kvm/x86.c |  8 ++++++--
>  2 files changed, 44 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index a44e568363a4..42f1ba92622a 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -221,6 +221,17 @@ static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
>  						    PT64_EPT_EXECUTABLE_MASK;
>  static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
>  
> +/*
> + * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
> + * to guard against L1TF attacks.
> + */
> +static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
> +
> +/*
> + * The number of high-order 1 bits to use in the mask above.
> + */
> +static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
> +
>  static void mmu_spte_set(u64 *sptep, u64 spte);
>  
>  void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
> @@ -308,9 +319,13 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
>  {
>  	unsigned int gen = kvm_current_mmio_generation(vcpu);
>  	u64 mask = generation_mmio_spte_mask(gen);
> +	u64 gpa = gfn << PAGE_SHIFT;
>  
>  	access &= ACC_WRITE_MASK | ACC_USER_MASK;
> -	mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT;
> +	mask |= shadow_mmio_value | access;
> +	mask |= gpa | shadow_nonpresent_or_rsvd_mask;
> +	mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
> +		<< shadow_nonpresent_or_rsvd_mask_len;
>  
>  	trace_mark_mmio_spte(sptep, gfn, access, gen);
>  	mmu_spte_set(sptep, mask);
> @@ -323,8 +338,14 @@ static bool is_mmio_spte(u64 spte)
>  
>  static gfn_t get_mmio_spte_gfn(u64 spte)
>  {
> -	u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask;
> -	return (spte & ~mask) >> PAGE_SHIFT;
> +	u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask |
> +		   shadow_nonpresent_or_rsvd_mask;
> +	u64 gpa = spte & ~mask;
> +
> +	gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
> +	       & shadow_nonpresent_or_rsvd_mask;
> +
> +	return gpa >> PAGE_SHIFT;
>  }
>  
>  static unsigned get_mmio_spte_access(u64 spte)
> @@ -381,7 +402,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
>  
> -static void kvm_mmu_clear_all_pte_masks(void)
> +static void kvm_mmu_reset_all_pte_masks(void)
>  {
>  	shadow_user_mask = 0;
>  	shadow_accessed_mask = 0;
> @@ -391,6 +412,18 @@ static void kvm_mmu_clear_all_pte_masks(void)
>  	shadow_mmio_mask = 0;
>  	shadow_present_mask = 0;
>  	shadow_acc_track_mask = 0;
> +
> +	/*
> +	 * If the CPU has 46 or less physical address bits, then set an
> +	 * appropriate mask to guard against L1TF attacks. Otherwise, it is
> +	 * assumed that the CPU is not vulnerable to L1TF.
> +	 */
> +	if (boot_cpu_data.x86_phys_bits <
> +	    52 - shadow_nonpresent_or_rsvd_mask_len)
> +		shadow_nonpresent_or_rsvd_mask =
> +			rsvd_bits(boot_cpu_data.x86_phys_bits -
> +				  shadow_nonpresent_or_rsvd_mask_len,
> +				  boot_cpu_data.x86_phys_bits - 1);
>  }
>  
>  static int is_cpuid_PSE36(void)
> @@ -5500,7 +5533,7 @@ int kvm_mmu_module_init(void)
>  {
>  	int ret = -ENOMEM;
>  
> -	kvm_mmu_clear_all_pte_masks();
> +	kvm_mmu_reset_all_pte_masks();
>  
>  	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
>  					    sizeof(struct pte_list_desc),
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 24c84aa87049..6e5626822743 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -6506,8 +6506,12 @@ static void kvm_set_mmio_spte_mask(void)
>  	 * Set the reserved bits and the present bit of an paging-structure
>  	 * entry to generate page fault with PFER.RSV = 1.
>  	 */
> -	 /* Mask the reserved physical address bits. */
> -	mask = rsvd_bits(maxphyaddr, 51);
> +
> +	/*
> +	 * Mask the uppermost physical address bit, which would be reserved as
> +	 * long as the supported physical address width is less than 52.
> +	 */
> +	mask = 1ull << 51;
>  
>  	/* Set the present bit. */
>  	mask |= 1ull;
Junaid Shahid Sept. 25, 2018, 8:11 a.m. UTC | #3
On 09/24/2018 12:57 PM, Sakari Ailus wrote:
> 
> I'm not subscribed to the kvm list nor I did find the original e-mail with
> the headers so I constructed the reply from the bits I did find. If
> something goes wrong it's probably because of that.
> 
> This patch in the host kernel breaks KVM guests on Core 2 Duo (T9500). KVM
> guests do not boot on plain v4.18.9 (which contains the patch) host;
> reverting this patch fixes it.

Hi Sakari,

Could you please let me know what OS is running in the guest? Also, is there some kernel panic etc. in the guest OS when it doesn't boot, or does it fail in some other way?

Thanks,
Junaid
Sakari Ailus Sept. 25, 2018, 8:18 a.m. UTC | #4
Hi Junaid,

On Tue, Sep 25, 2018 at 01:11:06AM -0700, Junaid Shahid wrote:
> On 09/24/2018 12:57 PM, Sakari Ailus wrote:
> > 
> > I'm not subscribed to the kvm list nor I did find the original e-mail with
> > the headers so I constructed the reply from the bits I did find. If
> > something goes wrong it's probably because of that.
> > 
> > This patch in the host kernel breaks KVM guests on Core 2 Duo (T9500). KVM
> > guests do not boot on plain v4.18.9 (which contains the patch) host;
> > reverting this patch fixes it.
> 
> Hi Sakari,
> 
> Could you please let me know what OS is running in the guest? Also, is
> there some kernel panic etc. in the guest OS when it doesn't boot, or
> does it fail in some other way?

The guest is Debian running v4.18.9 kernel.

Everything simply stops once I'd expect the kernel to start executing the
initrd scripts. Before figuring out the host kernel was at fault, I was
first wondering if there was something wrong with the guest itself and
disabled a few things such as APIC or HPET, and different oopses followed
from that.

If there's no immediate, obvious fix, I'd strongly suggest to revert the
patch.
Sean Christopherson Sept. 25, 2018, 1:38 p.m. UTC | #5
On Tue, 2018-09-25 at 11:18 +0300, Sakari Ailus wrote:
> Hi Junaid,
> 
> On Tue, Sep 25, 2018 at 01:11:06AM -0700, Junaid Shahid wrote:
> > 
> > On 09/24/2018 12:57 PM, Sakari Ailus wrote:
> > > 
> > > 
> > > I'm not subscribed to the kvm list nor I did find the original e-mail with
> > > the headers so I constructed the reply from the bits I did find. If
> > > something goes wrong it's probably because of that.
> > > 
> > > This patch in the host kernel breaks KVM guests on Core 2 Duo (T9500). KVM
> > > guests do not boot on plain v4.18.9 (which contains the patch) host;
> > > reverting this patch fixes it.
> > Hi Sakari,
> > 
> > Could you please let me know what OS is running in the guest? Also, is
> > there some kernel panic etc. in the guest OS when it doesn't boot, or
> > does it fail in some other way?
> The guest is Debian running v4.18.9 kernel.
> 
> Everything simply stops once I'd expect the kernel to start executing the
> initrd scripts. Before figuring out the host kernel was at fault, I was
> first wondering if there was something wrong with the guest itself and
> disabled a few things such as APIC or HPET, and different oopses followed
> from that.
> 
> If there's no immediate, obvious fix, I'd strongly suggest to revert the
> patch.

I was able to reproduce this by disabling EPT on modern hardware.
I have a fix, I'll send a patch shortly.
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6b8f11521c41..4cfc5c8b6e61 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -221,6 +221,17 @@  static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
 						    PT64_EPT_EXECUTABLE_MASK;
 static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
 
+/*
+ * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
+ * to guard against L1TF attacks.
+ */
+static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
+
+/*
+ * The number of high-order 1 bits to use in the mask above.
+ */
+static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
+
 static void mmu_spte_set(u64 *sptep, u64 spte);
 
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
@@ -308,9 +319,13 @@  static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
 {
 	unsigned int gen = kvm_current_mmio_generation(vcpu);
 	u64 mask = generation_mmio_spte_mask(gen);
+	u64 gpa = gfn << PAGE_SHIFT;
 
 	access &= ACC_WRITE_MASK | ACC_USER_MASK;
-	mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT;
+	mask |= shadow_mmio_value | shadow_nonpresent_or_rsvd_mask;
+	mask |= access | gpa;
+	mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
+		<< shadow_nonpresent_or_rsvd_mask_len;
 
 	trace_mark_mmio_spte(sptep, gfn, access, gen);
 	mmu_spte_set(sptep, mask);
@@ -323,8 +338,14 @@  static bool is_mmio_spte(u64 spte)
 
 static gfn_t get_mmio_spte_gfn(u64 spte)
 {
-	u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask;
-	return (spte & ~mask) >> PAGE_SHIFT;
+	u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask |
+		   shadow_nonpresent_or_rsvd_mask;
+	u64 gpa = spte & ~mask;
+
+	gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
+	       & shadow_nonpresent_or_rsvd_mask;
+
+	return gpa >> PAGE_SHIFT;
 }
 
 static unsigned get_mmio_spte_access(u64 spte)
@@ -381,7 +402,7 @@  void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
-static void kvm_mmu_clear_all_pte_masks(void)
+static void kvm_mmu_reset_all_pte_masks(void)
 {
 	shadow_user_mask = 0;
 	shadow_accessed_mask = 0;
@@ -391,6 +412,18 @@  static void kvm_mmu_clear_all_pte_masks(void)
 	shadow_mmio_mask = 0;
 	shadow_present_mask = 0;
 	shadow_acc_track_mask = 0;
+
+	/* 
+	 * If the CPU has 46 or less physical address bits, then set an
+	 * appropriate mask to guard against L1TF attacks. Otherwise, it is
+	 * assumed that the CPU is not vulnerable to L1TF.
+	 */
+	if (boot_cpu_data.x86_phys_bits <
+	    52 - shadow_nonpresent_or_rsvd_mask_len) 
+		shadow_nonpresent_or_rsvd_mask = 
+			rsvd_bits(boot_cpu_data.x86_phys_bits -
+				  shadow_nonpresent_or_rsvd_mask_len,
+				  boot_cpu_data.x86_phys_bits - 1);
 }
 
 static int is_cpuid_PSE36(void)
@@ -5499,7 +5532,7 @@  int kvm_mmu_module_init(void)
 {
 	int ret = -ENOMEM;
 
-	kvm_mmu_clear_all_pte_masks();
+	kvm_mmu_reset_all_pte_masks();
 
 	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
 					    sizeof(struct pte_list_desc),
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2b812b3c5088..6cd9d62a2dd3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6473,8 +6473,12 @@  static void kvm_set_mmio_spte_mask(void)
 	 * Set the reserved bits and the present bit of an paging-structure
 	 * entry to generate page fault with PFER.RSV = 1.
 	 */
-	 /* Mask the reserved physical address bits. */
-	mask = rsvd_bits(maxphyaddr, 51);
+
+	/*
+	 * Mask the uppermost physical address bit, which would be reserved as
+	 * long as the supported physical address width is less than 52.
+	 */
+	mask = 1ull << 51;
 
 	/* Set the present bit. */
 	mask |= 1ull;