diff mbox series

[v3,4/4] x86/kvm: use __decrypted attribute in shared variables

Message ID 1535567040-1370-5-git-send-email-brijesh.singh@amd.com (mailing list archive)
State New, archived
Headers show
Series x86: Fix SEV guest regression | expand

Commit Message

Brijesh Singh Aug. 29, 2018, 6:24 p.m. UTC
The following commit:

  368a540e0232 (x86/kvmclock: Remove memblock dependency)

caused SEV guest regression. When SEV is active, we map the shared
variables (wall_clock and hv_clock_boot) with C=0 to ensure that both
the guest and the hypervisor is able to access the data. To map the
variables we use kernel_physical_mapping_init() to split the large pages,
but this routine fails to allocate a new page. Before the above commit,
kvmclock initialization was called after memory allocator was available
but now its called early during boot.

Recently we added a special .data..decrypted section to hold the shared
variables. This section is mapped with C=0 early during boot. Use
__decrypted attribute to put the wall_clock and hv_clock_boot in
.data..decrypted section so that they are mapped with C=0.

Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Fixes: 368a540e0232 ("x86/kvmclock: Remove memblock dependency")
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: kvm@vger.kernel.org
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: linux-kernel@vger.kernel.org
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <sean.j.christopherson@intel.com>
Cc: kvm@vger.kernel.org
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
---
 arch/x86/kernel/kvmclock.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

Comments

Sean Christopherson Aug. 29, 2018, 7:56 p.m. UTC | #1
On Wed, Aug 29, 2018 at 01:24:00PM -0500, Brijesh Singh wrote:
> The following commit:
> 
>   368a540e0232 (x86/kvmclock: Remove memblock dependency)

Checkpatch prefers:

    Commit 368a540e0232 ("x86/kvmclock: Remove memblock dependency")

That'll also save three lines in the commit message.
 
> caused SEV guest regression. When SEV is active, we map the shared
> variables (wall_clock and hv_clock_boot) with C=0 to ensure that both
> the guest and the hypervisor is able to access the data. To map the

Nit: s/is/are

> variables we use kernel_physical_mapping_init() to split the large pages,
> but this routine fails to allocate a new page. Before the above commit,
> kvmclock initialization was called after memory allocator was available
> but now its called early during boot.

What about something like this to make the issue a bit clearer:

  variables we use kernel_physical_mapping_init() to split the large pages,
  but splitting large pages requires allocating a new PMD, which fails now
  that kvmclock initialization is called early during boot.

> Recently we added a special .data..decrypted section to hold the shared
> variables. This section is mapped with C=0 early during boot. Use
> __decrypted attribute to put the wall_clock and hv_clock_boot in
> .data..decrypted section so that they are mapped with C=0.
> 
> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> Fixes: 368a540e0232 ("x86/kvmclock: Remove memblock dependency")
> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> Cc: kvm@vger.kernel.org
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Borislav Petkov <bp@suse.de>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: linux-kernel@vger.kernel.org
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Sean Christopherson <sean.j.christopherson@intel.com>
> Cc: kvm@vger.kernel.org
> Cc: "Radim Krčmář" <rkrcmar@redhat.com>
> ---
>  arch/x86/kernel/kvmclock.c | 30 +++++++++++++++++++++++++-----
>  1 file changed, 25 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> index 1e67646..08f5f8a 100644
> --- a/arch/x86/kernel/kvmclock.c
> +++ b/arch/x86/kernel/kvmclock.c
> @@ -28,6 +28,7 @@
>  #include <linux/sched/clock.h>
>  #include <linux/mm.h>
>  #include <linux/slab.h>
> +#include <linux/set_memory.h>
>  
>  #include <asm/hypervisor.h>
>  #include <asm/mem_encrypt.h>
> @@ -61,8 +62,8 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
>  	(PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
>  
>  static struct pvclock_vsyscall_time_info
> -			hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE);
> -static struct pvclock_wall_clock wall_clock;
> +			hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __decrypted __aligned(PAGE_SIZE);
> +static struct pvclock_wall_clock wall_clock __decrypted;
>  static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
>  
>  static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
> @@ -267,10 +268,29 @@ static int kvmclock_setup_percpu(unsigned int cpu)
>  		return 0;
>  
>  	/* Use the static page for the first CPUs, allocate otherwise */
> -	if (cpu < HVC_BOOT_ARRAY_SIZE)
> +	if (cpu < HVC_BOOT_ARRAY_SIZE) {
>  		p = &hv_clock_boot[cpu];
> -	else
> -		p = kzalloc(sizeof(*p), GFP_KERNEL);
> +	} else {
> +		int rc;
> +		unsigned int sz = sizeof(*p);
> +
> +		if (sev_active())
> +			sz = PAGE_ALIGN(sz);

This is a definite downside to the section approach.  Unless I missed
something, the section padding goes to waste since we don't have a
mechanism in place to allocate into that section, e.g. as is we're
burning nearly 2mb of data since we're only using 4k of the 2mb page.
And every decrypted allocation can potentially fracture a large page
since the allocator is unaware of the decrypted requirement.  Might
not be an issue for kvmclock since it's a one-time allocation, but
we could suffer death by a thousand cuts if there are scenarios where
a decrypted allocation isn't be persistent (VirtIO queues maybe?).

Duplicating the full kernel tables for C=0 accesses doesn't suffer
from these issues.  And I think potential corruption issues due to
mis-{aligned,size} objects can be detected through static analysis,
build assertions and/or runtime checks.

> +		p = kzalloc(sz, GFP_KERNEL);

For the SEV case, can't we do a straight kmalloc() since we zero
out the page after decrypting it?

> +
> +		/*
> +		 * The physical address of per-cpu variable will be shared with
> +		 * the hypervisor. Let's clear the C-bit before we assign the
> +		 * memory to per_cpu variable.
> +		 */
> +		if (p && sev_active()) {
> +			rc = set_memory_decrypted((unsigned long)p, sz >> PAGE_SHIFT);
> +			if (rc)
> +				return rc;
> +			memset(p, 0, sz);
> +		}
> +	}
>  
>  	per_cpu(hv_clock_per_cpu, cpu) = p;
>  	return p ? 0 : -ENOMEM;
> -- 
> 2.7.4
>
Brijesh Singh Aug. 30, 2018, 2:10 p.m. UTC | #2
On 08/29/2018 02:56 PM, Sean Christopherson wrote:
> On Wed, Aug 29, 2018 at 01:24:00PM -0500, Brijesh Singh wrote:
>> The following commit:
>>
>>    368a540e0232 (x86/kvmclock: Remove memblock dependency)
> 
> Checkpatch prefers:
> 
>      Commit 368a540e0232 ("x86/kvmclock: Remove memblock dependency")
> 
> That'll also save three lines in the commit message.

Noted.


>   
>> caused SEV guest regression. When SEV is active, we map the shared
>> variables (wall_clock and hv_clock_boot) with C=0 to ensure that both
>> the guest and the hypervisor is able to access the data. To map the
> 
> Nit: s/is/are

Noted.

> 
>> variables we use kernel_physical_mapping_init() to split the large pages,
>> but this routine fails to allocate a new page. Before the above commit,
>> kvmclock initialization was called after memory allocator was available
>> but now its called early during boot.
> 
> What about something like this to make the issue a bit clearer:
> 
>    variables we use kernel_physical_mapping_init() to split the large pages,
>    but splitting large pages requires allocating a new PMD, which fails now
>    that kvmclock initialization is called early during boot.
> 

Much better.


>> Recently we added a special .data..decrypted section to hold the shared
>> variables. This section is mapped with C=0 early during boot. Use
>> __decrypted attribute to put the wall_clock and hv_clock_boot in
>> .data..decrypted section so that they are mapped with C=0.
>>
>> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
>> Fixes: 368a540e0232 ("x86/kvmclock: Remove memblock dependency")
>> Cc: Tom Lendacky <thomas.lendacky@amd.com>
>> Cc: kvm@vger.kernel.org
>> Cc: Thomas Gleixner <tglx@linutronix.de>
>> Cc: Borislav Petkov <bp@suse.de>
>> Cc: "H. Peter Anvin" <hpa@zytor.com>
>> Cc: linux-kernel@vger.kernel.org
>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>> Cc: Sean Christopherson <sean.j.christopherson@intel.com>
>> Cc: kvm@vger.kernel.org
>> Cc: "Radim Krčmář" <rkrcmar@redhat.com>
>> ---
>>   arch/x86/kernel/kvmclock.c | 30 +++++++++++++++++++++++++-----
>>   1 file changed, 25 insertions(+), 5 deletions(-)
>>
>> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
>> index 1e67646..08f5f8a 100644
>> --- a/arch/x86/kernel/kvmclock.c
>> +++ b/arch/x86/kernel/kvmclock.c
>> @@ -28,6 +28,7 @@
>>   #include <linux/sched/clock.h>
>>   #include <linux/mm.h>
>>   #include <linux/slab.h>
>> +#include <linux/set_memory.h>
>>   
>>   #include <asm/hypervisor.h>
>>   #include <asm/mem_encrypt.h>
>> @@ -61,8 +62,8 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
>>   	(PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
>>   
>>   static struct pvclock_vsyscall_time_info
>> -			hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE);
>> -static struct pvclock_wall_clock wall_clock;
>> +			hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __decrypted __aligned(PAGE_SIZE);
>> +static struct pvclock_wall_clock wall_clock __decrypted;
>>   static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
>>   
>>   static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
>> @@ -267,10 +268,29 @@ static int kvmclock_setup_percpu(unsigned int cpu)
>>   		return 0;
>>   
>>   	/* Use the static page for the first CPUs, allocate otherwise */
>> -	if (cpu < HVC_BOOT_ARRAY_SIZE)
>> +	if (cpu < HVC_BOOT_ARRAY_SIZE) {
>>   		p = &hv_clock_boot[cpu];
>> -	else
>> -		p = kzalloc(sizeof(*p), GFP_KERNEL);
>> +	} else {
>> +		int rc;
>> +		unsigned int sz = sizeof(*p);
>> +
>> +		if (sev_active())
>> +			sz = PAGE_ALIGN(sz);
> 
> This is a definite downside to the section approach.  Unless I missed
> something, the section padding goes to waste since we don't have a
> mechanism in place to allocate into that section, e.g. as is we're
> burning nearly 2mb of data since we're only using 4k of the 2mb page.
> And every decrypted allocation can potentially fracture a large page
> since the allocator is unaware of the decrypted requirement.  Might
> not be an issue for kvmclock since it's a one-time allocation, but
> we could suffer death by a thousand cuts if there are scenarios where
> a decrypted allocation isn't be persistent (VirtIO queues maybe?).
> 


The .data..decrypted is used for storing the static variables (which
need to be accessed unencrypted before dynamical allocators are ready).
If caller does a dynamic allocation and want to use the buffer as
decrypted then she is responsible to set/clear the C-bit using
set_memory_{decrypted,encrypted}. Currently, the decrypted section
holds the wall_clock and hv_clock_boot variables but its user will grow
when we add SEV-ES support. In SEV-ES case, the GHCB (guest-host 
communication block) need to be accessed unencrypted very early during
boot.

SEV uses SWIOTLB for DMA buffer, the buffer pool is allocated and mapped
as decrypted during the boot. For the SEV case, Virtio uses the DMA
APIs to allocate the VRING, caller does not need to map the buffers as
decrypted. Actually there are very few number of 
set_memory_{decrypted/encrypted} calls overall (during or after the
kernel boot).


> Duplicating the full kernel tables for C=0 accesses doesn't suffer
> from these issues.  And I think potential corruption issues due to
> mis-{aligned,size} objects can be detected through static analysis,
> build assertions and/or runtime checks.
> 
>> +		p = kzalloc(sz, GFP_KERNEL);
> 
> For the SEV case, can't we do a straight kmalloc() since we zero
> out the page after decrypting it?
> 


Sure we can do kmalloc(); IMO, since this is not hot code path and doing
kmalloc() for SEV and kzalloc() for non-SEV does not buy much.



>> +
>> +		/*
>> +		 * The physical address of per-cpu variable will be shared with
>> +		 * the hypervisor. Let's clear the C-bit before we assign the
>> +		 * memory to per_cpu variable.
>> +		 */
>> +		if (p && sev_active()) {
>> +			rc = set_memory_decrypted((unsigned long)p, sz >> PAGE_SHIFT);
>> +			if (rc)
>> +				return rc;
>> +			memset(p, 0, sz);
>> +		}
>> +	}
>>   
>>   	per_cpu(hv_clock_per_cpu, cpu) = p;
>>   	return p ? 0 : -ENOMEM;
>> -- 
>> 2.7.4
>>
diff mbox series

Patch

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 1e67646..08f5f8a 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -28,6 +28,7 @@ 
 #include <linux/sched/clock.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/set_memory.h>
 
 #include <asm/hypervisor.h>
 #include <asm/mem_encrypt.h>
@@ -61,8 +62,8 @@  early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
 	(PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
 
 static struct pvclock_vsyscall_time_info
-			hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE);
-static struct pvclock_wall_clock wall_clock;
+			hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __decrypted __aligned(PAGE_SIZE);
+static struct pvclock_wall_clock wall_clock __decrypted;
 static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
 
 static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
@@ -267,10 +268,29 @@  static int kvmclock_setup_percpu(unsigned int cpu)
 		return 0;
 
 	/* Use the static page for the first CPUs, allocate otherwise */
-	if (cpu < HVC_BOOT_ARRAY_SIZE)
+	if (cpu < HVC_BOOT_ARRAY_SIZE) {
 		p = &hv_clock_boot[cpu];
-	else
-		p = kzalloc(sizeof(*p), GFP_KERNEL);
+	} else {
+		int rc;
+		unsigned int sz = sizeof(*p);
+
+		if (sev_active())
+			sz = PAGE_ALIGN(sz);
+
+		p = kzalloc(sz, GFP_KERNEL);
+
+		/*
+		 * The physical address of per-cpu variable will be shared with
+		 * the hypervisor. Let's clear the C-bit before we assign the
+		 * memory to per_cpu variable.
+		 */
+		if (p && sev_active()) {
+			rc = set_memory_decrypted((unsigned long)p, sz >> PAGE_SHIFT);
+			if (rc)
+				return rc;
+			memset(p, 0, sz);
+		}
+	}
 
 	per_cpu(hv_clock_per_cpu, cpu) = p;
 	return p ? 0 : -ENOMEM;