diff mbox series

[v4,4/4] x86/kvm: use __decrypted attribute in shared variables

Message ID 1536024582-25700-5-git-send-email-brijesh.singh@amd.com (mailing list archive)
State New, archived
Headers show
Series x86: Fix SEV guest regression | expand

Commit Message

Brijesh Singh Sept. 4, 2018, 1:29 a.m. UTC
Commit: 368a540e0232 (x86/kvmclock: Remove memblock dependency)
caused SEV guest regression. When SEV is active, we map the shared
variables (wall_clock and hv_clock_boot) with C=0 to ensure that both
the guest and the hypervisor are able to access the data. To map the
variables we use kernel_physical_mapping_init() to split the large pages,
but splitting large pages requires allocating a new PMD, which fails now
that kvmclock initialization is called early during boot.

Recently we added a special .data..decrypted section to hold the shared
variables. This section is mapped with C=0 early during boot. Use
__decrypted attribute to put the wall_clock and hv_clock_boot in
.data..decrypted section so that they are mapped with C=0.

Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
Fixes: 368a540e0232 ("x86/kvmclock: Remove memblock dependency")
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: kvm@vger.kernel.org
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: linux-kernel@vger.kernel.org
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <sean.j.christopherson@intel.com>
Cc: kvm@vger.kernel.org
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
---
 arch/x86/kernel/kvmclock.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

Comments

Sean Christopherson Sept. 4, 2018, 10:32 p.m. UTC | #1
On Mon, Sep 03, 2018 at 08:29:42PM -0500, Brijesh Singh wrote:
> Commit: 368a540e0232 (x86/kvmclock: Remove memblock dependency)
> caused SEV guest regression. When SEV is active, we map the shared
> variables (wall_clock and hv_clock_boot) with C=0 to ensure that both
> the guest and the hypervisor are able to access the data. To map the
> variables we use kernel_physical_mapping_init() to split the large pages,
> but splitting large pages requires allocating a new PMD, which fails now
> that kvmclock initialization is called early during boot.
> 
> Recently we added a special .data..decrypted section to hold the shared
> variables. This section is mapped with C=0 early during boot. Use
> __decrypted attribute to put the wall_clock and hv_clock_boot in
> .data..decrypted section so that they are mapped with C=0.
> 
> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
> Fixes: 368a540e0232 ("x86/kvmclock: Remove memblock dependency")
> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> Cc: kvm@vger.kernel.org
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Borislav Petkov <bp@suse.de>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: linux-kernel@vger.kernel.org
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Sean Christopherson <sean.j.christopherson@intel.com>
> Cc: kvm@vger.kernel.org
> Cc: "Radim Krčmář" <rkrcmar@redhat.com>
> ---
>  arch/x86/kernel/kvmclock.c | 30 +++++++++++++++++++++++++-----
>  1 file changed, 25 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> index 1e67646..08f5f8a 100644
> --- a/arch/x86/kernel/kvmclock.c
> +++ b/arch/x86/kernel/kvmclock.c
> @@ -28,6 +28,7 @@
>  #include <linux/sched/clock.h>
>  #include <linux/mm.h>
>  #include <linux/slab.h>
> +#include <linux/set_memory.h>
>  
>  #include <asm/hypervisor.h>
>  #include <asm/mem_encrypt.h>
> @@ -61,8 +62,8 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
>  	(PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
>  
>  static struct pvclock_vsyscall_time_info
> -			hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE);
> -static struct pvclock_wall_clock wall_clock;
> +			hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __decrypted __aligned(PAGE_SIZE);
> +static struct pvclock_wall_clock wall_clock __decrypted;
>  static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
>  
>  static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
> @@ -267,10 +268,29 @@ static int kvmclock_setup_percpu(unsigned int cpu)
>  		return 0;
>  
>  	/* Use the static page for the first CPUs, allocate otherwise */
> -	if (cpu < HVC_BOOT_ARRAY_SIZE)
> +	if (cpu < HVC_BOOT_ARRAY_SIZE) {
>  		p = &hv_clock_boot[cpu];
> -	else
> -		p = kzalloc(sizeof(*p), GFP_KERNEL);
> +	} else {
> +		int rc;
> +		unsigned int sz = sizeof(*p);
> +
> +		if (sev_active())
> +			sz = PAGE_ALIGN(sz);

Hmm, again we're wasting a fairly sizable amount of memory since each
CPU is doing a separate 4k allocation.  What if we defined an auxilary
array in __decrypted to be used for cpus > HVC_BOOT_ARRAY_SIZE when
SEV is active?  struct pvclock_vsyscall_time_info is 32 bytes so we
could handle the max of 8192 CPUs with 256kb of data (252kb if you
subtract the pre-existing 4k page), i.e. the SEV case wouldn't need
additional memory beyond the 2mb page that's reserved for __decrypted.
The non-SEV case could do free_kernel_image_pages() on the unused
array (which would need to be page sized) so it wouldn't waste memory.

> +
> +		p = kzalloc(sz, GFP_KERNEL);
> +
> +		/*
> +		 * The physical address of per-cpu variable will be shared with
> +		 * the hypervisor. Let's clear the C-bit before we assign the
> +		 * memory to per_cpu variable.
> +		 */
> +		if (p && sev_active()) {
> +			rc = set_memory_decrypted((unsigned long)p, sz >> PAGE_SHIFT);
> +			if (rc)

@p is being leaked if set_memory_decrypted() fails.

> +				return rc;
> +			memset(p, 0, sz);
> +		}
> +	}
>  
>  	per_cpu(hv_clock_per_cpu, cpu) = p;
>  	return p ? 0 : -ENOMEM;
> -- 
> 2.7.4
>
Brijesh Singh Sept. 5, 2018, 6:09 p.m. UTC | #2
Hi Sean,


On 09/04/2018 05:32 PM, Sean Christopherson wrote:

...

>>   
>>   	/* Use the static page for the first CPUs, allocate otherwise */
>> -	if (cpu < HVC_BOOT_ARRAY_SIZE)
>> +	if (cpu < HVC_BOOT_ARRAY_SIZE) {
>>   		p = &hv_clock_boot[cpu];
>> -	else
>> -		p = kzalloc(sizeof(*p), GFP_KERNEL);
>> +	} else {
>> +		int rc;
>> +		unsigned int sz = sizeof(*p);
>> +
>> +		if (sev_active())
>> +			sz = PAGE_ALIGN(sz);
> 
> Hmm, again we're wasting a fairly sizable amount of memory since each
> CPU is doing a separate 4k allocation.  What if we defined an auxilary
> array in __decrypted to be used for cpus > HVC_BOOT_ARRAY_SIZE when
> SEV is active?  struct pvclock_vsyscall_time_info is 32 bytes so we
> could handle the max of 8192 CPUs with 256kb of data (252kb if you
> subtract the pre-existing 4k page), i.e. the SEV case wouldn't need
> additional memory beyond the 2mb page that's reserved for __decrypted.
> The non-SEV case could do free_kernel_image_pages() on the unused
> array (which would need to be page sized) so it wouldn't waste memory.


Sure, we could use unused memory range from .data..decrypted section
for cpu > HVC_BOOT_ARRAY_SIZE when SEV is active. I will rework the
patch.

-Brijesh
diff mbox series

Patch

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 1e67646..08f5f8a 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -28,6 +28,7 @@ 
 #include <linux/sched/clock.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/set_memory.h>
 
 #include <asm/hypervisor.h>
 #include <asm/mem_encrypt.h>
@@ -61,8 +62,8 @@  early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
 	(PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
 
 static struct pvclock_vsyscall_time_info
-			hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE);
-static struct pvclock_wall_clock wall_clock;
+			hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __decrypted __aligned(PAGE_SIZE);
+static struct pvclock_wall_clock wall_clock __decrypted;
 static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
 
 static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
@@ -267,10 +268,29 @@  static int kvmclock_setup_percpu(unsigned int cpu)
 		return 0;
 
 	/* Use the static page for the first CPUs, allocate otherwise */
-	if (cpu < HVC_BOOT_ARRAY_SIZE)
+	if (cpu < HVC_BOOT_ARRAY_SIZE) {
 		p = &hv_clock_boot[cpu];
-	else
-		p = kzalloc(sizeof(*p), GFP_KERNEL);
+	} else {
+		int rc;
+		unsigned int sz = sizeof(*p);
+
+		if (sev_active())
+			sz = PAGE_ALIGN(sz);
+
+		p = kzalloc(sz, GFP_KERNEL);
+
+		/*
+		 * The physical address of per-cpu variable will be shared with
+		 * the hypervisor. Let's clear the C-bit before we assign the
+		 * memory to per_cpu variable.
+		 */
+		if (p && sev_active()) {
+			rc = set_memory_decrypted((unsigned long)p, sz >> PAGE_SHIFT);
+			if (rc)
+				return rc;
+			memset(p, 0, sz);
+		}
+	}
 
 	per_cpu(hv_clock_per_cpu, cpu) = p;
 	return p ? 0 : -ENOMEM;