diff mbox series

[40/70] x86/sev-es: Setup per-cpu GHCBs for the runtime handler

Message ID 20200319091407.1481-41-joro@8bytes.org (mailing list archive)
State New, archived
Headers show
Series x86: SEV-ES Guest Support | expand

Commit Message

Joerg Roedel March 19, 2020, 9:13 a.m. UTC
From: Tom Lendacky <thomas.lendacky@amd.com>

The runtime handler needs a GHCB per CPU. Set them up and map them
unencrypted.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/x86/include/asm/mem_encrypt.h |  2 ++
 arch/x86/kernel/sev-es.c           | 28 +++++++++++++++++++++++++++-
 arch/x86/kernel/traps.c            |  3 +++
 3 files changed, 32 insertions(+), 1 deletion(-)

Comments

Mike Stunes April 14, 2020, 7:03 p.m. UTC | #1
On Mar 19, 2020, at 2:13 AM, Joerg Roedel <joro@8bytes.org> wrote:
> 
> From: Tom Lendacky <thomas.lendacky@amd.com>
> 
> The runtime handler needs a GHCB per CPU. Set them up and map them
> unencrypted.
> 
> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
> Signed-off-by: Joerg Roedel <jroedel@suse.de>
> ---
> arch/x86/include/asm/mem_encrypt.h |  2 ++
> arch/x86/kernel/sev-es.c           | 28 +++++++++++++++++++++++++++-
> arch/x86/kernel/traps.c            |  3 +++
> 3 files changed, 32 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
> index c17980e8db78..4bf5286310a0 100644
> --- a/arch/x86/kernel/sev-es.c
> +++ b/arch/x86/kernel/sev-es.c
> @@ -197,6 +203,26 @@ static bool __init sev_es_setup_ghcb(void)
> 	return true;
> }
> 
> +void sev_es_init_ghcbs(void)
> +{
> +	int cpu;
> +
> +	if (!sev_es_active())
> +		return;
> +
> +	/* Allocate GHCB pages */
> +	ghcb_page = __alloc_percpu(sizeof(struct ghcb), PAGE_SIZE);
> +
> +	/* Initialize per-cpu GHCB pages */
> +	for_each_possible_cpu(cpu) {
> +		struct ghcb *ghcb = (struct ghcb *)per_cpu_ptr(ghcb_page, cpu);
> +
> +		set_memory_decrypted((unsigned long)ghcb,
> +				     sizeof(*ghcb) >> PAGE_SHIFT);
> +		memset(ghcb, 0, sizeof(*ghcb));
> +	}
> +}
> +

set_memory_decrypted needs to check the return value. I see it
consistently return ENOMEM. I've traced that back to split_large_page
in arch/x86/mm/pat/set_memory.c.
Tom Lendacky April 14, 2020, 8:04 p.m. UTC | #2
On 4/14/20 2:03 PM, Mike Stunes wrote:
> On Mar 19, 2020, at 2:13 AM, Joerg Roedel <joro@8bytes.org> wrote:
>>
>> From: Tom Lendacky <thomas.lendacky@amd.com>
>>
>> The runtime handler needs a GHCB per CPU. Set them up and map them
>> unencrypted.
>>
>> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
>> Signed-off-by: Joerg Roedel <jroedel@suse.de>
>> ---
>> arch/x86/include/asm/mem_encrypt.h |  2 ++
>> arch/x86/kernel/sev-es.c           | 28 +++++++++++++++++++++++++++-
>> arch/x86/kernel/traps.c            |  3 +++
>> 3 files changed, 32 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
>> index c17980e8db78..4bf5286310a0 100644
>> --- a/arch/x86/kernel/sev-es.c
>> +++ b/arch/x86/kernel/sev-es.c
>> @@ -197,6 +203,26 @@ static bool __init sev_es_setup_ghcb(void)
>> 	return true;
>> }
>>
>> +void sev_es_init_ghcbs(void)
>> +{
>> +	int cpu;
>> +
>> +	if (!sev_es_active())
>> +		return;
>> +
>> +	/* Allocate GHCB pages */
>> +	ghcb_page = __alloc_percpu(sizeof(struct ghcb), PAGE_SIZE);
>> +
>> +	/* Initialize per-cpu GHCB pages */
>> +	for_each_possible_cpu(cpu) {
>> +		struct ghcb *ghcb = (struct ghcb *)per_cpu_ptr(ghcb_page, cpu);
>> +
>> +		set_memory_decrypted((unsigned long)ghcb,
>> +				     sizeof(*ghcb) >> PAGE_SHIFT);
>> +		memset(ghcb, 0, sizeof(*ghcb));
>> +	}
>> +}
>> +
> 
> set_memory_decrypted needs to check the return value. I see it
> consistently return ENOMEM. I've traced that back to split_large_page
> in arch/x86/mm/pat/set_memory.c.

At that point the guest won't be able to communicate with the hypervisor, 
too. Maybe we should BUG() here to terminate further processing?

Thanks,
Tom

>
Dave Hansen April 14, 2020, 8:12 p.m. UTC | #3
On 4/14/20 1:04 PM, Tom Lendacky wrote:
>> set_memory_decrypted needs to check the return value. I see it
>> consistently return ENOMEM. I've traced that back to split_large_page
>> in arch/x86/mm/pat/set_memory.c.
> 
> At that point the guest won't be able to communicate with the
> hypervisor, too. Maybe we should BUG() here to terminate further
> processing?

Escalating an -ENOMEM into a crashed kernel seems a bit extreme.
Granted, the guest may be in an unrecoverable state, but the host
doesn't need to be too.
Tom Lendacky April 14, 2020, 8:16 p.m. UTC | #4
On 4/14/20 3:12 PM, Dave Hansen wrote:
> On 4/14/20 1:04 PM, Tom Lendacky wrote:
>>> set_memory_decrypted needs to check the return value. I see it
>>> consistently return ENOMEM. I've traced that back to split_large_page
>>> in arch/x86/mm/pat/set_memory.c.
>>
>> At that point the guest won't be able to communicate with the
>> hypervisor, too. Maybe we should BUG() here to terminate further
>> processing?
> 
> Escalating an -ENOMEM into a crashed kernel seems a bit extreme.
> Granted, the guest may be in an unrecoverable state, but the host
> doesn't need to be too.
> 

The host wouldn't be. This only happens in a guest, so it would be just 
causing the guest kernel to panic early in the boot.

Thanks,
Tom
Tom Lendacky April 14, 2020, 8:18 p.m. UTC | #5
On 4/14/20 3:16 PM, Tom Lendacky wrote:
> 
> 
> On 4/14/20 3:12 PM, Dave Hansen wrote:
>> On 4/14/20 1:04 PM, Tom Lendacky wrote:
>>>> set_memory_decrypted needs to check the return value. I see it
>>>> consistently return ENOMEM. I've traced that back to split_large_page
>>>> in arch/x86/mm/pat/set_memory.c.
>>>
>>> At that point the guest won't be able to communicate with the
>>> hypervisor, too. Maybe we should BUG() here to terminate further
>>> processing?
>>
>> Escalating an -ENOMEM into a crashed kernel seems a bit extreme.
>> Granted, the guest may be in an unrecoverable state, but the host
>> doesn't need to be too.
>>
> 
> The host wouldn't be. This only happens in a guest, so it would be just 
> causing the guest kernel to panic early in the boot.

And I should add that it would only impact an SEV-ES guest.

Thanks,
Tom

> 
> Thanks,
> Tom
>
Joerg Roedel April 15, 2020, 3:53 p.m. UTC | #6
Hi Mike,

On Tue, Apr 14, 2020 at 07:03:44PM +0000, Mike Stunes wrote:
> set_memory_decrypted needs to check the return value. I see it
> consistently return ENOMEM. I've traced that back to split_large_page
> in arch/x86/mm/pat/set_memory.c.

I agree that the return code needs to be checked. But I wonder why this
happens. The split_large_page() function returns -ENOMEM when
alloc_pages() fails. Do you boot the guest with minal RAM assigned?

Regards,

	Joerg
Joerg Roedel April 15, 2020, 3:54 p.m. UTC | #7
On Tue, Apr 14, 2020 at 03:04:42PM -0500, Tom Lendacky wrote:
> At that point the guest won't be able to communicate with the hypervisor,
> too. Maybe we should BUG() here to terminate further processing?

We could talk to the hypervisor, there is still the boot-GHCB in the
bss-decrypted section. But there is nothing that could be done here
anyway besides terminating the guest.


Regards,

	Joerg
Bo Gan April 23, 2020, 1:33 a.m. UTC | #8
On 4/15/20 8:53 AM, Joerg Roedel wrote:
> Hi Mike,
> 
> On Tue, Apr 14, 2020 at 07:03:44PM +0000, Mike Stunes wrote:
>> set_memory_decrypted needs to check the return value. I see it
>> consistently return ENOMEM. I've traced that back to split_large_page
>> in arch/x86/mm/pat/set_memory.c.
> 
> I agree that the return code needs to be checked. But I wonder why this
> happens. The split_large_page() function returns -ENOMEM when
> alloc_pages() fails. Do you boot the guest with minal RAM assigned?
> 
> Regards,
> 
> 	Joerg
> 

I just want to add some context around this. The call path that lead to 
the failure is like the following:

	__alloc_pages_slowpath
	__alloc_pages_nodemask
	alloc_pages_current
	alloc_pages
	split_large_page
	__change_page_attr
	__change_page_attr_set_clr
	__set_memory_enc_dec
	set_memory_decrypted
	sev_es_init_ghcbs
	trap_init   -> before mm_init (in init/main.c)
	start_kernel
	x86_64_start_reservations
	x86_64_start_kernel
	secondary_startup_64

At this time, mem_init hasn't been called yet (which would be called by 
mm_init). Thus, the free pages are still owned by memblock. It's in 
mem_init (x86/mm/init_64.c) that memblock_free_all gets called and free 
pages are released.

During testing, I've also noticed that debug_pagealloc=1 will make the 
issue disappear. That's because with debug_pagealloc=1, 
probe_page_size_mask in x86/mm/init.c will not allow large pages 
(2M/1G). Therefore, no split_large_page would happen. Similarly, if CPU 
doesn't have X86_FEATURE_PSE, there won't be large pages either.

Any thoughts? Maybe split_large_page should get pages from memblock at 
early boot?

Bo
Joerg Roedel April 23, 2020, 11:30 a.m. UTC | #9
On Wed, Apr 22, 2020 at 06:33:13PM -0700, Bo Gan wrote:
> On 4/15/20 8:53 AM, Joerg Roedel wrote:
> > Hi Mike,
> > 
> > On Tue, Apr 14, 2020 at 07:03:44PM +0000, Mike Stunes wrote:
> > > set_memory_decrypted needs to check the return value. I see it
> > > consistently return ENOMEM. I've traced that back to split_large_page
> > > in arch/x86/mm/pat/set_memory.c.
> > 
> > I agree that the return code needs to be checked. But I wonder why this
> > happens. The split_large_page() function returns -ENOMEM when
> > alloc_pages() fails. Do you boot the guest with minal RAM assigned?
> > 
> > Regards,
> > 
> > 	Joerg
> > 
> 
> I just want to add some context around this. The call path that lead to the
> failure is like the following:
> 
> 	__alloc_pages_slowpath
> 	__alloc_pages_nodemask
> 	alloc_pages_current
> 	alloc_pages
> 	split_large_page
> 	__change_page_attr
> 	__change_page_attr_set_clr
> 	__set_memory_enc_dec
> 	set_memory_decrypted
> 	sev_es_init_ghcbs
> 	trap_init   -> before mm_init (in init/main.c)
> 	start_kernel
> 	x86_64_start_reservations
> 	x86_64_start_kernel
> 	secondary_startup_64
> 
> At this time, mem_init hasn't been called yet (which would be called by
> mm_init). Thus, the free pages are still owned by memblock. It's in mem_init
> (x86/mm/init_64.c) that memblock_free_all gets called and free pages are
> released.
> 
> During testing, I've also noticed that debug_pagealloc=1 will make the issue
> disappear. That's because with debug_pagealloc=1, probe_page_size_mask in
> x86/mm/init.c will not allow large pages (2M/1G). Therefore, no
> split_large_page would happen. Similarly, if CPU doesn't have
> X86_FEATURE_PSE, there won't be large pages either.
> 
> Any thoughts? Maybe split_large_page should get pages from memblock at early
> boot?

Thanks for you analysis. I fixed it (verified by Mike) by using
early_set_memory_decrypted() instead of set_memory_decrypted(). I still
wonder why I didn't see that issue on my kernel. It has
DEBUG_PAGEALLOC=y set, but it is not enabled by default and I also
didn't pass the command-line parameter.

Regards,

	Joerg
diff mbox series

Patch

diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 6f61bb93366a..8b69b389688f 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -48,6 +48,7 @@  int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size);
 void __init mem_encrypt_init(void);
 void __init mem_encrypt_free_decrypted_mem(void);
 
+void __init sev_es_init_ghcbs(void);
 bool sme_active(void);
 bool sev_active(void);
 bool sev_es_active(void);
@@ -71,6 +72,7 @@  static inline void __init sme_early_init(void) { }
 static inline void __init sme_encrypt_kernel(struct boot_params *bp) { }
 static inline void __init sme_enable(struct boot_params *bp) { }
 
+static inline void sev_es_init_ghcbs(void) { }
 static inline bool sme_active(void) { return false; }
 static inline bool sev_active(void) { return false; }
 static inline bool sev_es_active(void) { return false; }
diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
index c17980e8db78..4bf5286310a0 100644
--- a/arch/x86/kernel/sev-es.c
+++ b/arch/x86/kernel/sev-es.c
@@ -8,8 +8,11 @@ 
  */
 
 #include <linux/sched/debug.h>	/* For show_regs() */
-#include <linux/kernel.h>
+#include <linux/percpu-defs.h>
+#include <linux/mem_encrypt.h>
 #include <linux/printk.h>
+#include <linux/set_memory.h>
+#include <linux/kernel.h>
 #include <linux/mm.h>
 
 #include <asm/trap_defs.h>
@@ -29,6 +32,9 @@  struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
  */
 struct ghcb __initdata *boot_ghcb;
 
+/* Runtime GHCB pointers */
+static struct ghcb __percpu *ghcb_page;
+
 /* Needed in vc_early_vc_forward_exception */
 extern void early_exception(struct pt_regs *regs, int trapnr);
 
@@ -197,6 +203,26 @@  static bool __init sev_es_setup_ghcb(void)
 	return true;
 }
 
+void sev_es_init_ghcbs(void)
+{
+	int cpu;
+
+	if (!sev_es_active())
+		return;
+
+	/* Allocate GHCB pages */
+	ghcb_page = __alloc_percpu(sizeof(struct ghcb), PAGE_SIZE);
+
+	/* Initialize per-cpu GHCB pages */
+	for_each_possible_cpu(cpu) {
+		struct ghcb *ghcb = (struct ghcb *)per_cpu_ptr(ghcb_page, cpu);
+
+		set_memory_decrypted((unsigned long)ghcb,
+				     sizeof(*ghcb) >> PAGE_SHIFT);
+		memset(ghcb, 0, sizeof(*ghcb));
+	}
+}
+
 static void __init vc_early_vc_forward_exception(struct es_em_ctxt *ctxt)
 {
 	int trapnr = ctxt->fi.vector;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 6ef00eb6fbb9..09bebda9b053 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -918,6 +918,9 @@  void __init trap_init(void)
 	/* Init cpu_entry_area before IST entries are set up */
 	setup_cpu_entry_areas();
 
+	/* Init GHCB memory pages when running as an SEV-ES guest */
+	sev_es_init_ghcbs();
+
 	idt_setup_traps();
 
 	/*