diff mbox

[v5,28/32] x86/mm, kexec: Allow kexec to be used with SME

Message ID 20170418212121.10190.94885.stgit@tlendack-t1.amdoffice.net (mailing list archive)
State New, archived
Headers show

Commit Message

Tom Lendacky April 18, 2017, 9:21 p.m. UTC
Provide support so that kexec can be used to boot a kernel when SME is
enabled.

Support is needed to allocate pages for kexec without encryption.  This
is needed in order to be able to reboot in the kernel in the same manner
as originally booted.

Additionally, when shutting down all of the CPUs we need to be sure to
flush the caches and then halt. This is needed when booting from a state
where SME was not active into a state where SME is active (or vice-versa).
Without these steps, it is possible for cache lines to exist for the same
physical location but tagged both with and without the encryption bit. This
can cause random memory corruption when caches are flushed depending on
which cacheline is written last.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 arch/x86/include/asm/init.h          |    1 +
 arch/x86/include/asm/irqflags.h      |    5 +++++
 arch/x86/include/asm/kexec.h         |    8 ++++++++
 arch/x86/include/asm/pgtable_types.h |    1 +
 arch/x86/kernel/machine_kexec_64.c   |   35 +++++++++++++++++++++++++++++++++-
 arch/x86/kernel/process.c            |   26 +++++++++++++++++++++++--
 arch/x86/mm/ident_map.c              |   11 +++++++----
 include/linux/kexec.h                |   14 ++++++++++++++
 kernel/kexec_core.c                  |    7 +++++++
 9 files changed, 101 insertions(+), 7 deletions(-)

Comments

Borislav Petkov May 17, 2017, 7:17 p.m. UTC | #1
On Tue, Apr 18, 2017 at 04:21:21PM -0500, Tom Lendacky wrote:
> Provide support so that kexec can be used to boot a kernel when SME is
> enabled.
> 
> Support is needed to allocate pages for kexec without encryption.  This
> is needed in order to be able to reboot in the kernel in the same manner
> as originally booted.
> 
> Additionally, when shutting down all of the CPUs we need to be sure to
> flush the caches and then halt. This is needed when booting from a state
> where SME was not active into a state where SME is active (or vice-versa).
> Without these steps, it is possible for cache lines to exist for the same
> physical location but tagged both with and without the encryption bit. This
> can cause random memory corruption when caches are flushed depending on
> which cacheline is written last.
> 
> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
> ---
>  arch/x86/include/asm/init.h          |    1 +
>  arch/x86/include/asm/irqflags.h      |    5 +++++
>  arch/x86/include/asm/kexec.h         |    8 ++++++++
>  arch/x86/include/asm/pgtable_types.h |    1 +
>  arch/x86/kernel/machine_kexec_64.c   |   35 +++++++++++++++++++++++++++++++++-
>  arch/x86/kernel/process.c            |   26 +++++++++++++++++++++++--
>  arch/x86/mm/ident_map.c              |   11 +++++++----
>  include/linux/kexec.h                |   14 ++++++++++++++
>  kernel/kexec_core.c                  |    7 +++++++
>  9 files changed, 101 insertions(+), 7 deletions(-)

...

> @@ -86,7 +86,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
>  		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
>  	}
>  	pte = pte_offset_kernel(pmd, vaddr);
> -	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
> +	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));
>  	return 0;
>  err:
>  	free_transition_pgtable(image);
> @@ -114,6 +114,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
>  		.alloc_pgt_page	= alloc_pgt_page,
>  		.context	= image,
>  		.pmd_flag	= __PAGE_KERNEL_LARGE_EXEC,
> +		.kernpg_flag	= _KERNPG_TABLE_NOENC,
>  	};
>  	unsigned long mstart, mend;
>  	pgd_t *level4p;
> @@ -597,3 +598,35 @@ void arch_kexec_unprotect_crashkres(void)
>  {
>  	kexec_mark_crashkres(false);
>  }
> +
> +int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
> +{
> +	int ret;
> +
> +	if (sme_active()) {

	if (!sme_active())
		return 0;

	/*
	 * If SME...


> +		/*
> +		 * If SME is active we need to be sure that kexec pages are
> +		 * not encrypted because when we boot to the new kernel the
> +		 * pages won't be accessed encrypted (initially).
> +		 */
> +		ret = set_memory_decrypted((unsigned long)vaddr, pages);
> +		if (ret)
> +			return ret;
> +
> +		if (gfp & __GFP_ZERO)
> +			memset(vaddr, 0, pages * PAGE_SIZE);

This function is called after alloc_pages() which already zeroes memory
when __GFP_ZERO is supplied.

If you need to clear the memory *after* set_memory_encrypted() happens,
then you should probably mask out __GFP_ZERO before the alloc_pages()
call so as not to do it twice.

> +	}
> +
> +	return 0;
> +}
> +
> +void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
> +{
> +	if (sme_active()) {
> +		/*
> +		 * If SME is active we need to reset the pages back to being
> +		 * an encrypted mapping before freeing them.
> +		 */
> +		set_memory_encrypted((unsigned long)vaddr, pages);
> +	}
> +}
> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> index 0bb8842..f4e5de6 100644
> --- a/arch/x86/kernel/process.c
> +++ b/arch/x86/kernel/process.c
> @@ -24,6 +24,7 @@
>  #include <linux/cpuidle.h>
>  #include <trace/events/power.h>
>  #include <linux/hw_breakpoint.h>
> +#include <linux/kexec.h>
>  #include <asm/cpu.h>
>  #include <asm/apic.h>
>  #include <asm/syscalls.h>
> @@ -355,8 +356,25 @@ bool xen_set_default_idle(void)
>  	return ret;
>  }
>  #endif
> +
>  void stop_this_cpu(void *dummy)
>  {
> +	bool do_wbinvd_halt = false;
> +
> +	if (kexec_in_progress && boot_cpu_has(X86_FEATURE_SME)) {
> +		/*
> +		 * If we are performing a kexec and the processor supports
> +		 * SME then we need to clear out cache information before
> +		 * halting. With kexec, going from SME inactive to SME active
> +		 * requires clearing cache entries so that addresses without
> +		 * the encryption bit set don't corrupt the same physical
> +		 * address that has the encryption bit set when caches are
> +		 * flushed. Perform a wbinvd followed by a halt to achieve
> +		 * this.
> +		 */
> +		do_wbinvd_halt = true;
> +	}
> +
>  	local_irq_disable();
>  	/*
>  	 * Remove this CPU:
> @@ -365,8 +383,12 @@ void stop_this_cpu(void *dummy)
>  	disable_local_APIC();
>  	mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
>  
> -	for (;;)
> -		halt();
> +	for (;;) {
> +		if (do_wbinvd_halt)
> +			native_wbinvd_halt();

No need for that native_wbinvd_halt() thing:

	for (;;) {
		if (do_wbinvd)
			wbinvd();

		halt();
	}

>  /*
> diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
> index 04210a2..2c9fd3e 100644
> --- a/arch/x86/mm/ident_map.c
> +++ b/arch/x86/mm/ident_map.c
> @@ -20,6 +20,7 @@ static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page,
>  static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
>  			  unsigned long addr, unsigned long end)
>  {
> +	unsigned long kernpg_flag = info->kernpg_flag ? : _KERNPG_TABLE;

You're already supplying a x86_mapping_info and thus you can init
kernpg_flag to default _KERNPG_TABLE and override it in the SME+kexec
case, as you already do. And this way you can simply do:

	set_pud(pud, __pud(__pa(pmd) | info->kernpg_flag));

here and in the other pagetable functions I've snipped below, and save
yourself some lines.

...
Tom Lendacky May 19, 2017, 8:45 p.m. UTC | #2
On 5/17/2017 2:17 PM, Borislav Petkov wrote:
> On Tue, Apr 18, 2017 at 04:21:21PM -0500, Tom Lendacky wrote:
>> Provide support so that kexec can be used to boot a kernel when SME is
>> enabled.
>>
>> Support is needed to allocate pages for kexec without encryption.  This
>> is needed in order to be able to reboot in the kernel in the same manner
>> as originally booted.
>>
>> Additionally, when shutting down all of the CPUs we need to be sure to
>> flush the caches and then halt. This is needed when booting from a state
>> where SME was not active into a state where SME is active (or vice-versa).
>> Without these steps, it is possible for cache lines to exist for the same
>> physical location but tagged both with and without the encryption bit. This
>> can cause random memory corruption when caches are flushed depending on
>> which cacheline is written last.
>>
>> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
>> ---
>>  arch/x86/include/asm/init.h          |    1 +
>>  arch/x86/include/asm/irqflags.h      |    5 +++++
>>  arch/x86/include/asm/kexec.h         |    8 ++++++++
>>  arch/x86/include/asm/pgtable_types.h |    1 +
>>  arch/x86/kernel/machine_kexec_64.c   |   35 +++++++++++++++++++++++++++++++++-
>>  arch/x86/kernel/process.c            |   26 +++++++++++++++++++++++--
>>  arch/x86/mm/ident_map.c              |   11 +++++++----
>>  include/linux/kexec.h                |   14 ++++++++++++++
>>  kernel/kexec_core.c                  |    7 +++++++
>>  9 files changed, 101 insertions(+), 7 deletions(-)
>
> ...
>
>> @@ -86,7 +86,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
>>  		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
>>  	}
>>  	pte = pte_offset_kernel(pmd, vaddr);
>> -	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
>> +	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));
>>  	return 0;
>>  err:
>>  	free_transition_pgtable(image);
>> @@ -114,6 +114,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
>>  		.alloc_pgt_page	= alloc_pgt_page,
>>  		.context	= image,
>>  		.pmd_flag	= __PAGE_KERNEL_LARGE_EXEC,
>> +		.kernpg_flag	= _KERNPG_TABLE_NOENC,
>>  	};
>>  	unsigned long mstart, mend;
>>  	pgd_t *level4p;
>> @@ -597,3 +598,35 @@ void arch_kexec_unprotect_crashkres(void)
>>  {
>>  	kexec_mark_crashkres(false);
>>  }
>> +
>> +int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
>> +{
>> +	int ret;
>> +
>> +	if (sme_active()) {
>
> 	if (!sme_active())
> 		return 0;
>
> 	/*
> 	 * If SME...
>

Ok.

>
>> +		/*
>> +		 * If SME is active we need to be sure that kexec pages are
>> +		 * not encrypted because when we boot to the new kernel the
>> +		 * pages won't be accessed encrypted (initially).
>> +		 */
>> +		ret = set_memory_decrypted((unsigned long)vaddr, pages);
>> +		if (ret)
>> +			return ret;
>> +
>> +		if (gfp & __GFP_ZERO)
>> +			memset(vaddr, 0, pages * PAGE_SIZE);
>
> This function is called after alloc_pages() which already zeroes memory
> when __GFP_ZERO is supplied.
>
> If you need to clear the memory *after* set_memory_encrypted() happens,
> then you should probably mask out __GFP_ZERO before the alloc_pages()
> call so as not to do it twice.

I'll look into that.  I could put the memset() at the end of this
function so that it is done here no matter what.  And update the
default arch_kexec_post_alloc_pages() to also do the memset(). It
just hides the clearing of the pages a bit though by doing that.

>
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
>> +{
>> +	if (sme_active()) {
>> +		/*
>> +		 * If SME is active we need to reset the pages back to being
>> +		 * an encrypted mapping before freeing them.
>> +		 */
>> +		set_memory_encrypted((unsigned long)vaddr, pages);
>> +	}
>> +}
>> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
>> index 0bb8842..f4e5de6 100644
>> --- a/arch/x86/kernel/process.c
>> +++ b/arch/x86/kernel/process.c
>> @@ -24,6 +24,7 @@
>>  #include <linux/cpuidle.h>
>>  #include <trace/events/power.h>
>>  #include <linux/hw_breakpoint.h>
>> +#include <linux/kexec.h>
>>  #include <asm/cpu.h>
>>  #include <asm/apic.h>
>>  #include <asm/syscalls.h>
>> @@ -355,8 +356,25 @@ bool xen_set_default_idle(void)
>>  	return ret;
>>  }
>>  #endif
>> +
>>  void stop_this_cpu(void *dummy)
>>  {
>> +	bool do_wbinvd_halt = false;
>> +
>> +	if (kexec_in_progress && boot_cpu_has(X86_FEATURE_SME)) {
>> +		/*
>> +		 * If we are performing a kexec and the processor supports
>> +		 * SME then we need to clear out cache information before
>> +		 * halting. With kexec, going from SME inactive to SME active
>> +		 * requires clearing cache entries so that addresses without
>> +		 * the encryption bit set don't corrupt the same physical
>> +		 * address that has the encryption bit set when caches are
>> +		 * flushed. Perform a wbinvd followed by a halt to achieve
>> +		 * this.
>> +		 */
>> +		do_wbinvd_halt = true;
>> +	}
>> +
>>  	local_irq_disable();
>>  	/*
>>  	 * Remove this CPU:
>> @@ -365,8 +383,12 @@ void stop_this_cpu(void *dummy)
>>  	disable_local_APIC();
>>  	mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
>>
>> -	for (;;)
>> -		halt();
>> +	for (;;) {
>> +		if (do_wbinvd_halt)
>> +			native_wbinvd_halt();
>
> No need for that native_wbinvd_halt() thing:
>
> 	for (;;) {
> 		if (do_wbinvd)
> 			wbinvd();
>
> 		halt();
> 	}
>

Actually there is.  The above will result in data in the cache because
halt() turns into a function call if CONFIG_PARAVIRT is defined (refer
to the comment above where do_wbinvd_halt is set to true). I could make
this a native_wbinvd() and native_halt() as long as those are
guaranteed to never turn into function calls.  But never say never, so
that's why I created native_wbinvd_halt().

Thanks,
Tom

>>  /*
>> diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
>> index 04210a2..2c9fd3e 100644
>> --- a/arch/x86/mm/ident_map.c
>> +++ b/arch/x86/mm/ident_map.c
>> @@ -20,6 +20,7 @@ static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page,
>>  static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
>>  			  unsigned long addr, unsigned long end)
>>  {
>> +	unsigned long kernpg_flag = info->kernpg_flag ? : _KERNPG_TABLE;
>
> You're already supplying a x86_mapping_info and thus you can init
> kernpg_flag to default _KERNPG_TABLE and override it in the SME+kexec
> case, as you already do. And this way you can simply do:
>
> 	set_pud(pud, __pud(__pa(pmd) | info->kernpg_flag));
>
> here and in the other pagetable functions I've snipped below, and save
> yourself some lines.

Ok, I'll check into that.

Thanks,
Tom

>
> ...
>
Borislav Petkov May 19, 2017, 8:58 p.m. UTC | #3
On Fri, May 19, 2017 at 03:45:28PM -0500, Tom Lendacky wrote:
> Actually there is.  The above will result in data in the cache because
> halt() turns into a function call if CONFIG_PARAVIRT is defined (refer
> to the comment above where do_wbinvd_halt is set to true). I could make
> this a native_wbinvd() and native_halt()

That's why we have the native_* versions - to bypass paravirt crap.
Tom Lendacky May 19, 2017, 9:07 p.m. UTC | #4
On 5/19/2017 3:58 PM, Borislav Petkov wrote:
> On Fri, May 19, 2017 at 03:45:28PM -0500, Tom Lendacky wrote:
>> Actually there is.  The above will result in data in the cache because
>> halt() turns into a function call if CONFIG_PARAVIRT is defined (refer
>> to the comment above where do_wbinvd_halt is set to true). I could make
>> this a native_wbinvd() and native_halt()
>
> That's why we have the native_* versions - to bypass paravirt crap.

As long as those never change from static inline everything will be
fine. I can change it, but I really like how it explicitly indicates
what is needed in this case. Even if the function gets changed from
static inline the fact that the instructions are sequential in the
function covers that case.

Thanks,
Tom

>
Borislav Petkov May 19, 2017, 9:28 p.m. UTC | #5
On Fri, May 19, 2017 at 04:07:24PM -0500, Tom Lendacky wrote:
> As long as those never change from static inline everything will be
> fine. I can change it, but I really like how it explicitly indicates

I know what you want to do. But you're practically defining a helper
which contains two arbitrary instructions which probably no one else
will need.

So how about we simplify this function even more. We don't need to pay
attention to kexec being in progress because we're halting anyway so who
cares how fast we halt.

Might have to state that in the comment below though, instead of what's
there now.

And for the exact same moot reason, we don't need to look at SME CPUID
feature - we can just as well WBINVD unconditionally.

void stop_this_cpu(void *dummy)
{
        local_irq_disable();
        /*
         * Remove this CPU:
         */
        set_cpu_online(smp_processor_id(), false);
        disable_local_APIC();
        mcheck_cpu_clear(this_cpu_ptr(&cpu_info));

        for (;;) {
                /*
                 * If we are performing a kexec and the processor supports
                 * SME then we need to clear out cache information before
                 * halting. With kexec, going from SME inactive to SME active
                 * requires clearing cache entries so that addresses without
                 * the encryption bit set don't corrupt the same physical
                 * address that has the encryption bit set when caches are
                 * flushed. Perform a wbinvd followed by a halt to achieve
                 * this.
                 */
                asm volatile("wbinvd; hlt" ::: "memory");
        }
}

How's that?
Tom Lendacky May 19, 2017, 9:38 p.m. UTC | #6
On 5/19/2017 4:28 PM, Borislav Petkov wrote:
> On Fri, May 19, 2017 at 04:07:24PM -0500, Tom Lendacky wrote:
>> As long as those never change from static inline everything will be
>> fine. I can change it, but I really like how it explicitly indicates
>
> I know what you want to do. But you're practically defining a helper
> which contains two arbitrary instructions which probably no one else
> will need.
>
> So how about we simplify this function even more. We don't need to pay
> attention to kexec being in progress because we're halting anyway so who
> cares how fast we halt.
>
> Might have to state that in the comment below though, instead of what's
> there now.
>
> And for the exact same moot reason, we don't need to look at SME CPUID
> feature - we can just as well WBINVD unconditionally.
>
> void stop_this_cpu(void *dummy)
> {
>         local_irq_disable();
>         /*
>          * Remove this CPU:
>          */
>         set_cpu_online(smp_processor_id(), false);
>         disable_local_APIC();
>         mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
>
>         for (;;) {
>                 /*
>                  * If we are performing a kexec and the processor supports
>                  * SME then we need to clear out cache information before
>                  * halting. With kexec, going from SME inactive to SME active
>                  * requires clearing cache entries so that addresses without
>                  * the encryption bit set don't corrupt the same physical
>                  * address that has the encryption bit set when caches are
>                  * flushed. Perform a wbinvd followed by a halt to achieve
>                  * this.
>                  */
>                 asm volatile("wbinvd; hlt" ::: "memory");
>         }
> }
>
> How's that?

I can live with that!

Thanks,
Tom

>
Xunlei Pang May 26, 2017, 4:17 a.m. UTC | #7
On 04/19/2017 at 05:21 AM, Tom Lendacky wrote:
> Provide support so that kexec can be used to boot a kernel when SME is
> enabled.
>
> Support is needed to allocate pages for kexec without encryption.  This
> is needed in order to be able to reboot in the kernel in the same manner
> as originally booted.

Hi Tom,

Looks like kdump will break, I didn't see the similar handling for kdump cases, see kernel:
    kimage_alloc_crash_control_pages(), kimage_load_crash_segment(), etc.

We need to support kdump with SME, kdump kernel/initramfs/purgatory/elfcorehdr/etc
are all loaded into the reserved memory(see crashkernel=X) by userspace kexec-tools.
I think a straightforward way would be to mark the whole reserved memory range without
encryption before loading all the kexec segments for kdump, I guess we can handle this
easily in arch_kexec_unprotect_crashkres().

Moreover, now that "elfcorehdr=X" is left as decrypted, it needs to be remapped to the
encrypted data.

Regards,
Xunlei

>
> Additionally, when shutting down all of the CPUs we need to be sure to
> flush the caches and then halt. This is needed when booting from a state
> where SME was not active into a state where SME is active (or vice-versa).
> Without these steps, it is possible for cache lines to exist for the same
> physical location but tagged both with and without the encryption bit. This
> can cause random memory corruption when caches are flushed depending on
> which cacheline is written last.
>
> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
> ---
>  arch/x86/include/asm/init.h          |    1 +
>  arch/x86/include/asm/irqflags.h      |    5 +++++
>  arch/x86/include/asm/kexec.h         |    8 ++++++++
>  arch/x86/include/asm/pgtable_types.h |    1 +
>  arch/x86/kernel/machine_kexec_64.c   |   35 +++++++++++++++++++++++++++++++++-
>  arch/x86/kernel/process.c            |   26 +++++++++++++++++++++++--
>  arch/x86/mm/ident_map.c              |   11 +++++++----
>  include/linux/kexec.h                |   14 ++++++++++++++
>  kernel/kexec_core.c                  |    7 +++++++
>  9 files changed, 101 insertions(+), 7 deletions(-)
>
> diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
> index 737da62..b2ec511 100644
> --- a/arch/x86/include/asm/init.h
> +++ b/arch/x86/include/asm/init.h
> @@ -6,6 +6,7 @@ struct x86_mapping_info {
>  	void *context;			 /* context for alloc_pgt_page */
>  	unsigned long pmd_flag;		 /* page flag for PMD entry */
>  	unsigned long offset;		 /* ident mapping offset */
> +	unsigned long kernpg_flag;	 /* kernel pagetable flag override */
>  };
>  
>  int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
> diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
> index ac7692d..38b5920 100644
> --- a/arch/x86/include/asm/irqflags.h
> +++ b/arch/x86/include/asm/irqflags.h
> @@ -58,6 +58,11 @@ static inline __cpuidle void native_halt(void)
>  	asm volatile("hlt": : :"memory");
>  }
>  
> +static inline __cpuidle void native_wbinvd_halt(void)
> +{
> +	asm volatile("wbinvd; hlt" : : : "memory");
> +}
> +
>  #endif
>  
>  #ifdef CONFIG_PARAVIRT
> diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
> index 70ef205..e8183ac 100644
> --- a/arch/x86/include/asm/kexec.h
> +++ b/arch/x86/include/asm/kexec.h
> @@ -207,6 +207,14 @@ struct kexec_entry64_regs {
>  	uint64_t r15;
>  	uint64_t rip;
>  };
> +
> +extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages,
> +				       gfp_t gfp);
> +#define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages
> +
> +extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages);
> +#define arch_kexec_pre_free_pages arch_kexec_pre_free_pages
> +
>  #endif
>  
>  typedef void crash_vmclear_fn(void);
> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
> index ce8cb1c..0f326f4 100644
> --- a/arch/x86/include/asm/pgtable_types.h
> +++ b/arch/x86/include/asm/pgtable_types.h
> @@ -213,6 +213,7 @@ enum page_cache_mode {
>  #define PAGE_KERNEL		__pgprot(__PAGE_KERNEL | _PAGE_ENC)
>  #define PAGE_KERNEL_RO		__pgprot(__PAGE_KERNEL_RO | _PAGE_ENC)
>  #define PAGE_KERNEL_EXEC	__pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC)
> +#define PAGE_KERNEL_EXEC_NOENC	__pgprot(__PAGE_KERNEL_EXEC)
>  #define PAGE_KERNEL_RX		__pgprot(__PAGE_KERNEL_RX | _PAGE_ENC)
>  #define PAGE_KERNEL_NOCACHE	__pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC)
>  #define PAGE_KERNEL_LARGE	__pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC)
> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
> index 085c3b3..11c0ca9 100644
> --- a/arch/x86/kernel/machine_kexec_64.c
> +++ b/arch/x86/kernel/machine_kexec_64.c
> @@ -86,7 +86,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
>  		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
>  	}
>  	pte = pte_offset_kernel(pmd, vaddr);
> -	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
> +	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));
>  	return 0;
>  err:
>  	free_transition_pgtable(image);
> @@ -114,6 +114,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
>  		.alloc_pgt_page	= alloc_pgt_page,
>  		.context	= image,
>  		.pmd_flag	= __PAGE_KERNEL_LARGE_EXEC,
> +		.kernpg_flag	= _KERNPG_TABLE_NOENC,
>  	};
>  	unsigned long mstart, mend;
>  	pgd_t *level4p;
> @@ -597,3 +598,35 @@ void arch_kexec_unprotect_crashkres(void)
>  {
>  	kexec_mark_crashkres(false);
>  }
> +
> +int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
> +{
> +	int ret;
> +
> +	if (sme_active()) {
> +		/*
> +		 * If SME is active we need to be sure that kexec pages are
> +		 * not encrypted because when we boot to the new kernel the
> +		 * pages won't be accessed encrypted (initially).
> +		 */
> +		ret = set_memory_decrypted((unsigned long)vaddr, pages);
> +		if (ret)
> +			return ret;
> +
> +		if (gfp & __GFP_ZERO)
> +			memset(vaddr, 0, pages * PAGE_SIZE);
> +	}
> +
> +	return 0;
> +}
> +
> +void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
> +{
> +	if (sme_active()) {
> +		/*
> +		 * If SME is active we need to reset the pages back to being
> +		 * an encrypted mapping before freeing them.
> +		 */
> +		set_memory_encrypted((unsigned long)vaddr, pages);
> +	}
> +}
> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> index 0bb8842..f4e5de6 100644
> --- a/arch/x86/kernel/process.c
> +++ b/arch/x86/kernel/process.c
> @@ -24,6 +24,7 @@
>  #include <linux/cpuidle.h>
>  #include <trace/events/power.h>
>  #include <linux/hw_breakpoint.h>
> +#include <linux/kexec.h>
>  #include <asm/cpu.h>
>  #include <asm/apic.h>
>  #include <asm/syscalls.h>
> @@ -355,8 +356,25 @@ bool xen_set_default_idle(void)
>  	return ret;
>  }
>  #endif
> +
>  void stop_this_cpu(void *dummy)
>  {
> +	bool do_wbinvd_halt = false;
> +
> +	if (kexec_in_progress && boot_cpu_has(X86_FEATURE_SME)) {
> +		/*
> +		 * If we are performing a kexec and the processor supports
> +		 * SME then we need to clear out cache information before
> +		 * halting. With kexec, going from SME inactive to SME active
> +		 * requires clearing cache entries so that addresses without
> +		 * the encryption bit set don't corrupt the same physical
> +		 * address that has the encryption bit set when caches are
> +		 * flushed. Perform a wbinvd followed by a halt to achieve
> +		 * this.
> +		 */
> +		do_wbinvd_halt = true;
> +	}
> +
>  	local_irq_disable();
>  	/*
>  	 * Remove this CPU:
> @@ -365,8 +383,12 @@ void stop_this_cpu(void *dummy)
>  	disable_local_APIC();
>  	mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
>  
> -	for (;;)
> -		halt();
> +	for (;;) {
> +		if (do_wbinvd_halt)
> +			native_wbinvd_halt();
> +		else
> +			halt();
> +	}
>  }
>  
>  /*
> diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
> index 04210a2..2c9fd3e 100644
> --- a/arch/x86/mm/ident_map.c
> +++ b/arch/x86/mm/ident_map.c
> @@ -20,6 +20,7 @@ static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page,
>  static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
>  			  unsigned long addr, unsigned long end)
>  {
> +	unsigned long kernpg_flag = info->kernpg_flag ? : _KERNPG_TABLE;
>  	unsigned long next;
>  
>  	for (; addr < end; addr = next) {
> @@ -39,7 +40,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
>  		if (!pmd)
>  			return -ENOMEM;
>  		ident_pmd_init(info, pmd, addr, next);
> -		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
> +		set_pud(pud, __pud(__pa(pmd) | kernpg_flag));
>  	}
>  
>  	return 0;
> @@ -48,6 +49,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
>  static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
>  			  unsigned long addr, unsigned long end)
>  {
> +	unsigned long kernpg_flag = info->kernpg_flag ? : _KERNPG_TABLE;
>  	unsigned long next;
>  
>  	for (; addr < end; addr = next) {
> @@ -67,7 +69,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
>  		if (!pud)
>  			return -ENOMEM;
>  		ident_pud_init(info, pud, addr, next);
> -		set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
> +		set_p4d(p4d, __p4d(__pa(pud) | kernpg_flag));
>  	}
>  
>  	return 0;
> @@ -76,6 +78,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
>  int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
>  			      unsigned long pstart, unsigned long pend)
>  {
> +	unsigned long kernpg_flag = info->kernpg_flag ? : _KERNPG_TABLE;
>  	unsigned long addr = pstart + info->offset;
>  	unsigned long end = pend + info->offset;
>  	unsigned long next;
> @@ -104,14 +107,14 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
>  		if (result)
>  			return result;
>  		if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
> -			set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
> +			set_pgd(pgd, __pgd(__pa(p4d) | kernpg_flag));
>  		} else {
>  			/*
>  			 * With p4d folded, pgd is equal to p4d.
>  			 * The pgd entry has to point to the pud page table in this case.
>  			 */
>  			pud_t *pud = pud_offset(p4d, 0);
> -			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
> +			set_pgd(pgd, __pgd(__pa(pud) | kernpg_flag));
>  		}
>  	}
>  
> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> index d419d0e..1c76e3b 100644
> --- a/include/linux/kexec.h
> +++ b/include/linux/kexec.h
> @@ -383,6 +383,20 @@ static inline void *boot_phys_to_virt(unsigned long entry)
>  	return phys_to_virt(boot_phys_to_phys(entry));
>  }
>  
> +#ifndef arch_kexec_post_alloc_pages
> +static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages,
> +					      gfp_t gfp)
> +{
> +	return 0;
> +}
> +#endif
> +
> +#ifndef arch_kexec_pre_free_pages
> +static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
> +{
> +}
> +#endif
> +
>  #else /* !CONFIG_KEXEC_CORE */
>  struct pt_regs;
>  struct task_struct;
> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
> index bfe62d5..bb5e7e3 100644
> --- a/kernel/kexec_core.c
> +++ b/kernel/kexec_core.c
> @@ -38,6 +38,7 @@
>  #include <linux/syscore_ops.h>
>  #include <linux/compiler.h>
>  #include <linux/hugetlb.h>
> +#include <linux/mem_encrypt.h>
>  
>  #include <asm/page.h>
>  #include <asm/sections.h>
> @@ -315,6 +316,9 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
>  		count = 1 << order;
>  		for (i = 0; i < count; i++)
>  			SetPageReserved(pages + i);
> +
> +		arch_kexec_post_alloc_pages(page_address(pages), count,
> +					    gfp_mask);
>  	}
>  
>  	return pages;
> @@ -326,6 +330,9 @@ static void kimage_free_pages(struct page *page)
>  
>  	order = page_private(page);
>  	count = 1 << order;
> +
> +	arch_kexec_pre_free_pages(page_address(page), count);
> +
>  	for (i = 0; i < count; i++)
>  		ClearPageReserved(page + i);
>  	__free_pages(page, order);
>
>
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
Dave Young May 27, 2017, 2:17 a.m. UTC | #8
On 05/26/17 at 12:17pm, Xunlei Pang wrote:
> On 04/19/2017 at 05:21 AM, Tom Lendacky wrote:
> > Provide support so that kexec can be used to boot a kernel when SME is
> > enabled.
> >
> > Support is needed to allocate pages for kexec without encryption.  This
> > is needed in order to be able to reboot in the kernel in the same manner
> > as originally booted.
> 
> Hi Tom,
> 
> Looks like kdump will break, I didn't see the similar handling for kdump cases, see kernel:
>     kimage_alloc_crash_control_pages(), kimage_load_crash_segment(), etc.
> 
> We need to support kdump with SME, kdump kernel/initramfs/purgatory/elfcorehdr/etc
> are all loaded into the reserved memory(see crashkernel=X) by userspace kexec-tools.

For kexec_load, it is loaded by kexec-tools, we have in kernel loader
syscall kexec_file_load, it is handled in kernel.

> I think a straightforward way would be to mark the whole reserved memory range without
> encryption before loading all the kexec segments for kdump, I guess we can handle this
> easily in arch_kexec_unprotect_crashkres().
> 
> Moreover, now that "elfcorehdr=X" is left as decrypted, it needs to be remapped to the
> encrypted data.

Tom, could you have a try on kdump according to suggestion from Xunlei?
It is just based on theoretical patch understanding, there could be
other issues when you work on it. Feel free to ask if we can help on
anything.

Thanks
Dave
Tom Lendacky May 30, 2017, 5:46 p.m. UTC | #9
On 5/25/2017 11:17 PM, Xunlei Pang wrote:
> On 04/19/2017 at 05:21 AM, Tom Lendacky wrote:
>> Provide support so that kexec can be used to boot a kernel when SME is
>> enabled.
>>
>> Support is needed to allocate pages for kexec without encryption.  This
>> is needed in order to be able to reboot in the kernel in the same manner
>> as originally booted.
> 
> Hi Tom,
> 
> Looks like kdump will break, I didn't see the similar handling for kdump cases, see kernel:
>      kimage_alloc_crash_control_pages(), kimage_load_crash_segment(), etc. >
> We need to support kdump with SME, kdump kernel/initramfs/purgatory/elfcorehdr/etc
> are all loaded into the reserved memory(see crashkernel=X) by userspace kexec-tools.
> I think a straightforward way would be to mark the whole reserved memory range without
> encryption before loading all the kexec segments for kdump, I guess we can handle this
> easily in arch_kexec_unprotect_crashkres().

Yes, that would work.

> 
> Moreover, now that "elfcorehdr=X" is left as decrypted, it needs to be remapped to the
> encrypted data.

This is an area that I'm not familiar with, so I don't completely
understand the flow in regards to where/when/how the ELF headers are
copied and what needs to be done.

Can you elaborate a bit on this?

Thanks,
Tom

> 
> Regards,
> Xunlei
> 
>>
>> Additionally, when shutting down all of the CPUs we need to be sure to
>> flush the caches and then halt. This is needed when booting from a state
>> where SME was not active into a state where SME is active (or vice-versa).
>> Without these steps, it is possible for cache lines to exist for the same
>> physical location but tagged both with and without the encryption bit. This
>> can cause random memory corruption when caches are flushed depending on
>> which cacheline is written last.
>>
>> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
>> ---
>>   arch/x86/include/asm/init.h          |    1 +
>>   arch/x86/include/asm/irqflags.h      |    5 +++++
>>   arch/x86/include/asm/kexec.h         |    8 ++++++++
>>   arch/x86/include/asm/pgtable_types.h |    1 +
>>   arch/x86/kernel/machine_kexec_64.c   |   35 +++++++++++++++++++++++++++++++++-
>>   arch/x86/kernel/process.c            |   26 +++++++++++++++++++++++--
>>   arch/x86/mm/ident_map.c              |   11 +++++++----
>>   include/linux/kexec.h                |   14 ++++++++++++++
>>   kernel/kexec_core.c                  |    7 +++++++
>>   9 files changed, 101 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
>> index 737da62..b2ec511 100644
>> --- a/arch/x86/include/asm/init.h
>> +++ b/arch/x86/include/asm/init.h
>> @@ -6,6 +6,7 @@ struct x86_mapping_info {
>>   	void *context;			 /* context for alloc_pgt_page */
>>   	unsigned long pmd_flag;		 /* page flag for PMD entry */
>>   	unsigned long offset;		 /* ident mapping offset */
>> +	unsigned long kernpg_flag;	 /* kernel pagetable flag override */
>>   };
>>   
>>   int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
>> diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
>> index ac7692d..38b5920 100644
>> --- a/arch/x86/include/asm/irqflags.h
>> +++ b/arch/x86/include/asm/irqflags.h
>> @@ -58,6 +58,11 @@ static inline __cpuidle void native_halt(void)
>>   	asm volatile("hlt": : :"memory");
>>   }
>>   
>> +static inline __cpuidle void native_wbinvd_halt(void)
>> +{
>> +	asm volatile("wbinvd; hlt" : : : "memory");
>> +}
>> +
>>   #endif
>>   
>>   #ifdef CONFIG_PARAVIRT
>> diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
>> index 70ef205..e8183ac 100644
>> --- a/arch/x86/include/asm/kexec.h
>> +++ b/arch/x86/include/asm/kexec.h
>> @@ -207,6 +207,14 @@ struct kexec_entry64_regs {
>>   	uint64_t r15;
>>   	uint64_t rip;
>>   };
>> +
>> +extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages,
>> +				       gfp_t gfp);
>> +#define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages
>> +
>> +extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages);
>> +#define arch_kexec_pre_free_pages arch_kexec_pre_free_pages
>> +
>>   #endif
>>   
>>   typedef void crash_vmclear_fn(void);
>> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
>> index ce8cb1c..0f326f4 100644
>> --- a/arch/x86/include/asm/pgtable_types.h
>> +++ b/arch/x86/include/asm/pgtable_types.h
>> @@ -213,6 +213,7 @@ enum page_cache_mode {
>>   #define PAGE_KERNEL		__pgprot(__PAGE_KERNEL | _PAGE_ENC)
>>   #define PAGE_KERNEL_RO		__pgprot(__PAGE_KERNEL_RO | _PAGE_ENC)
>>   #define PAGE_KERNEL_EXEC	__pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC)
>> +#define PAGE_KERNEL_EXEC_NOENC	__pgprot(__PAGE_KERNEL_EXEC)
>>   #define PAGE_KERNEL_RX		__pgprot(__PAGE_KERNEL_RX | _PAGE_ENC)
>>   #define PAGE_KERNEL_NOCACHE	__pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC)
>>   #define PAGE_KERNEL_LARGE	__pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC)
>> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
>> index 085c3b3..11c0ca9 100644
>> --- a/arch/x86/kernel/machine_kexec_64.c
>> +++ b/arch/x86/kernel/machine_kexec_64.c
>> @@ -86,7 +86,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
>>   		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
>>   	}
>>   	pte = pte_offset_kernel(pmd, vaddr);
>> -	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
>> +	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));
>>   	return 0;
>>   err:
>>   	free_transition_pgtable(image);
>> @@ -114,6 +114,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
>>   		.alloc_pgt_page	= alloc_pgt_page,
>>   		.context	= image,
>>   		.pmd_flag	= __PAGE_KERNEL_LARGE_EXEC,
>> +		.kernpg_flag	= _KERNPG_TABLE_NOENC,
>>   	};
>>   	unsigned long mstart, mend;
>>   	pgd_t *level4p;
>> @@ -597,3 +598,35 @@ void arch_kexec_unprotect_crashkres(void)
>>   {
>>   	kexec_mark_crashkres(false);
>>   }
>> +
>> +int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
>> +{
>> +	int ret;
>> +
>> +	if (sme_active()) {
>> +		/*
>> +		 * If SME is active we need to be sure that kexec pages are
>> +		 * not encrypted because when we boot to the new kernel the
>> +		 * pages won't be accessed encrypted (initially).
>> +		 */
>> +		ret = set_memory_decrypted((unsigned long)vaddr, pages);
>> +		if (ret)
>> +			return ret;
>> +
>> +		if (gfp & __GFP_ZERO)
>> +			memset(vaddr, 0, pages * PAGE_SIZE);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
>> +{
>> +	if (sme_active()) {
>> +		/*
>> +		 * If SME is active we need to reset the pages back to being
>> +		 * an encrypted mapping before freeing them.
>> +		 */
>> +		set_memory_encrypted((unsigned long)vaddr, pages);
>> +	}
>> +}
>> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
>> index 0bb8842..f4e5de6 100644
>> --- a/arch/x86/kernel/process.c
>> +++ b/arch/x86/kernel/process.c
>> @@ -24,6 +24,7 @@
>>   #include <linux/cpuidle.h>
>>   #include <trace/events/power.h>
>>   #include <linux/hw_breakpoint.h>
>> +#include <linux/kexec.h>
>>   #include <asm/cpu.h>
>>   #include <asm/apic.h>
>>   #include <asm/syscalls.h>
>> @@ -355,8 +356,25 @@ bool xen_set_default_idle(void)
>>   	return ret;
>>   }
>>   #endif
>> +
>>   void stop_this_cpu(void *dummy)
>>   {
>> +	bool do_wbinvd_halt = false;
>> +
>> +	if (kexec_in_progress && boot_cpu_has(X86_FEATURE_SME)) {
>> +		/*
>> +		 * If we are performing a kexec and the processor supports
>> +		 * SME then we need to clear out cache information before
>> +		 * halting. With kexec, going from SME inactive to SME active
>> +		 * requires clearing cache entries so that addresses without
>> +		 * the encryption bit set don't corrupt the same physical
>> +		 * address that has the encryption bit set when caches are
>> +		 * flushed. Perform a wbinvd followed by a halt to achieve
>> +		 * this.
>> +		 */
>> +		do_wbinvd_halt = true;
>> +	}
>> +
>>   	local_irq_disable();
>>   	/*
>>   	 * Remove this CPU:
>> @@ -365,8 +383,12 @@ void stop_this_cpu(void *dummy)
>>   	disable_local_APIC();
>>   	mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
>>   
>> -	for (;;)
>> -		halt();
>> +	for (;;) {
>> +		if (do_wbinvd_halt)
>> +			native_wbinvd_halt();
>> +		else
>> +			halt();
>> +	}
>>   }
>>   
>>   /*
>> diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
>> index 04210a2..2c9fd3e 100644
>> --- a/arch/x86/mm/ident_map.c
>> +++ b/arch/x86/mm/ident_map.c
>> @@ -20,6 +20,7 @@ static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page,
>>   static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
>>   			  unsigned long addr, unsigned long end)
>>   {
>> +	unsigned long kernpg_flag = info->kernpg_flag ? : _KERNPG_TABLE;
>>   	unsigned long next;
>>   
>>   	for (; addr < end; addr = next) {
>> @@ -39,7 +40,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
>>   		if (!pmd)
>>   			return -ENOMEM;
>>   		ident_pmd_init(info, pmd, addr, next);
>> -		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
>> +		set_pud(pud, __pud(__pa(pmd) | kernpg_flag));
>>   	}
>>   
>>   	return 0;
>> @@ -48,6 +49,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
>>   static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
>>   			  unsigned long addr, unsigned long end)
>>   {
>> +	unsigned long kernpg_flag = info->kernpg_flag ? : _KERNPG_TABLE;
>>   	unsigned long next;
>>   
>>   	for (; addr < end; addr = next) {
>> @@ -67,7 +69,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
>>   		if (!pud)
>>   			return -ENOMEM;
>>   		ident_pud_init(info, pud, addr, next);
>> -		set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
>> +		set_p4d(p4d, __p4d(__pa(pud) | kernpg_flag));
>>   	}
>>   
>>   	return 0;
>> @@ -76,6 +78,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
>>   int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
>>   			      unsigned long pstart, unsigned long pend)
>>   {
>> +	unsigned long kernpg_flag = info->kernpg_flag ? : _KERNPG_TABLE;
>>   	unsigned long addr = pstart + info->offset;
>>   	unsigned long end = pend + info->offset;
>>   	unsigned long next;
>> @@ -104,14 +107,14 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
>>   		if (result)
>>   			return result;
>>   		if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
>> -			set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
>> +			set_pgd(pgd, __pgd(__pa(p4d) | kernpg_flag));
>>   		} else {
>>   			/*
>>   			 * With p4d folded, pgd is equal to p4d.
>>   			 * The pgd entry has to point to the pud page table in this case.
>>   			 */
>>   			pud_t *pud = pud_offset(p4d, 0);
>> -			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
>> +			set_pgd(pgd, __pgd(__pa(pud) | kernpg_flag));
>>   		}
>>   	}
>>   
>> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
>> index d419d0e..1c76e3b 100644
>> --- a/include/linux/kexec.h
>> +++ b/include/linux/kexec.h
>> @@ -383,6 +383,20 @@ static inline void *boot_phys_to_virt(unsigned long entry)
>>   	return phys_to_virt(boot_phys_to_phys(entry));
>>   }
>>   
>> +#ifndef arch_kexec_post_alloc_pages
>> +static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages,
>> +					      gfp_t gfp)
>> +{
>> +	return 0;
>> +}
>> +#endif
>> +
>> +#ifndef arch_kexec_pre_free_pages
>> +static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
>> +{
>> +}
>> +#endif
>> +
>>   #else /* !CONFIG_KEXEC_CORE */
>>   struct pt_regs;
>>   struct task_struct;
>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>> index bfe62d5..bb5e7e3 100644
>> --- a/kernel/kexec_core.c
>> +++ b/kernel/kexec_core.c
>> @@ -38,6 +38,7 @@
>>   #include <linux/syscore_ops.h>
>>   #include <linux/compiler.h>
>>   #include <linux/hugetlb.h>
>> +#include <linux/mem_encrypt.h>
>>   
>>   #include <asm/page.h>
>>   #include <asm/sections.h>
>> @@ -315,6 +316,9 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
>>   		count = 1 << order;
>>   		for (i = 0; i < count; i++)
>>   			SetPageReserved(pages + i);
>> +
>> +		arch_kexec_post_alloc_pages(page_address(pages), count,
>> +					    gfp_mask);
>>   	}
>>   
>>   	return pages;
>> @@ -326,6 +330,9 @@ static void kimage_free_pages(struct page *page)
>>   
>>   	order = page_private(page);
>>   	count = 1 << order;
>> +
>> +	arch_kexec_pre_free_pages(page_address(page), count);
>> +
>>   	for (i = 0; i < count; i++)
>>   		ClearPageReserved(page + i);
>>   	__free_pages(page, order);
>>
>>
>> _______________________________________________
>> kexec mailing list
>> kexec@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/kexec
>
Borislav Petkov May 31, 2017, 10:01 a.m. UTC | #10
On Tue, May 30, 2017 at 12:46:14PM -0500, Tom Lendacky wrote:
> This is an area that I'm not familiar with, so I don't completely
> understand the flow in regards to where/when/how the ELF headers are
> copied and what needs to be done.

So my suggestion is still to put kexec/kdump on the backburner for now
and concentrate on the 30-ish patchset first. Once they're done, we can
start dealing with it. Ditto with the IOMMU side of things. One thing at
a time.

Thanks.
Xunlei Pang May 31, 2017, 3:03 p.m. UTC | #11
On 05/31/2017 at 01:46 AM, Tom Lendacky wrote:
> On 5/25/2017 11:17 PM, Xunlei Pang wrote:
>> On 04/19/2017 at 05:21 AM, Tom Lendacky wrote:
>>> Provide support so that kexec can be used to boot a kernel when SME is
>>> enabled.
>>>
>>> Support is needed to allocate pages for kexec without encryption.  This
>>> is needed in order to be able to reboot in the kernel in the same manner
>>> as originally booted.
>>
>> Hi Tom,
>>
>> Looks like kdump will break, I didn't see the similar handling for kdump cases, see kernel:
>>      kimage_alloc_crash_control_pages(), kimage_load_crash_segment(), etc. >
>> We need to support kdump with SME, kdump kernel/initramfs/purgatory/elfcorehdr/etc
>> are all loaded into the reserved memory(see crashkernel=X) by userspace kexec-tools.
>> I think a straightforward way would be to mark the whole reserved memory range without
>> encryption before loading all the kexec segments for kdump, I guess we can handle this
>> easily in arch_kexec_unprotect_crashkres().
>
> Yes, that would work.
>
>>
>> Moreover, now that "elfcorehdr=X" is left as decrypted, it needs to be remapped to the
>> encrypted data.
>
> This is an area that I'm not familiar with, so I don't completely
> understand the flow in regards to where/when/how the ELF headers are
> copied and what needs to be done.
>
> Can you elaborate a bit on this?

"elfcorehdr" is generated by userspace kexec-tools(git://git.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git), it's
actually ELF CORE header data(elf header, PT_LOAD/PT_NOTE program header), see kexec/crashdump-elf.c::FUNC().

For kdump case, it will be put in some reserved crash memory allocated by kexec-tools, and passed the corresponding
start address of the allocated reserved crash memory to kdump kernel via "elfcorehdr=", please see kernel functions
setup_elfcorehdr() and vmcore_init() for how it is parsed by kdump kernel.

Regards,
Xunlei

>>
>>>
>>> Additionally, when shutting down all of the CPUs we need to be sure to
>>> flush the caches and then halt. This is needed when booting from a state
>>> where SME was not active into a state where SME is active (or vice-versa).
>>> Without these steps, it is possible for cache lines to exist for the same
>>> physical location but tagged both with and without the encryption bit. This
>>> can cause random memory corruption when caches are flushed depending on
>>> which cacheline is written last.
>>>
>>> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
>>> ---
>>>   arch/x86/include/asm/init.h          |    1 +
>>>   arch/x86/include/asm/irqflags.h      |    5 +++++
>>>   arch/x86/include/asm/kexec.h         |    8 ++++++++
>>>   arch/x86/include/asm/pgtable_types.h |    1 +
>>>   arch/x86/kernel/machine_kexec_64.c   |   35 +++++++++++++++++++++++++++++++++-
>>>   arch/x86/kernel/process.c            |   26 +++++++++++++++++++++++--
>>>   arch/x86/mm/ident_map.c              |   11 +++++++----
>>>   include/linux/kexec.h                |   14 ++++++++++++++
>>>   kernel/kexec_core.c                  |    7 +++++++
>>>   9 files changed, 101 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
>>> index 737da62..b2ec511 100644
>>> --- a/arch/x86/include/asm/init.h
>>> +++ b/arch/x86/include/asm/init.h
>>> @@ -6,6 +6,7 @@ struct x86_mapping_info {
>>>       void *context;             /* context for alloc_pgt_page */
>>>       unsigned long pmd_flag;         /* page flag for PMD entry */
>>>       unsigned long offset;         /* ident mapping offset */
>>> +    unsigned long kernpg_flag;     /* kernel pagetable flag override */
>>>   };
>>>     int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
>>> diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
>>> index ac7692d..38b5920 100644
>>> --- a/arch/x86/include/asm/irqflags.h
>>> +++ b/arch/x86/include/asm/irqflags.h
>>> @@ -58,6 +58,11 @@ static inline __cpuidle void native_halt(void)
>>>       asm volatile("hlt": : :"memory");
>>>   }
>>>   +static inline __cpuidle void native_wbinvd_halt(void)
>>> +{
>>> +    asm volatile("wbinvd; hlt" : : : "memory");
>>> +}
>>> +
>>>   #endif
>>>     #ifdef CONFIG_PARAVIRT
>>> diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
>>> index 70ef205..e8183ac 100644
>>> --- a/arch/x86/include/asm/kexec.h
>>> +++ b/arch/x86/include/asm/kexec.h
>>> @@ -207,6 +207,14 @@ struct kexec_entry64_regs {
>>>       uint64_t r15;
>>>       uint64_t rip;
>>>   };
>>> +
>>> +extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages,
>>> +                       gfp_t gfp);
>>> +#define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages
>>> +
>>> +extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages);
>>> +#define arch_kexec_pre_free_pages arch_kexec_pre_free_pages
>>> +
>>>   #endif
>>>     typedef void crash_vmclear_fn(void);
>>> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
>>> index ce8cb1c..0f326f4 100644
>>> --- a/arch/x86/include/asm/pgtable_types.h
>>> +++ b/arch/x86/include/asm/pgtable_types.h
>>> @@ -213,6 +213,7 @@ enum page_cache_mode {
>>>   #define PAGE_KERNEL        __pgprot(__PAGE_KERNEL | _PAGE_ENC)
>>>   #define PAGE_KERNEL_RO        __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC)
>>>   #define PAGE_KERNEL_EXEC    __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC)
>>> +#define PAGE_KERNEL_EXEC_NOENC    __pgprot(__PAGE_KERNEL_EXEC)
>>>   #define PAGE_KERNEL_RX        __pgprot(__PAGE_KERNEL_RX | _PAGE_ENC)
>>>   #define PAGE_KERNEL_NOCACHE    __pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC)
>>>   #define PAGE_KERNEL_LARGE    __pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC)
>>> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
>>> index 085c3b3..11c0ca9 100644
>>> --- a/arch/x86/kernel/machine_kexec_64.c
>>> +++ b/arch/x86/kernel/machine_kexec_64.c
>>> @@ -86,7 +86,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
>>>           set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
>>>       }
>>>       pte = pte_offset_kernel(pmd, vaddr);
>>> -    set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
>>> +    set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));
>>>       return 0;
>>>   err:
>>>       free_transition_pgtable(image);
>>> @@ -114,6 +114,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
>>>           .alloc_pgt_page    = alloc_pgt_page,
>>>           .context    = image,
>>>           .pmd_flag    = __PAGE_KERNEL_LARGE_EXEC,
>>> +        .kernpg_flag    = _KERNPG_TABLE_NOENC,
>>>       };
>>>       unsigned long mstart, mend;
>>>       pgd_t *level4p;
>>> @@ -597,3 +598,35 @@ void arch_kexec_unprotect_crashkres(void)
>>>   {
>>>       kexec_mark_crashkres(false);
>>>   }
>>> +
>>> +int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
>>> +{
>>> +    int ret;
>>> +
>>> +    if (sme_active()) {
>>> +        /*
>>> +         * If SME is active we need to be sure that kexec pages are
>>> +         * not encrypted because when we boot to the new kernel the
>>> +         * pages won't be accessed encrypted (initially).
>>> +         */
>>> +        ret = set_memory_decrypted((unsigned long)vaddr, pages);
>>> +        if (ret)
>>> +            return ret;
>>> +
>>> +        if (gfp & __GFP_ZERO)
>>> +            memset(vaddr, 0, pages * PAGE_SIZE);
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
>>> +{
>>> +    if (sme_active()) {
>>> +        /*
>>> +         * If SME is active we need to reset the pages back to being
>>> +         * an encrypted mapping before freeing them.
>>> +         */
>>> +        set_memory_encrypted((unsigned long)vaddr, pages);
>>> +    }
>>> +}
>>> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
>>> index 0bb8842..f4e5de6 100644
>>> --- a/arch/x86/kernel/process.c
>>> +++ b/arch/x86/kernel/process.c
>>> @@ -24,6 +24,7 @@
>>>   #include <linux/cpuidle.h>
>>>   #include <trace/events/power.h>
>>>   #include <linux/hw_breakpoint.h>
>>> +#include <linux/kexec.h>
>>>   #include <asm/cpu.h>
>>>   #include <asm/apic.h>
>>>   #include <asm/syscalls.h>
>>> @@ -355,8 +356,25 @@ bool xen_set_default_idle(void)
>>>       return ret;
>>>   }
>>>   #endif
>>> +
>>>   void stop_this_cpu(void *dummy)
>>>   {
>>> +    bool do_wbinvd_halt = false;
>>> +
>>> +    if (kexec_in_progress && boot_cpu_has(X86_FEATURE_SME)) {
>>> +        /*
>>> +         * If we are performing a kexec and the processor supports
>>> +         * SME then we need to clear out cache information before
>>> +         * halting. With kexec, going from SME inactive to SME active
>>> +         * requires clearing cache entries so that addresses without
>>> +         * the encryption bit set don't corrupt the same physical
>>> +         * address that has the encryption bit set when caches are
>>> +         * flushed. Perform a wbinvd followed by a halt to achieve
>>> +         * this.
>>> +         */
>>> +        do_wbinvd_halt = true;
>>> +    }
>>> +
>>>       local_irq_disable();
>>>       /*
>>>        * Remove this CPU:
>>> @@ -365,8 +383,12 @@ void stop_this_cpu(void *dummy)
>>>       disable_local_APIC();
>>>       mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
>>>   -    for (;;)
>>> -        halt();
>>> +    for (;;) {
>>> +        if (do_wbinvd_halt)
>>> +            native_wbinvd_halt();
>>> +        else
>>> +            halt();
>>> +    }
>>>   }
>>>     /*
>>> diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
>>> index 04210a2..2c9fd3e 100644
>>> --- a/arch/x86/mm/ident_map.c
>>> +++ b/arch/x86/mm/ident_map.c
>>> @@ -20,6 +20,7 @@ static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page,
>>>   static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
>>>                 unsigned long addr, unsigned long end)
>>>   {
>>> +    unsigned long kernpg_flag = info->kernpg_flag ? : _KERNPG_TABLE;
>>>       unsigned long next;
>>>         for (; addr < end; addr = next) {
>>> @@ -39,7 +40,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
>>>           if (!pmd)
>>>               return -ENOMEM;
>>>           ident_pmd_init(info, pmd, addr, next);
>>> -        set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
>>> +        set_pud(pud, __pud(__pa(pmd) | kernpg_flag));
>>>       }
>>>         return 0;
>>> @@ -48,6 +49,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
>>>   static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
>>>                 unsigned long addr, unsigned long end)
>>>   {
>>> +    unsigned long kernpg_flag = info->kernpg_flag ? : _KERNPG_TABLE;
>>>       unsigned long next;
>>>         for (; addr < end; addr = next) {
>>> @@ -67,7 +69,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
>>>           if (!pud)
>>>               return -ENOMEM;
>>>           ident_pud_init(info, pud, addr, next);
>>> -        set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
>>> +        set_p4d(p4d, __p4d(__pa(pud) | kernpg_flag));
>>>       }
>>>         return 0;
>>> @@ -76,6 +78,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
>>>   int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
>>>                     unsigned long pstart, unsigned long pend)
>>>   {
>>> +    unsigned long kernpg_flag = info->kernpg_flag ? : _KERNPG_TABLE;
>>>       unsigned long addr = pstart + info->offset;
>>>       unsigned long end = pend + info->offset;
>>>       unsigned long next;
>>> @@ -104,14 +107,14 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
>>>           if (result)
>>>               return result;
>>>           if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
>>> -            set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
>>> +            set_pgd(pgd, __pgd(__pa(p4d) | kernpg_flag));
>>>           } else {
>>>               /*
>>>                * With p4d folded, pgd is equal to p4d.
>>>                * The pgd entry has to point to the pud page table in this case.
>>>                */
>>>               pud_t *pud = pud_offset(p4d, 0);
>>> -            set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
>>> +            set_pgd(pgd, __pgd(__pa(pud) | kernpg_flag));
>>>           }
>>>       }
>>>   diff --git a/include/linux/kexec.h b/include/linux/kexec.h
>>> index d419d0e..1c76e3b 100644
>>> --- a/include/linux/kexec.h
>>> +++ b/include/linux/kexec.h
>>> @@ -383,6 +383,20 @@ static inline void *boot_phys_to_virt(unsigned long entry)
>>>       return phys_to_virt(boot_phys_to_phys(entry));
>>>   }
>>>   +#ifndef arch_kexec_post_alloc_pages
>>> +static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages,
>>> +                          gfp_t gfp)
>>> +{
>>> +    return 0;
>>> +}
>>> +#endif
>>> +
>>> +#ifndef arch_kexec_pre_free_pages
>>> +static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
>>> +{
>>> +}
>>> +#endif
>>> +
>>>   #else /* !CONFIG_KEXEC_CORE */
>>>   struct pt_regs;
>>>   struct task_struct;
>>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>>> index bfe62d5..bb5e7e3 100644
>>> --- a/kernel/kexec_core.c
>>> +++ b/kernel/kexec_core.c
>>> @@ -38,6 +38,7 @@
>>>   #include <linux/syscore_ops.h>
>>>   #include <linux/compiler.h>
>>>   #include <linux/hugetlb.h>
>>> +#include <linux/mem_encrypt.h>
>>>     #include <asm/page.h>
>>>   #include <asm/sections.h>
>>> @@ -315,6 +316,9 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
>>>           count = 1 << order;
>>>           for (i = 0; i < count; i++)
>>>               SetPageReserved(pages + i);
>>> +
>>> +        arch_kexec_post_alloc_pages(page_address(pages), count,
>>> +                        gfp_mask);
>>>       }
>>>         return pages;
>>> @@ -326,6 +330,9 @@ static void kimage_free_pages(struct page *page)
>>>         order = page_private(page);
>>>       count = 1 << order;
>>> +
>>> +    arch_kexec_pre_free_pages(page_address(page), count);
>>> +
>>>       for (i = 0; i < count; i++)
>>>           ClearPageReserved(page + i);
>>>       __free_pages(page, order);
>>>
>>>
>>> _______________________________________________
>>> kexec mailing list
>>> kexec@lists.infradead.org
>>> http://lists.infradead.org/mailman/listinfo/kexec
>>
Borislav Petkov May 31, 2017, 3:48 p.m. UTC | #12
On Wed, May 31, 2017 at 11:03:52PM +0800, Xunlei Pang wrote:
> For kdump case, it will be put in some reserved crash memory allocated
> by kexec-tools, and passed the corresponding start address of the
> allocated reserved crash memory to kdump kernel via "elfcorehdr=",
> please see kernel functions setup_elfcorehdr() and vmcore_init() for
> how it is parsed by kdump kernel.

... which could be a great way to pass the SME status to the second
kernel without any funky sysfs games.
diff mbox

Patch

diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 737da62..b2ec511 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -6,6 +6,7 @@  struct x86_mapping_info {
 	void *context;			 /* context for alloc_pgt_page */
 	unsigned long pmd_flag;		 /* page flag for PMD entry */
 	unsigned long offset;		 /* ident mapping offset */
+	unsigned long kernpg_flag;	 /* kernel pagetable flag override */
 };
 
 int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index ac7692d..38b5920 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -58,6 +58,11 @@  static inline __cpuidle void native_halt(void)
 	asm volatile("hlt": : :"memory");
 }
 
+static inline __cpuidle void native_wbinvd_halt(void)
+{
+	asm volatile("wbinvd; hlt" : : : "memory");
+}
+
 #endif
 
 #ifdef CONFIG_PARAVIRT
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 70ef205..e8183ac 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -207,6 +207,14 @@  struct kexec_entry64_regs {
 	uint64_t r15;
 	uint64_t rip;
 };
+
+extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages,
+				       gfp_t gfp);
+#define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages
+
+extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages);
+#define arch_kexec_pre_free_pages arch_kexec_pre_free_pages
+
 #endif
 
 typedef void crash_vmclear_fn(void);
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index ce8cb1c..0f326f4 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -213,6 +213,7 @@  enum page_cache_mode {
 #define PAGE_KERNEL		__pgprot(__PAGE_KERNEL | _PAGE_ENC)
 #define PAGE_KERNEL_RO		__pgprot(__PAGE_KERNEL_RO | _PAGE_ENC)
 #define PAGE_KERNEL_EXEC	__pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC)
+#define PAGE_KERNEL_EXEC_NOENC	__pgprot(__PAGE_KERNEL_EXEC)
 #define PAGE_KERNEL_RX		__pgprot(__PAGE_KERNEL_RX | _PAGE_ENC)
 #define PAGE_KERNEL_NOCACHE	__pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC)
 #define PAGE_KERNEL_LARGE	__pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 085c3b3..11c0ca9 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -86,7 +86,7 @@  static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
 		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
 	}
 	pte = pte_offset_kernel(pmd, vaddr);
-	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
+	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));
 	return 0;
 err:
 	free_transition_pgtable(image);
@@ -114,6 +114,7 @@  static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 		.alloc_pgt_page	= alloc_pgt_page,
 		.context	= image,
 		.pmd_flag	= __PAGE_KERNEL_LARGE_EXEC,
+		.kernpg_flag	= _KERNPG_TABLE_NOENC,
 	};
 	unsigned long mstart, mend;
 	pgd_t *level4p;
@@ -597,3 +598,35 @@  void arch_kexec_unprotect_crashkres(void)
 {
 	kexec_mark_crashkres(false);
 }
+
+int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
+{
+	int ret;
+
+	if (sme_active()) {
+		/*
+		 * If SME is active we need to be sure that kexec pages are
+		 * not encrypted because when we boot to the new kernel the
+		 * pages won't be accessed encrypted (initially).
+		 */
+		ret = set_memory_decrypted((unsigned long)vaddr, pages);
+		if (ret)
+			return ret;
+
+		if (gfp & __GFP_ZERO)
+			memset(vaddr, 0, pages * PAGE_SIZE);
+	}
+
+	return 0;
+}
+
+void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
+{
+	if (sme_active()) {
+		/*
+		 * If SME is active we need to reset the pages back to being
+		 * an encrypted mapping before freeing them.
+		 */
+		set_memory_encrypted((unsigned long)vaddr, pages);
+	}
+}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0bb8842..f4e5de6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -24,6 +24,7 @@ 
 #include <linux/cpuidle.h>
 #include <trace/events/power.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/kexec.h>
 #include <asm/cpu.h>
 #include <asm/apic.h>
 #include <asm/syscalls.h>
@@ -355,8 +356,25 @@  bool xen_set_default_idle(void)
 	return ret;
 }
 #endif
+
 void stop_this_cpu(void *dummy)
 {
+	bool do_wbinvd_halt = false;
+
+	if (kexec_in_progress && boot_cpu_has(X86_FEATURE_SME)) {
+		/*
+		 * If we are performing a kexec and the processor supports
+		 * SME then we need to clear out cache information before
+		 * halting. With kexec, going from SME inactive to SME active
+		 * requires clearing cache entries so that addresses without
+		 * the encryption bit set don't corrupt the same physical
+		 * address that has the encryption bit set when caches are
+		 * flushed. Perform a wbinvd followed by a halt to achieve
+		 * this.
+		 */
+		do_wbinvd_halt = true;
+	}
+
 	local_irq_disable();
 	/*
 	 * Remove this CPU:
@@ -365,8 +383,12 @@  void stop_this_cpu(void *dummy)
 	disable_local_APIC();
 	mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
 
-	for (;;)
-		halt();
+	for (;;) {
+		if (do_wbinvd_halt)
+			native_wbinvd_halt();
+		else
+			halt();
+	}
 }
 
 /*
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index 04210a2..2c9fd3e 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -20,6 +20,7 @@  static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page,
 static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
 			  unsigned long addr, unsigned long end)
 {
+	unsigned long kernpg_flag = info->kernpg_flag ? : _KERNPG_TABLE;
 	unsigned long next;
 
 	for (; addr < end; addr = next) {
@@ -39,7 +40,7 @@  static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
 		if (!pmd)
 			return -ENOMEM;
 		ident_pmd_init(info, pmd, addr, next);
-		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+		set_pud(pud, __pud(__pa(pmd) | kernpg_flag));
 	}
 
 	return 0;
@@ -48,6 +49,7 @@  static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
 static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
 			  unsigned long addr, unsigned long end)
 {
+	unsigned long kernpg_flag = info->kernpg_flag ? : _KERNPG_TABLE;
 	unsigned long next;
 
 	for (; addr < end; addr = next) {
@@ -67,7 +69,7 @@  static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
 		if (!pud)
 			return -ENOMEM;
 		ident_pud_init(info, pud, addr, next);
-		set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
+		set_p4d(p4d, __p4d(__pa(pud) | kernpg_flag));
 	}
 
 	return 0;
@@ -76,6 +78,7 @@  static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
 int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
 			      unsigned long pstart, unsigned long pend)
 {
+	unsigned long kernpg_flag = info->kernpg_flag ? : _KERNPG_TABLE;
 	unsigned long addr = pstart + info->offset;
 	unsigned long end = pend + info->offset;
 	unsigned long next;
@@ -104,14 +107,14 @@  int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
 		if (result)
 			return result;
 		if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
-			set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+			set_pgd(pgd, __pgd(__pa(p4d) | kernpg_flag));
 		} else {
 			/*
 			 * With p4d folded, pgd is equal to p4d.
 			 * The pgd entry has to point to the pud page table in this case.
 			 */
 			pud_t *pud = pud_offset(p4d, 0);
-			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+			set_pgd(pgd, __pgd(__pa(pud) | kernpg_flag));
 		}
 	}
 
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index d419d0e..1c76e3b 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -383,6 +383,20 @@  static inline void *boot_phys_to_virt(unsigned long entry)
 	return phys_to_virt(boot_phys_to_phys(entry));
 }
 
+#ifndef arch_kexec_post_alloc_pages
+static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages,
+					      gfp_t gfp)
+{
+	return 0;
+}
+#endif
+
+#ifndef arch_kexec_pre_free_pages
+static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
+{
+}
+#endif
+
 #else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
 struct task_struct;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index bfe62d5..bb5e7e3 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -38,6 +38,7 @@ 
 #include <linux/syscore_ops.h>
 #include <linux/compiler.h>
 #include <linux/hugetlb.h>
+#include <linux/mem_encrypt.h>
 
 #include <asm/page.h>
 #include <asm/sections.h>
@@ -315,6 +316,9 @@  static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
 		count = 1 << order;
 		for (i = 0; i < count; i++)
 			SetPageReserved(pages + i);
+
+		arch_kexec_post_alloc_pages(page_address(pages), count,
+					    gfp_mask);
 	}
 
 	return pages;
@@ -326,6 +330,9 @@  static void kimage_free_pages(struct page *page)
 
 	order = page_private(page);
 	count = 1 << order;
+
+	arch_kexec_pre_free_pages(page_address(page), count);
+
 	for (i = 0; i < count; i++)
 		ClearPageReserved(page + i);
 	__free_pages(page, order);