diff mbox

[v5,29/32] x86/mm: Add support to encrypt the kernel in-place

Message ID 20170418212149.10190.70894.stgit@tlendack-t1.amdoffice.net (mailing list archive)
State New, archived
Headers show

Commit Message

Tom Lendacky April 18, 2017, 9:21 p.m. UTC
Add the support to encrypt the kernel in-place. This is done by creating
new page mappings for the kernel - a decrypted write-protected mapping
and an encrypted mapping. The kernel is encrypted by copying it through
a temporary buffer.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 arch/x86/include/asm/mem_encrypt.h |    6 +
 arch/x86/mm/Makefile               |    2 
 arch/x86/mm/mem_encrypt.c          |  262 ++++++++++++++++++++++++++++++++++++
 arch/x86/mm/mem_encrypt_boot.S     |  151 +++++++++++++++++++++
 4 files changed, 421 insertions(+)
 create mode 100644 arch/x86/mm/mem_encrypt_boot.S

Comments

Borislav Petkov May 18, 2017, 12:46 p.m. UTC | #1
On Tue, Apr 18, 2017 at 04:21:49PM -0500, Tom Lendacky wrote:
> Add the support to encrypt the kernel in-place. This is done by creating
> new page mappings for the kernel - a decrypted write-protected mapping
> and an encrypted mapping. The kernel is encrypted by copying it through
> a temporary buffer.
> 
> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
> ---
>  arch/x86/include/asm/mem_encrypt.h |    6 +
>  arch/x86/mm/Makefile               |    2 
>  arch/x86/mm/mem_encrypt.c          |  262 ++++++++++++++++++++++++++++++++++++
>  arch/x86/mm/mem_encrypt_boot.S     |  151 +++++++++++++++++++++
>  4 files changed, 421 insertions(+)
>  create mode 100644 arch/x86/mm/mem_encrypt_boot.S
> 
> diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
> index b406df2..8f6f9b4 100644
> --- a/arch/x86/include/asm/mem_encrypt.h
> +++ b/arch/x86/include/asm/mem_encrypt.h
> @@ -31,6 +31,12 @@ static inline u64 sme_dma_mask(void)
>  	return ((u64)sme_me_mask << 1) - 1;
>  }
>  
> +void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
> +			 unsigned long decrypted_kernel_vaddr,
> +			 unsigned long kernel_len,
> +			 unsigned long encryption_wa,
> +			 unsigned long encryption_pgd);
> +
>  void __init sme_early_encrypt(resource_size_t paddr,
>  			      unsigned long size);
>  void __init sme_early_decrypt(resource_size_t paddr,
> diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
> index 9e13841..0633142 100644
> --- a/arch/x86/mm/Makefile
> +++ b/arch/x86/mm/Makefile
> @@ -38,3 +38,5 @@ obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
>  obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o
>  obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
>  obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
> +
> +obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_boot.o
> diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
> index 30b07a3..0ff41a4 100644
> --- a/arch/x86/mm/mem_encrypt.c
> +++ b/arch/x86/mm/mem_encrypt.c
> @@ -24,6 +24,7 @@
>  #include <asm/setup.h>
>  #include <asm/bootparam.h>
>  #include <asm/cacheflush.h>
> +#include <asm/sections.h>
>  
>  /*
>   * Since SME related variables are set early in the boot process they must
> @@ -216,8 +217,269 @@ void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
>  	set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT);
>  }
>  
> +void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start,

static

> +			  unsigned long end)
> +{
> +	unsigned long addr = start;
> +	pgdval_t *pgd_p;
> +
> +	while (addr < end) {
> +		unsigned long pgd_end;
> +
> +		pgd_end = (addr & PGDIR_MASK) + PGDIR_SIZE;
> +		if (pgd_end > end)
> +			pgd_end = end;
> +
> +		pgd_p = (pgdval_t *)pgd_base + pgd_index(addr);
> +		*pgd_p = 0;

Hmm, so this is a contiguous range from [start:end] which translates to
8-byte PGD pointers in the PGD page so you can simply memset that range,
no?

Instead of iterating over each one?

> +
> +		addr = pgd_end;
> +	}
> +}
> +
> +#define PGD_FLAGS	_KERNPG_TABLE_NOENC
> +#define PUD_FLAGS	_KERNPG_TABLE_NOENC
> +#define PMD_FLAGS	(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
> +
> +static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
> +				     unsigned long vaddr, pmdval_t pmd_val)
> +{
> +	pgdval_t pgd, *pgd_p;
> +	pudval_t pud, *pud_p;
> +	pmdval_t pmd, *pmd_p;

You should use the enclosing type, not the underlying one. I.e.,

	pgd_t *pgd;
	pud_t *pud;
	...

and then the macros native_p*d_val(), p*d_offset() and so on. I say
native_* because we don't want to have any paravirt nastyness here.
I believe your previous version was using the proper interfaces.

And the kernel has gotten 5-level pagetables support in
the meantime, so this'll need to start at p4d AFAICT.
arch/x86/mm/fault.c::dump_pagetable() looks like a good example to stare
at.

> +	pgd_p = (pgdval_t *)pgd_base + pgd_index(vaddr);
> +	pgd = *pgd_p;
> +	if (pgd) {
> +		pud_p = (pudval_t *)(pgd & ~PTE_FLAGS_MASK);
> +	} else {
> +		pud_p = pgtable_area;
> +		memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
> +		pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
> +
> +		*pgd_p = (pgdval_t)pud_p + PGD_FLAGS;
> +	}
> +
> +	pud_p += pud_index(vaddr);
> +	pud = *pud_p;
> +	if (pud) {
> +		if (pud & _PAGE_PSE)
> +			goto out;
> +
> +		pmd_p = (pmdval_t *)(pud & ~PTE_FLAGS_MASK);
> +	} else {
> +		pmd_p = pgtable_area;
> +		memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
> +		pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
> +
> +		*pud_p = (pudval_t)pmd_p + PUD_FLAGS;
> +	}
> +
> +	pmd_p += pmd_index(vaddr);
> +	pmd = *pmd_p;
> +	if (!pmd || !(pmd & _PAGE_PSE))
> +		*pmd_p = pmd_val;
> +
> +out:
> +	return pgtable_area;
> +}
> +
> +static unsigned long __init sme_pgtable_calc(unsigned long len)
> +{
> +	unsigned long pud_tables, pmd_tables;
> +	unsigned long total = 0;
> +
> +	/*
> +	 * Perform a relatively simplistic calculation of the pagetable
> +	 * entries that are needed. That mappings will be covered by 2MB
> +	 * PMD entries so we can conservatively calculate the required
> +	 * number of PUD and PMD structures needed to perform the mappings.
> +	 * Incrementing the count for each covers the case where the
> +	 * addresses cross entries.
> +	 */
> +	pud_tables = ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE;
> +	pud_tables++;
> +	pmd_tables = ALIGN(len, PUD_SIZE) / PUD_SIZE;
> +	pmd_tables++;
> +
> +	total += pud_tables * sizeof(pud_t) * PTRS_PER_PUD;
> +	total += pmd_tables * sizeof(pmd_t) * PTRS_PER_PMD;
> +
> +	/*
> +	 * Now calculate the added pagetable structures needed to populate
> +	 * the new pagetables.
> +	 */

Nice commenting, helps following what's going on.

> +	pud_tables = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
> +	pmd_tables = ALIGN(total, PUD_SIZE) / PUD_SIZE;
> +
> +	total += pud_tables * sizeof(pud_t) * PTRS_PER_PUD;
> +	total += pmd_tables * sizeof(pmd_t) * PTRS_PER_PMD;
> +
> +	return total;
> +}
> +
>  void __init sme_encrypt_kernel(void)
>  {
> +	pgd_t *pgd;
> +	void *pgtable_area;
> +	unsigned long kernel_start, kernel_end, kernel_len;
> +	unsigned long workarea_start, workarea_end, workarea_len;
> +	unsigned long execute_start, execute_end, execute_len;
> +	unsigned long pgtable_area_len;
> +	unsigned long decrypted_base;
> +	unsigned long paddr, pmd_flags;


Please sort function local variables declaration in a reverse christmas
tree order:

	<type> longest_variable_name;
	<type> shorter_var_name;
	<type> even_shorter;
	<type> i;

> +
> +	if (!sme_active())
> +		return;

...

> diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
> new file mode 100644
> index 0000000..fb58f9f
> --- /dev/null
> +++ b/arch/x86/mm/mem_encrypt_boot.S
> @@ -0,0 +1,151 @@
> +/*
> + * AMD Memory Encryption Support
> + *
> + * Copyright (C) 2016 Advanced Micro Devices, Inc.
> + *
> + * Author: Tom Lendacky <thomas.lendacky@amd.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/pgtable.h>
> +#include <asm/page.h>
> +#include <asm/processor-flags.h>
> +#include <asm/msr-index.h>
> +
> +	.text
> +	.code64
> +ENTRY(sme_encrypt_execute)
> +
> +	/*
> +	 * Entry parameters:
> +	 *   RDI - virtual address for the encrypted kernel mapping
> +	 *   RSI - virtual address for the decrypted kernel mapping
> +	 *   RDX - length of kernel
> +	 *   RCX - virtual address of the encryption workarea, including:
> +	 *     - stack page (PAGE_SIZE)
> +	 *     - encryption routine page (PAGE_SIZE)
> +	 *     - intermediate copy buffer (PMD_PAGE_SIZE)
> +	 *    R8 - physcial address of the pagetables to use for encryption
> +	 */
> +
> +	push	%rbp
> +	push	%r12
> +
> +	/* Set up a one page stack in the non-encrypted memory area */
> +	movq	%rsp, %rbp		/* Save current stack pointer */
> +	movq	%rcx, %rax		/* Workarea stack page */
> +	movq	%rax, %rsp		/* Set new stack pointer */
> +	addq	$PAGE_SIZE, %rsp	/* Stack grows from the bottom */
> +	addq	$PAGE_SIZE, %rax	/* Workarea encryption routine */
> +
> +	movq	%rdi, %r10		/* Encrypted kernel */
> +	movq	%rsi, %r11		/* Decrypted kernel */
> +	movq	%rdx, %r12		/* Kernel length */
> +
> +	/* Copy encryption routine into the workarea */
> +	movq	%rax, %rdi		/* Workarea encryption routine */
> +	leaq	.Lenc_start(%rip), %rsi	/* Encryption routine */
> +	movq	$(.Lenc_stop - .Lenc_start), %rcx	/* Encryption routine length */
> +	rep	movsb
> +
> +	/* Setup registers for call */
> +	movq	%r10, %rdi		/* Encrypted kernel */
> +	movq	%r11, %rsi		/* Decrypted kernel */
> +	movq	%r8, %rdx		/* Pagetables used for encryption */
> +	movq	%r12, %rcx		/* Kernel length */
> +	movq	%rax, %r8		/* Workarea encryption routine */
> +	addq	$PAGE_SIZE, %r8		/* Workarea intermediate copy buffer */
> +
> +	call	*%rax			/* Call the encryption routine */
> +
> +	movq	%rbp, %rsp		/* Restore original stack pointer */
> +
> +	pop	%r12
> +	pop	%rbp
> +
> +	ret
> +ENDPROC(sme_encrypt_execute)
> +
> +.Lenc_start:
> +ENTRY(sme_enc_routine)

A function called a "routine"? Why do we need the global symbol?
Nothing's referencing it AFAICT.

> +/*
> + * Routine used to encrypt kernel.
> + *   This routine must be run outside of the kernel proper since
> + *   the kernel will be encrypted during the process. So this
> + *   routine is defined here and then copied to an area outside
> + *   of the kernel where it will remain and run decrypted
> + *   during execution.
> + *
> + *   On entry the registers must be:
> + *     RDI - virtual address for the encrypted kernel mapping
> + *     RSI - virtual address for the decrypted kernel mapping
> + *     RDX - address of the pagetables to use for encryption
> + *     RCX - length of kernel
> + *      R8 - intermediate copy buffer
> + *
> + *     RAX - points to this routine
> + *
> + * The kernel will be encrypted by copying from the non-encrypted
> + * kernel space to an intermediate buffer and then copying from the
> + * intermediate buffer back to the encrypted kernel space. The physical
> + * addresses of the two kernel space mappings are the same which
> + * results in the kernel being encrypted "in place".
> + */
> +	/* Enable the new page tables */
> +	mov	%rdx, %cr3
> +
> +	/* Flush any global TLBs */
> +	mov	%cr4, %rdx
> +	andq	$~X86_CR4_PGE, %rdx
> +	mov	%rdx, %cr4
> +	orq	$X86_CR4_PGE, %rdx
> +	mov	%rdx, %cr4
> +
> +	/* Set the PAT register PA5 entry to write-protect */
> +	push	%rcx
> +	movl	$MSR_IA32_CR_PAT, %ecx
> +	rdmsr
> +	push	%rdx			/* Save original PAT value */
> +	andl	$0xffff00ff, %edx	/* Clear PA5 */
> +	orl	$0x00000500, %edx	/* Set PA5 to WP */

Maybe check first whether PA5 is already set correctly and avoid the
WRMSR and the restoring below too?

> +	wrmsr
> +	pop	%rdx			/* RDX contains original PAT value */
> +	pop	%rcx
> +
> +	movq	%rcx, %r9		/* Save kernel length */
> +	movq	%rdi, %r10		/* Save encrypted kernel address */
> +	movq	%rsi, %r11		/* Save decrypted kernel address */
> +
> +	wbinvd				/* Invalidate any cache entries */
> +
> +	/* Copy/encrypt 2MB at a time */
> +1:
> +	movq	%r11, %rsi		/* Source - decrypted kernel */
> +	movq	%r8, %rdi		/* Dest   - intermediate copy buffer */
> +	movq	$PMD_PAGE_SIZE, %rcx	/* 2MB length */
> +	rep	movsb

not movsQ?

> +	movq	%r8, %rsi		/* Source - intermediate copy buffer */
> +	movq	%r10, %rdi		/* Dest   - encrypted kernel */
> +	movq	$PMD_PAGE_SIZE, %rcx	/* 2MB length */
> +	rep	movsb
> +
> +	addq	$PMD_PAGE_SIZE, %r11
> +	addq	$PMD_PAGE_SIZE, %r10
> +	subq	$PMD_PAGE_SIZE, %r9	/* Kernel length decrement */
> +	jnz	1b			/* Kernel length not zero? */
> +
> +	/* Restore PAT register */
> +	push	%rdx			/* Save original PAT value */
> +	movl	$MSR_IA32_CR_PAT, %ecx
> +	rdmsr
> +	pop	%rdx			/* Restore original PAT value */
> +	wrmsr
> +
> +	ret
> +ENDPROC(sme_enc_routine)
> +.Lenc_stop:
>
Tom Lendacky May 25, 2017, 10:24 p.m. UTC | #2
On 5/18/2017 7:46 AM, Borislav Petkov wrote:
> On Tue, Apr 18, 2017 at 04:21:49PM -0500, Tom Lendacky wrote:
>> Add the support to encrypt the kernel in-place. This is done by creating
>> new page mappings for the kernel - a decrypted write-protected mapping
>> and an encrypted mapping. The kernel is encrypted by copying it through
>> a temporary buffer.
>>
>> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
>> ---
>>  arch/x86/include/asm/mem_encrypt.h |    6 +
>>  arch/x86/mm/Makefile               |    2
>>  arch/x86/mm/mem_encrypt.c          |  262 ++++++++++++++++++++++++++++++++++++
>>  arch/x86/mm/mem_encrypt_boot.S     |  151 +++++++++++++++++++++
>>  4 files changed, 421 insertions(+)
>>  create mode 100644 arch/x86/mm/mem_encrypt_boot.S
>>
>> diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
>> index b406df2..8f6f9b4 100644
>> --- a/arch/x86/include/asm/mem_encrypt.h
>> +++ b/arch/x86/include/asm/mem_encrypt.h
>> @@ -31,6 +31,12 @@ static inline u64 sme_dma_mask(void)
>>  	return ((u64)sme_me_mask << 1) - 1;
>>  }
>>
>> +void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
>> +			 unsigned long decrypted_kernel_vaddr,
>> +			 unsigned long kernel_len,
>> +			 unsigned long encryption_wa,
>> +			 unsigned long encryption_pgd);
>> +
>>  void __init sme_early_encrypt(resource_size_t paddr,
>>  			      unsigned long size);
>>  void __init sme_early_decrypt(resource_size_t paddr,
>> diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
>> index 9e13841..0633142 100644
>> --- a/arch/x86/mm/Makefile
>> +++ b/arch/x86/mm/Makefile
>> @@ -38,3 +38,5 @@ obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
>>  obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o
>>  obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
>>  obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
>> +
>> +obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_boot.o
>> diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
>> index 30b07a3..0ff41a4 100644
>> --- a/arch/x86/mm/mem_encrypt.c
>> +++ b/arch/x86/mm/mem_encrypt.c
>> @@ -24,6 +24,7 @@
>>  #include <asm/setup.h>
>>  #include <asm/bootparam.h>
>>  #include <asm/cacheflush.h>
>> +#include <asm/sections.h>
>>
>>  /*
>>   * Since SME related variables are set early in the boot process they must
>> @@ -216,8 +217,269 @@ void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
>>  	set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT);
>>  }
>>
>> +void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start,
>
> static

Yup.

>
>> +			  unsigned long end)
>> +{
>> +	unsigned long addr = start;
>> +	pgdval_t *pgd_p;
>> +
>> +	while (addr < end) {
>> +		unsigned long pgd_end;
>> +
>> +		pgd_end = (addr & PGDIR_MASK) + PGDIR_SIZE;
>> +		if (pgd_end > end)
>> +			pgd_end = end;
>> +
>> +		pgd_p = (pgdval_t *)pgd_base + pgd_index(addr);
>> +		*pgd_p = 0;
>
> Hmm, so this is a contiguous range from [start:end] which translates to
> 8-byte PGD pointers in the PGD page so you can simply memset that range,
> no?
>
> Instead of iterating over each one?

I guess I could do that, but this will probably only end up clearing a
single PGD entry anyway since it's highly doubtful the address range
would cross a 512GB boundary.

>
>> +
>> +		addr = pgd_end;
>> +	}
>> +}
>> +
>> +#define PGD_FLAGS	_KERNPG_TABLE_NOENC
>> +#define PUD_FLAGS	_KERNPG_TABLE_NOENC
>> +#define PMD_FLAGS	(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
>> +
>> +static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
>> +				     unsigned long vaddr, pmdval_t pmd_val)
>> +{
>> +	pgdval_t pgd, *pgd_p;
>> +	pudval_t pud, *pud_p;
>> +	pmdval_t pmd, *pmd_p;
>
> You should use the enclosing type, not the underlying one. I.e.,
>
> 	pgd_t *pgd;
> 	pud_t *pud;
> 	...
>
> and then the macros native_p*d_val(), p*d_offset() and so on. I say
> native_* because we don't want to have any paravirt nastyness here.
> I believe your previous version was using the proper interfaces.

I won't be able to use the p*d_offset() macros since they use __va()
and we're identity mapped during this time (which is why I would guess
the proposed changes for the 5-level pagetables in
arch/x86/kernel/head64.c, __startup_64, don't use these macros
either). I should be able to use the native_set_p*d() and others though,
I'll look into that.

>
> And the kernel has gotten 5-level pagetables support in
> the meantime, so this'll need to start at p4d AFAICT.
> arch/x86/mm/fault.c::dump_pagetable() looks like a good example to stare
> at.

Yeah, I accounted for that in the other parts of the code but I need
to do that here also.

>
>> +	pgd_p = (pgdval_t *)pgd_base + pgd_index(vaddr);
>> +	pgd = *pgd_p;
>> +	if (pgd) {
>> +		pud_p = (pudval_t *)(pgd & ~PTE_FLAGS_MASK);
>> +	} else {
>> +		pud_p = pgtable_area;
>> +		memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
>> +		pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
>> +
>> +		*pgd_p = (pgdval_t)pud_p + PGD_FLAGS;
>> +	}
>> +
>> +	pud_p += pud_index(vaddr);
>> +	pud = *pud_p;
>> +	if (pud) {
>> +		if (pud & _PAGE_PSE)
>> +			goto out;
>> +
>> +		pmd_p = (pmdval_t *)(pud & ~PTE_FLAGS_MASK);
>> +	} else {
>> +		pmd_p = pgtable_area;
>> +		memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
>> +		pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
>> +
>> +		*pud_p = (pudval_t)pmd_p + PUD_FLAGS;
>> +	}
>> +
>> +	pmd_p += pmd_index(vaddr);
>> +	pmd = *pmd_p;
>> +	if (!pmd || !(pmd & _PAGE_PSE))
>> +		*pmd_p = pmd_val;
>> +
>> +out:
>> +	return pgtable_area;
>> +}
>> +
>> +static unsigned long __init sme_pgtable_calc(unsigned long len)
>> +{
>> +	unsigned long pud_tables, pmd_tables;
>> +	unsigned long total = 0;
>> +
>> +	/*
>> +	 * Perform a relatively simplistic calculation of the pagetable
>> +	 * entries that are needed. That mappings will be covered by 2MB
>> +	 * PMD entries so we can conservatively calculate the required
>> +	 * number of PUD and PMD structures needed to perform the mappings.
>> +	 * Incrementing the count for each covers the case where the
>> +	 * addresses cross entries.
>> +	 */
>> +	pud_tables = ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE;
>> +	pud_tables++;
>> +	pmd_tables = ALIGN(len, PUD_SIZE) / PUD_SIZE;
>> +	pmd_tables++;
>> +
>> +	total += pud_tables * sizeof(pud_t) * PTRS_PER_PUD;
>> +	total += pmd_tables * sizeof(pmd_t) * PTRS_PER_PMD;
>> +
>> +	/*
>> +	 * Now calculate the added pagetable structures needed to populate
>> +	 * the new pagetables.
>> +	 */
>
> Nice commenting, helps following what's going on.
>
>> +	pud_tables = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
>> +	pmd_tables = ALIGN(total, PUD_SIZE) / PUD_SIZE;
>> +
>> +	total += pud_tables * sizeof(pud_t) * PTRS_PER_PUD;
>> +	total += pmd_tables * sizeof(pmd_t) * PTRS_PER_PMD;
>> +
>> +	return total;
>> +}
>> +
>>  void __init sme_encrypt_kernel(void)
>>  {
>> +	pgd_t *pgd;
>> +	void *pgtable_area;
>> +	unsigned long kernel_start, kernel_end, kernel_len;
>> +	unsigned long workarea_start, workarea_end, workarea_len;
>> +	unsigned long execute_start, execute_end, execute_len;
>> +	unsigned long pgtable_area_len;
>> +	unsigned long decrypted_base;
>> +	unsigned long paddr, pmd_flags;
>
>
> Please sort function local variables declaration in a reverse christmas
> tree order:
>
> 	<type> longest_variable_name;
> 	<type> shorter_var_name;
> 	<type> even_shorter;
> 	<type> i;
>

Will do.

>> +
>> +	if (!sme_active())
>> +		return;
>
> ...
>
>> diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
>> new file mode 100644
>> index 0000000..fb58f9f
>> --- /dev/null
>> +++ b/arch/x86/mm/mem_encrypt_boot.S
>> @@ -0,0 +1,151 @@
>> +/*
>> + * AMD Memory Encryption Support
>> + *
>> + * Copyright (C) 2016 Advanced Micro Devices, Inc.
>> + *
>> + * Author: Tom Lendacky <thomas.lendacky@amd.com>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + */
>> +
>> +#include <linux/linkage.h>
>> +#include <asm/pgtable.h>
>> +#include <asm/page.h>
>> +#include <asm/processor-flags.h>
>> +#include <asm/msr-index.h>
>> +
>> +	.text
>> +	.code64
>> +ENTRY(sme_encrypt_execute)
>> +
>> +	/*
>> +	 * Entry parameters:
>> +	 *   RDI - virtual address for the encrypted kernel mapping
>> +	 *   RSI - virtual address for the decrypted kernel mapping
>> +	 *   RDX - length of kernel
>> +	 *   RCX - virtual address of the encryption workarea, including:
>> +	 *     - stack page (PAGE_SIZE)
>> +	 *     - encryption routine page (PAGE_SIZE)
>> +	 *     - intermediate copy buffer (PMD_PAGE_SIZE)
>> +	 *    R8 - physcial address of the pagetables to use for encryption
>> +	 */
>> +
>> +	push	%rbp
>> +	push	%r12
>> +
>> +	/* Set up a one page stack in the non-encrypted memory area */
>> +	movq	%rsp, %rbp		/* Save current stack pointer */
>> +	movq	%rcx, %rax		/* Workarea stack page */
>> +	movq	%rax, %rsp		/* Set new stack pointer */
>> +	addq	$PAGE_SIZE, %rsp	/* Stack grows from the bottom */
>> +	addq	$PAGE_SIZE, %rax	/* Workarea encryption routine */
>> +
>> +	movq	%rdi, %r10		/* Encrypted kernel */
>> +	movq	%rsi, %r11		/* Decrypted kernel */
>> +	movq	%rdx, %r12		/* Kernel length */
>> +
>> +	/* Copy encryption routine into the workarea */
>> +	movq	%rax, %rdi		/* Workarea encryption routine */
>> +	leaq	.Lenc_start(%rip), %rsi	/* Encryption routine */
>> +	movq	$(.Lenc_stop - .Lenc_start), %rcx	/* Encryption routine length */
>> +	rep	movsb
>> +
>> +	/* Setup registers for call */
>> +	movq	%r10, %rdi		/* Encrypted kernel */
>> +	movq	%r11, %rsi		/* Decrypted kernel */
>> +	movq	%r8, %rdx		/* Pagetables used for encryption */
>> +	movq	%r12, %rcx		/* Kernel length */
>> +	movq	%rax, %r8		/* Workarea encryption routine */
>> +	addq	$PAGE_SIZE, %r8		/* Workarea intermediate copy buffer */
>> +
>> +	call	*%rax			/* Call the encryption routine */
>> +
>> +	movq	%rbp, %rsp		/* Restore original stack pointer */
>> +
>> +	pop	%r12
>> +	pop	%rbp
>> +
>> +	ret
>> +ENDPROC(sme_encrypt_execute)
>> +
>> +.Lenc_start:
>> +ENTRY(sme_enc_routine)
>
> A function called a "routine"? Why do we need the global symbol?
> Nothing's referencing it AFAICT.

I can change the name. As for the use of ENTRY... without the
ENTRY/ENDPROC combination I was receiving a warning about a return
instruction outside of a callable function. It looks like I can just
define the "sme_enc_routine:" label with the ENDPROC and the warning
goes away and the global is avoided. It doesn't like the local labels
(.L...) so I'll use the new name.

>
>> +/*
>> + * Routine used to encrypt kernel.
>> + *   This routine must be run outside of the kernel proper since
>> + *   the kernel will be encrypted during the process. So this
>> + *   routine is defined here and then copied to an area outside
>> + *   of the kernel where it will remain and run decrypted
>> + *   during execution.
>> + *
>> + *   On entry the registers must be:
>> + *     RDI - virtual address for the encrypted kernel mapping
>> + *     RSI - virtual address for the decrypted kernel mapping
>> + *     RDX - address of the pagetables to use for encryption
>> + *     RCX - length of kernel
>> + *      R8 - intermediate copy buffer
>> + *
>> + *     RAX - points to this routine
>> + *
>> + * The kernel will be encrypted by copying from the non-encrypted
>> + * kernel space to an intermediate buffer and then copying from the
>> + * intermediate buffer back to the encrypted kernel space. The physical
>> + * addresses of the two kernel space mappings are the same which
>> + * results in the kernel being encrypted "in place".
>> + */
>> +	/* Enable the new page tables */
>> +	mov	%rdx, %cr3
>> +
>> +	/* Flush any global TLBs */
>> +	mov	%cr4, %rdx
>> +	andq	$~X86_CR4_PGE, %rdx
>> +	mov	%rdx, %cr4
>> +	orq	$X86_CR4_PGE, %rdx
>> +	mov	%rdx, %cr4
>> +
>> +	/* Set the PAT register PA5 entry to write-protect */
>> +	push	%rcx
>> +	movl	$MSR_IA32_CR_PAT, %ecx
>> +	rdmsr
>> +	push	%rdx			/* Save original PAT value */
>> +	andl	$0xffff00ff, %edx	/* Clear PA5 */
>> +	orl	$0x00000500, %edx	/* Set PA5 to WP */
>
> Maybe check first whether PA5 is already set correctly and avoid the
> WRMSR and the restoring below too?

In the overall scheme of things it's probably not that big a deal when
compared to everything that's about to happen below.

>
>> +	wrmsr
>> +	pop	%rdx			/* RDX contains original PAT value */
>> +	pop	%rcx
>> +
>> +	movq	%rcx, %r9		/* Save kernel length */
>> +	movq	%rdi, %r10		/* Save encrypted kernel address */
>> +	movq	%rsi, %r11		/* Save decrypted kernel address */
>> +
>> +	wbinvd				/* Invalidate any cache entries */
>> +
>> +	/* Copy/encrypt 2MB at a time */
>> +1:
>> +	movq	%r11, %rsi		/* Source - decrypted kernel */
>> +	movq	%r8, %rdi		/* Dest   - intermediate copy buffer */
>> +	movq	$PMD_PAGE_SIZE, %rcx	/* 2MB length */
>> +	rep	movsb
>
> not movsQ?

The hardware will try to optimize rep movsb into large chunks assuming
things are aligned, sizes are large enough, etc. so we don't have to
explicitly specify and setup for a rep movsq.

Thanks,
Tom

>
>> +	movq	%r8, %rsi		/* Source - intermediate copy buffer */
>> +	movq	%r10, %rdi		/* Dest   - encrypted kernel */
>> +	movq	$PMD_PAGE_SIZE, %rcx	/* 2MB length */
>> +	rep	movsb
>> +
>> +	addq	$PMD_PAGE_SIZE, %r11
>> +	addq	$PMD_PAGE_SIZE, %r10
>> +	subq	$PMD_PAGE_SIZE, %r9	/* Kernel length decrement */
>> +	jnz	1b			/* Kernel length not zero? */
>> +
>> +	/* Restore PAT register */
>> +	push	%rdx			/* Save original PAT value */
>> +	movl	$MSR_IA32_CR_PAT, %ecx
>> +	rdmsr
>> +	pop	%rdx			/* Restore original PAT value */
>> +	wrmsr
>> +
>> +	ret
>> +ENDPROC(sme_enc_routine)
>> +.Lenc_stop:
>>
>
Borislav Petkov May 26, 2017, 4:25 p.m. UTC | #3
On Thu, May 25, 2017 at 05:24:27PM -0500, Tom Lendacky wrote:
> I guess I could do that, but this will probably only end up clearing a
> single PGD entry anyway since it's highly doubtful the address range
> would cross a 512GB boundary.

Or you can compute how many 512G-covering, i.e., PGD entries there are
and clear just the right amnount. :^)

> I can change the name. As for the use of ENTRY... without the
> ENTRY/ENDPROC combination I was receiving a warning about a return
> instruction outside of a callable function. It looks like I can just
> define the "sme_enc_routine:" label with the ENDPROC and the warning
> goes away and the global is avoided. It doesn't like the local labels
> (.L...) so I'll use the new name.

Is that warning from objtool or where does it come from?

How do I trigger it locally?

> The hardware will try to optimize rep movsb into large chunks assuming
> things are aligned, sizes are large enough, etc. so we don't have to
> explicitly specify and setup for a rep movsq.

I thought the hw does that for movsq too?
Tom Lendacky May 30, 2017, 4:39 p.m. UTC | #4
On 5/26/2017 11:25 AM, Borislav Petkov wrote:
> On Thu, May 25, 2017 at 05:24:27PM -0500, Tom Lendacky wrote:
>> I guess I could do that, but this will probably only end up clearing a
>> single PGD entry anyway since it's highly doubtful the address range
>> would cross a 512GB boundary.
> 
> Or you can compute how many 512G-covering, i.e., PGD entries there are
> and clear just the right amnount. :^)
> 
>> I can change the name. As for the use of ENTRY... without the
>> ENTRY/ENDPROC combination I was receiving a warning about a return
>> instruction outside of a callable function. It looks like I can just
>> define the "sme_enc_routine:" label with the ENDPROC and the warning
>> goes away and the global is avoided. It doesn't like the local labels
>> (.L...) so I'll use the new name.
> 
> Is that warning from objtool or where does it come from?

Yes, it's from objtool:

arch/x86/mm/mem_encrypt_boot.o: warning: objtool: .text+0xd2: return 
instruction outside of a callable function

> 
> How do I trigger it locally

I think having CONFIG_STACK_VALIDATION=y will trigger it.

> 
>> The hardware will try to optimize rep movsb into large chunks assuming
>> things are aligned, sizes are large enough, etc. so we don't have to
>> explicitly specify and setup for a rep movsq.
> 
> I thought the hw does that for movsq too?

It does.

Thanks,
Tom

>
diff mbox

Patch

diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index b406df2..8f6f9b4 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -31,6 +31,12 @@  static inline u64 sme_dma_mask(void)
 	return ((u64)sme_me_mask << 1) - 1;
 }
 
+void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
+			 unsigned long decrypted_kernel_vaddr,
+			 unsigned long kernel_len,
+			 unsigned long encryption_wa,
+			 unsigned long encryption_pgd);
+
 void __init sme_early_encrypt(resource_size_t paddr,
 			      unsigned long size);
 void __init sme_early_decrypt(resource_size_t paddr,
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 9e13841..0633142 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -38,3 +38,5 @@  obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
 obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o
 obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
 obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
+
+obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_boot.o
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 30b07a3..0ff41a4 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -24,6 +24,7 @@ 
 #include <asm/setup.h>
 #include <asm/bootparam.h>
 #include <asm/cacheflush.h>
+#include <asm/sections.h>
 
 /*
  * Since SME related variables are set early in the boot process they must
@@ -216,8 +217,269 @@  void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
 	set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT);
 }
 
+void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start,
+			  unsigned long end)
+{
+	unsigned long addr = start;
+	pgdval_t *pgd_p;
+
+	while (addr < end) {
+		unsigned long pgd_end;
+
+		pgd_end = (addr & PGDIR_MASK) + PGDIR_SIZE;
+		if (pgd_end > end)
+			pgd_end = end;
+
+		pgd_p = (pgdval_t *)pgd_base + pgd_index(addr);
+		*pgd_p = 0;
+
+		addr = pgd_end;
+	}
+}
+
+#define PGD_FLAGS	_KERNPG_TABLE_NOENC
+#define PUD_FLAGS	_KERNPG_TABLE_NOENC
+#define PMD_FLAGS	(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
+
+static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
+				     unsigned long vaddr, pmdval_t pmd_val)
+{
+	pgdval_t pgd, *pgd_p;
+	pudval_t pud, *pud_p;
+	pmdval_t pmd, *pmd_p;
+
+	pgd_p = (pgdval_t *)pgd_base + pgd_index(vaddr);
+	pgd = *pgd_p;
+	if (pgd) {
+		pud_p = (pudval_t *)(pgd & ~PTE_FLAGS_MASK);
+	} else {
+		pud_p = pgtable_area;
+		memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
+		pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
+
+		*pgd_p = (pgdval_t)pud_p + PGD_FLAGS;
+	}
+
+	pud_p += pud_index(vaddr);
+	pud = *pud_p;
+	if (pud) {
+		if (pud & _PAGE_PSE)
+			goto out;
+
+		pmd_p = (pmdval_t *)(pud & ~PTE_FLAGS_MASK);
+	} else {
+		pmd_p = pgtable_area;
+		memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
+		pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
+
+		*pud_p = (pudval_t)pmd_p + PUD_FLAGS;
+	}
+
+	pmd_p += pmd_index(vaddr);
+	pmd = *pmd_p;
+	if (!pmd || !(pmd & _PAGE_PSE))
+		*pmd_p = pmd_val;
+
+out:
+	return pgtable_area;
+}
+
+static unsigned long __init sme_pgtable_calc(unsigned long len)
+{
+	unsigned long pud_tables, pmd_tables;
+	unsigned long total = 0;
+
+	/*
+	 * Perform a relatively simplistic calculation of the pagetable
+	 * entries that are needed. That mappings will be covered by 2MB
+	 * PMD entries so we can conservatively calculate the required
+	 * number of PUD and PMD structures needed to perform the mappings.
+	 * Incrementing the count for each covers the case where the
+	 * addresses cross entries.
+	 */
+	pud_tables = ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE;
+	pud_tables++;
+	pmd_tables = ALIGN(len, PUD_SIZE) / PUD_SIZE;
+	pmd_tables++;
+
+	total += pud_tables * sizeof(pud_t) * PTRS_PER_PUD;
+	total += pmd_tables * sizeof(pmd_t) * PTRS_PER_PMD;
+
+	/*
+	 * Now calculate the added pagetable structures needed to populate
+	 * the new pagetables.
+	 */
+	pud_tables = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
+	pmd_tables = ALIGN(total, PUD_SIZE) / PUD_SIZE;
+
+	total += pud_tables * sizeof(pud_t) * PTRS_PER_PUD;
+	total += pmd_tables * sizeof(pmd_t) * PTRS_PER_PMD;
+
+	return total;
+}
+
 void __init sme_encrypt_kernel(void)
 {
+	pgd_t *pgd;
+	void *pgtable_area;
+	unsigned long kernel_start, kernel_end, kernel_len;
+	unsigned long workarea_start, workarea_end, workarea_len;
+	unsigned long execute_start, execute_end, execute_len;
+	unsigned long pgtable_area_len;
+	unsigned long decrypted_base;
+	unsigned long paddr, pmd_flags;
+
+	if (!sme_active())
+		return;
+
+	/*
+	 * Prepare for encrypting the kernel by building new pagetables with
+	 * the necessary attributes needed to encrypt the kernel in place.
+	 *
+	 *   One range of virtual addresses will map the memory occupied
+	 *   by the kernel as encrypted.
+	 *
+	 *   Another range of virtual addresses will map the memory occupied
+	 *   by the kernel as decrypted and write-protected.
+	 *
+	 *     The use of write-protect attribute will prevent any of the
+	 *     memory from being cached.
+	 */
+
+	/* Physical addresses gives us the identity mapped virtual addresses */
+	kernel_start = __pa_symbol(_text);
+	kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
+	kernel_len = kernel_end - kernel_start;
+
+	/* Set the encryption workarea to be immediately after the kernel */
+	workarea_start = kernel_end;
+
+	/*
+	 * Calculate required number of workarea bytes needed:
+	 *   executable encryption area size:
+	 *     stack page (PAGE_SIZE)
+	 *     encryption routine page (PAGE_SIZE)
+	 *     intermediate copy buffer (PMD_PAGE_SIZE)
+	 *   pagetable structures for the encryption of the kernel
+	 *   pagetable structures for workarea (in case not currently mapped)
+	 */
+	execute_start = workarea_start;
+	execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
+	execute_len = execute_end - execute_start;
+
+	/*
+	 * One PGD for both encrypted and decrypted mappings and a set of
+	 * PUDs and PMDs for each of the encrypted and decrypted mappings.
+	 */
+	pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD;
+	pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2;
+
+	/* PUDs and PMDs needed in the current pagetables for the workarea */
+	pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len);
+
+	/*
+	 * The total workarea includes the executable encryption area and
+	 * the pagetable area.
+	 */
+	workarea_len = execute_len + pgtable_area_len;
+	workarea_end = workarea_start + workarea_len;
+
+	/*
+	 * Set the address to the start of where newly created pagetable
+	 * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable
+	 * structures are created when the workarea is added to the current
+	 * pagetables and when the new encrypted and decrypted kernel
+	 * mappings are populated.
+	 */
+	pgtable_area = (void *)execute_end;
+
+	/*
+	 * Make sure the current pagetable structure has entries for
+	 * addressing the workarea.
+	 */
+	pgd = (pgd_t *)native_read_cr3();
+	paddr = workarea_start;
+	while (paddr < workarea_end) {
+		pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+						paddr,
+						paddr + PMD_FLAGS);
+
+		paddr += PMD_PAGE_SIZE;
+	}
+	native_write_cr3((unsigned long)pgd);
+
+	/*
+	 * A new pagetable structure is being built to allow for the kernel
+	 * to be encrypted. It starts with an empty PGD that will then be
+	 * populated with new PUDs and PMDs as the encrypted and decrypted
+	 * kernel mappings are created.
+	 */
+	pgd = pgtable_area;
+	memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD);
+	pgtable_area += sizeof(*pgd) * PTRS_PER_PGD;
+
+	/* Add encrypted kernel (identity) mappings */
+	pmd_flags = PMD_FLAGS | _PAGE_ENC;
+	paddr = kernel_start;
+	while (paddr < kernel_end) {
+		pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+						paddr,
+						paddr + pmd_flags);
+
+		paddr += PMD_PAGE_SIZE;
+	}
+
+	/*
+	 * A different PGD index/entry must be used to get different
+	 * pagetable entries for the decrypted mapping. Choose the next
+	 * PGD index and convert it to a virtual address to be used as
+	 * the base of the mapping.
+	 */
+	decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1);
+	decrypted_base <<= PGDIR_SHIFT;
+
+	/* Add decrypted, write-protected kernel (non-identity) mappings */
+	pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT);
+	paddr = kernel_start;
+	while (paddr < kernel_end) {
+		pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+						paddr + decrypted_base,
+						paddr + pmd_flags);
+
+		paddr += PMD_PAGE_SIZE;
+	}
+
+	/* Add decrypted workarea mappings to both kernel mappings */
+	paddr = workarea_start;
+	while (paddr < workarea_end) {
+		pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+						paddr,
+						paddr + PMD_FLAGS);
+
+		pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+						paddr + decrypted_base,
+						paddr + PMD_FLAGS);
+
+		paddr += PMD_PAGE_SIZE;
+	}
+
+	/* Perform the encryption */
+	sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
+			    kernel_len, workarea_start, (unsigned long)pgd);
+
+	/*
+	 * At this point we are running encrypted.  Remove the mappings for
+	 * the decrypted areas - all that is needed for this is to remove
+	 * the PGD entry/entries.
+	 */
+	sme_clear_pgd(pgd, kernel_start + decrypted_base,
+		      kernel_end + decrypted_base);
+
+	sme_clear_pgd(pgd, workarea_start + decrypted_base,
+		      workarea_end + decrypted_base);
+
+	/* Flush the TLB - no globals so cr3 is enough */
+	native_write_cr3(native_read_cr3());
 }
 
 unsigned long __init sme_enable(void)
diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
new file mode 100644
index 0000000..fb58f9f
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt_boot.S
@@ -0,0 +1,151 @@ 
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/processor-flags.h>
+#include <asm/msr-index.h>
+
+	.text
+	.code64
+ENTRY(sme_encrypt_execute)
+
+	/*
+	 * Entry parameters:
+	 *   RDI - virtual address for the encrypted kernel mapping
+	 *   RSI - virtual address for the decrypted kernel mapping
+	 *   RDX - length of kernel
+	 *   RCX - virtual address of the encryption workarea, including:
+	 *     - stack page (PAGE_SIZE)
+	 *     - encryption routine page (PAGE_SIZE)
+	 *     - intermediate copy buffer (PMD_PAGE_SIZE)
+	 *    R8 - physcial address of the pagetables to use for encryption
+	 */
+
+	push	%rbp
+	push	%r12
+
+	/* Set up a one page stack in the non-encrypted memory area */
+	movq	%rsp, %rbp		/* Save current stack pointer */
+	movq	%rcx, %rax		/* Workarea stack page */
+	movq	%rax, %rsp		/* Set new stack pointer */
+	addq	$PAGE_SIZE, %rsp	/* Stack grows from the bottom */
+	addq	$PAGE_SIZE, %rax	/* Workarea encryption routine */
+
+	movq	%rdi, %r10		/* Encrypted kernel */
+	movq	%rsi, %r11		/* Decrypted kernel */
+	movq	%rdx, %r12		/* Kernel length */
+
+	/* Copy encryption routine into the workarea */
+	movq	%rax, %rdi		/* Workarea encryption routine */
+	leaq	.Lenc_start(%rip), %rsi	/* Encryption routine */
+	movq	$(.Lenc_stop - .Lenc_start), %rcx	/* Encryption routine length */
+	rep	movsb
+
+	/* Setup registers for call */
+	movq	%r10, %rdi		/* Encrypted kernel */
+	movq	%r11, %rsi		/* Decrypted kernel */
+	movq	%r8, %rdx		/* Pagetables used for encryption */
+	movq	%r12, %rcx		/* Kernel length */
+	movq	%rax, %r8		/* Workarea encryption routine */
+	addq	$PAGE_SIZE, %r8		/* Workarea intermediate copy buffer */
+
+	call	*%rax			/* Call the encryption routine */
+
+	movq	%rbp, %rsp		/* Restore original stack pointer */
+
+	pop	%r12
+	pop	%rbp
+
+	ret
+ENDPROC(sme_encrypt_execute)
+
+.Lenc_start:
+ENTRY(sme_enc_routine)
+/*
+ * Routine used to encrypt kernel.
+ *   This routine must be run outside of the kernel proper since
+ *   the kernel will be encrypted during the process. So this
+ *   routine is defined here and then copied to an area outside
+ *   of the kernel where it will remain and run decrypted
+ *   during execution.
+ *
+ *   On entry the registers must be:
+ *     RDI - virtual address for the encrypted kernel mapping
+ *     RSI - virtual address for the decrypted kernel mapping
+ *     RDX - address of the pagetables to use for encryption
+ *     RCX - length of kernel
+ *      R8 - intermediate copy buffer
+ *
+ *     RAX - points to this routine
+ *
+ * The kernel will be encrypted by copying from the non-encrypted
+ * kernel space to an intermediate buffer and then copying from the
+ * intermediate buffer back to the encrypted kernel space. The physical
+ * addresses of the two kernel space mappings are the same which
+ * results in the kernel being encrypted "in place".
+ */
+	/* Enable the new page tables */
+	mov	%rdx, %cr3
+
+	/* Flush any global TLBs */
+	mov	%cr4, %rdx
+	andq	$~X86_CR4_PGE, %rdx
+	mov	%rdx, %cr4
+	orq	$X86_CR4_PGE, %rdx
+	mov	%rdx, %cr4
+
+	/* Set the PAT register PA5 entry to write-protect */
+	push	%rcx
+	movl	$MSR_IA32_CR_PAT, %ecx
+	rdmsr
+	push	%rdx			/* Save original PAT value */
+	andl	$0xffff00ff, %edx	/* Clear PA5 */
+	orl	$0x00000500, %edx	/* Set PA5 to WP */
+	wrmsr
+	pop	%rdx			/* RDX contains original PAT value */
+	pop	%rcx
+
+	movq	%rcx, %r9		/* Save kernel length */
+	movq	%rdi, %r10		/* Save encrypted kernel address */
+	movq	%rsi, %r11		/* Save decrypted kernel address */
+
+	wbinvd				/* Invalidate any cache entries */
+
+	/* Copy/encrypt 2MB at a time */
+1:
+	movq	%r11, %rsi		/* Source - decrypted kernel */
+	movq	%r8, %rdi		/* Dest   - intermediate copy buffer */
+	movq	$PMD_PAGE_SIZE, %rcx	/* 2MB length */
+	rep	movsb
+
+	movq	%r8, %rsi		/* Source - intermediate copy buffer */
+	movq	%r10, %rdi		/* Dest   - encrypted kernel */
+	movq	$PMD_PAGE_SIZE, %rcx	/* 2MB length */
+	rep	movsb
+
+	addq	$PMD_PAGE_SIZE, %r11
+	addq	$PMD_PAGE_SIZE, %r10
+	subq	$PMD_PAGE_SIZE, %r9	/* Kernel length decrement */
+	jnz	1b			/* Kernel length not zero? */
+
+	/* Restore PAT register */
+	push	%rdx			/* Save original PAT value */
+	movl	$MSR_IA32_CR_PAT, %ecx
+	rdmsr
+	pop	%rdx			/* Restore original PAT value */
+	wrmsr
+
+	ret
+ENDPROC(sme_enc_routine)
+.Lenc_stop: