diff mbox

[RFC,v4,27/28] x86: Add support to encrypt the kernel in-place

Message ID 20170216154808.19244.475.stgit@tlendack-t1.amdoffice.net (mailing list archive)
State New, archived
Headers show

Commit Message

Tom Lendacky Feb. 16, 2017, 3:48 p.m. UTC
This patch adds the support to encrypt the kernel in-place. This is
done by creating new page mappings for the kernel - a decrypted
write-protected mapping and an encrypted mapping. The kernel is encyrpted
by copying the kernel through a temporary buffer.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 arch/x86/kernel/Makefile           |    1 
 arch/x86/kernel/mem_encrypt_boot.S |  156 +++++++++++++++++++++++++++++
 arch/x86/kernel/mem_encrypt_init.c |  191 ++++++++++++++++++++++++++++++++++++
 3 files changed, 348 insertions(+)
 create mode 100644 arch/x86/kernel/mem_encrypt_boot.S

Comments

Borislav Petkov March 1, 2017, 5:36 p.m. UTC | #1
On Thu, Feb 16, 2017 at 09:48:08AM -0600, Tom Lendacky wrote:
> This patch adds the support to encrypt the kernel in-place. This is
> done by creating new page mappings for the kernel - a decrypted
> write-protected mapping and an encrypted mapping. The kernel is encyrpted

s/encyrpted/encrypted/

> by copying the kernel through a temporary buffer.

"... by copying it... "

> 
> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
> ---

...

> +ENTRY(sme_encrypt_execute)
> +
> +#ifdef CONFIG_AMD_MEM_ENCRYPT
> +	/*
> +	 * Entry parameters:
> +	 *   RDI - virtual address for the encrypted kernel mapping
> +	 *   RSI - virtual address for the decrypted kernel mapping
> +	 *   RDX - length of kernel
> +	 *   RCX - address of the encryption workarea

						     , including:

> +	 *     - stack page (PAGE_SIZE)
> +	 *     - encryption routine page (PAGE_SIZE)
> +	 *     - intermediate copy buffer (PMD_PAGE_SIZE)
> +	 *    R8 - address of the pagetables to use for encryption
> +	 */
> +
> +	/* Set up a one page stack in the non-encrypted memory area */
> +	movq	%rcx, %rax
> +	addq	$PAGE_SIZE, %rax
> +	movq	%rsp, %rbp

%rbp is callee-saved and you're overwriting it here. You need to push it
first.

> +	movq	%rax, %rsp
> +	push	%rbp
> +
> +	push	%r12
> +	push	%r13

In general, just do all pushes on function entry and the pops on exit,
like the compiler does.

> +	movq	%rdi, %r10
> +	movq	%rsi, %r11
> +	movq	%rdx, %r12
> +	movq	%rcx, %r13
> +
> +	/* Copy encryption routine into the workarea */
> +	movq	%rax, %rdi
> +	leaq	.Lencrypt_start(%rip), %rsi
> +	movq	$(.Lencrypt_stop - .Lencrypt_start), %rcx
> +	rep	movsb
> +
> +	/* Setup registers for call */
> +	movq	%r10, %rdi
> +	movq	%r11, %rsi
> +	movq	%r8, %rdx
> +	movq	%r12, %rcx
> +	movq	%rax, %r8
> +	addq	$PAGE_SIZE, %r8
> +
> +	/* Call the encryption routine */
> +	call	*%rax
> +
> +	pop	%r13
> +	pop	%r12
> +
> +	pop	%rsp			/* Restore original stack pointer */
> +.Lencrypt_exit:

Please put side comments like this here:

ENTRY(sme_encrypt_execute)

#ifdef CONFIG_AMD_MEM_ENCRYPT
        /*
         * Entry parameters:
         *   RDI - virtual address for the encrypted kernel mapping
         *   RSI - virtual address for the decrypted kernel mapping
         *   RDX - length of kernel
         *   RCX - address of the encryption workarea
         *     - stack page (PAGE_SIZE)
         *     - encryption routine page (PAGE_SIZE)
         *     - intermediate copy buffer (PMD_PAGE_SIZE)
         *    R8 - address of the pagetables to use for encryption
         */

        /* Set up a one page stack in the non-encrypted memory area */
        movq    %rcx, %rax                      # %rax = workarea
        addq    $PAGE_SIZE, %rax                # %rax += 4096
        movq    %rsp, %rbp                      # stash stack ptr
        movq    %rax, %rsp                      # set new stack
        push    %rbp                            # needs to happen before the mov %rsp, %rbp

        push    %r12
        push    %r13

        movq    %rdi, %r10                      # encrypted kernel
        movq    %rsi, %r11                      # decrypted kernel
        movq    %rdx, %r12                      # kernel length
        movq    %rcx, %r13                      # workarea
	...

and so on.

...

> diff --git a/arch/x86/kernel/mem_encrypt_init.c b/arch/x86/kernel/mem_encrypt_init.c
> index 25af15d..07cbb90 100644
> --- a/arch/x86/kernel/mem_encrypt_init.c
> +++ b/arch/x86/kernel/mem_encrypt_init.c
> @@ -16,9 +16,200 @@
>  #ifdef CONFIG_AMD_MEM_ENCRYPT
>  
>  #include <linux/mem_encrypt.h>
> +#include <linux/mm.h>
> +
> +#include <asm/sections.h>
> +
> +extern void sme_encrypt_execute(unsigned long, unsigned long, unsigned long,
> +				void *, pgd_t *);

This belongs into mem_encrypt.h. And I think it already came up: please
use names for those params.

> +
> +#define PGD_FLAGS	_KERNPG_TABLE_NOENC
> +#define PUD_FLAGS	_KERNPG_TABLE_NOENC
> +#define PMD_FLAGS	__PAGE_KERNEL_LARGE_EXEC
> +
> +static void __init *sme_pgtable_entry(pgd_t *pgd, void *next_page,
> +				      void *vaddr, pmdval_t pmd_val)
> +{

sme_populate() or so sounds better.

> +	pud_t *pud;
> +	pmd_t *pmd;
> +
> +	pgd += pgd_index((unsigned long)vaddr);
> +	if (pgd_none(*pgd)) {
> +		pud = next_page;
> +		memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD);
> +		native_set_pgd(pgd,
> +			       native_make_pgd((unsigned long)pud + PGD_FLAGS));

Let it stick out, no need for those "stairs" in the vertical alignment :)

> +		next_page += sizeof(*pud) * PTRS_PER_PUD;
> +	} else {
> +		pud = (pud_t *)(native_pgd_val(*pgd) & ~PTE_FLAGS_MASK);
> +	}
> +
> +	pud += pud_index((unsigned long)vaddr);
> +	if (pud_none(*pud)) {
> +		pmd = next_page;
> +		memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD);
> +		native_set_pud(pud,
> +			       native_make_pud((unsigned long)pmd + PUD_FLAGS));
> +		next_page += sizeof(*pmd) * PTRS_PER_PMD;
> +	} else {
> +		pmd = (pmd_t *)(native_pud_val(*pud) & ~PTE_FLAGS_MASK);
> +	}
> +
> +	pmd += pmd_index((unsigned long)vaddr);
> +	if (pmd_none(*pmd) || !pmd_large(*pmd))
> +		native_set_pmd(pmd, native_make_pmd(pmd_val));
> +
> +	return next_page;
> +}
> +
> +static unsigned long __init sme_pgtable_calc(unsigned long start,
> +					     unsigned long end)
> +{
> +	unsigned long addr, total;
> +
> +	total = 0;
> +	addr = start;
> +	while (addr < end) {
> +		unsigned long pgd_end;
> +
> +		pgd_end = (addr & PGDIR_MASK) + PGDIR_SIZE;
> +		if (pgd_end > end)
> +			pgd_end = end;
> +
> +		total += sizeof(pud_t) * PTRS_PER_PUD * 2;
> +
> +		while (addr < pgd_end) {
> +			unsigned long pud_end;
> +
> +			pud_end = (addr & PUD_MASK) + PUD_SIZE;
> +			if (pud_end > end)
> +				pud_end = end;
> +
> +			total += sizeof(pmd_t) * PTRS_PER_PMD * 2;

That "* 2" is?

> +
> +			addr = pud_end;

So			addr += PUD_SIZE;

?

> +		}
> +
> +		addr = pgd_end;

So		addr += PGD_SIZE;

?

> +	total += sizeof(pgd_t) * PTRS_PER_PGD;
> +
> +	return total;
> +}
>  
>  void __init sme_encrypt_kernel(void)
>  {
> +	pgd_t *pgd;
> +	void *workarea, *next_page, *vaddr;
> +	unsigned long kern_start, kern_end, kern_len;
> +	unsigned long index, paddr, pmd_flags;
> +	unsigned long exec_size, full_size;
> +
> +	/* If SME is not active then no need to prepare */

That comment is obvious.

> +	if (!sme_active())
> +		return;
> +
> +	/* Set the workarea to be after the kernel */
> +	workarea = (void *)ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
> +
> +	/*
> +	 * Prepare for encrypting the kernel by building new pagetables with
> +	 * the necessary attributes needed to encrypt the kernel in place.
> +	 *
> +	 *   One range of virtual addresses will map the memory occupied
> +	 *   by the kernel as encrypted.
> +	 *
> +	 *   Another range of virtual addresses will map the memory occupied
> +	 *   by the kernel as decrypted and write-protected.
> +	 *
> +	 *     The use of write-protect attribute will prevent any of the
> +	 *     memory from being cached.
> +	 */
> +
> +	/* Physical address gives us the identity mapped virtual address */
> +	kern_start = __pa_symbol(_text);
> +	kern_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE) - 1;

So
	kern_end = (unsigned long)workarea - 1;

?

Also, you can make that workarea be unsigned long and cast it to void *
only when needed so that you don't need to cast it in here for the
calculations.

> +	kern_len = kern_end - kern_start + 1;
> +
> +	/*
> +	 * Calculate required number of workarea bytes needed:
> +	 *   executable encryption area size:
> +	 *     stack page (PAGE_SIZE)
> +	 *     encryption routine page (PAGE_SIZE)
> +	 *     intermediate copy buffer (PMD_PAGE_SIZE)
> +	 *   pagetable structures for workarea (in case not currently mapped)
> +	 *   pagetable structures for the encryption of the kernel
> +	 */
> +	exec_size = (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
> +
> +	full_size = exec_size;
> +	full_size += ALIGN(exec_size, PMD_PAGE_SIZE) / PMD_PAGE_SIZE *
> +		     sizeof(pmd_t) * PTRS_PER_PMD;
> +	full_size += sme_pgtable_calc(kern_start, kern_end + exec_size);
> +
> +	next_page = workarea + exec_size;

So next_page is the next free page after the workarea, correct? Because
of all things, *that* certainly needs a comment. It took me a while to
decipher what's going on here and I'm still not 100% clear.

> +	/* Make sure the current pagetables have entries for the workarea */
> +	pgd = (pgd_t *)native_read_cr3();
> +	paddr = (unsigned long)workarea;
> +	while (paddr < (unsigned long)workarea + full_size) {
> +		vaddr = (void *)paddr;
> +		next_page = sme_pgtable_entry(pgd, next_page, vaddr,
> +					      paddr + PMD_FLAGS);
> +
> +		paddr += PMD_PAGE_SIZE;
> +	}
> +	native_write_cr3(native_read_cr3());

Why not

	native_write_cr3((unsigned long)pgd);

?

Now you can actually acknowledge that the code block in between changed
the hierarchy in pgd and you're reloading it.

> +	/* Calculate a PGD index to be used for the decrypted mapping */
> +	index = (pgd_index(kern_end + full_size) + 1) & (PTRS_PER_PGD - 1);
> +	index <<= PGDIR_SHIFT;

So call it decrypt_mapping_pgd or so. index doesn't say anything. Also,
move it right above where it is being used. This function is very hard
to follow as it is.

> +	/* Set and clear the PGD */

This needs more text: we're building a new temporary pagetable which
will have A, B and C mapped into it and blablabla...

> +	pgd = next_page;
> +	memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD);
> +	next_page += sizeof(*pgd) * PTRS_PER_PGD;
> +
> +	/* Add encrypted (identity) mappings for the kernel */
> +	pmd_flags = PMD_FLAGS | _PAGE_ENC;
> +	paddr = kern_start;
> +	while (paddr < kern_end) {
> +		vaddr = (void *)paddr;
> +		next_page = sme_pgtable_entry(pgd, next_page, vaddr,
> +					      paddr + pmd_flags);
> +
> +		paddr += PMD_PAGE_SIZE;
> +	}
> +
> +	/* Add decrypted (non-identity) mappings for the kernel */
> +	pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT);
> +	paddr = kern_start;
> +	while (paddr < kern_end) {
> +		vaddr = (void *)(paddr + index);
> +		next_page = sme_pgtable_entry(pgd, next_page, vaddr,
> +					      paddr + pmd_flags);
> +
> +		paddr += PMD_PAGE_SIZE;
> +	}
> +
> +	/* Add the workarea to both mappings */
> +	paddr = kern_end + 1;

	paddr = (unsigned long)workarea;

Now this makes sense when I read the comment above it.

> +	while (paddr < (kern_end + exec_size)) {

... which actually wants that exec_size to be called workarea_size. Then
it'll make more sense.

And then the thing above:

	next_page = workarea + exec_size;

would look like:

	next_page = workarea + workarea_size;

which would make even more sense. And since you have stuff called _start
and _end, you can do:

	next_page = workarea_start + workarea_size;

and not it would make most sense. Eva! :-)

> +		vaddr = (void *)paddr;
> +		next_page = sme_pgtable_entry(pgd, next_page, vaddr,
> +					      paddr + PMD_FLAGS);
> +
> +		vaddr = (void *)(paddr + index);
> +		next_page = sme_pgtable_entry(pgd, next_page, vaddr,
> +					      paddr + PMD_FLAGS);
> +
> +		paddr += PMD_PAGE_SIZE;
> +	}
> +
> +	/* Perform the encryption */
> +	sme_encrypt_execute(kern_start, kern_start + index, kern_len,
> +			    workarea, pgd);
> +

Phew, that's one tough patch to review. I'd like to review it again in
your next submission.
Tom Lendacky March 2, 2017, 6:30 p.m. UTC | #2
On 3/1/2017 11:36 AM, Borislav Petkov wrote:
> On Thu, Feb 16, 2017 at 09:48:08AM -0600, Tom Lendacky wrote:
>> This patch adds the support to encrypt the kernel in-place. This is
>> done by creating new page mappings for the kernel - a decrypted
>> write-protected mapping and an encrypted mapping. The kernel is encyrpted
>
> s/encyrpted/encrypted/
>
>> by copying the kernel through a temporary buffer.
>
> "... by copying it... "

Ok.

>
>>
>> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
>> ---
>
> ...
>
>> +ENTRY(sme_encrypt_execute)
>> +
>> +#ifdef CONFIG_AMD_MEM_ENCRYPT
>> +	/*
>> +	 * Entry parameters:
>> +	 *   RDI - virtual address for the encrypted kernel mapping
>> +	 *   RSI - virtual address for the decrypted kernel mapping
>> +	 *   RDX - length of kernel
>> +	 *   RCX - address of the encryption workarea
>
> 						     , including:

Ok.

>
>> +	 *     - stack page (PAGE_SIZE)
>> +	 *     - encryption routine page (PAGE_SIZE)
>> +	 *     - intermediate copy buffer (PMD_PAGE_SIZE)
>> +	 *    R8 - address of the pagetables to use for encryption
>> +	 */
>> +
>> +	/* Set up a one page stack in the non-encrypted memory area */
>> +	movq	%rcx, %rax
>> +	addq	$PAGE_SIZE, %rax
>> +	movq	%rsp, %rbp
>
> %rbp is callee-saved and you're overwriting it here. You need to push it
> first.

Yup, I'll re-work the entry code based on this comment and the one
below.

>
>> +	movq	%rax, %rsp
>> +	push	%rbp
>> +
>> +	push	%r12
>> +	push	%r13
>
> In general, just do all pushes on function entry and the pops on exit,
> like the compiler does.
>
>> +	movq	%rdi, %r10
>> +	movq	%rsi, %r11
>> +	movq	%rdx, %r12
>> +	movq	%rcx, %r13
>> +
>> +	/* Copy encryption routine into the workarea */
>> +	movq	%rax, %rdi
>> +	leaq	.Lencrypt_start(%rip), %rsi
>> +	movq	$(.Lencrypt_stop - .Lencrypt_start), %rcx
>> +	rep	movsb
>> +
>> +	/* Setup registers for call */
>> +	movq	%r10, %rdi
>> +	movq	%r11, %rsi
>> +	movq	%r8, %rdx
>> +	movq	%r12, %rcx
>> +	movq	%rax, %r8
>> +	addq	$PAGE_SIZE, %r8
>> +
>> +	/* Call the encryption routine */
>> +	call	*%rax
>> +
>> +	pop	%r13
>> +	pop	%r12
>> +
>> +	pop	%rsp			/* Restore original stack pointer */
>> +.Lencrypt_exit:
>
> Please put side comments like this here:

Ok, can do.

>
> ENTRY(sme_encrypt_execute)
>
> #ifdef CONFIG_AMD_MEM_ENCRYPT
>         /*
>          * Entry parameters:
>          *   RDI - virtual address for the encrypted kernel mapping
>          *   RSI - virtual address for the decrypted kernel mapping
>          *   RDX - length of kernel
>          *   RCX - address of the encryption workarea
>          *     - stack page (PAGE_SIZE)
>          *     - encryption routine page (PAGE_SIZE)
>          *     - intermediate copy buffer (PMD_PAGE_SIZE)
>          *    R8 - address of the pagetables to use for encryption
>          */
>
>         /* Set up a one page stack in the non-encrypted memory area */
>         movq    %rcx, %rax                      # %rax = workarea
>         addq    $PAGE_SIZE, %rax                # %rax += 4096
>         movq    %rsp, %rbp                      # stash stack ptr
>         movq    %rax, %rsp                      # set new stack
>         push    %rbp                            # needs to happen before the mov %rsp, %rbp
>
>         push    %r12
>         push    %r13
>
>         movq    %rdi, %r10                      # encrypted kernel
>         movq    %rsi, %r11                      # decrypted kernel
>         movq    %rdx, %r12                      # kernel length
>         movq    %rcx, %r13                      # workarea
> 	...
>
> and so on.
>
> ...
>
>> diff --git a/arch/x86/kernel/mem_encrypt_init.c b/arch/x86/kernel/mem_encrypt_init.c
>> index 25af15d..07cbb90 100644
>> --- a/arch/x86/kernel/mem_encrypt_init.c
>> +++ b/arch/x86/kernel/mem_encrypt_init.c
>> @@ -16,9 +16,200 @@
>>  #ifdef CONFIG_AMD_MEM_ENCRYPT
>>
>>  #include <linux/mem_encrypt.h>
>> +#include <linux/mm.h>
>> +
>> +#include <asm/sections.h>
>> +
>> +extern void sme_encrypt_execute(unsigned long, unsigned long, unsigned long,
>> +				void *, pgd_t *);
>
> This belongs into mem_encrypt.h. And I think it already came up: please
> use names for those params.

Yup, will move it.

>
>> +
>> +#define PGD_FLAGS	_KERNPG_TABLE_NOENC
>> +#define PUD_FLAGS	_KERNPG_TABLE_NOENC
>> +#define PMD_FLAGS	__PAGE_KERNEL_LARGE_EXEC
>> +
>> +static void __init *sme_pgtable_entry(pgd_t *pgd, void *next_page,
>> +				      void *vaddr, pmdval_t pmd_val)
>> +{
>
> sme_populate() or so sounds better.

Ok.

>
>> +	pud_t *pud;
>> +	pmd_t *pmd;
>> +
>> +	pgd += pgd_index((unsigned long)vaddr);
>> +	if (pgd_none(*pgd)) {
>> +		pud = next_page;
>> +		memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD);
>> +		native_set_pgd(pgd,
>> +			       native_make_pgd((unsigned long)pud + PGD_FLAGS));
>
> Let it stick out, no need for those "stairs" in the vertical alignment :)

Ok.

>
>> +		next_page += sizeof(*pud) * PTRS_PER_PUD;
>> +	} else {
>> +		pud = (pud_t *)(native_pgd_val(*pgd) & ~PTE_FLAGS_MASK);
>> +	}
>> +
>> +	pud += pud_index((unsigned long)vaddr);
>> +	if (pud_none(*pud)) {
>> +		pmd = next_page;
>> +		memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD);
>> +		native_set_pud(pud,
>> +			       native_make_pud((unsigned long)pmd + PUD_FLAGS));
>> +		next_page += sizeof(*pmd) * PTRS_PER_PMD;
>> +	} else {
>> +		pmd = (pmd_t *)(native_pud_val(*pud) & ~PTE_FLAGS_MASK);
>> +	}
>> +
>> +	pmd += pmd_index((unsigned long)vaddr);
>> +	if (pmd_none(*pmd) || !pmd_large(*pmd))
>> +		native_set_pmd(pmd, native_make_pmd(pmd_val));
>> +
>> +	return next_page;
>> +}
>> +
>> +static unsigned long __init sme_pgtable_calc(unsigned long start,
>> +					     unsigned long end)
>> +{
>> +	unsigned long addr, total;
>> +
>> +	total = 0;
>> +	addr = start;
>> +	while (addr < end) {
>> +		unsigned long pgd_end;
>> +
>> +		pgd_end = (addr & PGDIR_MASK) + PGDIR_SIZE;
>> +		if (pgd_end > end)
>> +			pgd_end = end;
>> +
>> +		total += sizeof(pud_t) * PTRS_PER_PUD * 2;
>> +
>> +		while (addr < pgd_end) {
>> +			unsigned long pud_end;
>> +
>> +			pud_end = (addr & PUD_MASK) + PUD_SIZE;
>> +			if (pud_end > end)
>> +				pud_end = end;
>> +
>> +			total += sizeof(pmd_t) * PTRS_PER_PMD * 2;
>
> That "* 2" is?

The "* 2" here and above is that a PUD and a PMD is needed for both
the encrypted and decrypted mappings. I'll add a comment to clarify
that.

>
>> +
>> +			addr = pud_end;
>
> So			addr += PUD_SIZE;
>
> ?

Yes, I believe that is correct.

>
>> +		}
>> +
>> +		addr = pgd_end;
>
> So		addr += PGD_SIZE;
>
> ?

Yup, I can do that here too (but need PGDIR_SIZE).

>
>> +	total += sizeof(pgd_t) * PTRS_PER_PGD;
>> +
>> +	return total;
>> +}
>>
>>  void __init sme_encrypt_kernel(void)
>>  {
>> +	pgd_t *pgd;
>> +	void *workarea, *next_page, *vaddr;
>> +	unsigned long kern_start, kern_end, kern_len;
>> +	unsigned long index, paddr, pmd_flags;
>> +	unsigned long exec_size, full_size;
>> +
>> +	/* If SME is not active then no need to prepare */
>
> That comment is obvious.

Ok.

>
>> +	if (!sme_active())
>> +		return;
>> +
>> +	/* Set the workarea to be after the kernel */
>> +	workarea = (void *)ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
>> +
>> +	/*
>> +	 * Prepare for encrypting the kernel by building new pagetables with
>> +	 * the necessary attributes needed to encrypt the kernel in place.
>> +	 *
>> +	 *   One range of virtual addresses will map the memory occupied
>> +	 *   by the kernel as encrypted.
>> +	 *
>> +	 *   Another range of virtual addresses will map the memory occupied
>> +	 *   by the kernel as decrypted and write-protected.
>> +	 *
>> +	 *     The use of write-protect attribute will prevent any of the
>> +	 *     memory from being cached.
>> +	 */
>> +
>> +	/* Physical address gives us the identity mapped virtual address */
>> +	kern_start = __pa_symbol(_text);
>> +	kern_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE) - 1;
>
> So
> 	kern_end = (unsigned long)workarea - 1;
>
> ?
>
> Also, you can make that workarea be unsigned long and cast it to void *
> only when needed so that you don't need to cast it in here for the
> calculations.

Ok, I'll rework this a bit.  I believe I can even get rid of the
"+ 1" and "- 1" stuff, too.

>
>> +	kern_len = kern_end - kern_start + 1;
>> +
>> +	/*
>> +	 * Calculate required number of workarea bytes needed:
>> +	 *   executable encryption area size:
>> +	 *     stack page (PAGE_SIZE)
>> +	 *     encryption routine page (PAGE_SIZE)
>> +	 *     intermediate copy buffer (PMD_PAGE_SIZE)
>> +	 *   pagetable structures for workarea (in case not currently mapped)
>> +	 *   pagetable structures for the encryption of the kernel
>> +	 */
>> +	exec_size = (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
>> +
>> +	full_size = exec_size;
>> +	full_size += ALIGN(exec_size, PMD_PAGE_SIZE) / PMD_PAGE_SIZE *
>> +		     sizeof(pmd_t) * PTRS_PER_PMD;
>> +	full_size += sme_pgtable_calc(kern_start, kern_end + exec_size);
>> +
>> +	next_page = workarea + exec_size;
>
> So next_page is the next free page after the workarea, correct? Because
> of all things, *that* certainly needs a comment. It took me a while to
> decipher what's going on here and I'm still not 100% clear.

So next_page is the first free page within the workarea in which a
pagetable entry (PGD, PUD or PMD) can be created when we are populating
the new mappings or adding the workarea to the current mapping.  Any
new pagetable structures that are created will use this value.

>
>> +	/* Make sure the current pagetables have entries for the workarea */
>> +	pgd = (pgd_t *)native_read_cr3();
>> +	paddr = (unsigned long)workarea;
>> +	while (paddr < (unsigned long)workarea + full_size) {
>> +		vaddr = (void *)paddr;
>> +		next_page = sme_pgtable_entry(pgd, next_page, vaddr,
>> +					      paddr + PMD_FLAGS);
>> +
>> +		paddr += PMD_PAGE_SIZE;
>> +	}
>> +	native_write_cr3(native_read_cr3());
>
> Why not
>
> 	native_write_cr3((unsigned long)pgd);
>
> ?
>
> Now you can actually acknowledge that the code block in between changed
> the hierarchy in pgd and you're reloading it.

Ok, that makes sense.

>
>> +	/* Calculate a PGD index to be used for the decrypted mapping */
>> +	index = (pgd_index(kern_end + full_size) + 1) & (PTRS_PER_PGD - 1);
>> +	index <<= PGDIR_SHIFT;
>
> So call it decrypt_mapping_pgd or so. index doesn't say anything. Also,
> move it right above where it is being used. This function is very hard
> to follow as it is.

Ok, I'll work on the comment.  Something along the line of:

/*
  * The encrypted mapping of the kernel will use identity mapped
  * virtual addresses.  A different PGD index/entry must be used to
  * get different pagetable entries for the decrypted mapping.
  * Choose the next PGD index and convert it to a virtual address
  * to be used as the base of the mapping.
  */

>
>> +	/* Set and clear the PGD */
>
> This needs more text: we're building a new temporary pagetable which
> will have A, B and C mapped into it and blablabla...

Will do.

>
>> +	pgd = next_page;
>> +	memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD);
>> +	next_page += sizeof(*pgd) * PTRS_PER_PGD;
>> +
>> +	/* Add encrypted (identity) mappings for the kernel */
>> +	pmd_flags = PMD_FLAGS | _PAGE_ENC;
>> +	paddr = kern_start;
>> +	while (paddr < kern_end) {
>> +		vaddr = (void *)paddr;
>> +		next_page = sme_pgtable_entry(pgd, next_page, vaddr,
>> +					      paddr + pmd_flags);
>> +
>> +		paddr += PMD_PAGE_SIZE;
>> +	}
>> +
>> +	/* Add decrypted (non-identity) mappings for the kernel */
>> +	pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT);
>> +	paddr = kern_start;
>> +	while (paddr < kern_end) {
>> +		vaddr = (void *)(paddr + index);
>> +		next_page = sme_pgtable_entry(pgd, next_page, vaddr,
>> +					      paddr + pmd_flags);
>> +
>> +		paddr += PMD_PAGE_SIZE;
>> +	}
>> +
>> +	/* Add the workarea to both mappings */
>> +	paddr = kern_end + 1;
>
> 	paddr = (unsigned long)workarea;
>
> Now this makes sense when I read the comment above it.

Yup, it does.

>
>> +	while (paddr < (kern_end + exec_size)) {
>
> ... which actually wants that exec_size to be called workarea_size. Then
> it'll make more sense.

Except the workarea size includes both the encryption execution
size and the pagetable structure size.  I'll work on this to try
and clarify it better.

>
> And then the thing above:
>
> 	next_page = workarea + exec_size;
>
> would look like:
>
> 	next_page = workarea + workarea_size;
>
> which would make even more sense. And since you have stuff called _start
> and _end, you can do:
>
> 	next_page = workarea_start + workarea_size;
>
> and not it would make most sense. Eva! :-)
>
>> +		vaddr = (void *)paddr;
>> +		next_page = sme_pgtable_entry(pgd, next_page, vaddr,
>> +					      paddr + PMD_FLAGS);
>> +
>> +		vaddr = (void *)(paddr + index);
>> +		next_page = sme_pgtable_entry(pgd, next_page, vaddr,
>> +					      paddr + PMD_FLAGS);
>> +
>> +		paddr += PMD_PAGE_SIZE;
>> +	}
>> +
>> +	/* Perform the encryption */
>> +	sme_encrypt_execute(kern_start, kern_start + index, kern_len,
>> +			    workarea, pgd);
>> +
>
> Phew, that's one tough patch to review. I'd like to review it again in
> your next submission.

Most definitely.  I appreciate the feedback since I'm very close to
the code and have an understanding of what I'm doing. I'd like to be
sure that everyone can easily understand what is happening.

Thanks,
Tom

>
Borislav Petkov March 2, 2017, 6:51 p.m. UTC | #3
On Thu, Mar 02, 2017 at 12:30:31PM -0600, Tom Lendacky wrote:
> The "* 2" here and above is that a PUD and a PMD is needed for both
> the encrypted and decrypted mappings. I'll add a comment to clarify
> that.

Ah, makes sense. Definitely needs a comment.

> Yup, I can do that here too (but need PGDIR_SIZE).

Right, I did test and wanted to write PGDIR_SIZE but then ... I guess
something distracted me :-)

> So next_page is the first free page within the workarea in which a
> pagetable entry (PGD, PUD or PMD) can be created when we are populating
> the new mappings or adding the workarea to the current mapping.  Any
> new pagetable structures that are created will use this value.

Ok, so I guess this needs an overview comment with maybe some ascii
showing how workarea, exec_size, full_size and all those other things
play together.

> Ok, I'll work on the comment.  Something along the line of:
>
> /*
>  * The encrypted mapping of the kernel will use identity mapped
>  * virtual addresses.  A different PGD index/entry must be used to
>  * get different pagetable entries for the decrypted mapping.
>  * Choose the next PGD index and convert it to a virtual address
>  * to be used as the base of the mapping.

Better.

> Except the workarea size includes both the encryption execution
> size and the pagetable structure size.  I'll work on this to try
> and clarify it better.

That's a useful piece of info, yap, the big picture could use some more
explanation.

> Most definitely.  I appreciate the feedback since I'm very close to
> the code and have an understanding of what I'm doing. I'd like to be
> sure that everyone can easily understand what is happening.

Nice!

Thanks.
diff mbox

Patch

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 33af80a..dc3ed84 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -142,4 +142,5 @@  ifeq ($(CONFIG_X86_64),y)
 	obj-y				+= vsmp_64.o
 
 	obj-y				+= mem_encrypt_init.o
+	obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_boot.o
 endif
diff --git a/arch/x86/kernel/mem_encrypt_boot.S b/arch/x86/kernel/mem_encrypt_boot.S
new file mode 100644
index 0000000..58e1756
--- /dev/null
+++ b/arch/x86/kernel/mem_encrypt_boot.S
@@ -0,0 +1,156 @@ 
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/processor-flags.h>
+#include <asm/msr-index.h>
+
+	.text
+	.code64
+ENTRY(sme_encrypt_execute)
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+	/*
+	 * Entry parameters:
+	 *   RDI - virtual address for the encrypted kernel mapping
+	 *   RSI - virtual address for the decrypted kernel mapping
+	 *   RDX - length of kernel
+	 *   RCX - address of the encryption workarea
+	 *     - stack page (PAGE_SIZE)
+	 *     - encryption routine page (PAGE_SIZE)
+	 *     - intermediate copy buffer (PMD_PAGE_SIZE)
+	 *    R8 - address of the pagetables to use for encryption
+	 */
+
+	/* Set up a one page stack in the non-encrypted memory area */
+	movq	%rcx, %rax
+	addq	$PAGE_SIZE, %rax
+	movq	%rsp, %rbp
+	movq	%rax, %rsp
+	push	%rbp
+
+	push	%r12
+	push	%r13
+
+	movq	%rdi, %r10
+	movq	%rsi, %r11
+	movq	%rdx, %r12
+	movq	%rcx, %r13
+
+	/* Copy encryption routine into the workarea */
+	movq	%rax, %rdi
+	leaq	.Lencrypt_start(%rip), %rsi
+	movq	$(.Lencrypt_stop - .Lencrypt_start), %rcx
+	rep	movsb
+
+	/* Setup registers for call */
+	movq	%r10, %rdi
+	movq	%r11, %rsi
+	movq	%r8, %rdx
+	movq	%r12, %rcx
+	movq	%rax, %r8
+	addq	$PAGE_SIZE, %r8
+
+	/* Call the encryption routine */
+	call	*%rax
+
+	pop	%r13
+	pop	%r12
+
+	pop	%rsp			/* Restore original stack pointer */
+.Lencrypt_exit:
+#endif	/* CONFIG_AMD_MEM_ENCRYPT */
+
+	ret
+ENDPROC(sme_encrypt_execute)
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+/*
+ * Routine used to encrypt kernel.
+ *   This routine must be run outside of the kernel proper since
+ *   the kernel will be encrypted during the process. So this
+ *   routine is defined here and then copied to an area outside
+ *   of the kernel where it will remain and run decrypted
+ *   during execution.
+ *
+ *   On entry the registers must be:
+ *     RDI - virtual address for the encrypted kernel mapping
+ *     RSI - virtual address for the decrypted kernel mapping
+ *     RDX - address of the pagetables to use for encryption
+ *     RCX - length of kernel
+ *      R8 - intermediate copy buffer
+ *
+ *     RAX - points to this routine
+ *
+ * The kernel will be encrypted by copying from the non-encrypted
+ * kernel space to an intermediate buffer and then copying from the
+ * intermediate buffer back to the encrypted kernel space. The physical
+ * addresses of the two kernel space mappings are the same which
+ * results in the kernel being encrypted "in place".
+ */
+.Lencrypt_start:
+	/* Enable the new page tables */
+	mov	%rdx, %cr3
+
+	/* Flush any global TLBs */
+	mov	%cr4, %rdx
+	andq	$~X86_CR4_PGE, %rdx
+	mov	%rdx, %cr4
+	orq	$X86_CR4_PGE, %rdx
+	mov	%rdx, %cr4
+
+	/* Set the PAT register PA5 entry to write-protect */
+	push	%rcx
+	movl	$MSR_IA32_CR_PAT, %ecx
+	rdmsr
+	push	%rdx			/* Save original PAT value */
+	andl	$0xffff00ff, %edx	/* Clear PA5 */
+	orl	$0x00000500, %edx	/* Set PA5 to WP */
+	wrmsr
+	pop	%rdx			/* RDX contains original PAT value */
+	pop	%rcx
+
+	movq	%rcx, %r9		/* Save length */
+	movq	%rdi, %r10		/* Save destination address */
+	movq	%rsi, %r11		/* Save source address */
+
+	wbinvd				/* Invalidate any cache entries */
+
+	/* Copy/encrypt 2MB at a time */
+1:
+	movq	%r11, %rsi
+	movq	%r8, %rdi
+	movq	$PMD_PAGE_SIZE, %rcx
+	rep	movsb
+
+	movq	%r8, %rsi
+	movq	%r10, %rdi
+	movq	$PMD_PAGE_SIZE, %rcx
+	rep	movsb
+
+	addq	$PMD_PAGE_SIZE, %r11
+	addq	$PMD_PAGE_SIZE, %r10
+	subq	$PMD_PAGE_SIZE, %r9
+	jnz	1b
+
+	/* Restore PAT register */
+	push	%rdx
+	movl	$MSR_IA32_CR_PAT, %ecx
+	rdmsr
+	pop	%rdx
+	wrmsr
+
+	ret
+.Lencrypt_stop:
+#endif	/* CONFIG_AMD_MEM_ENCRYPT */
diff --git a/arch/x86/kernel/mem_encrypt_init.c b/arch/x86/kernel/mem_encrypt_init.c
index 25af15d..07cbb90 100644
--- a/arch/x86/kernel/mem_encrypt_init.c
+++ b/arch/x86/kernel/mem_encrypt_init.c
@@ -16,9 +16,200 @@ 
 #ifdef CONFIG_AMD_MEM_ENCRYPT
 
 #include <linux/mem_encrypt.h>
+#include <linux/mm.h>
+
+#include <asm/sections.h>
+
+extern void sme_encrypt_execute(unsigned long, unsigned long, unsigned long,
+				void *, pgd_t *);
+
+#define PGD_FLAGS	_KERNPG_TABLE_NOENC
+#define PUD_FLAGS	_KERNPG_TABLE_NOENC
+#define PMD_FLAGS	__PAGE_KERNEL_LARGE_EXEC
+
+static void __init *sme_pgtable_entry(pgd_t *pgd, void *next_page,
+				      void *vaddr, pmdval_t pmd_val)
+{
+	pud_t *pud;
+	pmd_t *pmd;
+
+	pgd += pgd_index((unsigned long)vaddr);
+	if (pgd_none(*pgd)) {
+		pud = next_page;
+		memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD);
+		native_set_pgd(pgd,
+			       native_make_pgd((unsigned long)pud + PGD_FLAGS));
+		next_page += sizeof(*pud) * PTRS_PER_PUD;
+	} else {
+		pud = (pud_t *)(native_pgd_val(*pgd) & ~PTE_FLAGS_MASK);
+	}
+
+	pud += pud_index((unsigned long)vaddr);
+	if (pud_none(*pud)) {
+		pmd = next_page;
+		memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD);
+		native_set_pud(pud,
+			       native_make_pud((unsigned long)pmd + PUD_FLAGS));
+		next_page += sizeof(*pmd) * PTRS_PER_PMD;
+	} else {
+		pmd = (pmd_t *)(native_pud_val(*pud) & ~PTE_FLAGS_MASK);
+	}
+
+	pmd += pmd_index((unsigned long)vaddr);
+	if (pmd_none(*pmd) || !pmd_large(*pmd))
+		native_set_pmd(pmd, native_make_pmd(pmd_val));
+
+	return next_page;
+}
+
+static unsigned long __init sme_pgtable_calc(unsigned long start,
+					     unsigned long end)
+{
+	unsigned long addr, total;
+
+	total = 0;
+	addr = start;
+	while (addr < end) {
+		unsigned long pgd_end;
+
+		pgd_end = (addr & PGDIR_MASK) + PGDIR_SIZE;
+		if (pgd_end > end)
+			pgd_end = end;
+
+		total += sizeof(pud_t) * PTRS_PER_PUD * 2;
+
+		while (addr < pgd_end) {
+			unsigned long pud_end;
+
+			pud_end = (addr & PUD_MASK) + PUD_SIZE;
+			if (pud_end > end)
+				pud_end = end;
+
+			total += sizeof(pmd_t) * PTRS_PER_PMD * 2;
+
+			addr = pud_end;
+		}
+
+		addr = pgd_end;
+	}
+	total += sizeof(pgd_t) * PTRS_PER_PGD;
+
+	return total;
+}
 
 void __init sme_encrypt_kernel(void)
 {
+	pgd_t *pgd;
+	void *workarea, *next_page, *vaddr;
+	unsigned long kern_start, kern_end, kern_len;
+	unsigned long index, paddr, pmd_flags;
+	unsigned long exec_size, full_size;
+
+	/* If SME is not active then no need to prepare */
+	if (!sme_active())
+		return;
+
+	/* Set the workarea to be after the kernel */
+	workarea = (void *)ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
+
+	/*
+	 * Prepare for encrypting the kernel by building new pagetables with
+	 * the necessary attributes needed to encrypt the kernel in place.
+	 *
+	 *   One range of virtual addresses will map the memory occupied
+	 *   by the kernel as encrypted.
+	 *
+	 *   Another range of virtual addresses will map the memory occupied
+	 *   by the kernel as decrypted and write-protected.
+	 *
+	 *     The use of write-protect attribute will prevent any of the
+	 *     memory from being cached.
+	 */
+
+	/* Physical address gives us the identity mapped virtual address */
+	kern_start = __pa_symbol(_text);
+	kern_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE) - 1;
+	kern_len = kern_end - kern_start + 1;
+
+	/*
+	 * Calculate required number of workarea bytes needed:
+	 *   executable encryption area size:
+	 *     stack page (PAGE_SIZE)
+	 *     encryption routine page (PAGE_SIZE)
+	 *     intermediate copy buffer (PMD_PAGE_SIZE)
+	 *   pagetable structures for workarea (in case not currently mapped)
+	 *   pagetable structures for the encryption of the kernel
+	 */
+	exec_size = (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
+
+	full_size = exec_size;
+	full_size += ALIGN(exec_size, PMD_PAGE_SIZE) / PMD_PAGE_SIZE *
+		     sizeof(pmd_t) * PTRS_PER_PMD;
+	full_size += sme_pgtable_calc(kern_start, kern_end + exec_size);
+
+	next_page = workarea + exec_size;
+
+	/* Make sure the current pagetables have entries for the workarea */
+	pgd = (pgd_t *)native_read_cr3();
+	paddr = (unsigned long)workarea;
+	while (paddr < (unsigned long)workarea + full_size) {
+		vaddr = (void *)paddr;
+		next_page = sme_pgtable_entry(pgd, next_page, vaddr,
+					      paddr + PMD_FLAGS);
+
+		paddr += PMD_PAGE_SIZE;
+	}
+	native_write_cr3(native_read_cr3());
+
+	/* Calculate a PGD index to be used for the decrypted mapping */
+	index = (pgd_index(kern_end + full_size) + 1) & (PTRS_PER_PGD - 1);
+	index <<= PGDIR_SHIFT;
+
+	/* Set and clear the PGD */
+	pgd = next_page;
+	memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD);
+	next_page += sizeof(*pgd) * PTRS_PER_PGD;
+
+	/* Add encrypted (identity) mappings for the kernel */
+	pmd_flags = PMD_FLAGS | _PAGE_ENC;
+	paddr = kern_start;
+	while (paddr < kern_end) {
+		vaddr = (void *)paddr;
+		next_page = sme_pgtable_entry(pgd, next_page, vaddr,
+					      paddr + pmd_flags);
+
+		paddr += PMD_PAGE_SIZE;
+	}
+
+	/* Add decrypted (non-identity) mappings for the kernel */
+	pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT);
+	paddr = kern_start;
+	while (paddr < kern_end) {
+		vaddr = (void *)(paddr + index);
+		next_page = sme_pgtable_entry(pgd, next_page, vaddr,
+					      paddr + pmd_flags);
+
+		paddr += PMD_PAGE_SIZE;
+	}
+
+	/* Add the workarea to both mappings */
+	paddr = kern_end + 1;
+	while (paddr < (kern_end + exec_size)) {
+		vaddr = (void *)paddr;
+		next_page = sme_pgtable_entry(pgd, next_page, vaddr,
+					      paddr + PMD_FLAGS);
+
+		vaddr = (void *)(paddr + index);
+		next_page = sme_pgtable_entry(pgd, next_page, vaddr,
+					      paddr + PMD_FLAGS);
+
+		paddr += PMD_PAGE_SIZE;
+	}
+
+	/* Perform the encryption */
+	sme_encrypt_execute(kern_start, kern_start + index, kern_len,
+			    workarea, pgd);
+
 }
 
 unsigned long __init sme_get_me_mask(void)