diff mbox series

[v25,07/21] x86/sgx: Enumerate and track EPC sections

Message ID 20200204060545.31729-8-jarkko.sakkinen@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series Intel SGX foundations | expand

Commit Message

Jarkko Sakkinen Feb. 4, 2020, 6:05 a.m. UTC
From: Sean Christopherson <sean.j.christopherson@intel.com>

Enumerate Enclave Page Cache (EPC) sections via CPUID and add the data
structures necessary to track EPC pages so that they can be allocated,
freed and managed. As a system may have multiple EPC sections, invoke CPUID
on SGX sub-leafs until an invalid leaf is encountered.

For simplicity, support a maximum of eight EPC sections. Existing client
hardware supports only a single section, while upcoming server hardware
will support at most eight sections. Bounding the number of sections also
allows the section ID to be embedded along with a page's offset in a single
unsigned long, enabling easy retrieval of both the VA and PA for a given
page.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Co-developed-by: Serge Ayoun <serge.ayoun@intel.com>
Signed-off-by: Serge Ayoun <serge.ayoun@intel.com>
Co-developed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
 arch/x86/Kconfig                  |  14 +++
 arch/x86/kernel/cpu/Makefile      |   1 +
 arch/x86/kernel/cpu/sgx/Makefile  |   3 +
 arch/x86/kernel/cpu/sgx/main.c    | 151 ++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/sgx/reclaim.c |  87 +++++++++++++++++
 arch/x86/kernel/cpu/sgx/sgx.h     |  70 ++++++++++++++
 6 files changed, 326 insertions(+)
 create mode 100644 arch/x86/kernel/cpu/sgx/Makefile
 create mode 100644 arch/x86/kernel/cpu/sgx/main.c
 create mode 100644 arch/x86/kernel/cpu/sgx/reclaim.c
 create mode 100644 arch/x86/kernel/cpu/sgx/sgx.h

Comments

Sean Christopherson Feb. 5, 2020, 7:57 p.m. UTC | #1
On Tue, Feb 04, 2020 at 08:05:31AM +0200, Jarkko Sakkinen wrote:
> From: Sean Christopherson <sean.j.christopherson@intel.com>
> 
> Enumerate Enclave Page Cache (EPC) sections via CPUID and add the data
> structures necessary to track EPC pages so that they can be allocated,
> freed and managed. As a system may have multiple EPC sections, invoke CPUID
> on SGX sub-leafs until an invalid leaf is encountered.
> 
> For simplicity, support a maximum of eight EPC sections. Existing client
> hardware supports only a single section, while upcoming server hardware
> will support at most eight sections. Bounding the number of sections also
> allows the section ID to be embedded along with a page's offset in a single
> unsigned long, enabling easy retrieval of both the VA and PA for a given
> page.

...

> +++ b/arch/x86/kernel/cpu/sgx/reclaim.c
> @@ -0,0 +1,87 @@
> +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> +// Copyright(c) 2016-19 Intel Corporation.
> +
> +#include <linux/freezer.h>
> +#include <linux/highmem.h>
> +#include <linux/kthread.h>
> +#include <linux/pagemap.h>
> +#include <linux/ratelimit.h>
> +#include <linux/slab.h>
> +#include <linux/sched/mm.h>
> +#include <linux/sched/signal.h>
> +#include "encls.h"
> +
> +struct task_struct *ksgxswapd_tsk;
> +
> +/*
> + * Reset all pages to uninitialized state. Pages could be in initialized on
> + * kmemexec.
> + */
> +static void sgx_sanitize_section(struct sgx_epc_section *section)
> +{
> +	struct sgx_epc_page *page, *tmp;
> +	LIST_HEAD(secs_list);
> +	int ret;
> +
> +	while (!list_empty(&section->unsanitized_page_list)) {
> +		if (kthread_should_stop())
> +			return;
> +
> +		spin_lock(&section->lock);
> +
> +		page = list_first_entry(&section->unsanitized_page_list,
> +					struct sgx_epc_page, list);
> +
> +		ret = __eremove(sgx_epc_addr(page));
> +		if (!ret)
> +			list_move(&page->list, &section->page_list);
> +		else
> +			list_move_tail(&page->list, &secs_list);
> +
> +		spin_unlock(&section->lock);
> +
> +		cond_resched();
> +	}
> +
> +	list_for_each_entry_safe(page, tmp, &secs_list, list) {
> +		if (kthread_should_stop())
> +			return;
> +
> +		ret = __eremove(sgx_epc_addr(page));
> +		if (!WARN_ON_ONCE(ret)) {

Sadly, this WARN can fire after kexec() on systems with multiple EPC
sections if the SECS has child pages in another section.

Virtual EPC (KVM) has a similar issue, which I'm handling by collecting all
pages that fail a second EREMOVE in a global list, and then retrying every
page in the global list every time a virtual EPC is destroyed, i.e. might
have freed pages.

The same approach should work here, e.g. retry all SECS pages a third time
once all sections have been sanitized and WARN if EREMOVE fails a third
time on a page.

> +			spin_lock(&section->lock);
> +			list_move(&page->list, &section->page_list);
> +			spin_unlock(&section->lock);
> +		} else {
> +			list_del(&page->list);
> +			kfree(page);
> +		}
> +
> +		cond_resched();
> +	}
> +}
> +
> +static int ksgxswapd(void *p)
> +{
> +	int i;
> +
> +	set_freezable();
> +
> +	for (i = 0; i < sgx_nr_epc_sections; i++)
> +		sgx_sanitize_section(&sgx_epc_sections[i]);

I'm having second thoughts about proactively sanitizing the EPC sections.
I think it'd be better to do EREMOVE the first time a page is allocated,
e.g. add a SGX_EPC_PAGE_UNSANITIZED flag.

  1. Sanitizing EPC that's never used is a waste of cycles, especially on
     platforms with 64gb+ of EPC.

  2. Deferring to allocation time automatically scales with the number of
     tasks that are allocating EPC.  And, the CPU time will also be
     accounted to those tasks.

  3. Breaks on-demand paging when running in a VM, e.g. if the VMM chooses
     to allocate a physical EPC page when it's actually accessed by the
     VM.  I don't expect this to be a problem any time soon, as all VMMs
     will likely preallocate EPC pages until KVM (or any other hypervisor)
     gains EPC oversusbscription support, which may or may not ever happen.
     But, I'd prefer to simply not have the problem in the first place.

The logic will be a bit more complicated, but not terribly so.  

> +
> +	return 0;
> +}
Jarkko Sakkinen Feb. 5, 2020, 11:11 p.m. UTC | #2
On Wed, Feb 05, 2020 at 11:57:00AM -0800, Sean Christopherson wrote:
> > +		ret = __eremove(sgx_epc_addr(page));
> > +		if (!WARN_ON_ONCE(ret)) {
> 
> Sadly, this WARN can fire after kexec() on systems with multiple EPC
> sections if the SECS has child pages in another section.

What causes this?

/Jarkko
Jarkko Sakkinen Feb. 6, 2020, 3:34 p.m. UTC | #3
On Thu, Feb 06, 2020 at 01:11:47AM +0200, Jarkko Sakkinen wrote:
> On Wed, Feb 05, 2020 at 11:57:00AM -0800, Sean Christopherson wrote:
> > > +		ret = __eremove(sgx_epc_addr(page));
> > > +		if (!WARN_ON_ONCE(ret)) {
> > 
> > Sadly, this WARN can fire after kexec() on systems with multiple EPC
> > sections if the SECS has child pages in another section.
> 
> What causes this?

Ah obviously this can happen given that the final loop is done per
section before other sections are processed.

Lets fix the code first rather than change the approach based on code
that has an underlying regression. Performance can be fine tuned even
after upstreaming. Especially if the performance increases complexity it
is better to work that after there is a mainline code base.

Given that the loop is done in separate thread anyway, I'm not sure how
bad performance issue there is anyway. Performance based changes should
be always done based on a workloads and statistics.

I think you'd fix this issue by first changing the functions as:

static void sgx_sanitize_section(struct sgx_epc_section *section)
{
	struct sgx_epc_page *page, *tmp;
	LIST_HEAD(secs_list);
	int ret;

	while (!list_empty(&section->unsanitized_page_list)) {
		if (kthread_should_stop())
			return;

		spin_lock(&section->lock);

		page = list_first_entry(&section->unsanitized_page_list,
					struct sgx_epc_page, list);

		ret = __eremove(sgx_epc_addr(page));
		if (!ret)
			list_move(&page->list, &section->page_list);
		else
			list_move_tail(&page->list, &secs_list);

		spin_unlock(&section->lock);

		cond_resched();
	}

	list_move_tail(&secs_list, &section->unsanitized_list);
}

Then in ksgxswapd() you'd

for (i = 0; i < sgx_nr_epc_sections; i++)
	sgx_sanitize_section(&sgx_epc_sections[i]);

/* 2nd round for SECS */
for (i = 0; i < sgx_nr_epc_sections; i++)
	sgx_sanitize_section(&sgx_epc_sections[i]);

Finally you'd:

for (i = 0; i < sgx_nr_epc_sections; i++)
	WARN_ONCE(!list_empty(&sgx_epc_sections[i]->unsanitized_list));

/Jarkko
Jarkko Sakkinen Feb. 6, 2020, 3:35 p.m. UTC | #4
On Wed, Feb 05, 2020 at 11:57:00AM -0800, Sean Christopherson wrote:
>   3. Breaks on-demand paging when running in a VM, e.g. if the VMM chooses
>      to allocate a physical EPC page when it's actually accessed by the
>      VM.  I don't expect this to be a problem any time soon, as all VMMs
>      will likely preallocate EPC pages until KVM (or any other hypervisor)
>      gains EPC oversusbscription support, which may or may not ever happen.
>      But, I'd prefer to simply not have the problem in the first place.

So wouldn't it be better to revisit this when the VM changes are added.

/Jarkko
Sean Christopherson Feb. 6, 2020, 3:55 p.m. UTC | #5
On Thu, Feb 06, 2020 at 05:35:19PM +0200, Jarkko Sakkinen wrote:
> On Wed, Feb 05, 2020 at 11:57:00AM -0800, Sean Christopherson wrote:
> >   3. Breaks on-demand paging when running in a VM, e.g. if the VMM chooses
> >      to allocate a physical EPC page when it's actually accessed by the
> >      VM.  I don't expect this to be a problem any time soon, as all VMMs
> >      will likely preallocate EPC pages until KVM (or any other hypervisor)
> >      gains EPC oversusbscription support, which may or may not ever happen.
> >      But, I'd prefer to simply not have the problem in the first place.
> 
> So wouldn't it be better to revisit this when the VM changes are added.

No, because the guest kernel (this code) and the host hypervisor (KVM code)
are separate assets.  Folks will pick up this code use it for guest kernels
and start deploying it, e.g. for cloud workloads.  At some point after KVM
support lands upstream (assuming we get there), CSPs et al will (in theory)
move to the upstream version of KVM instead of running out-of-tree patches.
But, the guest kernels will stay the same and continue to exhibit the
undesirable behavior.

KVM is also not the only hypervisor that supports SGX, e.g. HyperV already
supports exposing SGX to guests.
diff mbox series

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 44d279698c0f..d11a41ef25af 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1949,6 +1949,20 @@  config X86_INTEL_TSX_MODE_AUTO
 	  side channel attacks- equals the tsx=auto command line parameter.
 endchoice
 
+config INTEL_SGX
+	bool "Intel SGX"
+	depends on X86_64 && CPU_SUP_INTEL
+	select SRCU
+	select MMU_NOTIFIER
+	help
+	  Intel(R) SGX is a set of CPU instructions that can be used by
+	  applications to set aside private regions of code and data, referred
+	  to as enclaves. An enclave's private memory can only be accessed by
+	  code running within the enclave. Accesses from outside the enclave,
+	  including other enclaves, are disallowed by hardware.
+
+	  If unsure, say N.
+
 config EFI
 	bool "EFI runtime service support"
 	depends on ACPI
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 7dc4ad68eb41..45534fb81007 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -46,6 +46,7 @@  obj-$(CONFIG_X86_MCE)			+= mce/
 obj-$(CONFIG_MTRR)			+= mtrr/
 obj-$(CONFIG_MICROCODE)			+= microcode/
 obj-$(CONFIG_X86_CPU_RESCTRL)		+= resctrl/
+obj-$(CONFIG_INTEL_SGX)			+= sgx/
 
 obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o
 
diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
new file mode 100644
index 000000000000..2dec75916a5e
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/Makefile
@@ -0,0 +1,3 @@ 
+obj-y += \
+	main.o \
+	reclaim.o
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
new file mode 100644
index 000000000000..38424c1e8341
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -0,0 +1,151 @@ 
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+// Copyright(c) 2016-17 Intel Corporation.
+
+#include <linux/freezer.h>
+#include <linux/highmem.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/ratelimit.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include "encls.h"
+
+struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
+int sgx_nr_epc_sections;
+
+static void __init sgx_free_epc_section(struct sgx_epc_section *section)
+{
+	struct sgx_epc_page *page;
+
+	while (!list_empty(&section->page_list)) {
+		page = list_first_entry(&section->page_list,
+					struct sgx_epc_page, list);
+		list_del(&page->list);
+		kfree(page);
+	}
+
+	while (!list_empty(&section->unsanitized_page_list)) {
+		page = list_first_entry(&section->unsanitized_page_list,
+					struct sgx_epc_page, list);
+		list_del(&page->list);
+		kfree(page);
+	}
+
+	memunmap(section->va);
+}
+
+static bool __init sgx_alloc_epc_section(u64 addr, u64 size,
+					 unsigned long index,
+					 struct sgx_epc_section *section)
+{
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	struct sgx_epc_page *page;
+	unsigned long i;
+
+	section->va = memremap(addr, size, MEMREMAP_WB);
+	if (!section->va)
+		return false;
+
+	section->pa = addr;
+	spin_lock_init(&section->lock);
+	INIT_LIST_HEAD(&section->page_list);
+	INIT_LIST_HEAD(&section->unsanitized_page_list);
+
+	for (i = 0; i < nr_pages; i++) {
+		page = kzalloc(sizeof(*page), GFP_KERNEL);
+		if (!page)
+			goto err_out;
+
+		page->desc = (addr + (i << PAGE_SHIFT)) | index;
+		list_add_tail(&page->list, &section->unsanitized_page_list);
+	}
+
+	return true;
+
+err_out:
+	sgx_free_epc_section(section);
+	return false;
+}
+
+static void __init sgx_page_cache_teardown(void)
+{
+	int i;
+
+	for (i = 0; i < sgx_nr_epc_sections; i++)
+		sgx_free_epc_section(&sgx_epc_sections[i]);
+}
+
+/**
+ * A section metric is concatenated in a way that @low bits 12-31 define the
+ * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
+ * metric.
+ */
+static inline u64 __init sgx_calc_section_metric(u64 low, u64 high)
+{
+	return (low & GENMASK_ULL(31, 12)) +
+	       ((high & GENMASK_ULL(19, 0)) << 32);
+}
+
+static bool __init sgx_page_cache_init(void)
+{
+	u32 eax, ebx, ecx, edx, type;
+	u64 pa, size;
+	int i;
+
+	for (i = 0; i <= ARRAY_SIZE(sgx_epc_sections); i++) {
+		cpuid_count(SGX_CPUID, i + SGX_CPUID_FIRST_VARIABLE_SUB_LEAF,
+			    &eax, &ebx, &ecx, &edx);
+
+		type = eax & SGX_CPUID_SUB_LEAF_TYPE_MASK;
+		if (type == SGX_CPUID_SUB_LEAF_INVALID)
+			break;
+
+		if (type != SGX_CPUID_SUB_LEAF_EPC_SECTION) {
+			pr_err_once("Unknown EPC section type: %u\n", type);
+			break;
+		}
+
+		if (i == ARRAY_SIZE(sgx_epc_sections)) {
+			pr_warn("No free slot for an EPC section\n");
+			break;
+		}
+
+		pa = sgx_calc_section_metric(eax, ebx);
+		size = sgx_calc_section_metric(ecx, edx);
+
+		pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1);
+
+		if (!sgx_alloc_epc_section(pa, size, i, &sgx_epc_sections[i])) {
+			pr_err("No free memory for an EPC section\n");
+			break;
+		}
+
+		sgx_nr_epc_sections++;
+	}
+
+	if (!sgx_nr_epc_sections) {
+		pr_err("There are zero EPC sections.\n");
+		return false;
+	}
+
+	return true;
+}
+
+static void __init sgx_init(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_SGX))
+		return;
+
+	if (!sgx_page_cache_init())
+		return;
+
+	if (!sgx_page_reclaimer_init())
+		goto err_page_cache;
+
+	return;
+
+err_page_cache:
+	sgx_page_cache_teardown();
+}
+
+arch_initcall(sgx_init);
diff --git a/arch/x86/kernel/cpu/sgx/reclaim.c b/arch/x86/kernel/cpu/sgx/reclaim.c
new file mode 100644
index 000000000000..f071158d34f6
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/reclaim.c
@@ -0,0 +1,87 @@ 
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+// Copyright(c) 2016-19 Intel Corporation.
+
+#include <linux/freezer.h>
+#include <linux/highmem.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/ratelimit.h>
+#include <linux/slab.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include "encls.h"
+
+struct task_struct *ksgxswapd_tsk;
+
+/*
+ * Reset all pages to uninitialized state. Pages could be in initialized on
+ * kmemexec.
+ */
+static void sgx_sanitize_section(struct sgx_epc_section *section)
+{
+	struct sgx_epc_page *page, *tmp;
+	LIST_HEAD(secs_list);
+	int ret;
+
+	while (!list_empty(&section->unsanitized_page_list)) {
+		if (kthread_should_stop())
+			return;
+
+		spin_lock(&section->lock);
+
+		page = list_first_entry(&section->unsanitized_page_list,
+					struct sgx_epc_page, list);
+
+		ret = __eremove(sgx_epc_addr(page));
+		if (!ret)
+			list_move(&page->list, &section->page_list);
+		else
+			list_move_tail(&page->list, &secs_list);
+
+		spin_unlock(&section->lock);
+
+		cond_resched();
+	}
+
+	list_for_each_entry_safe(page, tmp, &secs_list, list) {
+		if (kthread_should_stop())
+			return;
+
+		ret = __eremove(sgx_epc_addr(page));
+		if (!WARN_ON_ONCE(ret)) {
+			spin_lock(&section->lock);
+			list_move(&page->list, &section->page_list);
+			spin_unlock(&section->lock);
+		} else {
+			list_del(&page->list);
+			kfree(page);
+		}
+
+		cond_resched();
+	}
+}
+
+static int ksgxswapd(void *p)
+{
+	int i;
+
+	set_freezable();
+
+	for (i = 0; i < sgx_nr_epc_sections; i++)
+		sgx_sanitize_section(&sgx_epc_sections[i]);
+
+	return 0;
+}
+
+bool __init sgx_page_reclaimer_init(void)
+{
+	struct task_struct *tsk;
+
+	tsk = kthread_run(ksgxswapd, NULL, "ksgxswapd");
+	if (IS_ERR(tsk))
+		return false;
+
+	ksgxswapd_tsk = tsk;
+
+	return true;
+}
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
new file mode 100644
index 000000000000..aad30980be32
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -0,0 +1,70 @@ 
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+#ifndef _X86_SGX_H
+#define _X86_SGX_H
+
+#include <linux/bitops.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/rwsem.h>
+#include <linux/types.h>
+#include <asm/asm.h>
+#include "arch.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) "sgx: " fmt
+
+struct sgx_epc_page {
+	unsigned long desc;
+	struct list_head list;
+};
+
+/**
+ * struct sgx_epc_section
+ *
+ * The firmware can define multiple chunks of EPC to the different areas of the
+ * physical memory e.g. for memory areas of the each node. This structure is
+ * used to store EPC pages for one EPC section and virtual memory area where
+ * the pages have been mapped.
+ */
+struct sgx_epc_section {
+	unsigned long pa;
+	void *va;
+	struct list_head page_list;
+	struct list_head unsanitized_page_list;
+	spinlock_t lock;
+};
+
+/**
+ * enum sgx_epc_page_desc - bits and masks for an EPC page's descriptor
+ * %SGX_EPC_SECTION_MASK:	SGX allows to have multiple EPC sections in the
+ *				physical memory. The existing and near-future
+ *				hardware defines at most eight sections, hence
+ *				three bits to hold a section.
+ */
+enum sgx_epc_page_desc {
+	SGX_EPC_SECTION_MASK			= GENMASK_ULL(3, 0),
+	/* bits 12-63 are reserved for the physical page address of the page */
+};
+
+#define SGX_MAX_EPC_SECTIONS (SGX_EPC_SECTION_MASK + 1)
+
+extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
+
+static inline struct sgx_epc_section *sgx_epc_section(struct sgx_epc_page *page)
+{
+	return &sgx_epc_sections[page->desc & SGX_EPC_SECTION_MASK];
+}
+
+static inline void *sgx_epc_addr(struct sgx_epc_page *page)
+{
+	struct sgx_epc_section *section = sgx_epc_section(page);
+
+	return section->va + (page->desc & PAGE_MASK) - section->pa;
+}
+
+extern int sgx_nr_epc_sections;
+extern struct task_struct *ksgxswapd_tsk;
+
+bool __init sgx_page_reclaimer_init(void);
+
+#endif /* _X86_SGX_H */