diff mbox series

[v17,18/23] platform/x86: Intel SGX driver

Message ID 20181116010412.23967-19-jarkko.sakkinen@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series [v17,01/23] x86/sgx: Update MAINTAINERS | expand

Commit Message

Jarkko Sakkinen Nov. 16, 2018, 1:01 a.m. UTC
Intel Software Guard eXtensions (SGX) is a set of CPU instructions that
can be used by applications to set aside private regions of code and
data. The code outside the enclave is disallowed to access the memory
inside the enclave by the CPU access control.

SGX driver provides a ioctl API for loading and initializing enclaves.
Address range for enclaves is reserved with mmap() and they are
destroyed with munmap(). Enclave construction, measurement and
initialization is done with the provided the ioctl API.

Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Co-developed-by: Serge Ayoun <serge.ayoun@intel.com>
Signed-off-by: Serge Ayoun <serge.ayoun@intel.com>
Co-developed-by: Shay Katz-zamir <shay.katz-zamir@intel.com>
Signed-off-by: Shay Katz-zamir <shay.katz-zamir@intel.com>
Co-developed-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
---
 arch/x86/include/uapi/asm/sgx.h            |  59 ++
 drivers/platform/x86/Kconfig               |   2 +
 drivers/platform/x86/Makefile              |   1 +
 drivers/platform/x86/intel_sgx/Kconfig     |  20 +
 drivers/platform/x86/intel_sgx/Makefile    |  12 +
 drivers/platform/x86/intel_sgx/sgx.h       | 180 +++++
 drivers/platform/x86/intel_sgx/sgx_encl.c  | 784 +++++++++++++++++++++
 drivers/platform/x86/intel_sgx/sgx_ioctl.c | 234 ++++++
 drivers/platform/x86/intel_sgx/sgx_main.c  | 267 +++++++
 drivers/platform/x86/intel_sgx/sgx_util.c  |  85 +++
 drivers/platform/x86/intel_sgx/sgx_vma.c   |  43 ++
 11 files changed, 1687 insertions(+)
 create mode 100644 arch/x86/include/uapi/asm/sgx.h
 create mode 100644 drivers/platform/x86/intel_sgx/Kconfig
 create mode 100644 drivers/platform/x86/intel_sgx/Makefile
 create mode 100644 drivers/platform/x86/intel_sgx/sgx.h
 create mode 100644 drivers/platform/x86/intel_sgx/sgx_encl.c
 create mode 100644 drivers/platform/x86/intel_sgx/sgx_ioctl.c
 create mode 100644 drivers/platform/x86/intel_sgx/sgx_main.c
 create mode 100644 drivers/platform/x86/intel_sgx/sgx_util.c
 create mode 100644 drivers/platform/x86/intel_sgx/sgx_vma.c

Comments

Randy Dunlap Nov. 16, 2018, 1:37 a.m. UTC | #1
On 11/15/18 5:01 PM, Jarkko Sakkinen wrote:
> Intel Software Guard eXtensions (SGX) is a set of CPU instructions that
> can be used by applications to set aside private regions of code and
> data. The code outside the enclave is disallowed to access the memory
> inside the enclave by the CPU access control.
> 
> SGX driver provides a ioctl API for loading and initializing enclaves.
> Address range for enclaves is reserved with mmap() and they are
> destroyed with munmap(). Enclave construction, measurement and
> initialization is done with the provided the ioctl API.
> 
> Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Co-developed-by: Serge Ayoun <serge.ayoun@intel.com>
> Signed-off-by: Serge Ayoun <serge.ayoun@intel.com>
> Co-developed-by: Shay Katz-zamir <shay.katz-zamir@intel.com>
> Signed-off-by: Shay Katz-zamir <shay.katz-zamir@intel.com>
> Co-developed-by: Suresh Siddha <suresh.b.siddha@intel.com>
> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
> ---

> diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
> new file mode 100644
> index 000000000000..aadf9c76e360
> --- /dev/null
> +++ b/arch/x86/include/uapi/asm/sgx.h
> @@ -0,0 +1,59 @@
> +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
> +/**
> + * Copyright(c) 2016-18 Intel Corporation.
> + */
> +#ifndef _UAPI_ASM_X86_SGX_H
> +#define _UAPI_ASM_X86_SGX_H
> +
> +#include <linux/types.h>
> +#include <linux/ioctl.h>
> +
> +#define SGX_MAGIC 0xA4
> +
> +#define SGX_IOC_ENCLAVE_CREATE \
> +	_IOW(SGX_MAGIC, 0x00, struct sgx_enclave_create)
> +#define SGX_IOC_ENCLAVE_ADD_PAGE \
> +	_IOW(SGX_MAGIC, 0x01, struct sgx_enclave_add_page)
> +#define SGX_IOC_ENCLAVE_INIT \
> +	_IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init)
> +
> +/* IOCTL return values */
> +#define SGX_POWER_LOST_ENCLAVE		0x40000000


Hi,
The ioctl magic number should be documented in
Documentation/ioctl/ioctl-number.txt.

ta.
Jarkko Sakkinen Nov. 16, 2018, 11:23 a.m. UTC | #2
On Thu, Nov 15, 2018 at 05:37:27PM -0800, Randy Dunlap wrote:
> On 11/15/18 5:01 PM, Jarkko Sakkinen wrote:
> > Intel Software Guard eXtensions (SGX) is a set of CPU instructions that
> > can be used by applications to set aside private regions of code and
> > data. The code outside the enclave is disallowed to access the memory
> > inside the enclave by the CPU access control.
> > 
> > SGX driver provides a ioctl API for loading and initializing enclaves.
> > Address range for enclaves is reserved with mmap() and they are
> > destroyed with munmap(). Enclave construction, measurement and
> > initialization is done with the provided the ioctl API.
> > 
> > Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> > Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
> > Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> > Co-developed-by: Serge Ayoun <serge.ayoun@intel.com>
> > Signed-off-by: Serge Ayoun <serge.ayoun@intel.com>
> > Co-developed-by: Shay Katz-zamir <shay.katz-zamir@intel.com>
> > Signed-off-by: Shay Katz-zamir <shay.katz-zamir@intel.com>
> > Co-developed-by: Suresh Siddha <suresh.b.siddha@intel.com>
> > Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
> > ---
> 
> > diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
> > new file mode 100644
> > index 000000000000..aadf9c76e360
> > --- /dev/null
> > +++ b/arch/x86/include/uapi/asm/sgx.h
> > @@ -0,0 +1,59 @@
> > +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
> > +/**
> > + * Copyright(c) 2016-18 Intel Corporation.
> > + */
> > +#ifndef _UAPI_ASM_X86_SGX_H
> > +#define _UAPI_ASM_X86_SGX_H
> > +
> > +#include <linux/types.h>
> > +#include <linux/ioctl.h>
> > +
> > +#define SGX_MAGIC 0xA4
> > +
> > +#define SGX_IOC_ENCLAVE_CREATE \
> > +	_IOW(SGX_MAGIC, 0x00, struct sgx_enclave_create)
> > +#define SGX_IOC_ENCLAVE_ADD_PAGE \
> > +	_IOW(SGX_MAGIC, 0x01, struct sgx_enclave_add_page)
> > +#define SGX_IOC_ENCLAVE_INIT \
> > +	_IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init)
> > +
> > +/* IOCTL return values */
> > +#define SGX_POWER_LOST_ENCLAVE		0x40000000
> 
> 
> Hi,
> The ioctl magic number should be documented in
> Documentation/ioctl/ioctl-number.txt.
> 
> ta.
> -- 
> ~Randy

Thank you. Addressed now in https://github.com/jsakkine-intel/linux-sgx.git.

/Jarkko
Jarkko Sakkinen Nov. 19, 2018, 3:06 p.m. UTC | #3
On Fri, Nov 16, 2018 at 03:01:25AM +0200, Jarkko Sakkinen wrote:
> Intel Software Guard eXtensions (SGX) is a set of CPU instructions that
> can be used by applications to set aside private regions of code and
> data. The code outside the enclave is disallowed to access the memory
> inside the enclave by the CPU access control.
> 
> SGX driver provides a ioctl API for loading and initializing enclaves.
> Address range for enclaves is reserved with mmap() and they are
> destroyed with munmap(). Enclave construction, measurement and
> initialization is done with the provided the ioctl API.
> 
> Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Co-developed-by: Serge Ayoun <serge.ayoun@intel.com>
> Signed-off-by: Serge Ayoun <serge.ayoun@intel.com>
> Co-developed-by: Shay Katz-zamir <shay.katz-zamir@intel.com>
> Signed-off-by: Shay Katz-zamir <shay.katz-zamir@intel.com>
> Co-developed-by: Suresh Siddha <suresh.b.siddha@intel.com>
> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>

Question: should be dissolve the driver completely and move this code to
arch/x86/kernel/cpu/sgx/ (and rename intel_sgx.c as main.c)? Swapping
patch removes the possibility to compile this as a module anyway.

Would make also maintainer hierarchy more clear and clean albeit that
cannot be a guiding reason to do such change. Here's the current
MAINTAINERS entry in my master:

INTEL SGX
M:	Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
M:	Sean Christopherson <sean.j.christopherson@intel.com>
L:	linux-sgx@vger.kernel.org
S:	Maintained
Q:	https://patchwork.kernel.org/project/intel-sgx/list/
T:	git https://github.com/jsakkine-intel/linux-sgx.git
F:	arch/x86/include/asm/sgx.h
F:	arch/x86/include/uapi/asm/sgx.h
F:	arch/x86/kernel/cpu/intel_sgx.c
F:	drivers/platform/x86/intel_sgx/*
K:	\bSGX_

If we do this, we would end up with this:

INTEL SGX
M:	Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
M:	Sean Christopherson <sean.j.christopherson@intel.com>
L:	linux-sgx@vger.kernel.org
S:	Maintained
Q:	https://patchwork.kernel.org/project/intel-sgx/list/
T:	git https://github.com/jsakkine-intel/linux-sgx.git
F:	arch/x86/include/asm/sgx.h
F:	arch/x86/include/uapi/asm/sgx.h
F:	arch/x86/kernel/cpu/sgx/*
K:	\bSGX_

Then once the base code has been merged I would put my PRs to x86
maintainers for subsequent kernel releases.

/Jarkko
Andy Lutomirski Nov. 19, 2018, 3:29 p.m. UTC | #4
On Thu, Nov 15, 2018 at 5:08 PM Jarkko Sakkinen
<jarkko.sakkinen@linux.intel.com> wrote:
>
> Intel Software Guard eXtensions (SGX) is a set of CPU instructions that
> can be used by applications to set aside private regions of code and
> data. The code outside the enclave is disallowed to access the memory
> inside the enclave by the CPU access control.
>
> SGX driver provides a ioctl API for loading and initializing enclaves.
> Address range for enclaves is reserved with mmap() and they are
> destroyed with munmap(). Enclave construction, measurement and
> initialization is done with the provided the ioctl API.
>

I brought this up a while back, and I think I should re-ask it now
that this driver is getting close to ready:

As it stands, there's just one SGX character device, and I imagine
that it'll be available to unprivileged applications.  I'm concerned
that this isn't quite what we want.  I certainly think that everyone,
or at least almost everyone, ought to be able to run normal enclaves.
But I think that we should consider restricting who can run specially
privileged enclaves.  In particular, the ability to run enclaves with
the provisioning bit set is somewhat sensitive, since it effectively
allows access to a stable fingerprint of the system.  Before flexible
LC, this wasn't such a big deal, since only Intel's provisioning
enclave could see the key, and Intel's enclave has some degree of
control of what is done with the key.  With flex LC, this protection
is lost.

But this is maybe more of a big deal than just access to a stable
fingerprint.  The ability to provision a remote attestation protocol
is a key part of running SGX malware, and SGX malware is surely going
to exist some day.  (Sure, Intel will try to block access to the
actual attestation service for malware, but I doubt that Intel will be
able to fully defend it.)

So I propose that there be a few device nodes.  Maybe
/dev/sgx/unprivilegd and /dev/sgx/provisioning?  The default mode of
the latter could be 0600.  If you've opened the unprivileged node, you
can only run enclaves without any special permission bits set.

We should also consider whether we allow the unprivileged node to run
launch enclaves, and, for that matter, whether we allow user code to
run launch enclaves at all, given that they're not useful with the
current architecture of the driver.

--Andy
Jarkko Sakkinen Nov. 19, 2018, 4:19 p.m. UTC | #5
On Mon, Nov 19, 2018 at 07:29:25AM -0800, Andy Lutomirski wrote:
> On Thu, Nov 15, 2018 at 5:08 PM Jarkko Sakkinen
> <jarkko.sakkinen@linux.intel.com> wrote:
> >
> > Intel Software Guard eXtensions (SGX) is a set of CPU instructions that
> > can be used by applications to set aside private regions of code and
> > data. The code outside the enclave is disallowed to access the memory
> > inside the enclave by the CPU access control.
> >
> > SGX driver provides a ioctl API for loading and initializing enclaves.
> > Address range for enclaves is reserved with mmap() and they are
> > destroyed with munmap(). Enclave construction, measurement and
> > initialization is done with the provided the ioctl API.
> >
> 
> I brought this up a while back, and I think I should re-ask it now
> that this driver is getting close to ready:
> 
> As it stands, there's just one SGX character device, and I imagine
> that it'll be available to unprivileged applications.  I'm concerned
> that this isn't quite what we want.  I certainly think that everyone,
> or at least almost everyone, ought to be able to run normal enclaves.
> But I think that we should consider restricting who can run specially
> privileged enclaves.  In particular, the ability to run enclaves with
> the provisioning bit set is somewhat sensitive, since it effectively
> allows access to a stable fingerprint of the system.  Before flexible
> LC, this wasn't such a big deal, since only Intel's provisioning
> enclave could see the key, and Intel's enclave has some degree of
> control of what is done with the key.  With flex LC, this protection
> is lost.
> 
> But this is maybe more of a big deal than just access to a stable
> fingerprint.  The ability to provision a remote attestation protocol
> is a key part of running SGX malware, and SGX malware is surely going
> to exist some day.  (Sure, Intel will try to block access to the
> actual attestation service for malware, but I doubt that Intel will be
> able to fully defend it.)
> 
> So I propose that there be a few device nodes.  Maybe
> /dev/sgx/unprivilegd and /dev/sgx/provisioning?  The default mode of
> the latter could be 0600.  If you've opened the unprivileged node, you
> can only run enclaves without any special permission bits set.

What would the use case for unprivileged i.e. this configuration would
mean depending on permissions?

There would be three types of users:

1. Ones that have access to neither of the devices.
2. Ones that have access to unprivileged. Who are these?
3. Ones that have access to provisioning.

> We should also consider whether we allow the unprivileged node to run
> launch enclaves, and, for that matter, whether we allow user code to
> run launch enclaves at all, given that they're not useful with the
> current architecture of the driver.

ATM the launch enclave bit is disallowed by the kernel in the current
patch set. I don't really see any use case to allow them except if we
want to allow run enclaves in an environment where the MSRs are rdonly.

/Jarkko
Jethro Beekman Nov. 19, 2018, 4:22 p.m. UTC | #6
On 2018-11-19 20:36, Jarkko Sakkinen wrote:
> Question: should be dissolve the driver completely and move this code to
> arch/x86/kernel/cpu/sgx/ (and rename intel_sgx.c as main.c)? Swapping
> patch removes the possibility to compile this as a module anyway.

No. We should keep the capability to build this as a module for other 
users of SGX. What is the swapping patch and why doesn't allow building 
as a module?

--
Jethro Beekman | Fortanix
Andy Lutomirski Nov. 19, 2018, 4:59 p.m. UTC | #7
On Mon, Nov 19, 2018 at 8:19 AM Jarkko Sakkinen
<jarkko.sakkinen@linux.intel.com> wrote:
>
> On Mon, Nov 19, 2018 at 07:29:25AM -0800, Andy Lutomirski wrote:
> > On Thu, Nov 15, 2018 at 5:08 PM Jarkko Sakkinen
> > <jarkko.sakkinen@linux.intel.com> wrote:
> > >
> > > Intel Software Guard eXtensions (SGX) is a set of CPU instructions that
> > > can be used by applications to set aside private regions of code and
> > > data. The code outside the enclave is disallowed to access the memory
> > > inside the enclave by the CPU access control.
> > >
> > > SGX driver provides a ioctl API for loading and initializing enclaves.
> > > Address range for enclaves is reserved with mmap() and they are
> > > destroyed with munmap(). Enclave construction, measurement and
> > > initialization is done with the provided the ioctl API.
> > >
> >
> > I brought this up a while back, and I think I should re-ask it now
> > that this driver is getting close to ready:
> >
> > As it stands, there's just one SGX character device, and I imagine
> > that it'll be available to unprivileged applications.  I'm concerned
> > that this isn't quite what we want.  I certainly think that everyone,
> > or at least almost everyone, ought to be able to run normal enclaves.
> > But I think that we should consider restricting who can run specially
> > privileged enclaves.  In particular, the ability to run enclaves with
> > the provisioning bit set is somewhat sensitive, since it effectively
> > allows access to a stable fingerprint of the system.  Before flexible
> > LC, this wasn't such a big deal, since only Intel's provisioning
> > enclave could see the key, and Intel's enclave has some degree of
> > control of what is done with the key.  With flex LC, this protection
> > is lost.
> >
> > But this is maybe more of a big deal than just access to a stable
> > fingerprint.  The ability to provision a remote attestation protocol
> > is a key part of running SGX malware, and SGX malware is surely going
> > to exist some day.  (Sure, Intel will try to block access to the
> > actual attestation service for malware, but I doubt that Intel will be
> > able to fully defend it.)
> >
> > So I propose that there be a few device nodes.  Maybe
> > /dev/sgx/unprivilegd and /dev/sgx/provisioning?  The default mode of
> > the latter could be 0600.  If you've opened the unprivileged node, you
> > can only run enclaves without any special permission bits set.
>
> What would the use case for unprivileged i.e. this configuration would
> mean depending on permissions?

Everything except the quoting/provisioning code would use the unprivileged node.

>
> There would be three types of users:
>
> 1. Ones that have access to neither of the devices.
> 2. Ones that have access to unprivileged. Who are these?

Either 0666 (world) or an sgx group.

> 3. Ones that have access to provisioning.

Root only.

The idea here is that, under normal circumstances, provisioning only
runs once, or at least only runs rarely.  So, rather than the SDK
running provisioning whenever it feels like doing so (which is the
current behavior, I imagine, although I haven't looked), there would
be a privileged program, perhaps a systemd unit that runs when needed,
that produces the key material needed for remote attestation, and
non-root users that need attestation would get the keying material
from the provisioning service.  And the provisioning service could
implement its own policy.  Ideally, the service wouldn't give the
sealed keys to users at all but would, instead, just provide the
entire attestation service over a UNIX socket, which would make
provisioning capabilities revocable.

Does this make sense?

--Andy
Jarkko Sakkinen Nov. 19, 2018, 5:19 p.m. UTC | #8
On Mon, Nov 19, 2018 at 04:22:54PM +0000, Jethro Beekman wrote:
> On 2018-11-19 20:36, Jarkko Sakkinen wrote:
> > Question: should be dissolve the driver completely and move this code to
> > arch/x86/kernel/cpu/sgx/ (and rename intel_sgx.c as main.c)? Swapping
> > patch removes the possibility to compile this as a module anyway.
> 
> No. We should keep the capability to build this as a module for other users
> of SGX. What is the swapping patch and why doesn't allow building as a
> module?

In some earlier version of the patch set the request was to remove the
callback interface to the driver so now core does direct calls to the
swapping operations.

/Jarkko
Jethro Beekman Nov. 19, 2018, 5:39 p.m. UTC | #9
On 2018-11-19 22:49, Jarkko Sakkinen wrote:
> On Mon, Nov 19, 2018 at 04:22:54PM +0000, Jethro Beekman wrote:
>> On 2018-11-19 20:36, Jarkko Sakkinen wrote:
>>> Question: should be dissolve the driver completely and move this code to
>>> arch/x86/kernel/cpu/sgx/ (and rename intel_sgx.c as main.c)? Swapping
>>> patch removes the possibility to compile this as a module anyway.
>>
>> No. We should keep the capability to build this as a module for other users
>> of SGX. What is the swapping patch and why doesn't allow building as a
>> module?
> 
> In some earlier version of the patch set the request was to remove the
> callback interface to the driver so now core does direct calls to the
> swapping operations.

I see, you're talking about this part (from PATCH v17 20/23):

> This has the unfortunate and odd side effect of preventing the SGX driver from 
> being compiled as a loadable module.

But you also write:

> However, this should be a temporary situation

And in PATCH v17 19/23:

> Allowing the userspace-facing driver to be built as a loaded module is 
> desirable

I wholeheartedly agree with that last statement.

Let's keep it as separated as possible for now until KVM lands and we 
get the module capability back, instead of integrating everything so 
tightly that that is no longer possible.

--
Jethro Beekman | Fortanix
Andy Lutomirski Nov. 19, 2018, 6:18 p.m. UTC | #10
> On Nov 19, 2018, at 10:19 AM, Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com> wrote:
> 
>> On Mon, Nov 19, 2018 at 04:22:54PM +0000, Jethro Beekman wrote:
>>> On 2018-11-19 20:36, Jarkko Sakkinen wrote:
>>> Question: should be dissolve the driver completely and move this code to
>>> arch/x86/kernel/cpu/sgx/ (and rename intel_sgx.c as main.c)? Swapping
>>> patch removes the possibility to compile this as a module anyway.
>> 
>> No. We should keep the capability to build this as a module for other users
>> of SGX. What is the swapping patch and why doesn't allow building as a
>> module?
> 
> In some earlier version of the patch set the request was to remove the
> callback interface to the driver so now core does direct calls to the
> swapping operations.
> 
> 

Once static_call lands, this will be a great use for it.
Jarkko Sakkinen Nov. 20, 2018, 10:58 a.m. UTC | #11
On Mon, Nov 19, 2018 at 05:39:04PM +0000, Jethro Beekman wrote:
> Let's keep it as separated as possible for now until KVM lands and we get
> the module capability back, instead of integrating everything so tightly
> Jethro Beekman | Fortanix

We can keep the driver structure and still move it under
arch/x86/kernel/cpu. When landing the KVM code we can then allow
the driver part to be compiled as module. It is not the only LKM
under arch/x86.

Would also simplify both way we develop SGX code alot, not least
because there would be clear consumer of the pull requests.

/Jarkko
Jarkko Sakkinen Nov. 20, 2018, 11 a.m. UTC | #12
On Mon, Nov 19, 2018 at 11:18:54AM -0700, Andy Lutomirski wrote:
> Once static_call lands, this will be a great use for it.

When is it expected to land? Cool feature BTW!

/Jarkko
Dr. Greg Nov. 20, 2018, 11:15 a.m. UTC | #13
On Mon, Nov 19, 2018 at 07:29:25AM -0800, Andy Lutomirski wrote:

Good morning, I hope this note finds the week going well for everyone.

> On Thu, Nov 15, 2018 at 5:08 PM Jarkko Sakkinen
> <jarkko.sakkinen@linux.intel.com> wrote:
> >
> > Intel Software Guard eXtensions (SGX) is a set of CPU instructions that
> > can be used by applications to set aside private regions of code and
> > data. The code outside the enclave is disallowed to access the memory
> > inside the enclave by the CPU access control.
> >
> > SGX driver provides a ioctl API for loading and initializing enclaves.
> > Address range for enclaves is reserved with mmap() and they are
> > destroyed with munmap(). Enclave construction, measurement and
> > initialization is done with the provided the ioctl API.

> I brought this up a while back, and I think I should re-ask it now
> that this driver is getting close to ready:
>
> As it stands, there's just one SGX character device, and I imagine
> that it'll be available to unprivileged applications.  I'm concerned
> that this isn't quite what we want.  I certainly think that
> everyone, or at least almost everyone, ought to be able to run
> normal enclaves.  But I think that we should consider restricting
> who can run specially privileged enclaves.  In particular, the
> ability to run enclaves with the provisioning bit set is somewhat
> sensitive, since it effectively allows access to a stable
> fingerprint of the system.  Before flexible LC, this wasn't such a
> big deal, since only Intel's provisioning enclave could see the key,
> and Intel's enclave has some degree of control of what is done with
> the key.  With flex LC, this protection is lost.

When I saw this thread over the weekend I was going to reply to it and
raise this issue but Andy beat me to it.  Beyond that I was busy
trying to get enough firwewood ready to stay warm for the winter so
other priorities prevailed.. :-)

About 6-8 months ago, Andy and I had a lively discussion on this list
about whether or not there was a need, from a security perspective,
for enclave based LC.  I understand the sentiment against enclave
based LC, particularly with only a single vendor in control, but when
the entire spectrum of SGX security guarantees are considered, things
begin to fray at the edges a bit when LC is delegated to the kernel.

As everyone knows, in the most general sense, SGX is designed to
implement security in an IAGO threat environment, ie. where the entire
platform is compromised.  This is conceptually, if not provably
possible, when all of the security contracts are fulfilled by
enclaves.  Placing LC in the kernel has the effect of bringing the
kernel into the TCB footprint of the overall security guarantees the
technology is designed to deliver, which include privacy and
anonymity.

So, as Andy points out, we now need to work these security
considerations into the design of what will be 'the' Linux SGX driver.

Moving LC into the kernel means that rigid security controls are no
longer available for two security sensitive assets; The Provisioning
Key (PK) and the Provisioning Seal Key (PSK).

The PK is used to generate a unique Platform Provisioning IDentity
(PPID) in the Endpoint Selection (ES) phase of the Intel Enhanced
Privacy ID (EPID) provisioning protocol.  With the proposed driver, as
Andy notes above, there is now the opportunity for adversarial
generation of a unique platform identity by anyone who can create and
run an enclave.

The ENCLU[EGETKEY] instruction does not include the OWNEREPOCH values
in the key derivation process so the PK is in effect a permanent and
unique identity for a platform.

The PSK is one of the statically derived symmetric keys that are
available for sealing/encryption of data.  The PSK is used to encrypt
and protect the EPID private key that Intel provisions to a platform.
This private key is used by the quoting enclave to sign attestation
reports for enclaves running on the platform.  The SGX privacy
guarantees are a function of the security characteristics of EPID
keys.

So, loss of the PSK has implications with respect to the binding of an
EPID to a platform and what entities can use the EPID private key.  In
the enclave LC model the only entities with access to the key are the
provisioning (PVE) and quoting (QE) enclaves.

For the benefit of those not deeply involved in all this.  Shortly
after the release of the Linux SDK/PSW, Intel evolved the ability to
uniquely identify a platform and verify that an enclave is running on
a 'GenuineIntel' platform into the Platform Certification Enclave
(PCE).  This enclave has the provisioning attribute set and uses
access to the PK to implement this functionality.

Section 5.8.2 of the Costan and Devadas SGX review paper talk about
the PK and the PSK.  This section notes, in a referral to section
5.9.3, that restriction of general access to the PROVISION attribute
by enclaves is important for SGX privacy guarantees.

So I believe it is fair to state that this issue requires careful
consideration as we move the Linux SGX eco-system forward.

> But this is maybe more of a big deal than just access to a stable
> fingerprint.  The ability to provision a remote attestation protocol
> is a key part of running SGX malware, and SGX malware is surely
> going to exist some day.  (Sure, Intel will try to block access to
> the actual attestation service for malware, but I doubt that Intel
> will be able to fully defend it.)

Malware would not necessarily need the Intel attestation service.
Once access to the PROVISION bit is available, malware teams could
simply build their own attestation service.

Malware authors are a bit limited now by the general design of the
Intel PSW that mediates all enclave platform functionality through the
aesm_service binary.  The group that I direct SGX engineering for,
IDfusion, developed a complete replacement for the Intel PSW.  Our
work is a minimum footprint implementation that focuses on the
application of SGX to intelligent network endpoint devices and
embedded environments.

Our PSW supports the generation of 'fat binaries' where all of the
functionality, including enclaves, can be embedded into a single
statically linked and self-contained binary.  We developed a proof of
concept application, based on the Apache Struts vulnerability, that
loads one of these binaries on a platform which then sets up a secured
and attested channel for spirting information off of the malware
target.

So the malware concerns are not theoretical, as we are not an
extraordinarily clever group of individuals.  We are considering an
open-source release of the tooling we have developed and even if we
don't end up doing that someone else could easily implement equivalent
functionality.

We've written at some length about these issues in the following Intel
SGX developer forum threads if anyone is interested:

https://software.intel.com/en-us/forums/intel-software-guard-extensions-intel-sgx/topic/781730

https://software.intel.com/en-us/forums/intel-software-guard-extensions-intel-sgx/topic/671517

From a hardware perspective, FLC has now emerged on the Xeon E series
of processors.  FLC isn't being deployed for the sake of openness or
the open-source community, it is rather, secondary to a vertical
marketing strategy that Intel is implementing for SGX.

Intel is offering a subscription based service for data-center
applications of SGX.  Attestation is based on an ECDSA secured
certification that the enclave is running on a 'GenuineIntel' platform
with a specific platform security version (PSVN), see the discussion
on the PCE enclave above.

Non server class, ie. workstations and embedded class devices, will
presumably use a provisioning and EPID based strategy.

> So I propose that there be a few device nodes.  Maybe
> /dev/sgx/unprivilegd and /dev/sgx/provisioning?  The default mode of
> the latter could be 0600.  If you've opened the unprivileged node,
> you can only run enclaves without any special permission bits set.
>
> We should also consider whether we allow the unprivileged node to
> run launch enclaves, and, for that matter, whether we allow user
> code to run launch enclaves at all, given that they're not useful
> with the current architecture of the driver.

Doing this right is probably going to take a bit more then two
separate device nodes.

I believe what the Linux community needs to do is to step back and
consider how to create a driver architecture that supports what will
be the certain stratification of SGX hardware.  I don't see the
current driver being useful for anything but data-center class
hardware for the intermediate if not foreseeable future.

In addition, given what was detailed above, I don't see how this
driver is going to be satisfactory for those of us delivering
platforms with stringent security and privacy guarantees that are
relevant in an IAGO model.  Like it or not the most stringent security
contracts are only available in a full enclave based model.

As a result, like it or not, the current driver needs the ability to
function in both launch control modes.

> --Andy

Hopefully the above information is useful to the development dialogue.

Developing a community driver is tedious at best, particularly for
hardware such as this.  Our personal thanks to Jarkko and others who
have been working through these issues.

Best wishes for a productive remainder of the week and for a pleasant
Thanksgiving holiday for those who will be celebrating that.

Dr. Greg

As always,
Dr. G.W. Wettstein, Ph.D.   Enjellic Systems Development, LLC.
4206 N. 19th Ave.           Specializing in information infra-structure
Fargo, ND  58102            development.
PH: 701-281-1686
FAX: 701-281-3949           EMAIL: greg@enjellic.com
------------------------------------------------------------------------------
"So you got your butt kicked by an 'old' guy.

 Before you taunted him did it ever cross your mind that the $1200
 Schmoelke aero-bars he was laying on and the $900 Rocket7 cycling
 shoes he was wearing might mean that the $10,000 custom bike frame he
 was riding was used for more than transportation to the Dairy Queen
 each night for a Dilly Bar?"
                                -- Dr. G.W. Wettstein
                                   Resurrection
Jarkko Sakkinen Nov. 20, 2018, 12:04 p.m. UTC | #14
On Mon, Nov 19, 2018 at 08:59:24AM -0800, Andy Lutomirski wrote:
> The idea here is that, under normal circumstances, provisioning only
> runs once, or at least only runs rarely.  So, rather than the SDK
> running provisioning whenever it feels like doing so (which is the
> current behavior, I imagine, although I haven't looked), there would
> be a privileged program, perhaps a systemd unit that runs when needed,
> that produces the key material needed for remote attestation, and
> non-root users that need attestation would get the keying material
> from the provisioning service.  And the provisioning service could
> implement its own policy.  Ideally, the service wouldn't give the
> sealed keys to users at all but would, instead, just provide the
> entire attestation service over a UNIX socket, which would make
> provisioning capabilities revocable.
> 
> Does this make sense?

Yes, it does for me at least now that you brought some context.

/Jarkko
Jarkko Sakkinen Nov. 21, 2018, 3:24 p.m. UTC | #15
On Tue, Nov 20, 2018 at 12:58:17PM +0200, Jarkko Sakkinen wrote:
> On Mon, Nov 19, 2018 at 05:39:04PM +0000, Jethro Beekman wrote:
> > Let's keep it as separated as possible for now until KVM lands and we get
> > the module capability back, instead of integrating everything so tightly
> > Jethro Beekman | Fortanix
> 
> We can keep the driver structure and still move it under
> arch/x86/kernel/cpu. When landing the KVM code we can then allow
> the driver part to be compiled as module. It is not the only LKM
> under arch/x86.
> 
> Would also simplify both way we develop SGX code alot, not least
> because there would be clear consumer of the pull requests.

The purpose is not to thrawt the driver structure for user space facing
part. The purpose is only make logicistics work better for
post-upstreaming maintenance.

I'll move on with this for v18 unless there is a good reason not to.

/Jarkko
Dr. Greg Nov. 22, 2018, 11:12 a.m. UTC | #16
On Tue, Nov 20, 2018 at 02:04:42PM +0200, Jarkko Sakkinen wrote:

Good morning to everyone, Happy Thanksgiving to those who are
celebrating the holiday.

> On Mon, Nov 19, 2018 at 08:59:24AM -0800, Andy Lutomirski wrote:
> > The idea here is that, under normal circumstances, provisioning only
> > runs once, or at least only runs rarely.  So, rather than the SDK
> > running provisioning whenever it feels like doing so (which is the
> > current behavior, I imagine, although I haven't looked), there would
> > be a privileged program, perhaps a systemd unit that runs when needed,
> > that produces the key material needed for remote attestation, and
> > non-root users that need attestation would get the keying material
> > from the provisioning service.  And the provisioning service could
> > implement its own policy.  Ideally, the service wouldn't give the
> > sealed keys to users at all but would, instead, just provide the
> > entire attestation service over a UNIX socket, which would make
> > provisioning capabilities revocable.
> > 
> > Does this make sense?

> Yes, it does for me at least now that you brought some context.

Let me see if I can add a bit of additional context to the above to
frame further discussion regarding two major needs of the driver
before it lands.

What Andy is describing is how the current system already works.  The
driver is at the root of a fairly complex eco-system of code,
cryptography and protocols that implement SGX functionality.  This
software stack is known as the SGX Platform SoftWare (PSW) or SGX
runtime.

The Intel provided runtime is implemented in C++ and, depending on how
you count it, clocks in at around 50+ KLOC.  All of this ends up as a
single 1.8 megabyte binary named aesm_service that links against 35
shared libraries and is run by systemd.

This binary implements the functionality needed to load, initialize,
run and attest enclaves.  It also implements communications with the
Intel provisioning and attestation services which is needed to
provision a private EPID key to the platform and to verify the status
of an enclave attestation quote from a remote platform.

In order to achieve the SGX/IAGO security model, a lot of this
functionality is implemented by choreographing exchanges between six
Intel supplied and signed enclaves.  Intel supplies source code to
these enclaves and understanding how all of this works requires an
understanding of that codebase as well.  To top if off there is also a
50K hunk of signed Java bytecode that gets stuffed into the Management
Engine if you are interested in platform services.

All of the above is what we wrote an independent implementation of, in
straight C, that is capable of linking against the MUSL C library with
only libelf and OpenSSL as dependencies.  We developed all of this to
support a reasonably sophisticated multi-enclave SGX security
application that implements modeling the runtime behavior of
applications running on the Linux kernel.  That application uses an
alternate enclave attestation and communications architecture that we
independently developed.

I don't describe the above to hype or promote what we do.  Everyone
discussing these issues is a professional software engineer or
architect.  As such, you will know that by the time you get done doing
all of the above, to the point where you are willing to take it to
Washington, DC to do live technology demonstrations to government
agencies with seven minutes of setup time, you are going to have to be
pretty confident that you know how all of the pieces are supposed to
go together.

Based on this experience, if the proposed driver lands in its current
state, Linux mainline will have, at least from a privacy perspective,
an inferior implementation of SGX.  In addition, we are not confident
the driver will be useful to anything other then server class hardware
and will be incapable of supporting virtually all of the existing SGX
hardware in the field.

This is NOT a criticism of Jarkko's work or the overall technical
implementation and quality of the driver.  We actually use and test a
modified version of the proposed driver, along with the out of tree
driver in our platforms.

At a high level, addressing these issues is straight forward.  First,
the driver needs to support authorization equivalent to that which is
implemented in the current Intel Launch Enclave, ie. control over the
SGX_FLAGS_PROVISION_KEY attribute.  Secondly, the driver needs to drop
its prohibition against launch enclaves, ie. returning EINVAL when a
request is made to initialize enclaves which have the
SGX_FLAGS_EINITTOKEN_KEY attribute set.

There will be some devil in the details with respect to both of these
issues, but those discussions can follow later.  Addressing these two
issues will at least create an environment where the proposed in-tree
driver is equivalent in privacy and functionality to the out of tree
driver.

SGX is a remarkably complex piece of machinery.  Producing a useful
driver requires the consideration of a lot of issues which, in our
opinion, have not been fully represented in the discussions to date.

> /Jarkko

I hope the above is useful for framing future discussions.

Have a good remainder of the week.

Dr. Greg

As always,
Dr. G.W. Wettstein, Ph.D.   Enjellic Systems Development, LLC.
4206 N. 19th Ave.           Specializing in information infra-structure
Fargo, ND  58102            development.
PH: 701-281-1686
FAX: 701-281-3949           EMAIL: greg@enjellic.com
------------------------------------------------------------------------------
"I suppose that could could happen but he wouldn't know a Galois Field
 if it kicked him in the nuts."
                                -- Anonymous mathematician
                                   Resurrection.
Andy Lutomirski Nov. 22, 2018, 3:21 p.m. UTC | #17
On Thu, Nov 22, 2018 at 3:12 AM Dr. Greg <greg@enjellic.com> wrote:
>
> On Tue, Nov 20, 2018 at 02:04:42PM +0200, Jarkko Sakkinen wrote:
>
> Good morning to everyone, Happy Thanksgiving to those who are
> celebrating the holiday.
>
> > On Mon, Nov 19, 2018 at 08:59:24AM -0800, Andy Lutomirski wrote:
> > > The idea here is that, under normal circumstances, provisioning only
> > > runs once, or at least only runs rarely.  So, rather than the SDK
> > > running provisioning whenever it feels like doing so (which is the
> > > current behavior, I imagine, although I haven't looked), there would
> > > be a privileged program, perhaps a systemd unit that runs when needed,
> > > that produces the key material needed for remote attestation, and
> > > non-root users that need attestation would get the keying material
> > > from the provisioning service.  And the provisioning service could
> > > implement its own policy.  Ideally, the service wouldn't give the
> > > sealed keys to users at all but would, instead, just provide the
> > > entire attestation service over a UNIX socket, which would make
> > > provisioning capabilities revocable.
> > >
> > > Does this make sense?
>
> > Yes, it does for me at least now that you brought some context.
>
> Let me see if I can add a bit of additional context to the above to
> frame further discussion regarding two major needs of the driver
> before it lands.
>
> What Andy is describing is how the current system already works.  The
> driver is at the root of a fairly complex eco-system of code,
> cryptography and protocols that implement SGX functionality.  This
> software stack is known as the SGX Platform SoftWare (PSW) or SGX
> runtime.
>
> The Intel provided runtime is implemented in C++ and, depending on how
> you count it, clocks in at around 50+ KLOC.  All of this ends up as a
> single 1.8 megabyte binary named aesm_service that links against 35
> shared libraries and is run by systemd.
>
> This binary implements the functionality needed to load, initialize,
> run and attest enclaves.  It also implements communications with the
> Intel provisioning and attestation services which is needed to
> provision a private EPID key to the platform and to verify the status
> of an enclave attestation quote from a remote platform.
>
> In order to achieve the SGX/IAGO security model, a lot of this
> functionality is implemented by choreographing exchanges between six
> Intel supplied and signed enclaves.  Intel supplies source code to
> these enclaves and understanding how all of this works requires an
> understanding of that codebase as well.  To top if off there is also a
> 50K hunk of signed Java bytecode that gets stuffed into the Management
> Engine if you are interested in platform services.
>

It's very nice of Intel to supply source. Also, yikes, they wrote
their ME blob in Java?

> All of the above is what we wrote an independent implementation of, in
> straight C, that is capable of linking against the MUSL C library with
> only libelf and OpenSSL as dependencies.  We developed all of this to
> support a reasonably sophisticated multi-enclave SGX security
> application that implements modeling the runtime behavior of
> applications running on the Linux kernel.  That application uses an
> alternate enclave attestation and communications architecture that we
> independently developed.

How do you generate your root of trust for attestation if you don't
use Intel's service?  If I understand right, Intel's mechanism boils
down to Intel possessing something that is functionally equivalent to
a list of valid provisioning keys associated with genuine Intel chips.
(It could be literal provisioning keys, or keying material from which
the provisioning keys are derived, or even public keys for which the
private parts are derived from the provisioning keys.  But all of this
boils down to the CPU, via some enclaves, proving knowledge of the
provisioning key that is available when MRSIGNER is Intel.

I could imagine replacing this by running Intel's mechanism once to
generate a new root of trust for your own mechanism, or even by buying
a bunch of CPUs, determining their provisioning keys with MRSIGNER=Dr.
Greg, and enrolling those.  But I don't see any way to bootstrap an
attestation service on a CPU you've never physically possessed without
running *something* with MRSIGNER=Intel.

> Based on this experience, if the proposed driver lands in its current
> state, Linux mainline will have, at least from a privacy perspective,
> an inferior implementation of SGX.  In addition, we are not confident
> the driver will be useful to anything other then server class hardware
> and will be incapable of supporting virtually all of the existing SGX
> hardware in the field.

I'm a little bit puzzled by your privacy claims.  If an attacker
controls the kernel, then the secrets in your enclave may still be
secure, but you have very little privacy in terms of trying to prevent
the attacker from tying your communication to your physical machine.

> At a high level, addressing these issues is straight forward.  First,
> the driver needs to support authorization equivalent to that which is
> implemented in the current Intel Launch Enclave, ie. control over the
> SGX_FLAGS_PROVISION_KEY attribute.

I agree, hence my email :)

> Secondly, the driver needs to drop
> its prohibition against launch enclaves, ie. returning EINVAL when a
> request is made to initialize enclaves which have the
> SGX_FLAGS_EINITTOKEN_KEY attribute set.
>

Can you explain your use case for a launch enclave?  The current
driver won't allow you to supply an EINITTOKEN, so I'm not immediately
seeing what you would do with a launch token.  If you really do need
it, I suppose we could have 'unprivileged', 'provisioning', and
'launch' nodes, and maybe even 'launch_and_provision', but that's
starting to have some combinatorial issues.
Andy Lutomirski Nov. 22, 2018, 8:56 p.m. UTC | #18
On Thu, Nov 22, 2018 at 3:12 AM Dr. Greg <greg@enjellic.com> wrote:
> In addition, we are not confident
> the driver will be useful to anything other then server class hardware
> and will be incapable of supporting virtually all of the existing SGX
> hardware in the field.

I forgot to mention: I have a plain old retail Intel Celeron (I think
-- it's not in front of me right now) that has Flex LC.  I suspect
that, going forward, all new Intel CPUs will support Flex LC.  I
really wish that Intel would document all of the detailed capabilities
of all their CPUs somewhere.

There has been some talk of how the driver could support old CPUs.  It
would involve a rather different LE being signed by Intel, but it
should be more or less transparent to user code if it happens.
Dr. Greg Nov. 23, 2018, 10:39 a.m. UTC | #19
On Thu, Nov 22, 2018 at 12:56:23PM -0800, Andy Lutomirski wrote:

Good morning to everyone.

> On Thu, Nov 22, 2018 at 3:12 AM Dr. Greg <greg@enjellic.com> wrote:
> > In addition, we are not confident
> > the driver will be useful to anything other then server class hardware
> > and will be incapable of supporting virtually all of the existing SGX
> > hardware in the field.

> I forgot to mention: I have a plain old retail Intel Celeron (I
> think -- it's not in front of me right now) that has Flex LC.  I
> suspect that, going forward, all new Intel CPUs will support Flex
> LC.  I really wish that Intel would document all of the detailed
> capabilities of all their CPUs somewhere.

Interesting, that contradicts the read we had gotten on FLC.

The only specific CPU's that we know of that are acknowledged to have
FLC are the XEON E series in support of the PCE/ECDSA attestation for
data-center use of SGX.

I'm assuming this is a really new system?  If so that matches our
experience with respect to the fact that SGX2/EDMM capable systems
slipped out quietly in early generation NUC SOC's.

Your sentinments are correct with respect to getting solid guidance on
hardware, it is a significant issue, particularly with respect to
SGX.

> There has been some talk of how the driver could support old CPUs.
> It would involve a rather different LE being signed by Intel, but it
> should be more or less transparent to user code if it happens.

I will reply to the LE and driver capability issues in my response to
your earlier e-mail so the thread does not diverge.

Jarkko, when this driver lands it will set the SGX ABI in stone for
Linux.  It would be very, very helpful to the development community if
there was some official guidance from Intel on whether or not FLC will
be a universal feature on all hardware and the date that is going to
happen or has happened.

If this turns out to be an OEM selectable issue via firmware
customizations, things will be even more problematic.

As things stand now, the proposed mainline SGX driver is going to be
useless for a lot of hardware that is out in the field.  Linux has
always had the reputation for supporting legacy hardware so this
directly contravenes that principal and needlessly so.

Dr. Greg

As always,
Dr. Greg Wettstein, Ph.D, Worker
IDfusion, LLC
4206 N. 19th Ave.           Implementing measured information privacy
Fargo, ND  58102            and integrity architectures.
PH: 701-281-1686
FAX: 701-281-3949           EMAIL: gw@idfusion.org
------------------------------------------------------------------------------
"Simplicity is prerequisite for reliability."
                                -- Edsger W. Dijkstra
Jarkko Sakkinen Nov. 24, 2018, 4:15 p.m. UTC | #20
On Tue, Nov 20, 2018 at 05:15:08AM -0600, Dr. Greg wrote:
> Malware would not necessarily need the Intel attestation service.
> Once access to the PROVISION bit is available, malware teams could
> simply build their own attestation service.

AFAIK not possible as they wouldn't have access to the root provisioning
key. Can be confirmed from the SDM's key derivation table (41-56).

/Jarkko
Jarkko Sakkinen Nov. 24, 2018, 4:45 p.m. UTC | #21
On Fri, Nov 23, 2018 at 04:39:23AM -0600, Dr. Greg wrote:
> Jarkko, when this driver lands it will set the SGX ABI in stone for
> Linux.  It would be very, very helpful to the development community if
> there was some official guidance from Intel on whether or not FLC will
> be a universal feature on all hardware and the date that is going to
> happen or has happened.

I seriously don't know but I can take this message to the mothership...

> If this turns out to be an OEM selectable issue via firmware
> customizations, things will be even more problematic.

Point taken.

> As things stand now, the proposed mainline SGX driver is going to be
> useless for a lot of hardware that is out in the field.  Linux has
> always had the reputation for supporting legacy hardware so this
> directly contravenes that principal and needlessly so.

I do get your point about supporting the legacy. I'm wondering if this
is still a deal breaker for Linux or not.

/Jarkko
Jarkko Sakkinen Nov. 24, 2018, 5:21 p.m. UTC | #22
On Thu, Nov 22, 2018 at 07:21:08AM -0800, Andy Lutomirski wrote:
> > At a high level, addressing these issues is straight forward.  First,
> > the driver needs to support authorization equivalent to that which is
> > implemented in the current Intel Launch Enclave, ie. control over the
> > SGX_FLAGS_PROVISION_KEY attribute.
> 
> I agree, hence my email :)

Started to scratch my head that is it really an issue that any enclave
can provision in the end?

Direct quote from your first response:

"In particular, the ability to run enclaves with the provisioning bit set
is somewhat sensitive, since it effectively allows access to a stable
fingerprint of the system."

As can be seen from the key derivation table this does not exactly hold
so you should refine your original argument before we can consider any
type of change.

I just don't see what it is so wrong for any enclave to be able to tell
that it really is an enclave.

/Jarkko
Dr. Greg Nov. 24, 2018, 7:24 p.m. UTC | #23
On Sat, Nov 24, 2018 at 08:15:21AM -0800, Jarkko Sakkinen wrote:

> On Tue, Nov 20, 2018 at 05:15:08AM -0600, Dr. Greg wrote:
> > Malware would not necessarily need the Intel attestation service.
> > Once access to the PROVISION bit is available, malware teams could
> > simply build their own attestation service.

> AFAIK not possible as they wouldn't have access to the root
> provisioning key. Can be confirmed from the SDM's key derivation
> table (41-56).

What provisioning and attestation is all about is establishing an
identity binding for a platform in question.  The standard Intel
service binds the identity of a platform to an EPID private key.

With access to the SGX_FLAGS_PROVISION_BIT an enclave can generate a
perpetual identity for a platform based on the identity modulus
signature (MRSIGNER) of the key that signs the signature structure of
the enclave.  Without access to the root provisioning key a security
quorum or group has to be implemented via a subscription or enrollment
model but that is arguably not much of an obstacle.

That is pretty much the way standard botware works now.

Without provisions for cryptographically secure authorization and
policy enforcement in the driver, we will be creating infrastructure
for a new generation of botware/malware whose mothership will know
that a participating platform is running with full confidentiality and
integrity protections.

> /Jarkko

Dr. Greg

As always,
Dr. G.W. Wettstein, Ph.D.   Enjellic Systems Development, LLC.
4206 N. 19th Ave.           Specializing in information infra-structure
Fargo, ND  58102            development.
PH: 701-281-1686
FAX: 701-281-3949           EMAIL: greg@enjellic.com
------------------------------------------------------------------------------
"Remember that when you take down the fishhouse you can't put
 the minnows back into the lake, so throw them out on the ice.
 Make sure you stomp on any of the live ones so they don't suffer."
                                -- Fritz Wettstein
                                   At the lake
Dr. Greg Nov. 24, 2018, 8:13 p.m. UTC | #24
On Sat, Nov 24, 2018 at 09:21:14AM -0800, Jarkko Sakkinen wrote:
> On Thu, Nov 22, 2018 at 07:21:08AM -0800, Andy Lutomirski wrote:
> > > At a high level, addressing these issues is straight forward.  First,
> > > the driver needs to support authorization equivalent to that which is
> > > implemented in the current Intel Launch Enclave, ie. control over the
> > > SGX_FLAGS_PROVISION_KEY attribute.
> > 
> > I agree, hence my email :)

> Started to scratch my head that is it really an issue that any
> enclave can provision in the end?
>
> Direct quote from your first response:
>
> "In particular, the ability to run enclaves with the provisioning
> bit set is somewhat sensitive, since it effectively allows access to
> a stable fingerprint of the system."
>
> As can be seen from the key derivation table this does not exactly
> hold so you should refine your original argument before we can
> consider any type of change.
>
> I just don't see what it is so wrong for any enclave to be able to
> tell that it really is an enclave.

This isn't about an enclave being able to tell that it is really an
enclave.  As I noted in my previous reply, access to the provisioning
bit allows an enclave author to create a perpetual hardware identifier
for a platform based on a signing key of their choosing, along with a
few other incidentals, all of which are completely under the control
of the enclave author.

The Intel SGX architects, at least originally, felt strongly enough
about this issue to use the Launch Enclave to implement
cryptographically secure policy control over access to the
SGX_FLAGS_PROVISION_KEY attribute.  See the 'if' clause that begins on
line 219 of psw/ae/le/launch_enclave.cpp in the current HEAD of the
Linux SGX SDK which is currently bf22963411.

Let me describe an entirely contrived example but one which is
representative of the threat.

I'm a web-site that wants to consistently and reliably track platforms
that visit a site.  Without cryptographically secure policy
enforcement in the SGX eco-system I push an enclave to the platform
which only computes the MRSIGNER specific derived provisioning key and
returns it to the web-site.

From that point onward I will always be able to identify the platform,
as long as the enclave can be executed on the platform.  Unlike
cookies, there is nothing to delete since the aggressor enclave only
needs to exist long enough to be run and generate the derived
provisioning key, no trace of the fingerprinting remains thereafter.

If the proposed driver is to be a functional replacement for the
existing SGX eco-system it needs to offer privacy and platform
security guarantees at least comparable to what is available on a
non-FLC system.  That means at least some semblance of
cryptographically secure policy management on at least two fronts.

We can propose a general architecture that we believe satisfies these
needs without compromising the upstream integrity of the kernel with
respect to free and open systems.  A solution that could arguably
protect user's investment in current non-FLC hardware as well.

We would be happy to articulate the outline of that but I don't want
to waste anyone's time, including ours, if everyone's mind has been
made up as to what the driver should and should not do.

We are clearly capable of making the proposed driver do whatever we
want it to do.  Our concern is that Linux security architects that
choose to use this technology have the best tools available to them,
within the constraints of upstream sensibility, without whacking on
the kernel.

As it stands now the driver has both privacy and potential system
security issues which translate into useability and desirability
implications for SGX on Linux moving forward.

> /Jarkko

Have a good remainder of the weekend.

I need to get back to my MIG welder out in the shop.

Dr. Greg

As always,
Dr. G.W. Wettstein, Ph.D.   Enjellic Systems Development, LLC.
4206 N. 19th Ave.           Specializing in information infra-structure
Fargo, ND  58102            development.
PH: 701-281-1686
FAX: 701-281-3949           EMAIL: greg@enjellic.com
------------------------------------------------------------------------------
"Attendants at a service station in Eunice, Louisiana, handed more than
 $100 to a naked man who claimed to have a gun in his pocket."
                                -- Unknown
Jarkko Sakkinen Nov. 25, 2018, 2:53 p.m. UTC | #25
On Sat, Nov 24, 2018 at 09:21:14AM -0800, Jarkko Sakkinen wrote:
> On Thu, Nov 22, 2018 at 07:21:08AM -0800, Andy Lutomirski wrote:
> > > At a high level, addressing these issues is straight forward.  First,
> > > the driver needs to support authorization equivalent to that which is
> > > implemented in the current Intel Launch Enclave, ie. control over the
> > > SGX_FLAGS_PROVISION_KEY attribute.
> > 
> > I agree, hence my email :)
> 
> Started to scratch my head that is it really an issue that any enclave
> can provision in the end?
> 
> Direct quote from your first response:
> 
> "In particular, the ability to run enclaves with the provisioning bit set
> is somewhat sensitive, since it effectively allows access to a stable
> fingerprint of the system."
> 
> As can be seen from the key derivation table this does not exactly hold
> so you should refine your original argument before we can consider any
> type of change.
> 
> I just don't see what it is so wrong for any enclave to be able to tell
> that it really is an enclave.

I mean I can understand why Greg wants LE although I don't understand
what benefit does it bring to anyone to lock in for enclave to allow
to identify itself.

What you are proposing does not really bring any additional security if
we consider a threat model where the kernel is an adversary but it makes
the software stack more clanky to use.

/Jarkko
Andy Lutomirski Nov. 25, 2018, 4:22 p.m. UTC | #26
>> On Nov 25, 2018, at 6:53 AM, Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com> wrote:
>> 
>> On Sat, Nov 24, 2018 at 09:21:14AM -0800, Jarkko Sakkinen wrote:
>> On Thu, Nov 22, 2018 at 07:21:08AM -0800, Andy Lutomirski wrote:
>>>> At a high level, addressing these issues is straight forward.  First,
>>>> the driver needs to support authorization equivalent to that which is
>>>> implemented in the current Intel Launch Enclave, ie. control over the
>>>> SGX_FLAGS_PROVISION_KEY attribute.
>>> 
>>> I agree, hence my email :)
>> 
>> Started to scratch my head that is it really an issue that any enclave
>> can provision in the end?
>> 
>> Direct quote from your first response:
>> 
>> "In particular, the ability to run enclaves with the provisioning bit set
>> is somewhat sensitive, since it effectively allows access to a stable
>> fingerprint of the system."
>> 
>> As can be seen from the key derivation table this does not exactly hold
>> so you should refine your original argument before we can consider any
>> type of change.
>> 
>> I just don't see what it is so wrong for any enclave to be able to tell
>> that it really is an enclave.
> 
> I mean I can understand why Greg wants LE although I don't understand
> what benefit does it bring to anyone to lock in for enclave to allow
> to identify itself.
> 
> What you are proposing does not really bring any additional security if
> we consider a threat model where the kernel is an adversary but it makes
> the software stack more clanky to use.

Agreed. What I’m proposing adds additional security if the kernel is *not* compromised.

There are other ways to accomplish it that might be better in some respects.  For example, there could be /dev/sgx and /dev/sgx_rights/provision.  The former exposes the whole sgx API, except that it doesn’t allow provisioning by default. The latter does nothing by itself. To run a provisioning enclave, you open both nodes, then do something like:

ioctl(sgx, SGX_IOC_ADD_RIGHT, sgx_provisioning);

This requires extra syscalls, but it doesn’t have the combinatorial explosion problem.
Dr. Greg Nov. 25, 2018, 6:55 p.m. UTC | #27
On Sun, Nov 25, 2018 at 08:22:35AM -0800, Andy Lutomirski wrote:

Good morning to everyone, I hope the weekend continues to proceed
well.

Proposal follows below for kernel based policy management of enclaves
if people want to skip forward.

> >> On Nov 25, 2018, at 6:53 AM, Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com> wrote:
> >> 
> >> On Sat, Nov 24, 2018 at 09:21:14AM -0800, Jarkko Sakkinen wrote:
> >> On Thu, Nov 22, 2018 at 07:21:08AM -0800, Andy Lutomirski wrote:
> >>>> At a high level, addressing these issues is straight forward.  First,
> >>>> the driver needs to support authorization equivalent to that which is
> >>>> implemented in the current Intel Launch Enclave, ie. control over the
> >>>> SGX_FLAGS_PROVISION_KEY attribute.
> >>> 
> >>> I agree, hence my email :)
> >> 
> >> Started to scratch my head that is it really an issue that any enclave
> >> can provision in the end?
> >> 
> >> Direct quote from your first response:
> >> 
> >> "In particular, the ability to run enclaves with the provisioning bit set
> >> is somewhat sensitive, since it effectively allows access to a stable
> >> fingerprint of the system."
> >> 
> >> As can be seen from the key derivation table this does not exactly hold
> >> so you should refine your original argument before we can consider any
> >> type of change.
> >> 
> >> I just don't see what it is so wrong for any enclave to be able to tell
> >> that it really is an enclave.

> > I mean I can understand why Greg wants LE although I don't understand
> > what benefit does it bring to anyone to lock in for enclave to allow
> > to identify itself.
> > 
> > What you are proposing does not really bring any additional security if
> > we consider a threat model where the kernel is an adversary but it makes
> > the software stack more clanky to use.

> Agreed. What I'm proposing adds additional security if the kernel is
> *not* compromised.

Let me use this to stress a concept that I believe is important in
this discussion.

SGX is enabling technology that allows developers to create software
architectures that will deliver their stated security and privacy
guarantees irrespective of platform state.  It does this by linking
'islands' of execution (enclaves) together through a web of
cryptographic guarantees.

The notion of a launch enclave is critical to establishing these
guarantees.  As soon as the kernel becomes involved in implementing
SGX security policy the architecture becomes vulnerable to kernel
and/or privilege modification attacks.

We've talked at length about the provisioning bit, I won't go into
details unless people are interested, but the EPID provisioning
protocol implements an SGX mediated cryptographic contract that a
perpetual platform identifier will not be disclosed to anyone but
Intel.  The launch enclave is critical to that guarantee.

It is completely understandable why a locked down, (non-FLC) hardware
platform, is undesirable in this community.  That doesn't mean that a
launch enclave as a concept is unneeded or necessarily evil.

In an FLC environment the kernel assumes responsibility for SGX
privacy and security.  This means that we need to do at least as well
with a kernel based model as to what is currently available.

> There are other ways to accomplish it that might be better in some
> respects.  For example, there could be /dev/sgx and
> /dev/sgx_rights/provision.  The former exposes the whole sgx API,
> except that it doesn???t allow provisioning by default. The latter
> does nothing by itself. To run a provisioning enclave, you open both
> nodes, then do something like:
>
> ioctl(sgx, SGX_IOC_ADD_RIGHT, sgx_provisioning);
>
> This requires extra syscalls, but it doesn't have the combinatorial
> explosion problem.

Here is a proposal for the driver to add the needed policy control
that is 'SGXy' in nature.  The 'SGXy' way is to use MRSIGNER values as
the currency for security policy management.

The driver should establish the equivalent of three linked lists,
maintainable via /sysfs pseudo-files or equivalent plumbing.  The
lists are referenced by the kernel to enforce the following policies.

1.) The right to initialize an enclave without special attributes.

2.) The right to initialize an enclave with the PROVISION_KEY attribute set.

3.) The right to initialize an enclave with the LICENSE_KEY attribute set.

The lists are populated with MRSIGNER values of enclaves that are
allowed to initialize under the specified conditions.

The driver should either establish a 'seal' file or value,
ie. MRSIGNER value of all zero's, that once written will not allow
further modifications of the list(s).  This will allow
cryptographically guaranteed policies to be setup at early boot that
will limit the ability of subsequent DAC compromises to affect policy
management.

The lists are obviously vulnerable to a kernel compromise but the
vulnerability scope is significantly limited vs. 'can I get root or
some other userid'.  If we are really concerned about the scope of
that vulnerability there could be an option on TPM based systems to
verify a hash value over the lists once sealed on each enclave
initialization.  We have already conceded that EINIT isn't going to be
any type of speed daemon.

On an FLC system the driver verifies that the submitted enclave has an
MRSIGNER value on one of the lists consistent with the attributes of
the enclave before loading the value into the identity modulus
signature registers.

In this model, I would argue that the driver does not need to
arbitrarily exclude launch enclaves as it does now, since the kernel
has the ability to specify acceptable launch enclaves.  The driver API
can alaso continue to accept an EINITTOKEN which maintains
compatibility with the current ABI.  Punishment can be inflicted on
non-FLC hardware owners by issueing EINVAL if an EINITTOKEN is
specified on platforms with fixed launch keys.

This also has the effect of allowing multiple launch enclaves at the
platform owner's discretion.  I know there was some sentiment, and
Jarkko had code, that used a launch enclave at a fixed location such
as /lib/firmware.  That has the disadvantage of requiring that the
kernel know about all the different ways that a launch enclave might
be used or setup.  It also establishes a cryptographic rather then a
filesystem based guarantee on the launch enclave being used.

If the lists are empty the kernel simply proceeds as it does now and
loads any enclave submitted to it.

I believe this architecture has a number of merits.  It largely
preserves compatibility with current PSW's and provides a mechanism
for cryptographically enforced policy that is consistent with the SGX
architecture.

I need to get Christmas lights put up on the house for the squirrels
to eat so I will leave this proposal open for debate.

Have a good remainder of the weekend or whats left of it.

Dr. Greg

As always,
Dr. G.W. Wettstein, Ph.D.   Enjellic Systems Development, LLC.
4206 N. 19th Ave.           Specializing in information infra-structure
Fargo, ND  58102            development.
PH: 701-281-1686
FAX: 701-281-3949           EMAIL: greg@enjellic.com
------------------------------------------------------------------------------
"Some of them are.  A surprising number aren't.  A personal favorite of
 mine was the log from a cracker who couldn't figure out how to untar
 and install the trojan package he'd ftped onto the machine.  He tried a
 few times, and then eventually gave up and logged out."
                                -- Nat Lanza
Jarkko Sakkinen Nov. 25, 2018, 11:51 p.m. UTC | #28
On Sun, Nov 25, 2018 at 12:55:24PM -0600, Dr. Greg wrote:
> Good morning to everyone, I hope the weekend continues to proceed
> well.

Thank you, wish the same. Surviving from jetlag after flying from
Finland to Portland yesterday..

Greg, Andy, I'll check your remarks with thought after I'm done with v18
i.e. have less things to multitask with. Is that OK for you? Probably
v18 won't be the last the version anyway.

/Jarkko
Andy Lutomirski Nov. 26, 2018, 12:37 a.m. UTC | #29
Bah, I hit send on a partially written draft. I’ll try again soon.

--Andy

> On Nov 25, 2018, at 1:59 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> 
> 
> 
>> On Nov 25, 2018, at 10:55 AM, Dr. Greg <greg@enjellic.com> wrote:
>> 
> 
>> 
>> 
>> The notion of a launch enclave is critical to establishing these
>> guarantees.  As soon as the kernel becomes involved in implementing
>> SGX security policy the architecture becomes vulnerable to kernel
>> and/or privilege modification attacks.
>> 
>> We've talked at length about the provisioning bit, I won't go into
>> details unless people are interested, but the EPID provisioning
>> protocol implements an SGX mediated cryptographic contract that a
>> perpetual platform identifier will not be disclosed to anyone but
>> Intel.  
> 
> As a reviewer, and as an occasional academic cryptographer, I need to put my foot down here.  This is inaccurate.
> 
> There is an SGX-mediated contract that says:
> 
> 1. For any given public key p, a perpetual platform identifier ID_p exists and will only be disclosed to the holder of the corresponding private key p_priv or to someone to whom the private key holder permits (intentionally or otherwise) to use that identifier.
> 
> 2. The ability described in #1 is available to anyone whom the kernel and launch enclave (if the MSRs are locked ) permits (intentionally or otherwise) to use it.
> 
> No, I have no clue why Intel did it this way.  I consider it to be a mistake.
> 
>> The launch enclave is critical to that guarantee.
>> 
>> It is completely understandable why a locked down, (non-FLC) hardware
>> platform, is undesirable in this community.  That doesn't mean that a
>> launch enclave as a concept is unneeded or necessarily evil.
>> 
>> In an FLC environment the kernel assumes responsibility for SGX
>> privacy and security.  This means that we need to do at least as well
>> with a kernel based model as to what is currently available.
>> 
>>> There are other ways to accomplish it that might be better in some
>>> respects.  For example, there could be /dev/sgx and
>>> /dev/sgx_rights/provision.  The former exposes the whole sgx API,
>>> except that it doesn???t allow provisioning by default. The latter
>>> does nothing by itself. To run a provisioning enclave, you open both
>>> nodes, then do something like:
>>> 
>>> ioctl(sgx, SGX_IOC_ADD_RIGHT, sgx_provisioning);
>>> 
>>> This requires extra syscalls, but it doesn't have the combinatorial
>>> explosion problem.
>> 
>> Here is a proposal for the driver to add the needed policy control
>> that is 'SGXy' in nature.  The 'SGXy' way is to use MRSIGNER values as
>> the currency for security policy management.
>> 
>> The driver should establish the equivalent of three linked lists,
>> maintainable via /sysfs pseudo-files or equivalent plumbing.  The
>> lists are referenced by the kernel to enforce the following policies.
>> 
>> 1.) The right to initialize an enclave without special attributes.
>> 
>> 2.) The right to initialize an enclave with the PROVISION_KEY attribute set.
>> 
>> 3.) The right to initialize an enclave with the LICENSE_KEY attribute set.
>> 
>> The lists are populated with MRSIGNER values of enclaves that are
>> allowed to initialize under the specified conditions.
> 
> NAK because this is insufficient.  You’re thinking of a model in which SGX-like protection is all that’s needed.  This is an inadequate model of the real world.  The attack I’m most concerned about wrt provisioning is an attack in which an unauthorized user is permitted 
> 
> The use case I see for attestation *privacy*
> 
>> 
>> The driver should either establish a 'seal' file or value,
>> ie. MRSIGNER value of all zero's, that once written will not allow
>> further modifications of the list(s).  This will allow
>> cryptographically guaranteed policies to be setup at early boot that
>> will limit the ability of subsequent DAC compromises to affect policy
>> management.
>> 
>> The lists are obviously vulnerable to a kernel compromise but the
>> vulnerability scope is significantly limited vs. 'can I get root or
>> some other userid'.  If we are really concerned about the scope of
>> that vulnerability there could be an option on TPM based systems to
>> verify a hash value over the lists once sealed on each enclave
>> initialization.  We have already conceded that EINIT isn't going to be
>> any type of speed daemon.
>> 
>> On an FLC system the driver verifies that the submitted enclave has an
>> MRSIGNER value on one of the lists consistent with the attributes of
>> the enclave before loading the value into the identity modulus
>> signature registers.
>> 
>> In this model, I would argue that the driver does not need to
>> arbitrarily exclude launch enclaves as it does now, since the kernel
>> has the ability to specify acceptable launch enclaves.  The driver API
>> can alaso continue to accept an EINITTOKEN which maintains
>> compatibility with the current ABI.  Punishment can be inflicted on
>> non-FLC hardware owners by issueing EINVAL if an EINITTOKEN is
>> specified on platforms with fixed launch keys.
>> 
>> This also has the effect of allowing multiple launch enclaves at the
>> platform owner's discretion.  I know there was some sentiment, and
>> Jarkko had code, that used a launch enclave at a fixed location such
>> as /lib/firmware.  That has the disadvantage of requiring that the
>> kernel know about all the different ways that a launch enclave might
>> be used or setup.  It also establishes a cryptographic rather then a
>> filesystem based guarantee on the launch enclave being used.
>> 
>> If the lists are empty the kernel simply proceeds as it does now and
>> loads any enclave submitted to it.
>> 
>> I believe this architecture has a number of merits.  It largely
>> preserves compatibility with current PSW's and provides a mechanism
>> for cryptographically enforced policy that is consistent with the SGX
>> architecture.
>> 
>> I need to get Christmas lights put up on the house for the squirrels
>> to eat so I will leave this proposal open for debate.
>> 
>> Have a good remainder of the weekend or whats left of it.
>> 
>> Dr. Greg
>> 
>> As always,
>> Dr. G.W. Wettstein, Ph.D.   Enjellic Systems Development, LLC.
>> 4206 N. 19th Ave.           Specializing in information infra-structure
>> Fargo, ND  58102            development.
>> PH: 701-281-1686
>> FAX: 701-281-3949           EMAIL: greg@enjellic.com
>> ------------------------------------------------------------------------------
>> "Some of them are.  A surprising number aren't.  A personal favorite of
>> mine was the log from a cracker who couldn't figure out how to untar
>> and install the trojan package he'd ftped onto the machine.  He tried a
>> few times, and then eventually gave up and logged out."
>>                               -- Nat Lanza
Dr. Greg Nov. 26, 2018, 11 a.m. UTC | #30
On Sun, Nov 25, 2018 at 04:37:00PM -0800, Andy Lutomirski wrote:

> Bah, I hit send on a partially written draft. I???ll try again soon.
> 
> --Andy

Your first issue seems to be complete so I will respond to that in
order to make sure we are not talking past one another.

> > On Nov 25, 2018, at 1:59 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> >> On Nov 25, 2018, at 10:55 AM, Dr. Greg <greg@enjellic.com> wrote:
> >> 
> >> The notion of a launch enclave is critical to establishing these
> >> guarantees.  As soon as the kernel becomes involved in implementing
> >> SGX security policy the architecture becomes vulnerable to kernel
> >> and/or privilege modification attacks.
> >> 
> >> We've talked at length about the provisioning bit, I won't go into
> >> details unless people are interested, but the EPID provisioning
> >> protocol implements an SGX mediated cryptographic contract that a
> >> perpetual platform identifier will not be disclosed to anyone but
> >> Intel.  

> As a reviewer, and as an occasional academic cryptographer, I need
> to put my foot down here.  This is inaccurate.

I certainly wouldn't try to engage in a debate at the level of
academic cryptography, so I want to clarify that I was speaking
specifically with respect to the ability to use the Intel supplied
Platform Certification Enclave (PCE) to obtain a perpetual platform
identifier.

There could certainly be an academic level weakness in SGX, or in the
provisioning protocol, but at the end of the day the important issue
seems to be whether or not a PCE enclave can be exploited by anyone
with execution access to the enclave to generate a perpetual
identifier for a platform.

> There is an SGX-mediated contract that says:
>
> 1. For any given public key p, a perpetual platform identifier ID_p
> exists and will only be disclosed to the holder of the corresponding
> private key p_priv or to someone to whom the private key holder
> permits (intentionally or otherwise) to use that identifier.
>
> 2. The ability described in #1 is available to anyone whom the
> kernel and launch enclave (if the MSRs are locked ) permits
> (intentionally or otherwise) to use it.

Let me see if I can respond directly to point two as it seems the most
important.

In the EPID provisioning model, the PCE and the ProVisioning Enclave
(PVE) both have posession of the PROVISION attribute and thus access
to the derivation of an MRSIGNER specific provisioning key.

The Intel supplied launch enclave (LE) specifically denies
initialization of enclaves which have the PROVISION attribute set,
with the exception of enclaves whose MRSIGNER values match those of
the keys that Intel uses to sign the PCE and PVE enclaves.  See line
219 of psw/ae/le/launch_enclave.cpp in the Intel SDK.

In the message one phase of the provisioning protocol Intel supplies
the platform with a 3K RSA public key (PEK).  The identity of that key
is confirmed by an ECC256 based signature over the key.  Intel embeds
the public gx and gy curve points for the signature key in their SDK,
see ae/data/constants/linux/peksk_pub.hh.

The PVE verifies the signature of the PEK and generates a SHA256
verification hash over a message containing the key and uses that as
the data field in an attestation report that is generated against the
target information for the PCE.  The PVE rejects generation of the
report if the PCE target information does not have the PROVISION
attribute set.  See line 124 of psw/ae/pve/provision_msg1.cpp.

This report, along with the PEK, is submitted to the PCE enclave in
order to generate the Platform Provisioning IDentity (PPID), which is
the privacy critical identifier.  The PCE verifies the report
generated by the PVE and rejects the request to generate the PPID if
the report was generated by an enclave that was not initialized with
the PROVISION bit set, see line 109 of psw/ae/pce/pce.cpp.

The PCE enclave then recomputes the message hash using the PEK that it
is provided and verifies that the value matches the value in the data
field of the attestation report from the PVE enclave.  If the values
do not match the PPID generation request is denied.  The PCE enclave
then encrypts the PPID with the PEK key and the encrypted PPID is
returned to Intel to use as the platform identifier.

The PPID is the CMAC over a 16 byte null message which uses a derived
provisioning key based on CPUSVN and ISVSVN values set to zero.  See
the get_ppid() function in psw/ae/pce/pce_helper.cpp.

I believe this effectively denies the ability of anyone other then
Intel, who holds the private portion of the ECC256 signature key used
to authenticate the PEK, from using the PCE enclave to generate a
platform identifier.

As I conceded above, there could be an academic deficiency in all
this, I'm not qualified to comment.  I believe there is a reasonably
solid functional guarantee that on a locked platform the process
cannot be easily subverted by a privacy aggressor.

We contend that the model we propose below can also deliver this
guarantee as long as ring 0 privileges are not compromised by an
aggressor, which is the best that an FLC platform can do.

Once again, the important design factor in all of this is the premise
that the launch enclave will not allow enclaves other then the PCE and
PVE to access the PROVISION bit.  Hence my comments about SGX being
about establishing islands of trust and the negotiation of security
contexts between those islands of trust.

> No, I have no clue why Intel did it this way.  I consider it to be a
> mistake.

Are you referring to there being a mistake in the trust relationships
that the provisioning protocol implements or the overall concept of a
provisioning key?

We've got over two man years into re-implementing all of this.  The
Intel code is a bit challenging to follow and not well documented, it
is now.... :-), but we have developed a great deal of respect for how
good the individuals behind the design of this were.

> >> The launch enclave is critical to that guarantee.
> >> 
> >> It is completely understandable why a locked down, (non-FLC) hardware
> >> platform, is undesirable in this community.  That doesn't mean that a
> >> launch enclave as a concept is unneeded or necessarily evil.
> >> 
> >> In an FLC environment the kernel assumes responsibility for SGX
> >> privacy and security.  This means that we need to do at least as well
> >> with a kernel based model as to what is currently available.
> >> 
> >>> There are other ways to accomplish it that might be better in some
> >>> respects.  For example, there could be /dev/sgx and
> >>> /dev/sgx_rights/provision.  The former exposes the whole sgx API,
> >>> except that it doesn???t allow provisioning by default. The latter
> >>> does nothing by itself. To run a provisioning enclave, you open both
> >>> nodes, then do something like:
> >>> 
> >>> ioctl(sgx, SGX_IOC_ADD_RIGHT, sgx_provisioning);
> >>> 
> >>> This requires extra syscalls, but it doesn't have the combinatorial
> >>> explosion problem.
> >> 
> >> Here is a proposal for the driver to add the needed policy control
> >> that is 'SGXy' in nature.  The 'SGXy' way is to use MRSIGNER values as
> >> the currency for security policy management.
> >> 
> >> The driver should establish the equivalent of three linked lists,
> >> maintainable via /sysfs pseudo-files or equivalent plumbing.  The
> >> lists are referenced by the kernel to enforce the following policies.
> >> 
> >> 1.) The right to initialize an enclave without special attributes.
> >> 
> >> 2.) The right to initialize an enclave with the PROVISION_KEY attribute set.
> >> 
> >> 3.) The right to initialize an enclave with the LICENSE_KEY attribute set.
> >> 
> >> The lists are populated with MRSIGNER values of enclaves that are
> >> allowed to initialize under the specified conditions.

> NAK because this is insufficient.  You're thinking of a model in
> which SGX-like protection is all that's needed.  This is an
> inadequate model of the real world.  The attack I'm most concerned
> about wrt provisioning is an attack in which an unauthorized user is
> permitted.

We will be interested in your comments as to why the proposal is
insufficient in the real world of FLC.

I believe the proposed architecture can be defended as being effective
in the real world, as it allows the root user to use cryptographic
protections of access to the PROVISION bit and to enclave execution in
general.  On FLC that is the strongest guarantee that can be
delivered.

When we speak of 'unauthorized' users I believe we are speaking in the
parlance of discretionary access controls which has a much wider TCB
scope then the cryptographic model we are proposing.  The model we
propose allows the platform owner (root) to effectively implement the
same level of security over the PROVISION bit that current locked
platforms have, in a free and open fashion of course.

We can certainly attempt to explain our position further.

> The use case I see for attestation *privacy*

Things seemed to end here so I assume that is where your e-mail went
awry.

I hope the clarifications provided above will assist further
discussion.

Have a good day.

Dr. Greg

As always,
Dr. G.W. Wettstein, Ph.D.   Enjellic Systems Development, LLC.
4206 N. 19th Ave.           Specializing in information infra-structure
Fargo, ND  58102            development.
PH: 701-281-1686
FAX: 701-281-3949           EMAIL: greg@enjellic.com
------------------------------------------------------------------------------
"When I am working on a problem I never think about beauty.  I only
 think about how to solve the problem.  But when I have finished, if
 the solution is not beautiful, I know it is wrong."
                                -- Buckminster Fuller
Andy Lutomirski Nov. 26, 2018, 6:22 p.m. UTC | #31
On Mon, Nov 26, 2018 at 3:00 AM Dr. Greg <greg@enjellic.com> wrote:
>
> On Sun, Nov 25, 2018 at 04:37:00PM -0800, Andy Lutomirski wrote:
>
> > Bah, I hit send on a partially written draft. I???ll try again soon.
> >
> > --Andy
>
> Your first issue seems to be complete so I will respond to that in
> order to make sure we are not talking past one another.

It wasn't, but your answer is enlightening!  I've read the SGX
*manual*, but I hadn't dug through the actual Intel-supplied enclaves.
So, when I said that the LE isn't an important part of the overall
trust model, I meant that it isn't *in hardware*.  It's certainly
possible to write SGX software that weakens the security of the
overall system, and Intel seems to have done so:

>
> The Intel supplied launch enclave (LE) specifically denies
> initialization of enclaves which have the PROVISION attribute set,
> with the exception of enclaves whose MRSIGNER values match those of
> the keys that Intel uses to sign the PCE and PVE enclaves.  See line
> 219 of psw/ae/le/launch_enclave.cpp in the Intel SDK.

This seems entirely reasonable.  (But see below...)

> This report, along with the PEK, is submitted to the PCE enclave in
> order to generate the Platform Provisioning IDentity (PPID), which is
> the privacy critical identifier.  The PCE verifies the report
> generated by the PVE and rejects the request to generate the PPID if
> the report was generated by an enclave that was not initialized with
> the PROVISION bit set, see line 109 of psw/ae/pce/pce.cpp.

...and this seems entirely unreasonable.  Your description does indeed
appear consistent with the code: the PCE will hand out the PPID to any
requesting enclave that has the PROVISION bit set, so you are correct
that:

> Once again, the important design factor in all of this is the premise
> that the launch enclave will not allow enclaves other then the PCE and
> PVE to access the PROVISION bit.

But here's where the whole thing goes off the rails.  I would argue
that the Intel-supplied (and Intel-signed, apparently!) PCE is just
straight-up buggy.  What Intel is *trying* to do is to hand out the
PPID to an appropriately signed enclave.  What they actually did is to
hand out the PPID to any enclave that has the PROVISION bit set.  This
is poor design because it overload the PROVISION bit.  That bit is
supposed to mean "may use EGETKEY to obtain provisioning and
provisioning seal keys", which is not actually what Intel wants here.
It's also poor design without FLC because it pointlessly relies on the
LE to enforce a restriction on the use of provisioning enclaves, when
the code could instead have checked MRSIGNER..  And it's just straight
up wrong with FLC because there is no guarantee whatsoever that the LE
being used is Intel's.  And, for that matter, there is no guarantee
that the requesting enclave doesn't have the DEBUG bit set.

(It's also poor design because the PCE doesn't appear to verify that
the report passed in is actually intended to be associated with a call
to get_ppid().  There appear to be reports associated with provision
"msg1" and "msg3".  If it's possible to get a valid report for msg3 to
be accepted as a msg1 report of vice versa, then it might be game
over.)

Sorry, but this is not Linux's problem.  The right fix is, in my
opinion, entirely clear: the PCE should check MRSIGNER and possibly
even MRENCLAVE in the report it receives.  Intel needs to fix their
PCE, sign a fixed version, and find some way to revoke, to the extent
possible, the old one.  And the SGX enclave authors need to understand
something that is apparently subtle: THE LAUNCH POLICY IS NOT PART OF
THE TCB AND SHOULD NOT BE RELIED UPON.  Enclaves can and should be
written to remain secure even in the complete absence of any form of
launch control.

I went and filed a bug on github.  Let's see what happens:

https://github.com/intel/linux-sgx/issues/345

Also, the whole SGX report mechanism seems to be misused in the SDK.
An SGX report is a cryptographic primitive that essentially acts like
a signed blob.  Building a secure protocol on top of signed messages
or on top of reports takes more than just making up ad hoc blob
formats and signing them.  There needs to be domain separation between
different messages, and this seems to be entirely missing.  Do you
know if Intel has had a serious audit of their platform enclave code
done?

>
> > No, I have no clue why Intel did it this way.  I consider it to be a
> > mistake.
>
> Are you referring to there being a mistake in the trust relationships
> that the provisioning protocol implements or the overall concept of a
> provisioning key?

I'm referring to the hardware's policy as to when keys that don't
depend on OWNEREPOCH can be obtained.  As far as I know, the only real
need for such keys is to verify that the running platform is a real
Intel platform, which means that access to the provisioning key is
only useful to Intel-approved services.  Why didn't Intel enforce this
in hardware or microcode?  I see no reason that EGETKEY should hand
out those key types of the enclave is not signed by Intel.  For that
matter, I also don't see why the provisioning seal key needs to exist
-- the regular seal key could be used instead and, if OWNEREPOCH
changes, the platform could just re-certify itself.

> > >> The driver should establish the equivalent of three linked lists,
> > >> maintainable via /sysfs pseudo-files or equivalent plumbing.  The
> > >> lists are referenced by the kernel to enforce the following policies.
> > >>
> > >> 1.) The right to initialize an enclave without special attributes.
> > >>
> > >> 2.) The right to initialize an enclave with the PROVISION_KEY attribute set.
> > >>
> > >> 3.) The right to initialize an enclave with the LICENSE_KEY attribute set.
> > >>
> > >> The lists are populated with MRSIGNER values of enclaves that are
> > >> allowed to initialize under the specified conditions.
>
> > NAK because this is insufficient.  You're thinking of a model in
> > which SGX-like protection is all that's needed.  This is an
> > inadequate model of the real world.  The attack I'm most concerned
> > about wrt provisioning is an attack in which an unauthorized user is
> > permitted.
>
> We will be interested in your comments as to why the proposal is
> insufficient in the real world of FLC.

That's what you get for reading my unfinished email :)

Your proposal fails to protect against SGX malware.  Here's an SGX
malware use case: the attacker writes a malicious enclave and gets a
victim machine to run it as a non-root user.  They bundle it with a
valid Intel-signed copy of the relevant platform enclaves and use them
to bootstrap the attestation process.  The malicious enclave attests
its identity to a command-and-control server and obtains malicious
code to run.  The good guys can't reverse engineer the malware
enclave, because they can't pass the attestation check and therefore
can't obtain the encrypted code.

This isn't prevented by your proposed solution: the provisioning
enclaves are all signed by Intel.  What's needed is a check that
prevents unauthorized *users* from running them.

--Andy
Jarkko Sakkinen Nov. 26, 2018, 7:39 p.m. UTC | #32
On Sat, Nov 24, 2018 at 01:24:54PM -0600, Dr. Greg wrote:
> On Sat, Nov 24, 2018 at 08:15:21AM -0800, Jarkko Sakkinen wrote:
> 
> > On Tue, Nov 20, 2018 at 05:15:08AM -0600, Dr. Greg wrote:
> > > Malware would not necessarily need the Intel attestation service.
> > > Once access to the PROVISION bit is available, malware teams could
> > > simply build their own attestation service.
> 
> > AFAIK not possible as they wouldn't have access to the root
> > provisioning key. Can be confirmed from the SDM's key derivation
> > table (41-56).
> 
> What provisioning and attestation is all about is establishing an
> identity binding for a platform in question.  The standard Intel
> service binds the identity of a platform to an EPID private key.
> 
> With access to the SGX_FLAGS_PROVISION_BIT an enclave can generate a
> perpetual identity for a platform based on the identity modulus
> signature (MRSIGNER) of the key that signs the signature structure of
> the enclave.  Without access to the root provisioning key a security
> quorum or group has to be implemented via a subscription or enrollment
> model but that is arguably not much of an obstacle.
> 
> That is pretty much the way standard botware works now.
> 
> Without provisions for cryptographically secure authorization and
> policy enforcement in the driver, we will be creating infrastructure
> for a new generation of botware/malware whose mothership will know
> that a participating platform is running with full confidentiality and
> integrity protections.

OK, I think I got what you mean.

With free access to the provision the bot net controller could be sure
that a node is running inside an enclave. Is this what you are worried
about? Please correct if not or even if there is a slight drift on what
you are trying to state.

/Jarkko
Jarkko Sakkinen Nov. 26, 2018, 9:15 p.m. UTC | #33
On Sat, Nov 24, 2018 at 02:13:18PM -0600, Dr. Greg wrote:
> This isn't about an enclave being able to tell that it is really an
> enclave.  As I noted in my previous reply, access to the provisioning
> bit allows an enclave author to create a perpetual hardware identifier
> for a platform based on a signing key of their choosing, along with a
> few other incidentals, all of which are completely under the control
> of the enclave author.

I think I'm now in the same page with the issue now. Thanks for the
patience explaining this.

/Jarkko
Jarkko Sakkinen Nov. 26, 2018, 9:51 p.m. UTC | #34
On Sun, Nov 25, 2018 at 08:22:35AM -0800, Andy Lutomirski wrote:
> Agreed. What I’m proposing adds additional security if the kernel is
> *not* compromised.

And even if the kernel is compromised evil use will detected quicker
i.e. compromissed kernel is "better" than a kernel that allows to
use provisioning freely.

> There are other ways to accomplish it that might be better in some
> respects.  For example, there could be /dev/sgx and
> /dev/sgx_rights/provision.  The former exposes the whole sgx API,
> except that it doesn’t allow provisioning by default. The latter does
> nothing by itself. To run a provisioning enclave, you open both nodes,
> then do something like:
> 
> ioctl(sgx, SGX_IOC_ADD_RIGHT, sgx_provisioning);
> 
> This requires extra syscalls, but it doesn’t have the combinatorial
> explosion problem.

I like this design because it is extendable. I'm now also in the same
page why we need to protect provisioning in the first place. I would
slight restructure this as

/dev/sgx/control
/dev/sgx/attributes/provision

Looks cleaner and the root /dev directory is less polluted.

BTW, off-topic from this but should we remove ENCLAVE from IOC names as
they all concern enclaves anyway? Seems kind of redundant. I.e.

SGX_IOC_ENCLAVE_CREATE -> SGX_IOC_CREATE
SGX_IOC_ENCLAVE_ADD_PAGE -> SGX_IOC_ADD_PAGE
SGX_IOC_ENCLAVE_INIT -> SGX_IOC_INIT

/Jarkko
Jarkko Sakkinen Nov. 26, 2018, 10:16 p.m. UTC | #35
On Mon, Nov 26, 2018 at 05:00:39AM -0600, Dr. Greg wrote:
> We will be interested in your comments as to why the proposal is
> insufficient in the real world of FLC.
> 
> I believe the proposed architecture can be defended as being effective
> in the real world, as it allows the root user to use cryptographic
> protections of access to the PROVISION bit and to enclave execution in
> general.  On FLC that is the strongest guarantee that can be
> delivered.
> 
> When we speak of 'unauthorized' users I believe we are speaking in the
> parlance of discretionary access controls which has a much wider TCB
> scope then the cryptographic model we are proposing.  The model we
> propose allows the platform owner (root) to effectively implement the
> same level of security over the PROVISION bit that current locked
> platforms have, in a free and open fashion of course.
> 
> We can certainly attempt to explain our position further.

I think kernel controlled provision would in all cases lower the
mitigations of thread scenarios (at least what you've presented so far)
assuming that a compromissed kernel could be detected fairly quickly,
wouldn't it?

Even without SGX, having a compromissed kernel, you can anyhow stealth
your malware in many ways.

/Jarkko
Jarkko Sakkinen Nov. 26, 2018, 11:04 p.m. UTC | #36
On Mon, Nov 26, 2018 at 01:51:45PM -0800, Jarkko Sakkinen wrote:
> > ioctl(sgx, SGX_IOC_ADD_RIGHT, sgx_provisioning);
> > 
> > This requires extra syscalls, but it doesn’t have the combinatorial
> > explosion problem.
> 
> I like this design because it is extendable. I'm now also in the same
> page why we need to protect provisioning in the first place. I would
> slight restructure this as
> 
> /dev/sgx/control
> /dev/sgx/attributes/provision

I guess it would be OK to upstream only control node first as long as
provision attribute is denied in order to keep the already huge patch
set a tiny bit smaller?

/Jarkko
Jethro Beekman Nov. 27, 2018, 7:46 a.m. UTC | #37
On 2018-11-27 03:21, Jarkko Sakkinen wrote:
> BTW, off-topic from this but should we remove ENCLAVE from IOC names as
> they all concern enclaves anyway? Seems kind of redundant. I.e.
> 
> SGX_IOC_ENCLAVE_CREATE -> SGX_IOC_CREATE
> SGX_IOC_ENCLAVE_ADD_PAGE -> SGX_IOC_ADD_PAGE
> SGX_IOC_ENCLAVE_INIT -> SGX_IOC_INIT 

Future ioctls might be added that deal with system-global SGX things? 
Like an interface to communicate with the in-kernel LE or something.

Jethro Beekman | Fortanix
Dr. Greg Nov. 27, 2018, 8:55 a.m. UTC | #38
On Mon, Nov 26, 2018 at 03:04:36PM -0800, Jarkko Sakkinen wrote:

Good morning to everyone.

> On Mon, Nov 26, 2018 at 01:51:45PM -0800, Jarkko Sakkinen wrote:
> > > ioctl(sgx, SGX_IOC_ADD_RIGHT, sgx_provisioning);
> > > 
> > > This requires extra syscalls, but it doesn???t have the combinatorial
> > > explosion problem.
> > 
> > I like this design because it is extendable. I'm now also in the same
> > page why we need to protect provisioning in the first place. I would
> > slight restructure this as
> > 
> > /dev/sgx/control
> > /dev/sgx/attributes/provision

> I guess it would be OK to upstream only control node first as long
> as provision attribute is denied in order to keep the already huge
> patch set a tiny bit smaller?

At this point in time I believe there is a consensus that the driver
needs a policy management framework of some type for an optimum
implementation.  The PROVISION attribute has privacy implications and
unrestricted access to release mode (full security) is problematic.

Since the thread has become a bit divergent I wanted to note that we
have offered a proposal for a general policy management framework
based on MRSIGNER values.  This framework is consistent with the SGX
security model, ie. cryptographic rather then DAC based policy
controls.  This framework also allows a much more flexible policy
implementation that doesn't result in combinatoric issues.

Our framework also allows the preservation of the current ABI which
allows an EINITTOKEN to be passed in from userspace.  The framework
also supports the ability to specify that only a kernel based launch
enclave (LE) should be available if the platform owner or distribution
should desire to implement such a model.

The policy management framework is straight forward.  Three linked
lists or their equivalent which are populated through /sysfs
pseudo-files or equivalent plumbing.  Each list is populated with
MRSIGNER values for signing keys that are allowed to initialize
enclaves under three separate conditions.

1.) General enclaves without special attribute bits.

2.) Enclaves with the SGX_FLAGS_PROVISION_KEY attribute set. - i.e.,
'Provisioning Enclaves'.

3.) Enclaves with the SGX_FLAGS_LICENSE_KEY attribute set - i.e., 'Launch
Enclaves'.

An all-null MRSIGNER value serves as a 'sealing' value that locks a
list from any further modifications.

This architecture allows platform policies to be specified and then
sealed at early boot by the root user.  At that point cryptographic
policy controls are in place rather then DAC based controls, the
latter of which have perpetual security liabilities in addition to the
useability constraints inherent in a DAC or device node model.

We have developed an independent implementation of the PSW and
arguably have as much experience with issues surrounding how to
interact with the device driver as anyone.  We have spent a lot of
time thinking about these issues and the above framework provides the
most flexible architecture available.

> /Jarkko

We would be happy to discuss specific aspects of the implementation.

Have a good day.

Dr. Greg

As always,
Dr. G.W. Wettstein, Ph.D.   Enjellic Systems Development, LLC.
4206 N. 19th Ave.           Specializing in information infra-structure
Fargo, ND  58102            development.
PH: 701-281-1686
FAX: 701-281-3949           EMAIL: greg@enjellic.com
------------------------------------------------------------------------------
"Remember that when you take down the fishhouse you can't put
 the minnows back into the lake, so throw them out on the ice.
 Make sure you stomp on any of the live ones so they don't suffer."
                                -- Fritz Wettstein
                                   At the lake
Jarkko Sakkinen Nov. 27, 2018, 4:36 p.m. UTC | #39
On Tue, Nov 27, 2018 at 07:46:48AM +0000, Jethro Beekman wrote:
> On 2018-11-27 03:21, Jarkko Sakkinen wrote:
> > BTW, off-topic from this but should we remove ENCLAVE from IOC names as
> > they all concern enclaves anyway? Seems kind of redundant. I.e.
> > 
> > SGX_IOC_ENCLAVE_CREATE -> SGX_IOC_CREATE
> > SGX_IOC_ENCLAVE_ADD_PAGE -> SGX_IOC_ADD_PAGE
> > SGX_IOC_ENCLAVE_INIT -> SGX_IOC_INIT
> 
> Future ioctls might be added that deal with system-global SGX things? Like
> an interface to communicate with the in-kernel LE or something.

Yea, maybe better to keep it just in case. The names are not too long
anyway. Then the new ioctl should be SGX_IOC_ENCLAVE_SET_ATTRIBUTE.

/Jarkko
Jarkko Sakkinen Nov. 27, 2018, 4:41 p.m. UTC | #40
On Tue, Nov 27, 2018 at 02:55:33AM -0600, Dr. Greg wrote:
> Since the thread has become a bit divergent I wanted to note that we
> have offered a proposal for a general policy management framework
> based on MRSIGNER values.  This framework is consistent with the SGX
> security model, ie. cryptographic rather then DAC based policy
> controls.  This framework also allows a much more flexible policy
> implementation that doesn't result in combinatoric issues.
> 
> Our framework also allows the preservation of the current ABI which
> allows an EINITTOKEN to be passed in from userspace.  The framework
> also supports the ability to specify that only a kernel based launch
> enclave (LE) should be available if the platform owner or distribution
> should desire to implement such a model.
> 
> The policy management framework is straight forward.  Three linked
> lists or their equivalent which are populated through /sysfs
> pseudo-files or equivalent plumbing.  Each list is populated with
> MRSIGNER values for signing keys that are allowed to initialize
> enclaves under three separate conditions.
> 
> 1.) General enclaves without special attribute bits.
> 
> 2.) Enclaves with the SGX_FLAGS_PROVISION_KEY attribute set. - i.e.,
> 'Provisioning Enclaves'.
> 
> 3.) Enclaves with the SGX_FLAGS_LICENSE_KEY attribute set - i.e., 'Launch
> Enclaves'.
> 
> An all-null MRSIGNER value serves as a 'sealing' value that locks a
> list from any further modifications.
> 
> This architecture allows platform policies to be specified and then
> sealed at early boot by the root user.  At that point cryptographic
> policy controls are in place rather then DAC based controls, the
> latter of which have perpetual security liabilities in addition to the
> useability constraints inherent in a DAC or device node model.
> 
> We have developed an independent implementation of the PSW and
> arguably have as much experience with issues surrounding how to
> interact with the device driver as anyone.  We have spent a lot of
> time thinking about these issues and the above framework provides the
> most flexible architecture available.

Sounds like a lot bloat and policy added to the kernel whereas with
Andy's proposal you can implement logic to a daemon and provide only
mechanism to do it.

/Jarkko
Jarkko Sakkinen Nov. 27, 2018, 4:46 p.m. UTC | #41
On Tue, Nov 27, 2018 at 02:55:33AM -0600, Dr. Greg wrote:
> 3.) Enclaves with the SGX_FLAGS_LICENSE_KEY attribute set - i.e., 'Launch
> Enclaves'.

Kernel does not have to manage this. If the MSRs are read-only, they
should match your LE. If the MSRs writable, you don't need an LE.

This whole scheme sounds like adding own SELinux for SGX and it is
only words. No code available.

/Jarkko
Andy Lutomirski Nov. 27, 2018, 5:55 p.m. UTC | #42
> On Nov 27, 2018, at 8:41 AM, Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com> wrote:
> 
>> On Tue, Nov 27, 2018 at 02:55:33AM -0600, Dr. Greg wrote:
>> Since the thread has become a bit divergent I wanted to note that we
>> have offered a proposal for a general policy management framework
>> based on MRSIGNER values.  This framework is consistent with the SGX
>> security model, ie. cryptographic rather then DAC based policy
>> controls.  This framework also allows a much more flexible policy
>> implementation that doesn't result in combinatoric issues.
>> 
>> Our framework also allows the preservation of the current ABI which
>> allows an EINITTOKEN to be passed in from userspace.  The framework
>> also supports the ability to specify that only a kernel based launch
>> enclave (LE) should be available if the platform owner or distribution
>> should desire to implement such a model.
>> 
>> The policy management framework is straight forward.  Three linked
>> lists or their equivalent which are populated through /sysfs
>> pseudo-files or equivalent plumbing.  Each list is populated with
>> MRSIGNER values for signing keys that are allowed to initialize
>> enclaves under three separate conditions.
>> 
>> 1.) General enclaves without special attribute bits.
>> 
>> 2.) Enclaves with the SGX_FLAGS_PROVISION_KEY attribute set. - i.e.,
>> 'Provisioning Enclaves'.
>> 
>> 3.) Enclaves with the SGX_FLAGS_LICENSE_KEY attribute set - i.e., 'Launch
>> Enclaves'.
>> 
>> An all-null MRSIGNER value serves as a 'sealing' value that locks a
>> list from any further modifications.
>> 
>> This architecture allows platform policies to be specified and then
>> sealed at early boot by the root user.  At that point cryptographic
>> policy controls are in place rather then DAC based controls, the
>> latter of which have perpetual security liabilities in addition to the
>> useability constraints inherent in a DAC or device node model.
>> 
>> We have developed an independent implementation of the PSW and
>> arguably have as much experience with issues surrounding how to
>> interact with the device driver as anyone.  We have spent a lot of
>> time thinking about these issues and the above framework provides the
>> most flexible architecture available.
> 
> Sounds like a lot bloat and policy added to the kernel whereas with
> Andy's proposal you can implement logic to a daemon and provide only
> mechanism to do it.
> 
> 

Well, almost. We’d need SGX_IOC_FREEZE_MR{ENCLAVE,SIGNER} or similar.  Or maybe the daemon could handle the entire loading process.  But this can wait until after the main driver is upstream.

This does lead to a question: enclaves are kind-of-sort-of mapped into a given address space. What happens if you issue the various ioctls in the context of a different mm?  For that matter, can two processes mmap the same enclave?
Jarkko Sakkinen Nov. 28, 2018, 5:08 a.m. UTC | #43
On Sat, Nov 24, 2018 at 08:45:34AM -0800, Jarkko Sakkinen wrote:
> On Fri, Nov 23, 2018 at 04:39:23AM -0600, Dr. Greg wrote:
> > Jarkko, when this driver lands it will set the SGX ABI in stone for
> > Linux.  It would be very, very helpful to the development community if
> > there was some official guidance from Intel on whether or not FLC will
> > be a universal feature on all hardware and the date that is going to
> > happen or has happened.
> 
> I seriously don't know but I can take this message to the mothership...

LC enabling is essentially a platform vendors choice, not Intels choice,
like many other CPU features that Linux is dependent on. Of course, if
Linux ends supporting only LC that will without doubt have a big impact
on vendors so in that way it is indirectly also communitys choice.

/Jarkko
Jethro Beekman Nov. 28, 2018, 5:38 a.m. UTC | #44
2018-11-28 10:38, Jarkko Sakkinen wrote:
> On Sat, Nov 24, 2018 at 08:45:34AM -0800, Jarkko Sakkinen wrote:
>> On Fri, Nov 23, 2018 at 04:39:23AM -0600, Dr. Greg wrote:
>>> Jarkko, when this driver lands it will set the SGX ABI in stone for
>>> Linux.  It would be very, very helpful to the development community if
>>> there was some official guidance from Intel on whether or not FLC will
>>> be a universal feature on all hardware and the date that is going to
>>> happen or has happened.
>>
>> I seriously don't know but I can take this message to the mothership...
> 
> LC enabling is essentially a platform vendors choice, not Intels choice,
> like many other CPU features that Linux is dependent on. Of course, if
> Linux ends supporting only LC that will without doubt have a big impact
> on vendors so in that way it is indirectly also communitys choice.
> 
> /Jarkko
> 

Jarkko, it would be good if Intel at least documented FLC and 
recommended a particular practice in the SGX BIOS Writer's guide, in a 
similar way to how the software control interface is documented.

--
Jethro Beekman | Fortanix
Dr. Greg Nov. 28, 2018, 10:49 a.m. UTC | #45
On Tue, Nov 27, 2018 at 09:55:45AM -0800, Andy Lutomirski wrote:

> > On Nov 27, 2018, at 8:41 AM, Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com> wrote:
> > 
> >> On Tue, Nov 27, 2018 at 02:55:33AM -0600, Dr. Greg wrote:
> >> Since the thread has become a bit divergent I wanted to note that we
> >> have offered a proposal for a general policy management framework
> >> based on MRSIGNER values.  This framework is consistent with the SGX
> >> security model, ie. cryptographic rather then DAC based policy
> >> controls.  This framework also allows a much more flexible policy
> >> implementation that doesn't result in combinatoric issues.
> >> 
> >> Our framework also allows the preservation of the current ABI which
> >> allows an EINITTOKEN to be passed in from userspace.  The framework
> >> also supports the ability to specify that only a kernel based launch
> >> enclave (LE) should be available if the platform owner or distribution
> >> should desire to implement such a model.
> >> 
> >> The policy management framework is straight forward.  Three linked
> >> lists or their equivalent which are populated through /sysfs
> >> pseudo-files or equivalent plumbing.  Each list is populated with
> >> MRSIGNER values for signing keys that are allowed to initialize
> >> enclaves under three separate conditions.
> >> 
> >> 1.) General enclaves without special attribute bits.
> >> 
> >> 2.) Enclaves with the SGX_FLAGS_PROVISION_KEY attribute set. - i.e.,
> >> 'Provisioning Enclaves'.
> >> 
> >> 3.) Enclaves with the SGX_FLAGS_LICENSE_KEY attribute set - i.e., 'Launch
> >> Enclaves'.
> >> 
> >> An all-null MRSIGNER value serves as a 'sealing' value that locks a
> >> list from any further modifications.
> >> 
> >> This architecture allows platform policies to be specified and then
> >> sealed at early boot by the root user.  At that point cryptographic
> >> policy controls are in place rather then DAC based controls, the
> >> latter of which have perpetual security liabilities in addition to the
> >> useability constraints inherent in a DAC or device node model.
> >> 
> >> We have developed an independent implementation of the PSW and
> >> arguably have as much experience with issues surrounding how to
> >> interact with the device driver as anyone.  We have spent a lot of
> >> time thinking about these issues and the above framework provides the
> >> most flexible architecture available.
> > 
> > Sounds like a lot bloat and policy added to the kernel whereas with
> > Andy's proposal you can implement logic to a daemon and provide only
> > mechanism to do it.

> Well, almost. We'd need SGX_IOC_FREEZE_MR{ENCLAVE,SIGNER} or
> similar.  Or maybe the daemon could handle the entire loading process.
> But this can wait until after the main driver is upstream.
>
> This does lead to a question: enclaves are kind-of-sort-of mapped
> into a given address space. What happens if you issue the various
> ioctls in the context of a different mm?  For that matter, can two
> processes mmap the same enclave?

Fascinating.

We've been carrying a patch, that drops in on top of the proposed
kernel driver, that implements the needed policy management framework
for DAC fragile (FLC) platforms.  After a meeting yesterday with the
client that is funding the work, a decision was made to release the
enhancements when the SGX driver goes mainline.  That will at least
give developers the option of creating solutions on Linux that
implement the security guarantees that SGX was designed to deliver.

Most importantly, since it implements a driver consistent with the
design of SGX, it has the added benefit of allowing system
administrators the ability to enable the driver to work on non-FLC
(locked) platforms.  Since Jarkko confirmed that FLC is the option of
platform vendors, this would seem to be important as SGX on Linux will
only work in a random fashion dependent on the whims of hardware OEM's
in probably a SKU dependent fashion.  Which is why the client has
interest in the work.

Best wishes for a productive remainder of the week.

Dr. Greg

As always,
Dr. G.W. Wettstein, Ph.D.   Enjellic Systems Development, LLC.
4206 N. 19th Ave.           Specializing in information infra-structure
Fargo, ND  58102            development.
PH: 701-281-1686
FAX: 701-281-3949           EMAIL: greg@enjellic.com
------------------------------------------------------------------------------
"Five year projections, are you kidding me.  We don't know what we are
 supposed to be doing at the 4 o'clock meeting this afternoon."
                                -- Terry Wieland
                                   Resurrection
Jarkko Sakkinen Nov. 28, 2018, 7:22 p.m. UTC | #46
On Wed, Nov 28, 2018 at 04:49:41AM -0600, Dr. Greg wrote:
> We've been carrying a patch, that drops in on top of the proposed
> kernel driver, that implements the needed policy management framework
> for DAC fragile (FLC) platforms.  After a meeting yesterday with the
> client that is funding the work, a decision was made to release the
> enhancements when the SGX driver goes mainline.  That will at least
> give developers the option of creating solutions on Linux that
> implement the security guarantees that SGX was designed to deliver.

We do not need yet another policy management framework to the *kernel*.

The token based approach that Andy is proposing is proven and well
established method to create a mechanism. You can then create a daemon
to user space that decides who it wants to send tokes.

/Jarkko
Andy Lutomirski Nov. 28, 2018, 9:52 p.m. UTC | #47
On Tue, Nov 27, 2018 at 12:55 AM Dr. Greg <greg@enjellic.com> wrote:
> Since the thread has become a bit divergent I wanted to note that we
> have offered a proposal for a general policy management framework
> based on MRSIGNER values.  This framework is consistent with the SGX
> security model, ie. cryptographic rather then DAC based policy
> controls.  This framework also allows a much more flexible policy
> implementation that doesn't result in combinatoric issues.

Can you give a concrete explanation of a problem that your proposal
would solve?  As far as I can tell, it gets rid of a case in which an
unprivileged attacker who can run enclaves but hasn't compromised the
kernel can learn the PPID and other SGX-related permanent platform
identifiers, but it does nothing to prevent the same attacker from
learning non-SGX-related permanent identifiers, nor does it prevent
the attacker from using the Intel quoting enclave (unless configured
in a surprising way) and thus attesting to a remote system.

So what problem does it solve?
Pavel Machek Dec. 9, 2018, 5:01 p.m. UTC | #48
Hi!

> > There would be three types of users:
> >
> > 1. Ones that have access to neither of the devices.
> > 2. Ones that have access to unprivileged. Who are these?
> 
> Either 0666 (world) or an sgx group.

Sgx group, please. Or even better, what is generic term for sgx? We
probably want to use that, as sgx is likely trademarked and certainly
Intelism.
								Pavel
Pavel Machek Dec. 9, 2018, 5:01 p.m. UTC | #49
Hi!

> On Thu, Nov 15, 2018 at 5:08 PM Jarkko Sakkinen
> <jarkko.sakkinen@linux.intel.com> wrote:
> >
> > Intel Software Guard eXtensions (SGX) is a set of CPU instructions that
> > can be used by applications to set aside private regions of code and
> > data. The code outside the enclave is disallowed to access the memory
> > inside the enclave by the CPU access control.
> >
> > SGX driver provides a ioctl API for loading and initializing enclaves.
> > Address range for enclaves is reserved with mmap() and they are
> > destroyed with munmap(). Enclave construction, measurement and
> > initialization is done with the provided the ioctl API.
> >
> 
> I brought this up a while back, and I think I should re-ask it now
> that this driver is getting close to ready:
> 
> As it stands, there's just one SGX character device, and I imagine
> that it'll be available to unprivileged applications.  I'm concerned
> that this isn't quite what we want.  I certainly think that everyone,
> or at least almost everyone, ought to be able to run normal
> enclaves.

I don't think nobody or postfix or guest should be running enclaves on
my systems. First, I'd like to  be able to debug my systems.

Second, sgx quite complex and tricky. It may turn out to be secure in
the end, but I'd not be surprised if we got few CVEs before we get
there.

Last, I'd hate to find out in few years that I can't switch to amd
cpu because firefox now requires sgx.

Just make it root-only or 660 by default. Users can get permission in
similar way they get rights to audio..

								Pavel
Dr. Greg Dec. 10, 2018, 10:49 a.m. UTC | #50
On Wed, Nov 28, 2018 at 11:22:28AM -0800, Jarkko Sakkinen wrote:

Good morning, I hope everyone had a pleasant weekend.

> On Wed, Nov 28, 2018 at 04:49:41AM -0600, Dr. Greg wrote:
> > We've been carrying a patch, that drops in on top of the proposed
> > kernel driver, that implements the needed policy management framework
> > for DAC fragile (FLC) platforms.  After a meeting yesterday with the
> > client that is funding the work, a decision was made to release the
> > enhancements when the SGX driver goes mainline.  That will at least
> > give developers the option of creating solutions on Linux that
> > implement the security guarantees that SGX was designed to deliver.

> We do not need yet another policy management framework to the *kernel*.
>
> The token based approach that Andy is proposing is proven and well
> established method to create a mechanism. You can then create a
> daemon to user space that decides who it wants to send tokes.

I guess there will be plenty of time to argue about all of that.

In the meantime, I wanted to confirm that your jarkko-sgx/master
branch contains the proposed driver that is headed upstream.  Before
adding the SFLC patches we thought it best to run the driver through
some testing in order to verify that any problems we generated where
attributable to our work and not the base driver.

At the current time jarkko-sgx/master appears to be having difficulty
initializing the unit test enclave for our trusted runtime API
librarary.  Enclave creation and loading appear to work fine, things
go south after the EINIT ioctl is called on the loaded image.

We specifically isolated the regressions to occur secondary to the
EINIT ioctl being called.  We modified our sgx-load test utility to
pause with the image loaded, but not initialized.  We generated a fair
amount of system activity while the process was holding the enclave
image open and there were no issues.  The process was then allowed to
unmap the virtual memory image without calling EINIT and the system
was fine after that as well.

Symptoms vary, but in all cases appear to be linked to corruption of
the virtual memory infrastructure.  In all cases, the kernel ends up
at a point where any attempt to start a new process hangs and becomes
uninterruptible.  The full kernel failure does not appear to be
synchronous with when EINIT is called, which would support the notion
that something is going wrong with the VM management that is being
workqueue deferred.

This is with your MPX patch applied that corrects issues with the
wrong memory management context being acted upon by that system.  In
any event, the kernel configuration being used for testing does not
have MPX support even enabled.  Given that the changelog for the patch
is indicating the new driver is attempting something unique with
workqueue deferred VM management, it would seem possible that the
driver is tickling bad and possibly untested behavior elsewhere in the
kernel as well.

The enclave in question is not terribly sophisticated by the standards
of our other enclaves, but it is a non-trivial test of SGX
functionality.  It weighs in at about 156K and is generated and signed
in debug mode with version 1.4 compliant metadata.  Obviously it
initializes and runs fine with the out-of-tree driver.

We managed to capture two separate sets of error logs/backtraces that
are included below.  As I'm sure you know, without module support,
working on all of this is a bit painful as it requires the classic
edit-compile-link-boot-whimper procedure.... :-)

Given that the self-test committed to the kernel sources is a trivial
one page enclave and the proposed driver ABI is incompatible with the
released Intel Linux PSW/SDK, this may be the most challenging test
the driver has been put through.  Unless your PSW/SDK team is testing
the new driver behind the scenes.

Obviously let us know if jarkko-master/sgx is not where the action is
at or if you would like us to move forward with alternative testing.

Regression traces follow:

Event 1: -------------------------------------------------------------------
Dec  9 07:35:15 nuc2 kernel: general protection fault: 0000 [#1] SMP PTI
Dec  9 07:35:15 nuc2 kernel: CPU: 1 PID: 1594 Comm: less Not tainted 4.20.0-rc2-sgx-nuc2+ #11
Dec  9 07:35:15 nuc2 kernel: Hardware name: Intel Corporation NUC7CJYH/NUC7JYB, BIOS JYGLKCPX.86A.0046.2018.1103.1316 11/03/2018
Dec  9 07:35:15 nuc2 kernel: RIP: 0010:unmap_vmas+0x3c/0x83
Dec  9 07:35:15 nuc2 kernel: Code: 49 89 cc 53 48 89 f3 4c 8b 6e 40 49 83 bd a0 03 00 00 00 74 32 b9 01 00 00 00 4c 89 e2 4c 89 f6 4c 89 ef e8 db be 01 00 eb 1d <4c> 39 23 73 1d 48 89 de 45 31 c0 4c 89 e1 4c 89 f2 4c 89 ff e8 cb
Dec  9 07:35:15 nuc2 kernel: RSP: 0018:ffff9fd7404c7d90 EFLAGS: 00010282
Dec  9 07:35:15 nuc2 kernel: RAX: 000000000007755e RBX: ffff0f66fad412e0 RCX: 0000000000000000
Dec  9 07:35:15 nuc2 kernel: RDX: ffff8b66f9e42ee0 RSI: ffff8b66f9e42c00 RDI: ffff9fd7404c7dc8
Dec  9 07:35:15 nuc2 kernel: RBP: ffff9fd7404c7db8 R08: 0000000000000014 R09: 000000000007755e
Dec  9 07:35:15 nuc2 kernel: R10: ffff9fd7404c7cc0 R11: 0000000000000000 R12: ffffffffffffffff
Dec  9 07:35:15 nuc2 kernel: R13: ffff8b66f9e42c00 R14: 0000000000000000 R15: ffff9fd7404c7dc8
Dec  9 07:35:15 nuc2 kernel: FS:  0000000000000000(0000) GS:ffff8b66fbe80000(0000) knlGS:0000000000000000
Dec  9 07:35:15 nuc2 kernel: CS:  0010 DS: 002b ES: 002b CR0: 0000000080050033
Dec  9 07:35:15 nuc2 kernel: CR2: 00000000f7e5cce8 CR3: 000000012ec0a000 CR4: 0000000000340ee0
Dec  9 07:35:15 nuc2 kernel: Call Trace:
Dec  9 07:35:15 nuc2 kernel:  exit_mmap+0xab/0x146
Dec  9 07:35:15 nuc2 kernel:  ? __handle_mm_fault+0x6f8/0xb0e
Dec  9 07:35:15 nuc2 kernel:  mmput+0x20/0xa9
Dec  9 07:35:15 nuc2 kernel:  do_exit+0x39d/0x8ad
Dec  9 07:35:15 nuc2 kernel:  ? handle_mm_fault+0x172/0x1c4
Dec  9 07:35:15 nuc2 kernel:  do_group_exit+0x3f/0x96
Dec  9 07:35:15 nuc2 kernel:  __ia32_sys_exit_group+0x12/0x12
Dec  9 07:35:15 nuc2 kernel:  do_fast_syscall_32+0xfd/0x1c1
Dec  9 07:35:15 nuc2 kernel:  entry_SYSENTER_compat+0x7c/0x8e
Dec  9 07:35:15 nuc2 kernel: RIP: 0023:0xf7f638d9
Dec  9 07:35:15 nuc2 kernel: Code: Bad RIP value.
Dec  9 07:35:15 nuc2 kernel: RSP: 002b:00000000ff93594c EFLAGS: 00000206 ORIG_RAX: 00000000000000fc
Dec  9 07:35:15 nuc2 kernel: RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 0000000000000000
Dec  9 07:35:15 nuc2 kernel: RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000f7f05288
Dec  9 07:35:15 nuc2 kernel: RBP: 00000000ff935978 R08: 0000000000000000 R09: 0000000000000000
Dec  9 07:35:15 nuc2 kernel: R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
Dec  9 07:35:15 nuc2 kernel: R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
Dec  9 07:35:15 nuc2 kernel: Modules linked in:
Dec  9 07:35:15 nuc2 kernel: ---[ end trace 590ee48fe9cfd7a6 ]---
Dec  9 07:35:15 nuc2 kernel: RIP: 0010:unmap_vmas+0x3c/0x83
Dec  9 07:35:15 nuc2 kernel: Code: 49 89 cc 53 48 89 f3 4c 8b 6e 40 49 83 bd a0 03 00 00 00 74 32 b9 01 00 00 00 4c 89 e2 4c 89 f6 4c 89 ef e8 db be 01 00 eb 1d <4c> 39 23 73 1d 48 89 de 45 31 c0 4c 89 e1 4c 89 f2 4c 89 ff e8 cb
Dec  9 07:35:15 nuc2 kernel: RSP: 0018:ffff9fd7404c7d90 EFLAGS: 00010282
Dec  9 07:35:15 nuc2 kernel: RAX: 000000000007755e RBX: ffff0f66fad412e0 RCX: 0000000000000000
Dec  9 07:35:15 nuc2 kernel: RDX: ffff8b66f9e42ee0 RSI: ffff8b66f9e42c00 RDI: ffff9fd7404c7dc8
Dec  9 07:35:15 nuc2 kernel: RBP: ffff9fd7404c7db8 R08: 0000000000000014 R09: 000000000007755e
Dec  9 07:35:15 nuc2 kernel: R10: ffff9fd7404c7cc0 R11: 0000000000000000 R12: ffffffffffffffff
Dec  9 07:35:15 nuc2 kernel: R13: ffff8b66f9e42c00 R14: 0000000000000000 R15: ffff9fd7404c7dc8
Dec  9 07:35:15 nuc2 kernel: FS:  0000000000000000(0000) GS:ffff8b66fbe80000(0000) knlGS:0000000000000000
Dec  9 07:35:15 nuc2 kernel: CS:  0010 DS: 002b ES: 002b CR0: 0000000080050033
Dec  9 07:35:15 nuc2 kernel: CR2: 00000000f7f638af CR3: 000000012ec0a000 CR4: 0000000000340ee0
Dec  9 07:35:15 nuc2 kernel: Fixing recursive fault but reboot is needed!
---------------------------------------------------------------------------

Test 2: --------------------------------------------------------------------
Dec  9 07:55:51 nuc2 kernel: BUG: Bad rss-counter state mm:0000000004eb5fd2 idx:0 val:226
Dec  9 07:55:51 nuc2 kernel: BUG: Bad rss-counter state mm:0000000004eb5fd2 idx:1 val:46
Dec  9 07:55:51 nuc2 kernel: BUG: non-zero pgtables_bytes on freeing mm: 12288
Dec  9 07:56:12 nuc2 kernel: sgx-load[1759]: segfault at 80 ip 0000000000402015 sp 00007ffe727f6a30 error 4 in sgx-load[400000+b000]
Dec  9 07:56:12 nuc2 kernel: Code: ff 41 b8 8c 02 00 00 b9 90 78 40 00 ba 55 77 40 00 be cc 74 40 00 48 89 ef 31 c0 e8 35 ef ff ff e9 1e ff ff ff 48 83 4b 50 01 <49> 8b 8c 24 80 00 00 00 48 89 8b a0 00 00 00 49 8b 8c 24 88 00 00
Dec  9 07:56:17 nuc2 kernel: BUG: Bad rss-counter state mm:00000000666f29a9 idx:0 val:1
Dec  9 07:56:17 nuc2 kernel: BUG: Bad rss-counter state mm:00000000666f29a9 idx:1 val:9
Dec  9 07:56:17 nuc2 kernel: BUG: non-zero pgtables_bytes on freeing mm: 4096
Dec  9 07:56:25 nuc2 kernel: BUG: Bad rss-counter state mm:00000000f23b96cf idx:1 val:4
Dec  9 07:57:17 nuc2 kernel: rcu: INFO: rcu_sched self-detected stall on CPU
Dec  9 07:57:17 nuc2 kernel: rcu: ^I0-....: (14999 ticks this GP) idle=55e/1/0x4000000000000002 softirq=3304/3304 fqs=7499 
Dec  9 07:57:17 nuc2 kernel: rcu: ^I (t=15000 jiffies g=5665 q=50)
Dec  9 07:57:17 nuc2 kernel: NMI backtrace for cpu 0
Dec  9 07:57:17 nuc2 kernel: CPU: 0 PID: 1761 Comm: less Not tainted 4.20.0-rc2-sgx-nuc2+ #11
Dec  9 07:57:17 nuc2 kernel: Hardware name: Intel Corporation NUC7CJYH/NUC7JYB, BIOS JYGLKCPX.86A.0046.2018.1103.1316 11/03/2018
Dec  9 07:57:17 nuc2 kernel: Call Trace:
Dec  9 07:57:17 nuc2 kernel:  <IRQ>
Dec  9 07:57:17 nuc2 kernel:  dump_stack+0x4d/0x63
Dec  9 07:57:17 nuc2 kernel:  nmi_cpu_backtrace+0x7a/0x8b
Dec  9 07:57:17 nuc2 kernel:  ? lapic_can_unplug_cpu+0x98/0x98
----------------------------------------------------------------------------

> /Jarkko

Best wishes for a productive week.

Dr. Greg

As always,
Dr. G.W. Wettstein, Ph.D.   Enjellic Systems Development, LLC.
4206 N. 19th Ave.           Specializing in information infra-structure
Fargo, ND  58102            development.
PH: 701-281-1686
FAX: 701-281-3949           EMAIL: greg@enjellic.com
------------------------------------------------------------------------------
"(3)  With sufficient thrust, pigs fly just fine.  However, this is not
      necessarily a good idea.  It is hard to be sure where they are
      going to land, and it could be dangerous sitting under them as they
      fly overhead."
                                -- RFC 1925
                                   Fundamental Truths of Networking
Dr. Greg Dec. 10, 2018, 2:46 p.m. UTC | #51
On Sun, Dec 09, 2018 at 06:01:32PM +0100, Pavel Machek wrote:

> Hi!

Good morning to everyone.

> > On Thu, Nov 15, 2018 at 5:08 PM Jarkko Sakkinen
> > <jarkko.sakkinen@linux.intel.com> wrote:
> > >
> > > Intel Software Guard eXtensions (SGX) is a set of CPU instructions that
> > > can be used by applications to set aside private regions of code and
> > > data. The code outside the enclave is disallowed to access the memory
> > > inside the enclave by the CPU access control.
> > >
> > > SGX driver provides a ioctl API for loading and initializing enclaves.
> > > Address range for enclaves is reserved with mmap() and they are
> > > destroyed with munmap(). Enclave construction, measurement and
> > > initialization is done with the provided the ioctl API.
> > 
> > I brought this up a while back, and I think I should re-ask it now
> > that this driver is getting close to ready:
> > 
> > As it stands, there's just one SGX character device, and I imagine
> > that it'll be available to unprivileged applications.  I'm concerned
> > that this isn't quite what we want.  I certainly think that everyone,
> > or at least almost everyone, ought to be able to run normal
> > enclaves.

> I don't think nobody or postfix or guest should be running enclaves
> on my systems. First, I'd like to be able to debug my systems.
>
> Second, sgx quite complex and tricky. It may turn out to be secure
> in the end, but I'd not be surprised if we got few CVEs before we
> get there.
>
> Last, I'd hate to find out in few years that I can't switch to amd
> cpu because firefox now requires sgx.
>
> Just make it root-only or 660 by default. Users can get permission
> in similar way they get rights to audio..

I'm not sure that using root or group restricted access to a character
device is going to stop an ISV from embracing a technology, but that
is an alternate debate.

Relying on discretionary, or mandatory for that matter, access
controls is not consistent with the security architecture of SGX.  The
technology was designed to provide robustness in the face of
aggressors who may have compromised the operating system or hardware
platform.

The lingua franca of SGX security and access controls are MRSIGNER
values.  The SFLC patches that we will be making available, once we
are convinced the upstream driver is working, implement MRSIGNER based
security controls with an absolutely minimal TCB footprint in the
kernel.  This strategy allows the platform owner to use SGX compliant
and cryptographically enforced access controls.

Just as an aside, secondary to our perception that this technology and
what it can do is not widely understood, we are developing a 2-part
LWN article series on SGX and its implications for Linux.

> 								Pavel

Best wishes for a good day and a productive week.

Dr. Greg

As always,
Dr. G.W. Wettstein, Ph.D.   Enjellic Systems Development, LLC.
4206 N. 19th Ave.           Specializing in information infra-structure
Fargo, ND  58102            development.
PH: 701-281-1686
FAX: 701-281-3949           EMAIL: greg@enjellic.com
------------------------------------------------------------------------------
"... remember that innovation is saying 'no' to 1000 things."
                                -- Moxie Marlinspike
Jarkko Sakkinen Dec. 12, 2018, 6 p.m. UTC | #52
On Mon, Dec 10, 2018 at 04:49:08AM -0600, Dr. Greg wrote:
> In the meantime, I wanted to confirm that your jarkko-sgx/master
> branch contains the proposed driver that is headed upstream.  Before
> adding the SFLC patches we thought it best to run the driver through
> some testing in order to verify that any problems we generated where
> attributable to our work and not the base driver.

The master branch is by definition unstable at the moment i.e. it can
sometimes (not often) contain unfinished changes. Use next for testing.
I update next when I consider the master contents "stable enough".

Thanks.

/Jarkko
Dr. Greg Dec. 14, 2018, 11:59 p.m. UTC | #53
On Wed, Dec 12, 2018 at 08:00:36PM +0200, Jarkko Sakkinen wrote:

Good evening, I hope the week has gone well for everyone.

> On Mon, Dec 10, 2018 at 04:49:08AM -0600, Dr. Greg wrote:
> > In the meantime, I wanted to confirm that your jarkko-sgx/master
> > branch contains the proposed driver that is headed upstream.
> > Before adding the SFLC patches we thought it best to run the
> > driver through some testing in order to verify that any problems
> > we generated where attributable to our work and not the base
> > driver.
>
> The master branch is by definition unstable at the moment i.e. it
> can sometimes (not often) contain unfinished changes. Use next for
> testing.  I update next when I consider the master contents "stable
> enough".

I noticed in the last day or so that you appeared to sync
jarkko-sgx/master with jarkko-sgx/next, so I checked out a local
branch against jarkko-sgx/next and ran it against our unit tests.
Based on what we are seeing the driver is still experiencing issues
with initialization of a non-trivial enclave.

On the first test boot of the new kernel, the EINIT ioctl consistently
returned EBUSY over multiple invocations of the unit test.  This did
not appear to generate any negative issues with the kernel at large.

We rebooted the box to run the test against a fresh kernel load.  This
time around we experienced issues similar to what we had previously
described.  The EINIT ioctl generates a segmentation fault which seems
to largely incapacitate the kernel.

Here are the logs and first backtrace from the test:

---------------------------------------------------------------------------
Dec 14 13:25:06 nuc2 kernel: PGD 4f001067 P4D 4f001067 PUD 0 
Dec 14 13:25:06 nuc2 kernel: BUG: unable to handle kernel paging request at ffff97bf3ae916fe
Dec 14 13:25:06 nuc2 kernel: Oops: 0002 [#1] SMP PTI
Dec 14 13:25:06 nuc2 kernel: CPU: 1 PID: 34 Comm: kworker/1:1 Not tainted 4.20.0-rc2-sgx-nuc2+ #12
Dec 14 13:25:06 nuc2 kernel: Hardware name: Intel Corporation NUC7CJYH/NUC7JYB, BIOS JYGLKCPX.86A.0046.2018.1103.1316 11/03/2018
Dec 14 13:25:06 nuc2 kernel: Workqueue: events cache_reap
Dec 14 13:25:06 nuc2 kernel: RIP: 0010:free_block+0xe3/0x182
Dec 14 13:25:06 nuc2 kernel: Code: 20 45 29 d4 41 d3 ec 0f b6 4f 1d 45 01 e2 41 d3 ea 41 8b 49 30 ff c9 49 83 79 20 00 41 89 49 30 75 04 4d 89 59 20 4d 8b 59 20 <45> 88 14 0b 49 8d 49 08 41 83 79 30 00 75 1a 4c 8b 50 28 49 89 4a
Dec 14 13:25:06 nuc2 kernel: RSP: 0018:ffffb90800123db0 EFLAGS: 00210046
Dec 14 13:25:06 nuc2 kernel: RAX: ffff97be3b419080 RBX: 000000000000000f RCX: 00000000ffffff7e
Dec 14 13:25:06 nuc2 kernel: RDX: 0000000000000018 RSI: ffffd907ffc82b70 RDI: ffff97be3b44c200
Dec 14 13:25:06 nuc2 kernel: RBP: ffffb90800123dd8 R08: ffffb90800123e10 R09: fffff9b345eba440
Dec 14 13:25:06 nuc2 kernel: R10: 000000000051f663 R11: ffff97be3ae91780 R12: 0000000011ede5c3
Dec 14 13:25:06 nuc2 kernel: R13: ffffffff80000000 R14: ffff97be3b419088 R15: ffff97be3b4190a8
Dec 14 13:25:06 nuc2 kernel: FS:  0000000000000000(0000) GS:ffff97be3be80000(0000) knlGS:0000000000000000
Dec 14 13:25:06 nuc2 kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Dec 14 13:25:06 nuc2 kernel: CR2: ffff97bf3ae916fe CR3: 000000004ec0a000 CR4: 0000000000340ee0
Dec 14 13:25:06 nuc2 kernel: Call Trace:
Dec 14 13:25:06 nuc2 kernel:  drain_array_locked+0x50/0x75
Dec 14 13:25:06 nuc2 kernel:  drain_array.constprop.67+0x57/0x72
Dec 14 13:25:06 nuc2 kernel:  cache_reap+0x58/0x101
Dec 14 13:25:06 nuc2 kernel:  process_one_work+0x183/0x271
Dec 14 13:25:06 nuc2 kernel:  worker_thread+0x1e5/0x2b4
Dec 14 13:25:06 nuc2 kernel:  ? cancel_delayed_work_sync+0x10/0x10
Dec 14 13:25:06 nuc2 kernel:  kthread+0x116/0x11e
Dec 14 13:25:06 nuc2 kernel:  ? kthread_park+0x7e/0x7e
Dec 14 13:25:06 nuc2 kernel:  ret_from_fork+0x1f/0x40
Dec 14 13:25:06 nuc2 kernel: Modules linked in:
Dec 14 13:25:06 nuc2 kernel: CR2: ffff97bf3ae916fe
Dec 14 13:25:06 nuc2 kernel: ---[ end trace 7f5dc24edc7285b3 ]---
Dec 14 13:25:06 nuc2 kernel: RIP: 0010:free_block+0xe3/0x182
Dec 14 13:25:06 nuc2 kernel: Code: 20 45 29 d4 41 d3 ec 0f b6 4f 1d 45 01 e2 41 d3 ea 41 8b 49 30 ff c9 49 83 79 20 00 41 89 49 30 75 04 4d 89 59 20 4d 8b 59 20 <45> 88 14 0b 49 8d 49 08 41 83 79 30 00 75 1a 4c 8b 50 28 49 89 4a
Dec 14 13:25:06 nuc2 kernel: RSP: 0018:ffffb90800123db0 EFLAGS: 00210046
Dec 14 13:25:06 nuc2 kernel: RAX: ffff97be3b419080 RBX: 000000000000000f RCX: 00000000ffffff7e
Dec 14 13:25:06 nuc2 kernel: RDX: 0000000000000018 RSI: ffffd907ffc82b70 RDI: ffff97be3b44c200
Dec 14 13:25:06 nuc2 kernel: RBP: ffffb90800123dd8 R08: ffffb90800123e10 R09: fffff9b345eba440
Dec 14 13:25:06 nuc2 kernel: R10: 000000000051f663 R11: ffff97be3ae91780 R12: 0000000011ede5c3
Dec 14 13:25:06 nuc2 kernel: R13: ffffffff80000000 R14: ffff97be3b419088 R15: ffff97be3b4190a8
Dec 14 13:25:06 nuc2 kernel: FS:  0000000000000000(0000) GS:ffff97be3be80000(0000) knlGS:0000000000000000
Dec 14 13:25:06 nuc2 kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Dec 14 13:25:06 nuc2 kernel: CR2: ffff97bf3ae916fe CR3: 000000004ec0a000 CR4: 0000000000340ee0
Dec 14 13:25:21 nuc2 kernel: sgx-load[1596]: segfault at 80 ip 0000000000402015 sp 00007ffdb267e2f0 error 4 in sgx-load[400000+b000]
Dec 14 13:25:21 nuc2 kernel: Code: ff 41 b8 8c 02 00 00 b9 90 78 40 00 ba 5d 77 40 00 be cc 74 40 00 48 89 ef 31 c0 e8 35 ef ff ff e9 1e ff ff ff 48 83 4b 50 01 <49> 8b 8c 24 80 00 00 00 48 89 8b a0 00 00 00 49 8b 8c 24 88 00 00
---------------------------------------------------------------------------

This is a post 'make distclean' compile from a fresh branch of
jarkko-sgx/next with no modifications.

For testing purposes we created a branch of our PSW and dropped the
EINITTOKEN pointer from the sgx_enclave_init structure in order to
make our runtime compatible with the new variant of
SGX_IOC_ENCLAVE_INIT.  As I noted in my previous e-mail, our runtime
doesn't appear to be having any issues with the creation and load of
the enclave.

We are assuming there is an intent for the new driver to be reasonably
compatible with the current Intel PSW/SDK.  Even if this isn't the
case it would seem to be problematic if it is possible for a badly
formed IOCTL call to tip the kernel over.

Jethro are you guys testing the driver against any non-trivial
enclaves?

> Thanks.
> 
> /Jarkko

Let us know if you would like us to experiment with anything in
particular.

Have a good weekend.

Dr. Greg

As always,
Dr. G.W. Wettstein, Ph.D.   Enjellic Systems Development, LLC.
4206 N. 19th Ave.           Specializing in information infra-structure
Fargo, ND  58102            development.
PH: 701-281-1686
FAX: 701-281-3949           EMAIL: greg@enjellic.com
------------------------------------------------------------------------------
"You and Uncle Pete drank the whole thing?  That was a $250.00 bottle
 of whisky.

 Yeah, it was good."
                                -- Rick Engen
                                   Resurrection.
Sean Christopherson Dec. 15, 2018, 12:06 a.m. UTC | #54
On Fri, Dec 14, 2018 at 05:59:17PM -0600, Dr. Greg wrote:
> On Wed, Dec 12, 2018 at 08:00:36PM +0200, Jarkko Sakkinen wrote:
> 
> Good evening, I hope the week has gone well for everyone.
> 
> > On Mon, Dec 10, 2018 at 04:49:08AM -0600, Dr. Greg wrote:
> > > In the meantime, I wanted to confirm that your jarkko-sgx/master
> > > branch contains the proposed driver that is headed upstream.
> > > Before adding the SFLC patches we thought it best to run the
> > > driver through some testing in order to verify that any problems
> > > we generated where attributable to our work and not the base
> > > driver.
> >
> > The master branch is by definition unstable at the moment i.e. it
> > can sometimes (not often) contain unfinished changes. Use next for
> > testing.  I update next when I consider the master contents "stable
> > enough".
> 
> I noticed in the last day or so that you appeared to sync
> jarkko-sgx/master with jarkko-sgx/next, so I checked out a local
> branch against jarkko-sgx/next and ran it against our unit tests.
> Based on what we are seeing the driver is still experiencing issues
> with initialization of a non-trivial enclave.

master branch is broken, looks like the VMA code Jarkko is reworking is
buggy.  I should be able to help debug this next week.

[  504.149548] ------------[ cut here ]------------
[  504.149550] kernel BUG at /home/sean/go/src/kernel.org/linux/mm/mmap.c:669!
[  504.150288] invalid opcode: 0000 [#1] SMP
[  504.150614] CPU: 2 PID: 237 Comm: kworker/u20:2 Not tainted 4.20.0-rc2+ #267
[  504.151165] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
[  504.151818] Workqueue: sgx-encl-wq sgx_encl_release_worker
[  504.152267] RIP: 0010:__vma_adjust+0x64a/0x820
[  504.152626] Code: ff 48 89 50 18 e9 6f fc ff ff 4c 8b ab 88 00 00 00 45 31 e4 e9 61 fb ff ff 31 c0 48 83 c4 60 5b 5d 41 5c 41 5d 41 5e 41 5f c3 <0f> 0b 49 89 de 49 83 c6 20 0f 84 06 fe ff ff 49 8d 7e e0 e8 fe ee
[  504.154109] RSP: 0000:ffffc900004ebd60 EFLAGS: 00010206
[  504.154535] RAX: 00007fd92ef7e000 RBX: ffff888467af16c0 RCX: ffff888467af16e0
[  504.155104] RDX: ffff888458fd09e0 RSI: 00007fd954021000 RDI: ffff88846bf9e798
[  504.155673] RBP: ffff888467af1480 R08: ffff88845bea2000 R09: 0000000000000000
[  504.156242] R10: 0000000080000000 R11: fefefefefefefeff R12: 0000000000000000
[  504.156810] R13: ffff88846bf9e790 R14: ffff888467af1b70 R15: ffff888467af1b60
[  504.157378] FS:  0000000000000000(0000) GS:ffff88846f700000(0000) knlGS:0000000000000000
[  504.158021] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  504.158483] CR2: 00007f2c56e99000 CR3: 0000000005009001 CR4: 0000000000360ee0
[  504.159054] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  504.159623] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  504.160193] Call Trace:
[  504.160406]  __split_vma+0x16f/0x180
[  504.160706]  ? __switch_to_asm+0x40/0x70
[  504.161024]  __do_munmap+0xfb/0x450
[  504.161308]  sgx_encl_release_worker+0x44/0x70
[  504.161675]  process_one_work+0x200/0x3f0
[  504.162004]  worker_thread+0x2d/0x3d0
[  504.162301]  ? process_one_work+0x3f0/0x3f0
[  504.162645]  kthread+0x113/0x130
[  504.162912]  ? kthread_park+0x90/0x90
[  504.163209]  ret_from_fork+0x35/0x40
[  504.163503] Modules linked in: bridge stp llc
[  504.163866] ---[ end trace 83076139fc25e3e0 ]---
Dr. Greg Dec. 15, 2018, 11:22 p.m. UTC | #55
On Fri, Dec 14, 2018 at 04:06:27PM -0800, Sean Christopherson wrote:

Good afternoon, I hope the weekend is going well for everyone.

> On Fri, Dec 14, 2018 at 05:59:17PM -0600, Dr. Greg wrote:
> > On Wed, Dec 12, 2018 at 08:00:36PM +0200, Jarkko Sakkinen wrote:
> > 
> > Good evening, I hope the week has gone well for everyone.
> > 
> > > On Mon, Dec 10, 2018 at 04:49:08AM -0600, Dr. Greg wrote:
> > > > In the meantime, I wanted to confirm that your jarkko-sgx/master
> > > > branch contains the proposed driver that is headed upstream.
> > > > Before adding the SFLC patches we thought it best to run the
> > > > driver through some testing in order to verify that any problems
> > > > we generated where attributable to our work and not the base
> > > > driver.
> > >
> > > The master branch is by definition unstable at the moment i.e. it
> > > can sometimes (not often) contain unfinished changes. Use next for
> > > testing.  I update next when I consider the master contents "stable
> > > enough".
> > 
> > I noticed in the last day or so that you appeared to sync
> > jarkko-sgx/master with jarkko-sgx/next, so I checked out a local
> > branch against jarkko-sgx/next and ran it against our unit tests.
> > Based on what we are seeing the driver is still experiencing issues
> > with initialization of a non-trivial enclave.

> master branch is broken, looks like the VMA code Jarkko is reworking is
> buggy.  I should be able to help debug this next week.
> 
> [  504.149548] ------------[ cut here ]------------
> [  504.149550] kernel BUG at /home/sean/go/src/kernel.org/linux/mm/mmap.c:669!

Rodger, dodger.

Let us know when you think you have something working pushed up into
one of the branches and we will put it on the bench here in the lab
and see what our runtime is able to do with it.

BTW, your new vDSO work appears to be shaping up well.  Just out of
curiosity though, how are you testing and validating the new vDSO
based exception handler if it isn't possible to initialize and run an
enclave with the new driver?

We will look forward to hearing from you.

Have a good remainder of the weekend.

Dr. Greg

As always,
Dr. G.W. Wettstein, Ph.D.   Enjellic Systems Development, LLC.
4206 N. 19th Ave.           Specializing in information infra-structure
Fargo, ND  58102            development.
PH: 701-281-1686
FAX: 701-281-3949           EMAIL: greg@enjellic.com
------------------------------------------------------------------------------
"Don't worry about people stealing your ideas.  If your ideas are any
 good, you'll have to ram them down people's throats."
                                -- Howard Aiken
Jarkko Sakkinen Dec. 17, 2018, 1:28 p.m. UTC | #56
On Fri, Dec 14, 2018 at 04:06:27PM -0800, Sean Christopherson wrote:
> [  504.149548] ------------[ cut here ]------------
> [  504.149550] kernel BUG at /home/sean/go/src/kernel.org/linux/mm/mmap.c:669!
> [  504.150288] invalid opcode: 0000 [#1] SMP
> [  504.150614] CPU: 2 PID: 237 Comm: kworker/u20:2 Not tainted 4.20.0-rc2+ #267
> [  504.151165] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
> [  504.151818] Workqueue: sgx-encl-wq sgx_encl_release_worker
> [  504.152267] RIP: 0010:__vma_adjust+0x64a/0x820
> [  504.152626] Code: ff 48 89 50 18 e9 6f fc ff ff 4c 8b ab 88 00 00 00 45 31 e4 e9 61 fb ff ff 31 c0 48 83 c4 60 5b 5d 41 5c 41 5d 41 5e 41 5f c3 <0f> 0b 49 89 de 49 83 c6 20 0f 84 06 fe ff ff 49 8d 7e e0 e8 fe ee
> [  504.154109] RSP: 0000:ffffc900004ebd60 EFLAGS: 00010206
> [  504.154535] RAX: 00007fd92ef7e000 RBX: ffff888467af16c0 RCX: ffff888467af16e0
> [  504.155104] RDX: ffff888458fd09e0 RSI: 00007fd954021000 RDI: ffff88846bf9e798
> [  504.155673] RBP: ffff888467af1480 R08: ffff88845bea2000 R09: 0000000000000000
> [  504.156242] R10: 0000000080000000 R11: fefefefefefefeff R12: 0000000000000000
> [  504.156810] R13: ffff88846bf9e790 R14: ffff888467af1b70 R15: ffff888467af1b60
> [  504.157378] FS:  0000000000000000(0000) GS:ffff88846f700000(0000) knlGS:0000000000000000
> [  504.158021] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  504.158483] CR2: 00007f2c56e99000 CR3: 0000000005009001 CR4: 0000000000360ee0
> [  504.159054] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [  504.159623] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [  504.160193] Call Trace:
> [  504.160406]  __split_vma+0x16f/0x180
> [  504.160706]  ? __switch_to_asm+0x40/0x70
> [  504.161024]  __do_munmap+0xfb/0x450
> [  504.161308]  sgx_encl_release_worker+0x44/0x70
> [  504.161675]  process_one_work+0x200/0x3f0
> [  504.162004]  worker_thread+0x2d/0x3d0
> [  504.162301]  ? process_one_work+0x3f0/0x3f0
> [  504.162645]  kthread+0x113/0x130
> [  504.162912]  ? kthread_park+0x90/0x90
> [  504.163209]  ret_from_fork+0x35/0x40
> [  504.163503] Modules linked in: bridge stp llc
> [  504.163866] ---[ end trace 83076139fc25e3e0 ]---

There was a race with release and swapping code that I thought I fixed,
and this is looks like a race there. Have to recheck what I did not
consider. Anyway, though to share this if you have time to look at it.
That is the part where something is now unsync most probably.

/Jarkko
Jarkko Sakkinen Dec. 17, 2018, 1:39 p.m. UTC | #57
On Mon, Dec 17, 2018 at 03:28:59PM +0200, Jarkko Sakkinen wrote:
> On Fri, Dec 14, 2018 at 04:06:27PM -0800, Sean Christopherson wrote:
> > [  504.149548] ------------[ cut here ]------------
> > [  504.149550] kernel BUG at /home/sean/go/src/kernel.org/linux/mm/mmap.c:669!
> > [  504.150288] invalid opcode: 0000 [#1] SMP
> > [  504.150614] CPU: 2 PID: 237 Comm: kworker/u20:2 Not tainted 4.20.0-rc2+ #267
> > [  504.151165] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
> > [  504.151818] Workqueue: sgx-encl-wq sgx_encl_release_worker
> > [  504.152267] RIP: 0010:__vma_adjust+0x64a/0x820
> > [  504.152626] Code: ff 48 89 50 18 e9 6f fc ff ff 4c 8b ab 88 00 00 00 45 31 e4 e9 61 fb ff ff 31 c0 48 83 c4 60 5b 5d 41 5c 41 5d 41 5e 41 5f c3 <0f> 0b 49 89 de 49 83 c6 20 0f 84 06 fe ff ff 49 8d 7e e0 e8 fe ee
> > [  504.154109] RSP: 0000:ffffc900004ebd60 EFLAGS: 00010206
> > [  504.154535] RAX: 00007fd92ef7e000 RBX: ffff888467af16c0 RCX: ffff888467af16e0
> > [  504.155104] RDX: ffff888458fd09e0 RSI: 00007fd954021000 RDI: ffff88846bf9e798
> > [  504.155673] RBP: ffff888467af1480 R08: ffff88845bea2000 R09: 0000000000000000
> > [  504.156242] R10: 0000000080000000 R11: fefefefefefefeff R12: 0000000000000000
> > [  504.156810] R13: ffff88846bf9e790 R14: ffff888467af1b70 R15: ffff888467af1b60
> > [  504.157378] FS:  0000000000000000(0000) GS:ffff88846f700000(0000) knlGS:0000000000000000
> > [  504.158021] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > [  504.158483] CR2: 00007f2c56e99000 CR3: 0000000005009001 CR4: 0000000000360ee0
> > [  504.159054] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > [  504.159623] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> > [  504.160193] Call Trace:
> > [  504.160406]  __split_vma+0x16f/0x180
> > [  504.160706]  ? __switch_to_asm+0x40/0x70
> > [  504.161024]  __do_munmap+0xfb/0x450
> > [  504.161308]  sgx_encl_release_worker+0x44/0x70
> > [  504.161675]  process_one_work+0x200/0x3f0
> > [  504.162004]  worker_thread+0x2d/0x3d0
> > [  504.162301]  ? process_one_work+0x3f0/0x3f0
> > [  504.162645]  kthread+0x113/0x130
> > [  504.162912]  ? kthread_park+0x90/0x90
> > [  504.163209]  ret_from_fork+0x35/0x40
> > [  504.163503] Modules linked in: bridge stp llc
> > [  504.163866] ---[ end trace 83076139fc25e3e0 ]---
> 
> There was a race with release and swapping code that I thought I fixed,
> and this is looks like a race there. Have to recheck what I did not
> consider. Anyway, though to share this if you have time to look at it.
> That is the part where something is now unsync most probably.

I think I found it. I was careless to make sgx_encl_release() to use
sgx_invalidate(), which does not delete pages in the case when enclave
is already marked as dead. This was after I had fixed the race that I
had there in the first place. That is why I was puzzled why it suddenly
reappeared.

Would be nice to use sgx_invalidate() also in release for consistency in
semantics sake so maybe just delete this:

	if (encl->flags & SGX_ENCL_DEAD)
		return;

?

/Jarkko
Jarkko Sakkinen Dec. 17, 2018, 2:08 p.m. UTC | #58
On Mon, Dec 17, 2018 at 03:39:28PM +0200, Jarkko Sakkinen wrote:
> On Mon, Dec 17, 2018 at 03:28:59PM +0200, Jarkko Sakkinen wrote:
> > On Fri, Dec 14, 2018 at 04:06:27PM -0800, Sean Christopherson wrote:
> > > [  504.149548] ------------[ cut here ]------------
> > > [  504.149550] kernel BUG at /home/sean/go/src/kernel.org/linux/mm/mmap.c:669!
> > > [  504.150288] invalid opcode: 0000 [#1] SMP
> > > [  504.150614] CPU: 2 PID: 237 Comm: kworker/u20:2 Not tainted 4.20.0-rc2+ #267
> > > [  504.151165] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
> > > [  504.151818] Workqueue: sgx-encl-wq sgx_encl_release_worker
> > > [  504.152267] RIP: 0010:__vma_adjust+0x64a/0x820
> > > [  504.152626] Code: ff 48 89 50 18 e9 6f fc ff ff 4c 8b ab 88 00 00 00 45 31 e4 e9 61 fb ff ff 31 c0 48 83 c4 60 5b 5d 41 5c 41 5d 41 5e 41 5f c3 <0f> 0b 49 89 de 49 83 c6 20 0f 84 06 fe ff ff 49 8d 7e e0 e8 fe ee
> > > [  504.154109] RSP: 0000:ffffc900004ebd60 EFLAGS: 00010206
> > > [  504.154535] RAX: 00007fd92ef7e000 RBX: ffff888467af16c0 RCX: ffff888467af16e0
> > > [  504.155104] RDX: ffff888458fd09e0 RSI: 00007fd954021000 RDI: ffff88846bf9e798
> > > [  504.155673] RBP: ffff888467af1480 R08: ffff88845bea2000 R09: 0000000000000000
> > > [  504.156242] R10: 0000000080000000 R11: fefefefefefefeff R12: 0000000000000000
> > > [  504.156810] R13: ffff88846bf9e790 R14: ffff888467af1b70 R15: ffff888467af1b60
> > > [  504.157378] FS:  0000000000000000(0000) GS:ffff88846f700000(0000) knlGS:0000000000000000
> > > [  504.158021] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > [  504.158483] CR2: 00007f2c56e99000 CR3: 0000000005009001 CR4: 0000000000360ee0
> > > [  504.159054] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > > [  504.159623] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> > > [  504.160193] Call Trace:
> > > [  504.160406]  __split_vma+0x16f/0x180
> > > [  504.160706]  ? __switch_to_asm+0x40/0x70
> > > [  504.161024]  __do_munmap+0xfb/0x450
> > > [  504.161308]  sgx_encl_release_worker+0x44/0x70
> > > [  504.161675]  process_one_work+0x200/0x3f0
> > > [  504.162004]  worker_thread+0x2d/0x3d0
> > > [  504.162301]  ? process_one_work+0x3f0/0x3f0
> > > [  504.162645]  kthread+0x113/0x130
> > > [  504.162912]  ? kthread_park+0x90/0x90
> > > [  504.163209]  ret_from_fork+0x35/0x40
> > > [  504.163503] Modules linked in: bridge stp llc
> > > [  504.163866] ---[ end trace 83076139fc25e3e0 ]---
> > 
> > There was a race with release and swapping code that I thought I fixed,
> > and this is looks like a race there. Have to recheck what I did not
> > consider. Anyway, though to share this if you have time to look at it.
> > That is the part where something is now unsync most probably.
> 
> I think I found it. I was careless to make sgx_encl_release() to use
> sgx_invalidate(), which does not delete pages in the case when enclave
> is already marked as dead. This was after I had fixed the race that I
> had there in the first place. That is why I was puzzled why it suddenly
> reappeared.
> 
> Would be nice to use sgx_invalidate() also in release for consistency in
> semantics sake so maybe just delete this:
> 
> 	if (encl->flags & SGX_ENCL_DEAD)
> 		return;

Updated master, not at this point next.

/Jarkko
Jarkko Sakkinen Dec. 17, 2018, 2:13 p.m. UTC | #59
On Mon, Dec 17, 2018 at 04:08:11PM +0200, Jarkko Sakkinen wrote:
> On Mon, Dec 17, 2018 at 03:39:28PM +0200, Jarkko Sakkinen wrote:
> > On Mon, Dec 17, 2018 at 03:28:59PM +0200, Jarkko Sakkinen wrote:
> > > On Fri, Dec 14, 2018 at 04:06:27PM -0800, Sean Christopherson wrote:
> > > > [  504.149548] ------------[ cut here ]------------
> > > > [  504.149550] kernel BUG at /home/sean/go/src/kernel.org/linux/mm/mmap.c:669!
> > > > [  504.150288] invalid opcode: 0000 [#1] SMP
> > > > [  504.150614] CPU: 2 PID: 237 Comm: kworker/u20:2 Not tainted 4.20.0-rc2+ #267
> > > > [  504.151165] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
> > > > [  504.151818] Workqueue: sgx-encl-wq sgx_encl_release_worker
> > > > [  504.152267] RIP: 0010:__vma_adjust+0x64a/0x820
> > > > [  504.152626] Code: ff 48 89 50 18 e9 6f fc ff ff 4c 8b ab 88 00 00 00 45 31 e4 e9 61 fb ff ff 31 c0 48 83 c4 60 5b 5d 41 5c 41 5d 41 5e 41 5f c3 <0f> 0b 49 89 de 49 83 c6 20 0f 84 06 fe ff ff 49 8d 7e e0 e8 fe ee
> > > > [  504.154109] RSP: 0000:ffffc900004ebd60 EFLAGS: 00010206
> > > > [  504.154535] RAX: 00007fd92ef7e000 RBX: ffff888467af16c0 RCX: ffff888467af16e0
> > > > [  504.155104] RDX: ffff888458fd09e0 RSI: 00007fd954021000 RDI: ffff88846bf9e798
> > > > [  504.155673] RBP: ffff888467af1480 R08: ffff88845bea2000 R09: 0000000000000000
> > > > [  504.156242] R10: 0000000080000000 R11: fefefefefefefeff R12: 0000000000000000
> > > > [  504.156810] R13: ffff88846bf9e790 R14: ffff888467af1b70 R15: ffff888467af1b60
> > > > [  504.157378] FS:  0000000000000000(0000) GS:ffff88846f700000(0000) knlGS:0000000000000000
> > > > [  504.158021] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > > [  504.158483] CR2: 00007f2c56e99000 CR3: 0000000005009001 CR4: 0000000000360ee0
> > > > [  504.159054] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > > > [  504.159623] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> > > > [  504.160193] Call Trace:
> > > > [  504.160406]  __split_vma+0x16f/0x180
> > > > [  504.160706]  ? __switch_to_asm+0x40/0x70
> > > > [  504.161024]  __do_munmap+0xfb/0x450
> > > > [  504.161308]  sgx_encl_release_worker+0x44/0x70
> > > > [  504.161675]  process_one_work+0x200/0x3f0
> > > > [  504.162004]  worker_thread+0x2d/0x3d0
> > > > [  504.162301]  ? process_one_work+0x3f0/0x3f0
> > > > [  504.162645]  kthread+0x113/0x130
> > > > [  504.162912]  ? kthread_park+0x90/0x90
> > > > [  504.163209]  ret_from_fork+0x35/0x40
> > > > [  504.163503] Modules linked in: bridge stp llc
> > > > [  504.163866] ---[ end trace 83076139fc25e3e0 ]---
> > > 
> > > There was a race with release and swapping code that I thought I fixed,
> > > and this is looks like a race there. Have to recheck what I did not
> > > consider. Anyway, though to share this if you have time to look at it.
> > > That is the part where something is now unsync most probably.
> > 
> > I think I found it. I was careless to make sgx_encl_release() to use
> > sgx_invalidate(), which does not delete pages in the case when enclave
> > is already marked as dead. This was after I had fixed the race that I
> > had there in the first place. That is why I was puzzled why it suddenly
> > reappeared.
> > 
> > Would be nice to use sgx_invalidate() also in release for consistency in
> > semantics sake so maybe just delete this:
> > 
> > 	if (encl->flags & SGX_ENCL_DEAD)
> > 		return;
> 
> Updated master, not at this point next.

If I checked this right was that mmu_notifier_unregister() cause DEAD
to set, and thus when sgx_invalidate() is executed, it returns without
doing anything...

/Jarkko
Sean Christopherson Dec. 17, 2018, 2:27 p.m. UTC | #60
On Sat, Dec 15, 2018 at 05:22:31PM -0600, Dr. Greg wrote:
> On Fri, Dec 14, 2018 at 04:06:27PM -0800, Sean Christopherson wrote:
> 
> Good afternoon, I hope the weekend is going well for everyone.
> 
> > On Fri, Dec 14, 2018 at 05:59:17PM -0600, Dr. Greg wrote:
> > > On Wed, Dec 12, 2018 at 08:00:36PM +0200, Jarkko Sakkinen wrote:
> > > 
> > > Good evening, I hope the week has gone well for everyone.
> > > 
> > > > On Mon, Dec 10, 2018 at 04:49:08AM -0600, Dr. Greg wrote:
> > > > > In the meantime, I wanted to confirm that your jarkko-sgx/master
> > > > > branch contains the proposed driver that is headed upstream.
> > > > > Before adding the SFLC patches we thought it best to run the
> > > > > driver through some testing in order to verify that any problems
> > > > > we generated where attributable to our work and not the base
> > > > > driver.
> > > >
> > > > The master branch is by definition unstable at the moment i.e. it
> > > > can sometimes (not often) contain unfinished changes. Use next for
> > > > testing.  I update next when I consider the master contents "stable
> > > > enough".
> > > 
> > > I noticed in the last day or so that you appeared to sync
> > > jarkko-sgx/master with jarkko-sgx/next, so I checked out a local
> > > branch against jarkko-sgx/next and ran it against our unit tests.
> > > Based on what we are seeing the driver is still experiencing issues
> > > with initialization of a non-trivial enclave.
> 
> > master branch is broken, looks like the VMA code Jarkko is reworking is
> > buggy.  I should be able to help debug this next week.
> > 
> > [  504.149548] ------------[ cut here ]------------
> > [  504.149550] kernel BUG at /home/sean/go/src/kernel.org/linux/mm/mmap.c:669!
> 
> Rodger, dodger.
> 
> Let us know when you think you have something working pushed up into
> one of the branches and we will put it on the bench here in the lab
> and see what our runtime is able to do with it.
> 
> BTW, your new vDSO work appears to be shaping up well.  Just out of
> curiosity though, how are you testing and validating the new vDSO
> based exception handler if it isn't possible to initialize and run an
> enclave with the new driver?

Cherry-pick the patches to a stable version of the driver, the vDSO code
doesn't use any of the SGX APIs.
Dr. Greg Dec. 17, 2018, 4:34 p.m. UTC | #61
On Mon, Dec 17, 2018 at 04:13:15PM +0200, Jarkko Sakkinen wrote:

Good morning to everyone.

> On Mon, Dec 17, 2018 at 04:08:11PM +0200, Jarkko Sakkinen wrote:
> > On Mon, Dec 17, 2018 at 03:39:28PM +0200, Jarkko Sakkinen wrote:
> > > On Mon, Dec 17, 2018 at 03:28:59PM +0200, Jarkko Sakkinen wrote:
> > > > On Fri, Dec 14, 2018 at 04:06:27PM -0800, Sean Christopherson wrote:
> > > > > [  504.149548] ------------[ cut here ]------------
> > > > > [  504.149550] kernel BUG at /home/sean/go/src/kernel.org/linux/mm/mmap.c:669!
> > > > > [  504.150288] invalid opcode: 0000 [#1] SMP
> > > > > [  504.150614] CPU: 2 PID: 237 Comm: kworker/u20:2 Not tainted 4.20.0-rc2+ #267
> > > > > [  504.151165] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
> > > > > [  504.151818] Workqueue: sgx-encl-wq sgx_encl_release_worker
> > > > > [  504.152267] RIP: 0010:__vma_adjust+0x64a/0x820
> > > > > [  504.152626] Code: ff 48 89 50 18 e9 6f fc ff ff 4c 8b ab 88 00 00 00 45 31 e4 e9 61 fb ff ff 31 c0 48 83 c4 60 5b 5d 41 5c 41 5d 41 5e 41 5f c3 <0f> 0b 49 89 de 49 83 c6 20 0f 84 06 fe ff ff 49 8d 7e e0 e8 fe ee
> > > > > [  504.154109] RSP: 0000:ffffc900004ebd60 EFLAGS: 00010206
> > > > > [  504.154535] RAX: 00007fd92ef7e000 RBX: ffff888467af16c0 RCX: ffff888467af16e0
> > > > > [  504.155104] RDX: ffff888458fd09e0 RSI: 00007fd954021000 RDI: ffff88846bf9e798
> > > > > [  504.155673] RBP: ffff888467af1480 R08: ffff88845bea2000 R09: 0000000000000000
> > > > > [  504.156242] R10: 0000000080000000 R11: fefefefefefefeff R12: 0000000000000000
> > > > > [  504.156810] R13: ffff88846bf9e790 R14: ffff888467af1b70 R15: ffff888467af1b60
> > > > > [  504.157378] FS:  0000000000000000(0000) GS:ffff88846f700000(0000) knlGS:0000000000000000
> > > > > [  504.158021] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > > > [  504.158483] CR2: 00007f2c56e99000 CR3: 0000000005009001 CR4: 0000000000360ee0
> > > > > [  504.159054] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > > > > [  504.159623] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> > > > > [  504.160193] Call Trace:
> > > > > [  504.160406]  __split_vma+0x16f/0x180
> > > > > [  504.160706]  ? __switch_to_asm+0x40/0x70
> > > > > [  504.161024]  __do_munmap+0xfb/0x450
> > > > > [  504.161308]  sgx_encl_release_worker+0x44/0x70
> > > > > [  504.161675]  process_one_work+0x200/0x3f0
> > > > > [  504.162004]  worker_thread+0x2d/0x3d0
> > > > > [  504.162301]  ? process_one_work+0x3f0/0x3f0
> > > > > [  504.162645]  kthread+0x113/0x130
> > > > > [  504.162912]  ? kthread_park+0x90/0x90
> > > > > [  504.163209]  ret_from_fork+0x35/0x40
> > > > > [  504.163503] Modules linked in: bridge stp llc
> > > > > [  504.163866] ---[ end trace 83076139fc25e3e0 ]---

> > > > There was a race with release and swapping code that I thought
> > > > I fixed, and this is looks like a race there. Have to recheck
> > > > what I did not consider. Anyway, though to share this if you
> > > > have time to look at it.  That is the part where something is
> > > > now unsync most probably.

> > > I think I found it. I was careless to make sgx_encl_release() to
> > > use sgx_invalidate(), which does not delete pages in the case
> > > when enclave is already marked as dead. This was after I had
> > > fixed the race that I had there in the first place. That is why
> > > I was puzzled why it suddenly reappeared.

> > > Would be nice to use sgx_invalidate() also in release for consistency in
> > > semantics sake so maybe just delete this:
> > > 
> > > 	if (encl->flags & SGX_ENCL_DEAD)
> > > 		return;
> > 
> > Updated master, not at this point next.

> If I checked this right was that mmu_notifier_unregister() cause
> DEAD to set, and thus when sgx_invalidate() is executed, it returns
> without doing anything...

On a pristine jarkko-sgx/next local branch we commented out the 'if
(encl->flags & SGX_ENCL_DEAD) return' clause in the following
file/function:

arch/x86/kernel/cpu/sgx/driver/encl.c:sgx_invalidate()

And tested the kernel.

This fix seems to prevent the memory manager from getting
catastrophically corrupted but the EINIT ioctl still fails.

On the first invocation after a fresh boot the EINIT ioctl returns -1.

On subsequent invocations of the loader it returns EBUSY.  Every 8-10
invocations we get the -1 (EPERM -?) from the EINIT call and then it
returns to issueing EBUSY.

Here is a representative call trace from the loader utility:

---------------------------------------------------------------------------
address: 7ff5cbe00000, create address: 7ff5cbe00000
Non-token initialization requested.
EINIT retn: -1 / No error information
[SGXenclave.c,init_enclave,652]: Error location.
[sgx-load.c,main,180]: Error location.

address: 7f4255200000, create address: 7f4255200000
Non-token initialization requested.
EINIT retn: 16 / Resource busy
[SGXenclave.c,init_enclave,652]: Error location.
[sgx-load.c,main,180]: Error location.
---------------------------------------------------------------------------

It looks like I spoke too soon about the patch completely hardening
the machine.  We just got a segmentation fault on EINIT and the
process is hung in 'D' state with the following WCHAN value:

__flush_work.isra.43

Any further attempts to run the loader causes those processes to hang
as well.

Here is everything we have been able to get out of the machine with respect to a stack trace after the initial fault:

---------------------------------------------------------------------------
Dec 17 10:03:00 nuc2 kernel: general protection fault: 0000 [#1] SMP PTI
Dec 17 10:03:00 nuc2 kernel: CPU: 1 PID: 1249 Comm: kworker/u8:3 Not tainted 4.20.0-rc2-sgx-nuc2+ #13
Dec 17 10:03:00 nuc2 kernel: Hardware name: Intel Corporation NUC7CJYH/NUC7JYB, BIOS JYGLKCPX.86A.0046.2018.1103.1316 11/03/2018
Dec 17 10:03:00 nuc2 kernel: Workqueue: sgx-encl-wq sgx_encl_release_worker
Dec 17 10:03:00 nuc2 kernel: RIP: 0010:__mmu_notifier_invalidate_range_start+0x38/0xc5
Dec 17 10:03:00 nuc2 kernel: Code: 54 49 89 fc 48 c7 c7 d0 6f c3 ad 53 31 db 48 83 ec 18 48 89 75 c8 48 89 55 c0 e8 67 97 f7 ff 89 45 d4 49 8b 84 24 a0 03 00 00 <4c> 8b 30 41 0f b6 c5 89 45 d0 4d 85 f6 74 5e 49 8b 46 10 48 8b 40
Dec 17 10:03:00 nuc2 kernel: RSP: 0018:ffffa51d4238bc98 EFLAGS: 00010246
Dec 17 10:03:00 nuc2 kernel: RAX: dead000000000100 RBX: 0000000000000000 RCX: 0000000000000000
Dec 17 10:03:00 nuc2 kernel: RDX: 000000000001b640 RSI: 00007f51607ee000 RDI: ffffffffadc36fd0
Dec 17 10:03:00 nuc2 kernel: RBP: ffffa51d4238bcd8 R08: 00007f5160a00000 R09: 0000000000000000
Dec 17 10:03:00 nuc2 kernel: R10: ffffa51d4238bce8 R11: fefefefefefefeff R12: ffffa17a3aa68c00
Dec 17 10:03:00 nuc2 kernel: R13: ffffa17a3aa68c01 R14: 00007f51607ee000 R15: ffffa51d4238bd28
Dec 17 10:03:00 nuc2 kernel: FS:  0000000000000000(0000) GS:ffffa17a3be80000(0000) knlGS:0000000000000000
Dec 17 10:03:00 nuc2 kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Dec 17 10:03:00 nuc2 kernel: CR2: 000000000878ed68 CR3: 000000017adc4000 CR4: 0000000000340ee0
Dec 17 10:03:00 nuc2 kernel: Call Trace:
Dec 17 10:03:00 nuc2 kernel:  unmap_vmas+0x3a/0x83
Dec 17 10:03:00 nuc2 kernel:  unmap_region+0xab/0xfc
Dec 17 10:03:00 nuc2 kernel:  ? __vma_rb_erase+0x189/0x1c4
Dec 17 10:03:00 nuc2 kernel:  __do_munmap+0x246/0x2d5
Dec 17 10:03:00 nuc2 kernel:  do_munmap+0xc/0xe
Dec 17 10:03:00 nuc2 kernel:  sgx_encl_release_worker+0x44/0x6e
Dec 17 10:03:00 nuc2 kernel:  process_one_work+0x183/0x271
Dec 17 10:03:00 nuc2 kernel:  worker_thread+0x1e5/0x2b4
Dec 17 10:03:00 nuc2 kernel:  ? cancel_delayed_work_sync+0x10/0x10
Dec 17 10:03:00 nuc2 kernel:  kthread+0x116/0x11e
Dec 17 10:03:00 nuc2 kernel:  ? kthread_park+0x7e/0x7e
Dec 17 10:03:00 nuc2 kernel:  ret_from_fork+0x1f/0x40
Dec 17 10:03:00 nuc2 kernel: Modules linked in:
Dec 17 10:03:00 nuc2 kernel: ---[ end trace 07fc74730017fedb ]---
Dec 17 10:03:00 nuc2 kernel: RIP: 0010:__mmu_notifier_invalidate_range_start+0x38/0xc5
Dec 17 10:03:00 nuc2 kernel: Code: 54 49 89 fc 48 c7 c7 d0 6f c3 ad 53 31 db 48 83 ec 18 48 89 75 c8 48 89 55 c0 e8 67 97 f7 ff 89 45 d4 49 8b 84 24 a0 03 00 00 <4c> 8b 30 41 0f b6 c5 89 45 d0 4d 85 f6 74 5e 49 8b 46 10 48 8b 40
Dec 17 10:03:00 nuc2 kernel: RSP: 0018:ffffa51d4238bc98 EFLAGS: 00010246
Dec 17 10:03:00 nuc2 kernel: RAX: dead000000000100 RBX: 0000000000000000 RCX: 0000000000000000
Dec 17 10:03:00 nuc2 kernel: RDX: 000000000001b640 RSI: 00007f51607ee000 RDI: ffffffffadc36fd0
Dec 17 10:03:00 nuc2 kernel: RBP: ffffa51d4238bcd8 R08: 00007f5160a00000 R09: 0000000000000000
Dec 17 10:03:00 nuc2 kernel: R10: ffffa51d4238bce8 R11: fefefefefefefeff R12: ffffa17a3aa68c00
Dec 17 10:03:00 nuc2 kernel: R13: ffffa17a3aa68c01 R14: 00007f51607ee000 R15: ffffa51d4238bd28
Dec 17 10:03:00 nuc2 kernel: FS:  0000000000000000(0000) GS:ffffa17a3be80000(0000) knlGS:0000000000000000
Dec 17 10:03:00 nuc2 kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Dec 17 10:03:00 nuc2 kernel: CR2: 000000000878ed68 CR3: 000000017adc4000 CR4: 0000000000340ee0
---------------------------------------------------------------------------

So far the box still appears to be largely intact except for every
invocation of the enclave loader hanging.

> /Jarkko

Let us know how we can help.

Have a good afternoon.

Dr. Greg

As always,
Dr. Greg Wettstein, Ph.D, Worker
IDfusion, LLC
4206 N. 19th Ave.           Implementing measured information privacy
Fargo, ND  58102            and integrity architectures.
PH: 701-281-1686
FAX: 701-281-3949           EMAIL: gw@idfusion.org
------------------------------------------------------------------------------
"... remember that innovation is saying 'no' to 1000 things."
                                -- Moxie Marlinspike
Sean Christopherson Dec. 17, 2018, 5:31 p.m. UTC | #62
On Mon, Dec 17, 2018 at 04:08:11PM +0200, Jarkko Sakkinen wrote:
> On Mon, Dec 17, 2018 at 03:39:28PM +0200, Jarkko Sakkinen wrote:
> > On Mon, Dec 17, 2018 at 03:28:59PM +0200, Jarkko Sakkinen wrote:
> > > On Fri, Dec 14, 2018 at 04:06:27PM -0800, Sean Christopherson wrote:
> > > > [  504.149548] ------------[ cut here ]------------
> > > > [  504.149550] kernel BUG at /home/sean/go/src/kernel.org/linux/mm/mmap.c:669!
> > > > [  504.150288] invalid opcode: 0000 [#1] SMP
> > > > [  504.150614] CPU: 2 PID: 237 Comm: kworker/u20:2 Not tainted 4.20.0-rc2+ #267
> > > > [  504.151165] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
> > > > [  504.151818] Workqueue: sgx-encl-wq sgx_encl_release_worker
> > > > [  504.152267] RIP: 0010:__vma_adjust+0x64a/0x820
> > > > [  504.152626] Code: ff 48 89 50 18 e9 6f fc ff ff 4c 8b ab 88 00 00 00 45 31 e4 e9 61 fb ff ff 31 c0 48 83 c4 60 5b 5d 41 5c 41 5d 41 5e 41 5f c3 <0f> 0b 49 89 de 49 83 c6 20 0f 84 06 fe ff ff 49 8d 7e e0 e8 fe ee
> > > > [  504.154109] RSP: 0000:ffffc900004ebd60 EFLAGS: 00010206
> > > > [  504.154535] RAX: 00007fd92ef7e000 RBX: ffff888467af16c0 RCX: ffff888467af16e0
> > > > [  504.155104] RDX: ffff888458fd09e0 RSI: 00007fd954021000 RDI: ffff88846bf9e798
> > > > [  504.155673] RBP: ffff888467af1480 R08: ffff88845bea2000 R09: 0000000000000000
> > > > [  504.156242] R10: 0000000080000000 R11: fefefefefefefeff R12: 0000000000000000
> > > > [  504.156810] R13: ffff88846bf9e790 R14: ffff888467af1b70 R15: ffff888467af1b60
> > > > [  504.157378] FS:  0000000000000000(0000) GS:ffff88846f700000(0000) knlGS:0000000000000000
> > > > [  504.158021] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > > [  504.158483] CR2: 00007f2c56e99000 CR3: 0000000005009001 CR4: 0000000000360ee0
> > > > [  504.159054] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > > > [  504.159623] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> > > > [  504.160193] Call Trace:
> > > > [  504.160406]  __split_vma+0x16f/0x180
> > > > [  504.160706]  ? __switch_to_asm+0x40/0x70
> > > > [  504.161024]  __do_munmap+0xfb/0x450
> > > > [  504.161308]  sgx_encl_release_worker+0x44/0x70
> > > > [  504.161675]  process_one_work+0x200/0x3f0
> > > > [  504.162004]  worker_thread+0x2d/0x3d0
> > > > [  504.162301]  ? process_one_work+0x3f0/0x3f0
> > > > [  504.162645]  kthread+0x113/0x130
> > > > [  504.162912]  ? kthread_park+0x90/0x90
> > > > [  504.163209]  ret_from_fork+0x35/0x40
> > > > [  504.163503] Modules linked in: bridge stp llc
> > > > [  504.163866] ---[ end trace 83076139fc25e3e0 ]---
> > > 
> > > There was a race with release and swapping code that I thought I fixed,
> > > and this is looks like a race there. Have to recheck what I did not
> > > consider. Anyway, though to share this if you have time to look at it.
> > > That is the part where something is now unsync most probably.
> > 
> > I think I found it. I was careless to make sgx_encl_release() to use
> > sgx_invalidate(), which does not delete pages in the case when enclave
> > is already marked as dead. This was after I had fixed the race that I
> > had there in the first place. That is why I was puzzled why it suddenly
> > reappeared.
> > 
> > Would be nice to use sgx_invalidate() also in release for consistency in
> > semantics sake so maybe just delete this:
> > 
> > 	if (encl->flags & SGX_ENCL_DEAD)
> > 		return;

This doesn't work as-is.  sgx_encl_release() needs to use sgx_free_page()
and not __sgx_free_page() so that we get a WARN() if the page can't be
freed.  sgx_invalidate() needs to use __sgx_free_page() as freeing a page
can fail due to running concurrently with reclaim.  I'll play around with
the code a bit, there's probably a fairly clean way to share code between
the two flows.

> 
> Updated master, not at this point next.

Still broken (as Greg's parallel email points out).

sgx_encl_release_worker() calls do_unmap() without checking the validity
of the page tables[1].  As is, the code doesn't even guarantee mm_struct
itself is valid.

The easiest fix I can think of is to add a SGX_ENCL_MM_RELEASED flag
that is set along with SGX_ENCL_DEAD in sgx_mmu_notifier_release(), and
only call do_unmap() if SGX_ENCL_MM_RELEASED is false.  Note that this
means we cant unregister the mmu_notifier until after do_unmap(), but
that's true no matter what since we're relying on the mmu_notifier to
hold a reference to mm_struct.  Patch attached.

[1] https://www.spinics.net/lists/dri-devel/msg186827.html
From 7cfdf34ec5b70392216b24853d6b8cc5e3192a92 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 17 Dec 2018 09:21:14 -0800
Subject: [PATCH] x86/sgx: Do not attempt to unmap enclave VMAs if mm_struct is
 defunct

Add a flag, SGX_ENCL_MM_RELEASED, to explicitly track the lifecycle of
the enclave's associated mm_struct.  Simply ensuring the mm_struct
itself is valid is not sufficient as the VMAs and page tables can be
removed after sgx_mmu_notifier_release() is invoked[1].

Note that this means mmu_notifier can't be unregistered until after
do_unmap(), but that's true no matter what since the mmu_notifier
holds the enclave's reference to mm_struct, i.e. this also fixes a
potential use-after-free bug of the mm_struct.

[1] https://www.spinics.net/lists/dri-devel/msg186827.html

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
---
 arch/x86/kernel/cpu/sgx/driver/driver.h |  1 +
 arch/x86/kernel/cpu/sgx/driver/encl.c   | 18 ++++++++++--------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/cpu/sgx/driver/driver.h b/arch/x86/kernel/cpu/sgx/driver/driver.h
index 56f45cd433dd..d7c51284ef36 100644
--- a/arch/x86/kernel/cpu/sgx/driver/driver.h
+++ b/arch/x86/kernel/cpu/sgx/driver/driver.h
@@ -89,6 +89,7 @@ enum sgx_encl_flags {
 	SGX_ENCL_DEBUG		= BIT(1),
 	SGX_ENCL_SUSPEND	= BIT(2),
 	SGX_ENCL_DEAD		= BIT(3),
+	SGX_ENCL_MM_RELEASED	= BIT(4),
 };
 
 struct sgx_encl {
diff --git a/arch/x86/kernel/cpu/sgx/driver/encl.c b/arch/x86/kernel/cpu/sgx/driver/encl.c
index 923e31eb6552..77c5e65533fb 100644
--- a/arch/x86/kernel/cpu/sgx/driver/encl.c
+++ b/arch/x86/kernel/cpu/sgx/driver/encl.c
@@ -311,7 +311,7 @@ static void sgx_mmu_notifier_release(struct mmu_notifier *mn,
 		container_of(mn, struct sgx_encl, mmu_notifier);
 
 	mutex_lock(&encl->lock);
-	encl->flags |= SGX_ENCL_DEAD;
+	encl->flags |= SGX_ENCL_DEAD | SGX_ENCL_MM_RELEASED;
 	mutex_unlock(&encl->lock);
 }
 
@@ -967,10 +967,15 @@ static void sgx_encl_release_worker(struct work_struct *work)
 	struct sgx_encl *encl = container_of(work, struct sgx_encl, work);
 	unsigned long backing_size = encl->size + PAGE_SIZE;
 
-	down_write(&encl->mm->mmap_sem);
-	do_munmap(encl->mm, (unsigned long)encl->backing, backing_size +
-		  (backing_size >> 5), NULL);
-	up_write(&encl->mm->mmap_sem);
+	if (!(encl->flags & SGX_ENCL_MM_RELEASED)) {
+		down_write(&encl->mm->mmap_sem);
+		do_munmap(encl->mm, (unsigned long)encl->backing,
+			  backing_size + (backing_size >> 5), NULL);
+		up_write(&encl->mm->mmap_sem);
+	}
+
+	if (encl->mmu_notifier.ops)
+		mmu_notifier_unregister(&encl->mmu_notifier, encl->mm);
 
 	if (encl->tgid)
 		put_pid(encl->tgid);
@@ -990,9 +995,6 @@ void sgx_encl_release(struct kref *ref)
 {
 	struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount);
 
-	if (encl->mmu_notifier.ops)
-		mmu_notifier_unregister(&encl->mmu_notifier, encl->mm);
-
 	if (encl->pm_notifier.notifier_call)
 		unregister_pm_notifier(&encl->pm_notifier);
Dave Hansen Dec. 17, 2018, 5:45 p.m. UTC | #63
> +struct sgx_encl *sgx_encl_alloc(struct sgx_secs *secs)
> +{
...
> +	kref_init(&encl->refcount);
> +	INIT_LIST_HEAD(&encl->add_page_reqs);
> +	INIT_RADIX_TREE(&encl->page_tree, GFP_KERNEL);
> +	mutex_init(&encl->lock);
> +	INIT_WORK(&encl->add_page_work, sgx_add_page_worker);
> +
> +	encl->mm = current->mm;  <---------------------------------> +	encl->base = secs->base;
> +	encl->size = secs->size;
> +	encl->ssaframesize = secs->ssa_frame_size;
> +	encl->backing = backing;
> +
> +	return encl;
> +}

How is this OK without taking a reference on the mm?

I have a feeling a bunch of your bugs with the mmu notifiers and so
forth are because the refcounting is wrong here.

Sean's SGX_ENCL_MM_RELEASED would, I think be unnecessary if you just
take a refcount here and release it when the enclave is destroyed.
Jarkko Sakkinen Dec. 17, 2018, 5:49 p.m. UTC | #64
On Mon, Dec 17, 2018 at 09:31:06AM -0800, Sean Christopherson wrote:
> This doesn't work as-is.  sgx_encl_release() needs to use sgx_free_page()
> and not __sgx_free_page() so that we get a WARN() if the page can't be
> freed.  sgx_invalidate() needs to use __sgx_free_page() as freeing a page
> can fail due to running concurrently with reclaim.  I'll play around with
> the code a bit, there's probably a fairly clean way to share code between
> the two flows.

Hmm... but why issue a warning in that case? It should be legit
behaviour.

> sgx_encl_release_worker() calls do_unmap() without checking the validity
> of the page tables[1].  As is, the code doesn't even guarantee mm_struct
> itself is valid.
> 
> The easiest fix I can think of is to add a SGX_ENCL_MM_RELEASED flag
> that is set along with SGX_ENCL_DEAD in sgx_mmu_notifier_release(), and
> only call do_unmap() if SGX_ENCL_MM_RELEASED is false.  Note that this
> means we cant unregister the mmu_notifier until after do_unmap(), but
> that's true no matter what since we're relying on the mmu_notifier to
> hold a reference to mm_struct.  Patch attached.

OK, the fix change makes sense but I'm thinking that would it be a
better idea just to set mm NULL and check that instead?

/Jarkko
Jarkko Sakkinen Dec. 17, 2018, 6:01 p.m. UTC | #65
On Mon, Dec 17, 2018 at 09:45:40AM -0800, Dave Hansen wrote:
> > +struct sgx_encl *sgx_encl_alloc(struct sgx_secs *secs)
> > +{
> ...
> > +	kref_init(&encl->refcount);
> > +	INIT_LIST_HEAD(&encl->add_page_reqs);
> > +	INIT_RADIX_TREE(&encl->page_tree, GFP_KERNEL);
> > +	mutex_init(&encl->lock);
> > +	INIT_WORK(&encl->add_page_work, sgx_add_page_worker);
> > +
> > +	encl->mm = current->mm;  <---------------------------------> +	encl->base = secs->base;
> > +	encl->size = secs->size;
> > +	encl->ssaframesize = secs->ssa_frame_size;
> > +	encl->backing = backing;
> > +
> > +	return encl;
> > +}
> 
> How is this OK without taking a reference on the mm?
> 
> I have a feeling a bunch of your bugs with the mmu notifiers and so
> forth are because the refcounting is wrong here.
> 
> Sean's SGX_ENCL_MM_RELEASED would, I think be unnecessary if you just
> take a refcount here and release it when the enclave is destroyed.

Right, atomic_inc(encl->mm->count) here and once when releasing.

The we would not even need the whole mmu notifier in the first place.

/Jarkko
Dave Hansen Dec. 17, 2018, 6:07 p.m. UTC | #66
On 12/17/18 10:01 AM, Jarkko Sakkinen wrote:
>>> +	encl->mm = current->mm;  <---------------------------------> +	encl->base = secs->base;
>>> +	encl->size = secs->size;
>>> +	encl->ssaframesize = secs->ssa_frame_size;
>>> +	encl->backing = backing;
>>> +
>>> +	return encl;
>>> +}
>> How is this OK without taking a reference on the mm?
>>
>> I have a feeling a bunch of your bugs with the mmu notifiers and so
>> forth are because the refcounting is wrong here.
>>
>> Sean's SGX_ENCL_MM_RELEASED would, I think be unnecessary if you just
>> take a refcount here and release it when the enclave is destroyed.
> Right, atomic_inc(encl->mm->count) here and once when releasing.
> 
> The we would not even need the whole mmu notifier in the first place.

Please use mmget()/mmput().
Sean Christopherson Dec. 17, 2018, 6:09 p.m. UTC | #67
On Mon, Dec 17, 2018 at 07:49:35PM +0200, Jarkko Sakkinen wrote:
> On Mon, Dec 17, 2018 at 09:31:06AM -0800, Sean Christopherson wrote:
> > This doesn't work as-is.  sgx_encl_release() needs to use sgx_free_page()
> > and not __sgx_free_page() so that we get a WARN() if the page can't be
> > freed.  sgx_invalidate() needs to use __sgx_free_page() as freeing a page
> > can fail due to running concurrently with reclaim.  I'll play around with
> > the code a bit, there's probably a fairly clean way to share code between
> > the two flows.
> 
> Hmm... but why issue a warning in that case? It should be legit
> behaviour.

No, EREMOVE should never fail if the enclave is being released, i.e. all
references to the enclave are gone.  And failure during sgx_encl_release()
means we leaked an EPC page, which warrants a WARN.

The only legitimate reason __sgx_free_page() can fail in sgx_invalidate()
is because a page might be in the process of being reclaimed.  We could
theoretically WARN on EREMOVE failure in that case, but it'd make the code
a little fragile and it's not "fatal" in the sense that we get a second
chance to free the page during sgx_encl_release().

And unless I missed something, using sgx_invalidate() means were' leaking
all sgx_encl_page structs as well as the radix tree entries.
 
> > sgx_encl_release_worker() calls do_unmap() without checking the validity
> > of the page tables[1].  As is, the code doesn't even guarantee mm_struct
> > itself is valid.
> > 
> > The easiest fix I can think of is to add a SGX_ENCL_MM_RELEASED flag
> > that is set along with SGX_ENCL_DEAD in sgx_mmu_notifier_release(), and
> > only call do_unmap() if SGX_ENCL_MM_RELEASED is false.  Note that this
> > means we cant unregister the mmu_notifier until after do_unmap(), but
> > that's true no matter what since we're relying on the mmu_notifier to
> > hold a reference to mm_struct.  Patch attached.
> 
> OK, the fix change makes sense but I'm thinking that would it be a
> better idea just to set mm NULL and check that instead?

That makes sense.
Jarkko Sakkinen Dec. 17, 2018, 6:23 p.m. UTC | #68
On Mon, Dec 17, 2018 at 10:09:57AM -0800, Sean Christopherson wrote:
> No, EREMOVE should never fail if the enclave is being released, i.e. all
> references to the enclave are gone.  And failure during sgx_encl_release()
> means we leaked an EPC page, which warrants a WARN.

Right that what I was suspecting as swapper should hold a ref to the
enclave while it is working on it. It is a programming error when this
happens.

Maybe change the boolean parameter to flags parameter have a flag to
use sgx_free_page()?

> That makes sense.

What do you think of Dave's proposal?

/Jarkko
Jarkko Sakkinen Dec. 17, 2018, 6:31 p.m. UTC | #69
On Mon, Dec 17, 2018 at 10:07:08AM -0800, Dave Hansen wrote:
> On 12/17/18 10:01 AM, Jarkko Sakkinen wrote:
> >>> +	encl->mm = current->mm;  <---------------------------------> +	encl->base = secs->base;
> >>> +	encl->size = secs->size;
> >>> +	encl->ssaframesize = secs->ssa_frame_size;
> >>> +	encl->backing = backing;
> >>> +
> >>> +	return encl;
> >>> +}
> >> How is this OK without taking a reference on the mm?
> >>
> >> I have a feeling a bunch of your bugs with the mmu notifiers and so
> >> forth are because the refcounting is wrong here.
> >>
> >> Sean's SGX_ENCL_MM_RELEASED would, I think be unnecessary if you just
> >> take a refcount here and release it when the enclave is destroyed.
> > Right, atomic_inc(encl->mm->count) here and once when releasing.
> > 
> > The we would not even need the whole mmu notifier in the first place.
> 
> Please use mmget()/mmput().

There's now a patch to test on top of the master.

/Jarkko
Sean Christopherson Dec. 17, 2018, 6:36 p.m. UTC | #70
On Mon, Dec 17, 2018 at 08:01:02PM +0200, Jarkko Sakkinen wrote:
> On Mon, Dec 17, 2018 at 09:45:40AM -0800, Dave Hansen wrote:
> > > +struct sgx_encl *sgx_encl_alloc(struct sgx_secs *secs)
> > > +{
> > ...
> > > +	kref_init(&encl->refcount);
> > > +	INIT_LIST_HEAD(&encl->add_page_reqs);
> > > +	INIT_RADIX_TREE(&encl->page_tree, GFP_KERNEL);
> > > +	mutex_init(&encl->lock);
> > > +	INIT_WORK(&encl->add_page_work, sgx_add_page_worker);
> > > +
> > > +	encl->mm = current->mm;  <---------------------------------> +	encl->base = secs->base;
> > > +	encl->size = secs->size;
> > > +	encl->ssaframesize = secs->ssa_frame_size;
> > > +	encl->backing = backing;
> > > +
> > > +	return encl;
> > > +}
> > 
> > How is this OK without taking a reference on the mm?

It's subtle and the ordering is all kinds of weird, but technically we
are taking a reference on mm when the mmu_notifier is registered in
sgx_encl_create().  sgx_encl_alloc() and sgx_encl_create() are always
called in tandem and with mm->mm_users > 0, so we'll never use encl->mm
without holding a reference to mm.  We need to comment the weirdness
or maybe register the notifier before

> > I have a feeling a bunch of your bugs with the mmu notifiers and so
> > forth are because the refcounting is wrong here.

Eh, not really.  Maybe the mmu_notifier is more subtle, e.g. calling
do_unmap() after mmput() would be quite obvious, but there's no
fundamental bug, we just haven't needed to touch VMAs during release
prior to moving away from shmem.

> > Sean's SGX_ENCL_MM_RELEASED would, I think be unnecessary if you just
> > take a refcount here and release it when the enclave is destroyed.
> 
> Right, atomic_inc(encl->mm->count) here and once when releasing.
> 
> The we would not even need the whole mmu notifier in the first place.

I'm pretty sure doing mmget() would result in circular dependencies and
a zombie enclave.  In the do_exit() case where a task is abruptly killed:
 
  - __mmput() is never called because the enclave holds a ref
  - sgx_encl_release() is never be called because its VMAs hold refs
  - sgx_vma_close() is never called because __mmput()->exit_mmap() is
    blocked and the process itself is dead, i.e. won't unmap anything.
Jarkko Sakkinen Dec. 17, 2018, 6:43 p.m. UTC | #71
On Mon, Dec 17, 2018 at 10:36:13AM -0800, Sean Christopherson wrote:
> I'm pretty sure doing mmget() would result in circular dependencies and
> a zombie enclave.  In the do_exit() case where a task is abruptly killed:
>  
>   - __mmput() is never called because the enclave holds a ref
>   - sgx_encl_release() is never be called because its VMAs hold refs
>   - sgx_vma_close() is never called because __mmput()->exit_mmap() is
>     blocked and the process itself is dead, i.e. won't unmap anything.

Right, it does, you are absolutely right. Tried it and removed the
commit already.

Well, what we came up from your suggestion i.e. setting mm to NULL
and checking that is very subtle change and does not have any such
circular dependencies. We'll go with that.

/Jarkko
Sean Christopherson Dec. 17, 2018, 6:46 p.m. UTC | #72
On Mon, Dec 17, 2018 at 08:23:19PM +0200, Jarkko Sakkinen wrote:
> On Mon, Dec 17, 2018 at 10:09:57AM -0800, Sean Christopherson wrote:
> > No, EREMOVE should never fail if the enclave is being released, i.e. all
> > references to the enclave are gone.  And failure during sgx_encl_release()
> > means we leaked an EPC page, which warrants a WARN.
> 
> Right that what I was suspecting as swapper should hold a ref to the
> enclave while it is working on it. It is a programming error when this
> happens.
> 
> Maybe change the boolean parameter to flags parameter have a flag to
> use sgx_free_page()?

I tried that approach when I first split it to __sgx_free_page() and
sgx_free_page(), but IMO the code is more difficult to read and harder
to maintain since sgx_free_page() should be used except under special
circumstances, e.g. race with reclaim or the freeing is "untrusted",
i.e. requested by userspace via sgx_ioc_enclave_remove_pages().

> 
> > That makes sense.
> 
> What do you think of Dave's proposal?
> 
> /Jarkko
Dave Hansen Dec. 17, 2018, 6:47 p.m. UTC | #73
On 12/17/18 10:43 AM, Jarkko Sakkinen wrote:
> On Mon, Dec 17, 2018 at 10:36:13AM -0800, Sean Christopherson wrote:
>> I'm pretty sure doing mmget() would result in circular dependencies and
>> a zombie enclave.  In the do_exit() case where a task is abruptly killed:
>>  
>>   - __mmput() is never called because the enclave holds a ref
>>   - sgx_encl_release() is never be called because its VMAs hold refs
>>   - sgx_vma_close() is never called because __mmput()->exit_mmap() is
>>     blocked and the process itself is dead, i.e. won't unmap anything.
> Right, it does, you are absolutely right. Tried it and removed the
> commit already.
> 
> Well, what we came up from your suggestion i.e. setting mm to NULL
> and checking that is very subtle change and does not have any such
> circular dependencies. We'll go with that.

This all screams that you need to break out this code from the massive
"18" patch and get the mm interactions reviewed more thoroughly.

Also, no matter what method you go with, you have a bunch of commenting
and changelogging to do here.
Sean Christopherson Dec. 17, 2018, 6:48 p.m. UTC | #74
On Mon, Dec 17, 2018 at 08:43:33PM +0200, Jarkko Sakkinen wrote:
> On Mon, Dec 17, 2018 at 10:36:13AM -0800, Sean Christopherson wrote:
> > I'm pretty sure doing mmget() would result in circular dependencies and
> > a zombie enclave.  In the do_exit() case where a task is abruptly killed:
> >  
> >   - __mmput() is never called because the enclave holds a ref
> >   - sgx_encl_release() is never be called because its VMAs hold refs
> >   - sgx_vma_close() is never called because __mmput()->exit_mmap() is
> >     blocked and the process itself is dead, i.e. won't unmap anything.
> 
> Right, it does, you are absolutely right. Tried it and removed the
> commit already.
> 
> Well, what we came up from your suggestion i.e. setting mm to NULL
> and checking that is very subtle change and does not have any such
> circular dependencies. We'll go with that.

We can't set mm to NULL as we need it to unregister the notifier, and
I'm fairly certain attempting to unregister in the release callback
will deadlock.
Dave Hansen Dec. 17, 2018, 7:09 p.m. UTC | #75
On 12/17/18 10:48 AM, Sean Christopherson wrote:
> We can't set mm to NULL as we need it to unregister the notifier, and
> I'm fairly certain attempting to unregister in the release callback
> will deadlock.

Suggestion:

It looks like you only expect one VMA per enclave.  Things go bonkers if
this is not true.  So, instead of storing encl->mm, don't.  You can get
the mm from vma->vm_mm and you could just store encl->vma instead.

Doing that, you could even axe encl->base and encl->size, I think
because you just get those from the VMA itself.

That makes the relationship clearer: 1 VMA per enclave.  We also
implicitly understand that if you have a VMA, you implicitly have a ref
to the mm *and* the VMA is immutable.

If there were ever a path where encl->vma wasn't immutable, we'd have a
bug (or load of bugs) somewhere, right?
Andy Lutomirski Dec. 17, 2018, 7:12 p.m. UTC | #76
On Mon, Dec 17, 2018 at 10:47 AM Dave Hansen <dave.hansen@intel.com> wrote:
>
> On 12/17/18 10:43 AM, Jarkko Sakkinen wrote:
> > On Mon, Dec 17, 2018 at 10:36:13AM -0800, Sean Christopherson wrote:
> >> I'm pretty sure doing mmget() would result in circular dependencies and
> >> a zombie enclave.  In the do_exit() case where a task is abruptly killed:
> >>
> >>   - __mmput() is never called because the enclave holds a ref
> >>   - sgx_encl_release() is never be called because its VMAs hold refs
> >>   - sgx_vma_close() is never called because __mmput()->exit_mmap() is
> >>     blocked and the process itself is dead, i.e. won't unmap anything.
> > Right, it does, you are absolutely right. Tried it and removed the
> > commit already.
> >
> > Well, what we came up from your suggestion i.e. setting mm to NULL
> > and checking that is very subtle change and does not have any such
> > circular dependencies. We'll go with that.
>
> This all screams that you need to break out this code from the massive
> "18" patch and get the mm interactions reviewed more thoroughly.
>
> Also, no matter what method you go with, you have a bunch of commenting
> and changelogging to do here.

I'm going to ask an obnoxious high-level question: why does an enclave
even refer to a specific mm?

If I were designing this thing, and if I hadn't started trying to
implement it, my first thought would be that an enclave tracks its
linear address range, which is just a pair of numbers, and also keeps
track of a whole bunch of physical EPC pages, data structures, etc.
And that mmap() gets rejected unless the requested virtual address
matches the linear address range that the enclave wants and, aside
from that, just creates a VMA that keeps a reference to the enclave.
(And, for convenience, I suppose that the first mmap() call done
before any actual enclave setup happens could choose any address and
then cause the enclave to lock itself to that address, although a
regular anonymous PROT_NONE MAP_NORESERVE mapping would do just fine,
too.)  And the driver would explicitly allow multiple different mms to
have the same enclave mapped.  More importantly, a daemon could set up
an enclave without necessarily mapping it at all and then SCM_RIGHTS
the enclave over to the process that plans to run it.

Now I'm sure this has all kinds of problems, such as the ISA possibly
making it rather obnoxious to add pages to the enclave without having
it mapped.  But these operations could, in principle, be done by
having the enclave own a private mm that's used just for setup.  While
this would be vaguely annoying, Nadav's still-pending-but-nearly-done
text_poke series adds all the infrastructure that's needed for the
kernel to manage little private mms.  But some things get simpler --
invalidating the enclave can presumably use the regular rmap APIs to
zap all the PTEs in all VMAs pointing into the enclave.

So I'm not saying that you shouldn't do it the way you are now, but I
do think that the changelog or at least some emails should explain
*why* the enclave needs to keep a pointer to the creating process's
mm.  And, if you do keep the current model, it would be nice to
understand what happens if you do something awful like mremap()ing an
enclave, or calling madvise on it, or otherwise abusing the vma.  Or
doing fork(), for that matter.

I also find it suspicious that the various ioctl handlers
systematically ignore their "filep" parameters and instead use
find_vma() to find the relevant mm data structures.  That seems
backwards.
Dave Hansen Dec. 17, 2018, 7:17 p.m. UTC | #77
On 12/17/18 11:12 AM, Andy Lutomirski wrote:
> So I'm not saying that you shouldn't do it the way you are now, but I
> do think that the changelog or at least some emails should explain
> *why* the enclave needs to keep a pointer to the creating process's
> mm.  And, if you do keep the current model, it would be nice to
> understand what happens if you do something awful like mremap()ing an
> enclave, or calling madvise on it, or otherwise abusing the vma.  Or
> doing fork(), for that matter.

Yeah, the code is built to have one VMA and only one VMA per enclave.
You need to go over the origin of this restriction and what enforces this.
Andy Lutomirski Dec. 17, 2018, 7:25 p.m. UTC | #78
On Mon, Dec 17, 2018 at 11:17 AM Dave Hansen <dave.hansen@intel.com> wrote:
>
> On 12/17/18 11:12 AM, Andy Lutomirski wrote:
> > So I'm not saying that you shouldn't do it the way you are now, but I
> > do think that the changelog or at least some emails should explain
> > *why* the enclave needs to keep a pointer to the creating process's
> > mm.  And, if you do keep the current model, it would be nice to
> > understand what happens if you do something awful like mremap()ing an
> > enclave, or calling madvise on it, or otherwise abusing the vma.  Or
> > doing fork(), for that matter.
>
> Yeah, the code is built to have one VMA and only one VMA per enclave.
> You need to go over the origin of this restriction and what enforces this.

There is a sad historical reason that you may regret keeping this
restriction.  There are plenty of pieces of code out there that think
it's reasonable to spawn a subprocess by calling fork() and then
execve().  (This is *not* a sensible thing to do.  One should use
posix_spawn() or some CLONE_VM variant.  But even fairly recent
posix_spawn() implementations will fork().  So the driver has to do
*something* sensible on fork() or a bunch of things that use SGX
unsuspectingly via, for example, PKCS #11, are going to be very sad.
I suppose you could make enclaves just not show up in the fork()ed
children, but then you have a different problem: creating an enclave
and then doing daemon() won't work.

Yes, POSIX traditions are rather silly.
Jarkko Sakkinen Dec. 17, 2018, 7:33 p.m. UTC | #79
On Mon, Dec 17, 2018 at 10:48:58AM -0800, Sean Christopherson wrote:
> On Mon, Dec 17, 2018 at 08:43:33PM +0200, Jarkko Sakkinen wrote:
> > On Mon, Dec 17, 2018 at 10:36:13AM -0800, Sean Christopherson wrote:
> > > I'm pretty sure doing mmget() would result in circular dependencies and
> > > a zombie enclave.  In the do_exit() case where a task is abruptly killed:
> > >  
> > >   - __mmput() is never called because the enclave holds a ref
> > >   - sgx_encl_release() is never be called because its VMAs hold refs
> > >   - sgx_vma_close() is never called because __mmput()->exit_mmap() is
> > >     blocked and the process itself is dead, i.e. won't unmap anything.
> > 
> > Right, it does, you are absolutely right. Tried it and removed the
> > commit already.
> > 
> > Well, what we came up from your suggestion i.e. setting mm to NULL
> > and checking that is very subtle change and does not have any such
> > circular dependencies. We'll go with that.
> 
> We can't set mm to NULL as we need it to unregister the notifier, and
> I'm fairly certain attempting to unregister in the release callback
> will deadlock.

Noticed that too. mmu_notifier_unregister() requires a valid mm.

/Jarkko
Jarkko Sakkinen Dec. 17, 2018, 7:36 p.m. UTC | #80
On Mon, Dec 17, 2018 at 10:46:25AM -0800, Sean Christopherson wrote:
> On Mon, Dec 17, 2018 at 08:23:19PM +0200, Jarkko Sakkinen wrote:
> > On Mon, Dec 17, 2018 at 10:09:57AM -0800, Sean Christopherson wrote:
> > > No, EREMOVE should never fail if the enclave is being released, i.e. all
> > > references to the enclave are gone.  And failure during sgx_encl_release()
> > > means we leaked an EPC page, which warrants a WARN.
> > 
> > Right that what I was suspecting as swapper should hold a ref to the
> > enclave while it is working on it. It is a programming error when this
> > happens.
> > 
> > Maybe change the boolean parameter to flags parameter have a flag to
> > use sgx_free_page()?
> 
> I tried that approach when I first split it to __sgx_free_page() and
> sgx_free_page(), but IMO the code is more difficult to read and harder
> to maintain since sgx_free_page() should be used except under special
> circumstances, e.g. race with reclaim or the freeing is "untrusted",
> i.e. requested by userspace via sgx_ioc_enclave_remove_pages().

I mean inside sgx_invalidate() call either __sgx_free_page() or
sgx_free_page() depending on a flag.

/Jarkko
Jarkko Sakkinen Dec. 17, 2018, 7:37 p.m. UTC | #81
On Mon, Dec 17, 2018 at 11:09:33AM -0800, Dave Hansen wrote:
> On 12/17/18 10:48 AM, Sean Christopherson wrote:
> > We can't set mm to NULL as we need it to unregister the notifier, and
> > I'm fairly certain attempting to unregister in the release callback
> > will deadlock.
> 
> Suggestion:
> 
> It looks like you only expect one VMA per enclave.  Things go bonkers if
> this is not true.  So, instead of storing encl->mm, don't.  You can get
> the mm from vma->vm_mm and you could just store encl->vma instead.

The code actually supports having multiple VMAs per enclave.

/Jarkko
Dave Hansen Dec. 17, 2018, 7:40 p.m. UTC | #82
On 12/17/18 11:37 AM, Jarkko Sakkinen wrote:
>> Suggestion:
>>
>> It looks like you only expect one VMA per enclave.  Things go bonkers if
>> this is not true.  So, instead of storing encl->mm, don't.  You can get
>> the mm from vma->vm_mm and you could just store encl->vma instead.
> The code actually supports having multiple VMAs per enclave.

That seems at least somewhat at odds with this comment:

> static void sgx_vma_open(struct vm_area_struct *vma)
> {
>         struct sgx_encl *encl = vma->vm_private_data;
> 
>         if (!encl)
>                 return;
> 
>         /* kref cannot underflow because ECREATE ioctl checks that there is only
>          * one single VMA for the enclave before proceeding.
>          */
>         kref_get(&encl->refcount);
> }
Jarkko Sakkinen Dec. 17, 2018, 7:49 p.m. UTC | #83
On Mon, Dec 17, 2018 at 11:17:49AM -0800, Dave Hansen wrote:
> On 12/17/18 11:12 AM, Andy Lutomirski wrote:
> > So I'm not saying that you shouldn't do it the way you are now, but I
> > do think that the changelog or at least some emails should explain
> > *why* the enclave needs to keep a pointer to the creating process's
> > mm.  And, if you do keep the current model, it would be nice to
> > understand what happens if you do something awful like mremap()ing an
> > enclave, or calling madvise on it, or otherwise abusing the vma.  Or
> > doing fork(), for that matter.
> 
> Yeah, the code is built to have one VMA and only one VMA per enclave.
> You need to go over the origin of this restriction and what enforces this.

It is before ECREATE but after that you can split it with mprotect().

Lets take an example. I'm not sure how we would acquire mm efficiently
in sgx_encl_page_reclaim() otherwise than having it as a field in encl.

/Jarkko
Dave Hansen Dec. 17, 2018, 7:53 p.m. UTC | #84
On 12/17/18 11:49 AM, Jarkko Sakkinen wrote:
>> Yeah, the code is built to have one VMA and only one VMA per enclave.
>> You need to go over the origin of this restriction and what enforces this.
> It is before ECREATE but after that you can split it with mprotect().
> 
> Lets take an example. I'm not sure how we would acquire mm efficiently
> in sgx_encl_page_reclaim() otherwise than having it as a field in encl.

You're effectively rebuilding reverse-mapping infrastructure here.  It's
a frequent thing for the core VM to need to go from 'struct page' back
to the page tables mapping it.  For that we go (logically)
page->{anon_vma,mapping}->vma->vm_mm->pagetable.

This, on the other hand, is trying to do page->encl->mm->pagetable.  You
could very easily have a VMA analog in there instead of jumping straight
to the mm.
Jarkko Sakkinen Dec. 17, 2018, 7:54 p.m. UTC | #85
On Mon, Dec 17, 2018 at 11:25:47AM -0800, Andy Lutomirski wrote:
> On Mon, Dec 17, 2018 at 11:17 AM Dave Hansen <dave.hansen@intel.com> wrote:
> >
> > On 12/17/18 11:12 AM, Andy Lutomirski wrote:
> > > So I'm not saying that you shouldn't do it the way you are now, but I
> > > do think that the changelog or at least some emails should explain
> > > *why* the enclave needs to keep a pointer to the creating process's
> > > mm.  And, if you do keep the current model, it would be nice to
> > > understand what happens if you do something awful like mremap()ing an
> > > enclave, or calling madvise on it, or otherwise abusing the vma.  Or
> > > doing fork(), for that matter.
> >
> > Yeah, the code is built to have one VMA and only one VMA per enclave.
> > You need to go over the origin of this restriction and what enforces this.
> 
> There is a sad historical reason that you may regret keeping this
> restriction.  There are plenty of pieces of code out there that think
> it's reasonable to spawn a subprocess by calling fork() and then
> execve().  (This is *not* a sensible thing to do.  One should use
> posix_spawn() or some CLONE_VM variant.  But even fairly recent
> posix_spawn() implementations will fork().  So the driver has to do
> *something* sensible on fork() or a bunch of things that use SGX
> unsuspectingly via, for example, PKCS #11, are going to be very sad.
> I suppose you could make enclaves just not show up in the fork()ed
> children, but then you have a different problem: creating an enclave
> and then doing daemon() won't work.
> 
> Yes, POSIX traditions are rather silly.

ATM enclave VMAs are not copied on fork. Not sure how you would
implement COW semantics with enclaves.

/Jarkko
Andy Lutomirski Dec. 17, 2018, 7:55 p.m. UTC | #86
On Mon, Dec 17, 2018 at 11:53 AM Dave Hansen <dave.hansen@intel.com> wrote:
>
> On 12/17/18 11:49 AM, Jarkko Sakkinen wrote:
> >> Yeah, the code is built to have one VMA and only one VMA per enclave.
> >> You need to go over the origin of this restriction and what enforces this.
> > It is before ECREATE but after that you can split it with mprotect().
> >
> > Lets take an example. I'm not sure how we would acquire mm efficiently
> > in sgx_encl_page_reclaim() otherwise than having it as a field in encl.
>
> You're effectively rebuilding reverse-mapping infrastructure here.  It's
> a frequent thing for the core VM to need to go from 'struct page' back
> to the page tables mapping it.  For that we go (logically)
> page->{anon_vma,mapping}->vma->vm_mm->pagetable.

This is a bit outside my expertise here, but doesn't
unmap_mapping_range() do exactly what SGX wants?
Dave Hansen Dec. 17, 2018, 8:03 p.m. UTC | #87
On 12/17/18 11:55 AM, Andy Lutomirski wrote:
>> You're effectively rebuilding reverse-mapping infrastructure here.  It's
>> a frequent thing for the core VM to need to go from 'struct page' back
>> to the page tables mapping it.  For that we go (logically)
>> page->{anon_vma,mapping}->vma->vm_mm->pagetable.
> This is a bit outside my expertise here, but doesn't
> unmap_mapping_range() do exactly what SGX wants?

There's no 'struct page' for enclave memory as it stands.  That means no
page cache, and that means there's no 'struct address_space *mapping' in
the first place.

Basically, the choice was made a long time ago to have SGX's memory
management live outside the core VM.  I've waffled back and forth on it,
but I do still think this is the right way to do it.
Andy Lutomirski Dec. 17, 2018, 8:10 p.m. UTC | #88
On Mon, Dec 17, 2018 at 12:03 PM Dave Hansen <dave.hansen@intel.com> wrote:
>
> On 12/17/18 11:55 AM, Andy Lutomirski wrote:
> >> You're effectively rebuilding reverse-mapping infrastructure here.  It's
> >> a frequent thing for the core VM to need to go from 'struct page' back
> >> to the page tables mapping it.  For that we go (logically)
> >> page->{anon_vma,mapping}->vma->vm_mm->pagetable.
> > This is a bit outside my expertise here, but doesn't
> > unmap_mapping_range() do exactly what SGX wants?
>
> There's no 'struct page' for enclave memory as it stands.  That means no
> page cache, and that means there's no 'struct address_space *mapping' in
> the first place.
>
> Basically, the choice was made a long time ago to have SGX's memory
> management live outside the core VM.  I've waffled back and forth on it,
> but I do still think this is the right way to do it.

AFAICS a lack of struct page isn't a problem.  The core code seems to
understand that address_space objects might cover non-struct-page
memory.  Morally, enclave memory is a lot like hot-unpluggable PCI
space.
Dave Hansen Dec. 17, 2018, 8:15 p.m. UTC | #89
On 12/17/18 12:10 PM, Andy Lutomirski wrote:
>> There's no 'struct page' for enclave memory as it stands.  That means no
>> page cache, and that means there's no 'struct address_space *mapping' in
>> the first place.
>>
>> Basically, the choice was made a long time ago to have SGX's memory
>> management live outside the core VM.  I've waffled back and forth on it,
>> but I do still think this is the right way to do it.
> AFAICS a lack of struct page isn't a problem.  The core code seems to
> understand that address_space objects might cover non-struct-page
> memory.  Morally, enclave memory is a lot like hot-unpluggable PCI
> space.

Yeah, this is true.  The existing code seems to make it all the way from
unmap_mapping_range() down to zap_page_range() without 'struct page'.

Overall, I think what Andy is saying here is that an open(/dev/sgx)
should give you a "unique" enclave fd.  That fd can end up mapped into
one or more processes either via fork() or the other ways fds end up
getting handed around.  mmap() of this fd would be *required* to be
MAP_SHARED.  That means you don't need to support COW, and the semantics
are the same as any other MAP_SHARED mapping: children and parents and
anybody mmap()'ing it must all coordinate.

This sounds interesting at least.  It might lead to an unholy mess in
the driver, or it might be a great cleanup.  But, it does sound like
something that would both potentially simplify the semantics and the
implementation.
Jarkko Sakkinen Dec. 17, 2018, 8:21 p.m. UTC | #90
On Mon, Dec 17, 2018 at 09:33:22PM +0200, Jarkko Sakkinen wrote:
> On Mon, Dec 17, 2018 at 10:48:58AM -0800, Sean Christopherson wrote:
> > On Mon, Dec 17, 2018 at 08:43:33PM +0200, Jarkko Sakkinen wrote:
> > > On Mon, Dec 17, 2018 at 10:36:13AM -0800, Sean Christopherson wrote:
> > > > I'm pretty sure doing mmget() would result in circular dependencies and
> > > > a zombie enclave.  In the do_exit() case where a task is abruptly killed:
> > > >  
> > > >   - __mmput() is never called because the enclave holds a ref
> > > >   - sgx_encl_release() is never be called because its VMAs hold refs
> > > >   - sgx_vma_close() is never called because __mmput()->exit_mmap() is
> > > >     blocked and the process itself is dead, i.e. won't unmap anything.
> > > 
> > > Right, it does, you are absolutely right. Tried it and removed the
> > > commit already.
> > > 
> > > Well, what we came up from your suggestion i.e. setting mm to NULL
> > > and checking that is very subtle change and does not have any such
> > > circular dependencies. We'll go with that.
> > 
> > We can't set mm to NULL as we need it to unregister the notifier, and
> > I'm fairly certain attempting to unregister in the release callback
> > will deadlock.
> 
> Noticed that too. mmu_notifier_unregister() requires a valid mm.

Both branches updated...

/Jarkko
Sean Christopherson Dec. 17, 2018, 10:20 p.m. UTC | #91
On Mon, Dec 17, 2018 at 11:12:21AM -0800, Andy Lutomirski wrote:
> On Mon, Dec 17, 2018 at 10:47 AM Dave Hansen <dave.hansen@intel.com> wrote:
> >
> > On 12/17/18 10:43 AM, Jarkko Sakkinen wrote:
> > > On Mon, Dec 17, 2018 at 10:36:13AM -0800, Sean Christopherson wrote:
> > >> I'm pretty sure doing mmget() would result in circular dependencies and
> > >> a zombie enclave.  In the do_exit() case where a task is abruptly killed:
> > >>
> > >>   - __mmput() is never called because the enclave holds a ref
> > >>   - sgx_encl_release() is never be called because its VMAs hold refs
> > >>   - sgx_vma_close() is never called because __mmput()->exit_mmap() is
> > >>     blocked and the process itself is dead, i.e. won't unmap anything.
> > > Right, it does, you are absolutely right. Tried it and removed the
> > > commit already.
> > >
> > > Well, what we came up from your suggestion i.e. setting mm to NULL
> > > and checking that is very subtle change and does not have any such
> > > circular dependencies. We'll go with that.
> >
> > This all screams that you need to break out this code from the massive
> > "18" patch and get the mm interactions reviewed more thoroughly.
> >
> > Also, no matter what method you go with, you have a bunch of commenting
> > and changelogging to do here.
> 
> I'm going to ask an obnoxious high-level question: why does an enclave
> even refer to a specific mm?

Primarily because that's what the code has "always" done.  I can't
speak for Jarkko, but I got involved with this joyful project long after
the code was originally written.

> If I were designing this thing, and if I hadn't started trying to
> implement it, my first thought would be that an enclave tracks its
> linear address range, which is just a pair of numbers, and also keeps
> track of a whole bunch of physical EPC pages, data structures, etc.
> And that mmap() gets rejected unless the requested virtual address
> matches the linear address range that the enclave wants and, aside
> from that, just creates a VMA that keeps a reference to the enclave.
> (And, for convenience, I suppose that the first mmap() call done
> before any actual enclave setup happens could choose any address and
> then cause the enclave to lock itself to that address, although a
> regular anonymous PROT_NONE MAP_NORESERVE mapping would do just fine,
> too.)  And the driver would explicitly allow multiple different mms to
> have the same enclave mapped.  More importantly, a daemon could set up
> an enclave without necessarily mapping it at all and then SCM_RIGHTS
> the enclave over to the process that plans to run it.

Hmm, this could work, the obvious weirdness would be ensuring the linear
range is available in the destination mm, but that'd be userspace's
problem.

I don't think we'd need to keep a reference to the enclave in the VMA.
The enclave's ref could be held by the fd.  Assuming the kernel is using
its private mapping to access the enclave, that's all we'd need to be
able to manipulate the enclave, e.g. reclaim EPC pages.  Userspace would
need to keep the fd alive in order to use the VMA, but that sort of goes
without saying.  The mm/VMA juggling today is for zapping/testing the
correct PTEs, but as you pointed out in a different email we can use
unmap_mapping_range(), with the enclave's fd being the source of the
address space passed to unmap_mapping_range().  Removing a VMA simply
means we don't need to zap it or test its age.
 
> Now I'm sure this has all kinds of problems, such as the ISA possibly
> making it rather obnoxious to add pages to the enclave without having
> it mapped.  But these operations could, in principle, be done by
> having the enclave own a private mm that's used just for setup.  While
> this would be vaguely annoying, Nadav's still-pending-but-nearly-done
> text_poke series adds all the infrastructure that's needed for the
> kernel to manage little private mms.  But some things get simpler --
> invalidating the enclave can presumably use the regular rmap APIs to
> zap all the PTEs in all VMAs pointing into the enclave.

We don't even need a private mm, we can (and already do) use the kernel's
translations for ENCLS instructions.  Hardware only enforces the linear
address stuff when it's actually in enclave mode, i.e. executing the
enclave.  ENCLS instructions aren't subject to the ELRANGE checks and
can use any VA->PA combination.

> So I'm not saying that you shouldn't do it the way you are now, but I
> do think that the changelog or at least some emails should explain
> *why* the enclave needs to keep a pointer to the creating process's
> mm.  And, if you do keep the current model, it would be nice to
> understand what happens if you do something awful like mremap()ing an
> enclave, or calling madvise on it, or otherwise abusing the vma.  Or
> doing fork(), for that matter.
> 
> I also find it suspicious that the various ioctl handlers
> systematically ignore their "filep" parameters and instead use
> find_vma() to find the relevant mm data structures.  That seems
> backwards.

My brain is still sorting out the details, but I generally like the idea
of allocating an anon inode when creating an enclave, and exposing the
other ioctls() via the returned fd.  This is essentially the approach
used by KVM to manage multiple "layers" of ioctls across KVM itself, VMs
and vCPUS.  There are even similarities to accessing physical memory via
multiple disparate domains, e.g. host kernel, host userspace and guest.

The only potential hiccup I can see is the build flow.  Currently,
EADD+EEXTEND is done via a work queue to avoid major performance issues
(10x regression) when userspace is building multiple enclaves in parallel
using goroutines to wrap Cgo (the issue might apply to any M:N scheduler,
but I've only confirmed the Golang case).  The issue is that allocating
an EPC page acts like a blocking syscall when the EPC is under pressure,
i.e. an EPC page isn't immediately available.  This causes Go's scheduler
to thrash and tank performance[1].

That being said, I think we could simply do mmgrab()/mmdrop() for each
page to be added, and then do mmget_not_zero()/mmput() when actually
inserting into the mm's page tables.  Conceptually that seems cleaner
than implicitly relying on the mmu_notifier to guarantee the lifecycle
of the mm.

Alternatively, we could change the EADD+EEXTEND flow to not insert the
added page's PFN into the owner's process space, i.e. force userspace to
fault when it runs the enclave.  But that only delays the issue because
eventually we'll want to account EPC pages, i.e. add a cgroup, at which
point we'll likely need current->mm anyways.

[1] https://github.com/golang/go/issues/19574
Sean Christopherson Dec. 17, 2018, 10:36 p.m. UTC | #92
On Mon, Dec 17, 2018 at 12:15:47PM -0800, Dave Hansen wrote:
> On 12/17/18 12:10 PM, Andy Lutomirski wrote:
> >> There's no 'struct page' for enclave memory as it stands.  That means no
> >> page cache, and that means there's no 'struct address_space *mapping' in
> >> the first place.
> >>
> >> Basically, the choice was made a long time ago to have SGX's memory
> >> management live outside the core VM.  I've waffled back and forth on it,
> >> but I do still think this is the right way to do it.
> > AFAICS a lack of struct page isn't a problem.  The core code seems to
> > understand that address_space objects might cover non-struct-page
> > memory.  Morally, enclave memory is a lot like hot-unpluggable PCI
> > space.
> 
> Yeah, this is true.  The existing code seems to make it all the way from
> unmap_mapping_range() down to zap_page_range() without 'struct page'.
> 
> Overall, I think what Andy is saying here is that an open(/dev/sgx)
> should give you a "unique" enclave fd.  That fd can end up mapped into
> one or more processes either via fork() or the other ways fds end up
> getting handed around.  mmap() of this fd would be *required* to be
> MAP_SHARED.  That means you don't need to support COW, and the semantics
> are the same as any other MAP_SHARED mapping: children and parents and
> anybody mmap()'ing it must all coordinate.
> 
> This sounds interesting at least.  It might lead to an unholy mess in
> the driver, or it might be a great cleanup.  But, it does sound like
> something that would both potentially simplify the semantics and the
> implementation.

It's very similar to KVM's model, which has proven to be fairly robust,
so I don't think it'll be an unholy mess (famous last words).  It
probably won't be a "great" cleanup per se, but it definitely should
make the code more maintainable in the long run.

The other interesting aspect of the enclave fd approach is that it would
allow userspace to *execute* an enclave from multiple processes, so long
as it did the necessary multiplexing of pthreads to enclave threads.  I
think SGX2 instructions (dynamic EPC management) would even allow adding
new enclave threads on-demand.
Jarkko Sakkinen Dec. 18, 2018, 1:17 a.m. UTC | #93
On Mon, Dec 17, 2018 at 11:12:21AM -0800, Andy Lutomirski wrote:
> I'm going to ask an obnoxious high-level question: why does an enclave
> even refer to a specific mm?

The reason is that it has not been yet in focus in the review process
and there has been other concerns.

At least the code is fairly stable i.e. working code is usually good
starting point for making something different (ignoring the recent
regression caused by the shmem to VMA migration).

> If I were designing this thing, and if I hadn't started trying to
> implement it, my first thought would be that an enclave tracks its
> linear address range, which is just a pair of numbers, and also keeps
> track of a whole bunch of physical EPC pages, data structures, etc.
> And that mmap() gets rejected unless the requested virtual address
> matches the linear address range that the enclave wants and, aside
> from that, just creates a VMA that keeps a reference to the enclave.
> (And, for convenience, I suppose that the first mmap() call done
> before any actual enclave setup happens could choose any address and
> then cause the enclave to lock itself to that address, although a
> regular anonymous PROT_NONE MAP_NORESERVE mapping would do just fine,
> too.)  And the driver would explicitly allow multiple different mms to
> have the same enclave mapped.  More importantly, a daemon could set up
> an enclave without necessarily mapping it at all and then SCM_RIGHTS
> the enclave over to the process that plans to run it.

The current SGX_IOC_ENCLAVE_CREATE ioctl would be trivial to change to
use this approach. Instead looking up VMA with an enclave instance it
would create a new enclave instance.

Then we could have SGX_IOC_ENCLAVE_ATTACH to attach an enclave to a VMA.

This does not sound too complicated.

> Now I'm sure this has all kinds of problems, such as the ISA possibly
> making it rather obnoxious to add pages to the enclave without having
> it mapped.  But these operations could, in principle, be done by

We do EADD in a kthread. What this would require to put current->mm
into a request that it is processed by that thread. This would be
doable with mmget().

The deadlock that Sean mentioned would not exist since closing VMAs
is not bounded to the enclave life-cycle anymore.

So at least non-swapping ISA is easy to fit to this framework. I can
rework this for v19.

/Jarkko
Jarkko Sakkinen Dec. 18, 2018, 1:31 a.m. UTC | #94
On Tue, Dec 18, 2018 at 03:17:25AM +0200, Jarkko Sakkinen wrote:
> On Mon, Dec 17, 2018 at 11:12:21AM -0800, Andy Lutomirski wrote:
> > I'm going to ask an obnoxious high-level question: why does an enclave
> > even refer to a specific mm?
> 
> The reason is that it has not been yet in focus in the review process
> and there has been other concerns.
> 
> At least the code is fairly stable i.e. working code is usually good
> starting point for making something different (ignoring the recent
> regression caused by the shmem to VMA migration).
> 
> > If I were designing this thing, and if I hadn't started trying to
> > implement it, my first thought would be that an enclave tracks its
> > linear address range, which is just a pair of numbers, and also keeps
> > track of a whole bunch of physical EPC pages, data structures, etc.
> > And that mmap() gets rejected unless the requested virtual address
> > matches the linear address range that the enclave wants and, aside
> > from that, just creates a VMA that keeps a reference to the enclave.
> > (And, for convenience, I suppose that the first mmap() call done
> > before any actual enclave setup happens could choose any address and
> > then cause the enclave to lock itself to that address, although a
> > regular anonymous PROT_NONE MAP_NORESERVE mapping would do just fine,
> > too.)  And the driver would explicitly allow multiple different mms to
> > have the same enclave mapped.  More importantly, a daemon could set up
> > an enclave without necessarily mapping it at all and then SCM_RIGHTS
> > the enclave over to the process that plans to run it.
> 
> The current SGX_IOC_ENCLAVE_CREATE ioctl would be trivial to change to
> use this approach. Instead looking up VMA with an enclave instance it
> would create a new enclave instance.
> 
> Then we could have SGX_IOC_ENCLAVE_ATTACH to attach an enclave to a VMA.
> 
> This does not sound too complicated.
> 
> > Now I'm sure this has all kinds of problems, such as the ISA possibly
> > making it rather obnoxious to add pages to the enclave without having
> > it mapped.  But these operations could, in principle, be done by
> 
> We do EADD in a kthread. What this would require to put current->mm
> into a request that it is processed by that thread. This would be
> doable with mmget().

Correction here. We need mm just for vm_insert_pfn(), which would be
removed, no need to pass mm.

/Jarkko
Jarkko Sakkinen Dec. 18, 2018, 1:39 a.m. UTC | #95
On Mon, Dec 17, 2018 at 02:20:48PM -0800, Sean Christopherson wrote:
> The only potential hiccup I can see is the build flow.  Currently,
> EADD+EEXTEND is done via a work queue to avoid major performance issues
> (10x regression) when userspace is building multiple enclaves in parallel
> using goroutines to wrap Cgo (the issue might apply to any M:N scheduler,
> but I've only confirmed the Golang case).  The issue is that allocating
> an EPC page acts like a blocking syscall when the EPC is under pressure,
> i.e. an EPC page isn't immediately available.  This causes Go's scheduler
> to thrash and tank performance[1].

I don't see any major issues having that kthread. All the code that
maps the enclave would be removed.

I would only allow to map enclave to process address space after the
enclave has been initialized i.e. SGX_IOC_ENCLAVE_ATTACH.

/Jarkko
Jarkko Sakkinen Dec. 18, 2018, 1:40 a.m. UTC | #96
On Mon, Dec 17, 2018 at 12:10:17PM -0800, Andy Lutomirski wrote:
> On Mon, Dec 17, 2018 at 12:03 PM Dave Hansen <dave.hansen@intel.com> wrote:
> >
> > On 12/17/18 11:55 AM, Andy Lutomirski wrote:
> > >> You're effectively rebuilding reverse-mapping infrastructure here.  It's
> > >> a frequent thing for the core VM to need to go from 'struct page' back
> > >> to the page tables mapping it.  For that we go (logically)
> > >> page->{anon_vma,mapping}->vma->vm_mm->pagetable.
> > > This is a bit outside my expertise here, but doesn't
> > > unmap_mapping_range() do exactly what SGX wants?
> >
> > There's no 'struct page' for enclave memory as it stands.  That means no
> > page cache, and that means there's no 'struct address_space *mapping' in
> > the first place.
> >
> > Basically, the choice was made a long time ago to have SGX's memory
> > management live outside the core VM.  I've waffled back and forth on it,
> > but I do still think this is the right way to do it.
> 
> AFAICS a lack of struct page isn't a problem.  The core code seems to
> understand that address_space objects might cover non-struct-page
> memory.  Morally, enclave memory is a lot like hot-unpluggable PCI
> space.

I'm fine using it if it works. Will try it for v19.

/Jarkko
Jarkko Sakkinen Dec. 18, 2018, 3:27 a.m. UTC | #97
On Tue, Dec 18, 2018 at 03:39:18AM +0200, Jarkko Sakkinen wrote:
> On Mon, Dec 17, 2018 at 02:20:48PM -0800, Sean Christopherson wrote:
> > The only potential hiccup I can see is the build flow.  Currently,
> > EADD+EEXTEND is done via a work queue to avoid major performance issues
> > (10x regression) when userspace is building multiple enclaves in parallel
> > using goroutines to wrap Cgo (the issue might apply to any M:N scheduler,
> > but I've only confirmed the Golang case).  The issue is that allocating
> > an EPC page acts like a blocking syscall when the EPC is under pressure,
> > i.e. an EPC page isn't immediately available.  This causes Go's scheduler
> > to thrash and tank performance[1].
> 
> I don't see any major issues having that kthread. All the code that
> maps the enclave would be removed.
> 
> I would only allow to map enclave to process address space after the
> enclave has been initialized i.e. SGX_IOC_ENCLAVE_ATTACH.

Some refined thoughts.

PTE insertion can done in the #PF handler. In fact, we can PoC this
already with the current architecture (and I will right after sending
v18).

The backing space is a bit more nasty issue in the add pager thread.
The previous shmem swapping would have been a better fit. Maybe that
should be reconsidered?

If shmem was used, all the commits up to "SGX Enclave Driver" could
be reworked to the new model.

When we think about the swapping code, there uprises some difficulties.
Namely, when a page is swapped, the enclave must unmap the PTE from all
processes that have it mapped.

I have a one compromise solution for the problem above: make enclaves
shared BUT mutually exclusive. When you attach an enclave it gets
detached from the previous process that had it. This would still fully
implement the daemon example that Andy gave in earlier response.

/Jarkko
Andy Lutomirski Dec. 18, 2018, 4:55 a.m. UTC | #98
On Mon, Dec 17, 2018 at 5:39 PM Jarkko Sakkinen
<jarkko.sakkinen@linux.intel.com> wrote:
>
> On Mon, Dec 17, 2018 at 02:20:48PM -0800, Sean Christopherson wrote:
> > The only potential hiccup I can see is the build flow.  Currently,
> > EADD+EEXTEND is done via a work queue to avoid major performance issues
> > (10x regression) when userspace is building multiple enclaves in parallel
> > using goroutines to wrap Cgo (the issue might apply to any M:N scheduler,
> > but I've only confirmed the Golang case).  The issue is that allocating
> > an EPC page acts like a blocking syscall when the EPC is under pressure,
> > i.e. an EPC page isn't immediately available.  This causes Go's scheduler
> > to thrash and tank performance[1].
>
> I don't see any major issues having that kthread. All the code that
> maps the enclave would be removed.
>
> I would only allow to map enclave to process address space after the
> enclave has been initialized i.e. SGX_IOC_ENCLAVE_ATTACH.
>

What's SGX_IOC_ENCLAVE_ATTACH?  Why would it be needed at all?  I
would imagine that all pages would be faulted in as needed (or
prefaulted as an optimization) and the enclave would just work in any
process.
Andy Lutomirski Dec. 18, 2018, 4:59 a.m. UTC | #99
On Mon, Dec 17, 2018 at 2:20 PM Sean Christopherson
<sean.j.christopherson@intel.com> wrote:
>

> My brain is still sorting out the details, but I generally like the idea
> of allocating an anon inode when creating an enclave, and exposing the
> other ioctls() via the returned fd.  This is essentially the approach
> used by KVM to manage multiple "layers" of ioctls across KVM itself, VMs
> and vCPUS.  There are even similarities to accessing physical memory via
> multiple disparate domains, e.g. host kernel, host userspace and guest.
>

In my mind, opening /dev/sgx would give you the requisite inode.  I'm
not 100% sure that the chardev infrastructure allows this, but I think
it does.

> The only potential hiccup I can see is the build flow.  Currently,
> EADD+EEXTEND is done via a work queue to avoid major performance issues
> (10x regression) when userspace is building multiple enclaves in parallel
> using goroutines to wrap Cgo (the issue might apply to any M:N scheduler,
> but I've only confirmed the Golang case).  The issue is that allocating
> an EPC page acts like a blocking syscall when the EPC is under pressure,
> i.e. an EPC page isn't immediately available.  This causes Go's scheduler
> to thrash and tank performance[1].

What's the issue, and how does a workqueue help?  I'm wondering if a
nicer solution would be an ioctl to add lots of pages in a single
call.

>
> Alternatively, we could change the EADD+EEXTEND flow to not insert the
> added page's PFN into the owner's process space, i.e. force userspace to
> fault when it runs the enclave.  But that only delays the issue because
> eventually we'll want to account EPC pages, i.e. add a cgroup, at which
> point we'll likely need current->mm anyways.

You should be able to account the backing pages to a cgroup without
actually sticking them into the EPC, no?  Or am I misunderstanding?  I
guess we'll eventually want a cgroup to limit use of the limited EPC
resources.
Andy Lutomirski Dec. 18, 2018, 5:02 a.m. UTC | #100
On Mon, Dec 17, 2018 at 7:27 PM Jarkko Sakkinen
<jarkko.sakkinen@linux.intel.com> wrote:
>
> On Tue, Dec 18, 2018 at 03:39:18AM +0200, Jarkko Sakkinen wrote:
> > On Mon, Dec 17, 2018 at 02:20:48PM -0800, Sean Christopherson wrote:
> > > The only potential hiccup I can see is the build flow.  Currently,
> > > EADD+EEXTEND is done via a work queue to avoid major performance issues
> > > (10x regression) when userspace is building multiple enclaves in parallel
> > > using goroutines to wrap Cgo (the issue might apply to any M:N scheduler,
> > > but I've only confirmed the Golang case).  The issue is that allocating
> > > an EPC page acts like a blocking syscall when the EPC is under pressure,
> > > i.e. an EPC page isn't immediately available.  This causes Go's scheduler
> > > to thrash and tank performance[1].
> >
> > I don't see any major issues having that kthread. All the code that
> > maps the enclave would be removed.
> >
> > I would only allow to map enclave to process address space after the
> > enclave has been initialized i.e. SGX_IOC_ENCLAVE_ATTACH.
>
> Some refined thoughts.
>
> PTE insertion can done in the #PF handler. In fact, we can PoC this
> already with the current architecture (and I will right after sending
> v18).
>
> The backing space is a bit more nasty issue in the add pager thread.
> The previous shmem swapping would have been a better fit. Maybe that
> should be reconsidered?
>
> If shmem was used, all the commits up to "SGX Enclave Driver" could
> be reworked to the new model.
>
> When we think about the swapping code, there uprises some difficulties.
> Namely, when a page is swapped, the enclave must unmap the PTE from all
> processes that have it mapped.

That's what unmap_mapping_range(), etc do for you, no?  IOW make a
struct address_space that represents the logical enclave address
space, i.e. address 0 is the start and the pages count up from there.
You can unmap pages whenever you want, and the core mm code will take
care of zapping the pages from all vmas referencing that
address_space.
Andy Lutomirski Dec. 18, 2018, 5:55 a.m. UTC | #101
On Thu, Nov 15, 2018 at 5:08 PM Jarkko Sakkinen
<jarkko.sakkinen@linux.intel.com> wrote:
>
> Intel Software Guard eXtensions (SGX) is a set of CPU instructions that
> can be used by applications to set aside private regions of code and
> data. The code outside the enclave is disallowed to access the memory
> inside the enclave by the CPU access control.

This is a very partial review.

> +int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
> +                 struct vm_area_struct **vma)
> +{
> +       struct vm_area_struct *result;
> +       struct sgx_encl *encl;
> +
> +       result = find_vma(mm, addr);
> +       if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start)
> +               return -EINVAL;
> +
> +       encl = result->vm_private_data;
> +       *vma = result;
> +
> +       return encl ? 0 : -ENOENT;
> +}

I realize that this function may go away entirely but, if you keep it:
what are the locking rules?  What, if anything, prevents another
thread from destroying the enclave after sgx_encl_find() returns?

> +static int sgx_validate_secs(const struct sgx_secs *secs,
> +                            unsigned long ssaframesize)
> +{

...

> +       if (secs->attributes & SGX_ATTR_MODE64BIT) {
> +               if (secs->size > sgx_encl_size_max_64)
> +                       return -EINVAL;
> +       } else {
> +               /* On 64-bit architecture allow 32-bit encls only in
> +                * the compatibility mode.
> +                */
> +               if (!test_thread_flag(TIF_ADDR32))
> +                       return -EINVAL;
> +               if (secs->size > sgx_encl_size_max_32)
> +                       return -EINVAL;
> +       }

Why do we need the 32-bit-on-64-bit check?  In general, anything that
checks per-task or per-mm flags like TIF_ADDR32 is IMO likely to be
problematic.  You're allowing 64-bit enclaves in 32-bit tasks, so I'm
guessing you could just delete the check.

> +
> +       if (!(secs->xfrm & XFEATURE_MASK_FP) ||
> +           !(secs->xfrm & XFEATURE_MASK_SSE) ||
> +           (((secs->xfrm >> XFEATURE_BNDREGS) & 1) !=
> +            ((secs->xfrm >> XFEATURE_BNDCSR) & 1)) ||
> +           (secs->xfrm & ~sgx_xfrm_mask))
> +               return -EINVAL;

Do we need to check that the enclave doesn't use xfeatures that the
kernel doesn't know about?  Or are they all safe by design in enclave
mode?

> +static int sgx_encl_pm_notifier(struct notifier_block *nb,
> +                               unsigned long action, void *data)
> +{
> +       struct sgx_encl *encl = container_of(nb, struct sgx_encl, pm_notifier);
> +
> +       if (action != PM_SUSPEND_PREPARE && action != PM_HIBERNATION_PREPARE)
> +               return NOTIFY_DONE;

Hmm.  There's an argument to made that omitting this would better
exercise the code that handles fully asynchronous loss of an enclave.
Also, I think you're unnecessarily killing enclaves when suspend is
attempted but fails.

> +
> +static int sgx_get_key_hash(const void *modulus, void *hash)
> +{
> +       struct crypto_shash *tfm;
> +       int ret;
> +
> +       tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC);
> +       if (IS_ERR(tfm))
> +               return PTR_ERR(tfm);
> +
> +       ret = __sgx_get_key_hash(tfm, modulus, hash);
> +
> +       crypto_free_shash(tfm);
> +       return ret;
> +}
> +

I'm so sorry you had to deal with this API.  Once Zinc lands, you
could clean this up :)


> +static int sgx_encl_get(unsigned long addr, struct sgx_encl **encl)
> +{
> +       struct mm_struct *mm = current->mm;
> +       struct vm_area_struct *vma;
> +       int ret;
> +
> +       if (addr & (PAGE_SIZE - 1))
> +               return -EINVAL;
> +
> +       down_read(&mm->mmap_sem);
> +
> +       ret = sgx_encl_find(mm, addr, &vma);
> +       if (!ret) {
> +               *encl = vma->vm_private_data;
> +
> +               if ((*encl)->flags & SGX_ENCL_SUSPEND)
> +                       ret = SGX_POWER_LOST_ENCLAVE;
> +               else
> +                       kref_get(&(*encl)->refcount);
> +       }

Hmm.  This version has explicit refcounting.

> +static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +       vma->vm_ops = &sgx_vm_ops;
> +       vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO |
> +                        VM_DONTCOPY;
> +
> +       return 0;
> +}
> +
> +static unsigned long sgx_get_unmapped_area(struct file *file,
> +                                          unsigned long addr,
> +                                          unsigned long len,
> +                                          unsigned long pgoff,
> +                                          unsigned long flags)
> +{
> +       if (len < 2 * PAGE_SIZE || (len & (len - 1)))
> +               return -EINVAL;
> +
> +       if (len > sgx_encl_size_max_64)
> +               return -EINVAL;
> +
> +       if (len > sgx_encl_size_max_32 && test_thread_flag(TIF_ADDR32))
> +               return -EINVAL;

Generally speaking, this type of check wants to be
in_compat_syscall().  But I'm not sure I understand why you need it at
all.

> +static void sgx_ipi_cb(void *info)
> +{
> +}
> +
> +void sgx_flush_cpus(struct sgx_encl *encl)
> +{
> +       on_each_cpu_mask(mm_cpumask(encl->mm), sgx_ipi_cb, NULL, 1);
> +}

Please add a comment explaining what this promises to do.
Jarkko Sakkinen Dec. 18, 2018, 1:11 p.m. UTC | #102
On Mon, Dec 17, 2018 at 08:59:54PM -0800, Andy Lutomirski wrote:
> On Mon, Dec 17, 2018 at 2:20 PM Sean Christopherson
> <sean.j.christopherson@intel.com> wrote:
> >
> 
> > My brain is still sorting out the details, but I generally like the idea
> > of allocating an anon inode when creating an enclave, and exposing the
> > other ioctls() via the returned fd.  This is essentially the approach
> > used by KVM to manage multiple "layers" of ioctls across KVM itself, VMs
> > and vCPUS.  There are even similarities to accessing physical memory via
> > multiple disparate domains, e.g. host kernel, host userspace and guest.
> >
> 
> In my mind, opening /dev/sgx would give you the requisite inode.  I'm
> not 100% sure that the chardev infrastructure allows this, but I think
> it does.

Yes, this is what I was thinking too i.e.

enclave_fd = open("/dev/sgx/", O_RDWR);

After this enclave_fd "is" the enclave up until the file is closed.

> > The only potential hiccup I can see is the build flow.  Currently,
> > EADD+EEXTEND is done via a work queue to avoid major performance issues
> > (10x regression) when userspace is building multiple enclaves in parallel
> > using goroutines to wrap Cgo (the issue might apply to any M:N scheduler,
> > but I've only confirmed the Golang case).  The issue is that allocating
> > an EPC page acts like a blocking syscall when the EPC is under pressure,
> > i.e. an EPC page isn't immediately available.  This causes Go's scheduler
> > to thrash and tank performance[1].
> 
> What's the issue, and how does a workqueue help?  I'm wondering if a
> nicer solution would be an ioctl to add lots of pages in a single
> call.

I don't think this really is an issue as long as the thread does not
depend on any VMAs.

/Jarkko
Jarkko Sakkinen Dec. 18, 2018, 1:13 p.m. UTC | #103
On Mon, Dec 17, 2018 at 10:21:49PM +0200, Jarkko Sakkinen wrote:
> On Mon, Dec 17, 2018 at 09:33:22PM +0200, Jarkko Sakkinen wrote:
> > On Mon, Dec 17, 2018 at 10:48:58AM -0800, Sean Christopherson wrote:
> > > On Mon, Dec 17, 2018 at 08:43:33PM +0200, Jarkko Sakkinen wrote:
> > > > On Mon, Dec 17, 2018 at 10:36:13AM -0800, Sean Christopherson wrote:
> > > > > I'm pretty sure doing mmget() would result in circular dependencies and
> > > > > a zombie enclave.  In the do_exit() case where a task is abruptly killed:
> > > > >  
> > > > >   - __mmput() is never called because the enclave holds a ref
> > > > >   - sgx_encl_release() is never be called because its VMAs hold refs
> > > > >   - sgx_vma_close() is never called because __mmput()->exit_mmap() is
> > > > >     blocked and the process itself is dead, i.e. won't unmap anything.
> > > > 
> > > > Right, it does, you are absolutely right. Tried it and removed the
> > > > commit already.
> > > > 
> > > > Well, what we came up from your suggestion i.e. setting mm to NULL
> > > > and checking that is very subtle change and does not have any such
> > > > circular dependencies. We'll go with that.
> > > 
> > > We can't set mm to NULL as we need it to unregister the notifier, and
> > > I'm fairly certain attempting to unregister in the release callback
> > > will deadlock.
> > 
> > Noticed that too. mmu_notifier_unregister() requires a valid mm.
> 
> Both branches updated...

I'm not still seeing why you would want to call sgx_free_page() from
sgx_invalidate(). Kind of resistant to adding extra logging just for
checking for programming errors. What I would do if I had to debug
there a leak would be simply put kretprobe on __sgx_free_page().

/Jarkko
Jarkko Sakkinen Dec. 18, 2018, 1:18 p.m. UTC | #104
On Mon, Dec 17, 2018 at 08:55:02PM -0800, Andy Lutomirski wrote:
> On Mon, Dec 17, 2018 at 5:39 PM Jarkko Sakkinen
> <jarkko.sakkinen@linux.intel.com> wrote:
> >
> > On Mon, Dec 17, 2018 at 02:20:48PM -0800, Sean Christopherson wrote:
> > > The only potential hiccup I can see is the build flow.  Currently,
> > > EADD+EEXTEND is done via a work queue to avoid major performance issues
> > > (10x regression) when userspace is building multiple enclaves in parallel
> > > using goroutines to wrap Cgo (the issue might apply to any M:N scheduler,
> > > but I've only confirmed the Golang case).  The issue is that allocating
> > > an EPC page acts like a blocking syscall when the EPC is under pressure,
> > > i.e. an EPC page isn't immediately available.  This causes Go's scheduler
> > > to thrash and tank performance[1].
> >
> > I don't see any major issues having that kthread. All the code that
> > maps the enclave would be removed.
> >
> > I would only allow to map enclave to process address space after the
> > enclave has been initialized i.e. SGX_IOC_ENCLAVE_ATTACH.
> >
> 
> What's SGX_IOC_ENCLAVE_ATTACH?  Why would it be needed at all?  I
> would imagine that all pages would be faulted in as needed (or
> prefaulted as an optimization) and the enclave would just work in any
> process.

The way I see it the efficient way to implement this is to have the
enclave attached to a single process address space at a time.

#PF handler is trivial with multiple address spaces but swapping is
a bit tedious as you would need to zap N processes.

/Jarkko
Jarkko Sakkinen Dec. 18, 2018, 1:27 p.m. UTC | #105
On Mon, Dec 17, 2018 at 09:02:03PM -0800, Andy Lutomirski wrote:
> That's what unmap_mapping_range(), etc do for you, no?  IOW make a
> struct address_space that represents the logical enclave address
> space, i.e. address 0 is the start and the pages count up from there.
> You can unmap pages whenever you want, and the core mm code will take
> care of zapping the pages from all vmas referencing that
> address_space.

OK, so it does. Did not have time to look at it last night (about
3AM) :-) Yes, we could use that to do the N process zapping.

Based on this discussion I can take the first steps with the swapping
code.

And yeah, I don't think we need anon inode for this one. Can just use
the dev inode (did not check in detail but on the surface looks like
it).

/Jarkko
Sean Christopherson Dec. 18, 2018, 3:44 p.m. UTC | #106
On Mon, Dec 17, 2018 at 08:59:54PM -0800, Andy Lutomirski wrote:
> On Mon, Dec 17, 2018 at 2:20 PM Sean Christopherson
> <sean.j.christopherson@intel.com> wrote:
> >
> 
> > My brain is still sorting out the details, but I generally like the idea
> > of allocating an anon inode when creating an enclave, and exposing the
> > other ioctls() via the returned fd.  This is essentially the approach
> > used by KVM to manage multiple "layers" of ioctls across KVM itself, VMs
> > and vCPUS.  There are even similarities to accessing physical memory via
> > multiple disparate domains, e.g. host kernel, host userspace and guest.
> >
> 
> In my mind, opening /dev/sgx would give you the requisite inode.  I'm
> not 100% sure that the chardev infrastructure allows this, but I think
> it does.

My fd/inode knowledge is lacking, to say the least.  Whatever works, so
long as we have a way to uniquely identify enclaves.

> > The only potential hiccup I can see is the build flow.  Currently,
> > EADD+EEXTEND is done via a work queue to avoid major performance issues
> > (10x regression) when userspace is building multiple enclaves in parallel
> > using goroutines to wrap Cgo (the issue might apply to any M:N scheduler,
> > but I've only confirmed the Golang case).  The issue is that allocating
> > an EPC page acts like a blocking syscall when the EPC is under pressure,
> > i.e. an EPC page isn't immediately available.  This causes Go's scheduler
> > to thrash and tank performance[1].
> 
> What's the issue, and how does a workqueue help?  I'm wondering if a
> nicer solution would be an ioctl to add lots of pages in a single
> call.

Adding pages via workqueue makes the ioctl itself fast enough to avoid
triggering Go's rescheduling.  A batched EADD flow would likely help,
I just haven't had the time to rework the userspace side to be able to
test the performance.

> >
> > Alternatively, we could change the EADD+EEXTEND flow to not insert the
> > added page's PFN into the owner's process space, i.e. force userspace to
> > fault when it runs the enclave.  But that only delays the issue because
> > eventually we'll want to account EPC pages, i.e. add a cgroup, at which
> > point we'll likely need current->mm anyways.
> 
> You should be able to account the backing pages to a cgroup without
> actually sticking them into the EPC, no?  Or am I misunderstanding?  I
> guess we'll eventually want a cgroup to limit use of the limited EPC
> resources.

It's the latter, a cgroup to limit EPC.  The mm is used to retrieve the
cgroup without having track e.g. the task_struct.
Sean Christopherson Dec. 18, 2018, 3:46 p.m. UTC | #107
On Tue, Dec 18, 2018 at 03:13:11PM +0200, Jarkko Sakkinen wrote:
> On Mon, Dec 17, 2018 at 10:21:49PM +0200, Jarkko Sakkinen wrote:
> > On Mon, Dec 17, 2018 at 09:33:22PM +0200, Jarkko Sakkinen wrote:
> > > On Mon, Dec 17, 2018 at 10:48:58AM -0800, Sean Christopherson wrote:
> > > > On Mon, Dec 17, 2018 at 08:43:33PM +0200, Jarkko Sakkinen wrote:
> > > > > On Mon, Dec 17, 2018 at 10:36:13AM -0800, Sean Christopherson wrote:
> > > > > > I'm pretty sure doing mmget() would result in circular dependencies and
> > > > > > a zombie enclave.  In the do_exit() case where a task is abruptly killed:
> > > > > >  
> > > > > >   - __mmput() is never called because the enclave holds a ref
> > > > > >   - sgx_encl_release() is never be called because its VMAs hold refs
> > > > > >   - sgx_vma_close() is never called because __mmput()->exit_mmap() is
> > > > > >     blocked and the process itself is dead, i.e. won't unmap anything.
> > > > > 
> > > > > Right, it does, you are absolutely right. Tried it and removed the
> > > > > commit already.
> > > > > 
> > > > > Well, what we came up from your suggestion i.e. setting mm to NULL
> > > > > and checking that is very subtle change and does not have any such
> > > > > circular dependencies. We'll go with that.
> > > > 
> > > > We can't set mm to NULL as we need it to unregister the notifier, and
> > > > I'm fairly certain attempting to unregister in the release callback
> > > > will deadlock.
> > > 
> > > Noticed that too. mmu_notifier_unregister() requires a valid mm.
> > 
> > Both branches updated...
> 
> I'm not still seeing why you would want to call sgx_free_page() from
> sgx_invalidate(). Kind of resistant to adding extra logging just for
> checking for programming errors. What I would do if I had to debug
> there a leak would be simply put kretprobe on __sgx_free_page().

The WARN is needed to detect the leak in the first place.  And leaking
pages because EREMOVE fails usually means there's a serious bug.
Sean Christopherson Dec. 18, 2018, 6:53 p.m. UTC | #108
On Tue, Dec 18, 2018 at 07:44:18AM -0800, Sean Christopherson wrote:
> On Mon, Dec 17, 2018 at 08:59:54PM -0800, Andy Lutomirski wrote:
> > On Mon, Dec 17, 2018 at 2:20 PM Sean Christopherson
> > <sean.j.christopherson@intel.com> wrote:
> > >
> > 
> > > My brain is still sorting out the details, but I generally like the idea
> > > of allocating an anon inode when creating an enclave, and exposing the
> > > other ioctls() via the returned fd.  This is essentially the approach
> > > used by KVM to manage multiple "layers" of ioctls across KVM itself, VMs
> > > and vCPUS.  There are even similarities to accessing physical memory via
> > > multiple disparate domains, e.g. host kernel, host userspace and guest.
> > >
> > 
> > In my mind, opening /dev/sgx would give you the requisite inode.  I'm
> > not 100% sure that the chardev infrastructure allows this, but I think
> > it does.
> 
> My fd/inode knowledge is lacking, to say the least.  Whatever works, so
> long as we have a way to uniquely identify enclaves.

Actually, while we're dissecting the interface...

What if we re-organize the ioctls in such a way that we leave open the
possibility of allocating raw EPC for KVM via /dev/sgx?  I'm not 100%
positive this approach will work[1], but conceptually it fits well with
KVM's memory model, e.g. KVM is aware of the GPA<->HVA association but
generally speaking doesn't know what's physically backing each memory
region.

Tangentially related, I think we should support allocating multiple
enclaves from a single /dev/sgx fd, i.e. a process shouldn't have to
open /dev/sgx every time it wants to create a new enclave.

Something like this:

/dev/sgx
  |
  -> mmap() { return -EINVAL; }
  |
  -> unlocked_ioctl()
     |
     -> SGX_CREATE_ENCLAVE: { return alloc_enclave_fd(); }
     |  |
     |   -> mmap() { ... }
     |  | 
     |   -> get_unmapped_area() { 
     |  |           if (enclave->size) {
     |  |                   if (!addr)
     |  |                           addr = enclave->base;
     |  |                   if (addr + len + pgoff > enclave->base + enclave->size)
     |  |                           return -EINVAL;
     |  |           } else {
     |  |                   if (!validate_size(len))
     |  |                           return -EINVAL;
     |  |                   addr = naturally_align(len);
     |  |           }
     |  |   }
     |  |
     |   -> unlocked_ioctl() {
     |              SGX_ENCLAVE_ADD_PAGE: { ... }
     |              SGX_ENCLAVE_INIT: { ... }
     |              SGX_ENCLAVE_REMOVE_PAGES: { ... }
     |              SGX_ENCLAVE_MODIFY_PAGES: { ... }
     |      }
     |
     -> SGX_CREATE_VIRTUAL_EPC: {return alloc_epc_fd(); }
        |
         -> mmap() { ... }
        |
	 -> get_unmapped_area() {<page aligned/sized> }
        |
         -> unlocked_ioctl() {
                    SGX_VIRTUAL_EPC_???:
		    SGX_VIRTUAL_EPC_???:
	    }


[1] Delegating EPC management to /dev/sgx is viable for virtualizing SGX
    without oversubscribing EPC to guests, but oversubscribing EPC in a
    VMM requires handling EPC-related VM-Exits and using instructions
    that will #UD if the CPU is not post-VMXON.  I *think* having KVM
    forward VM-Exits to x86/sgx would work, but it's entirely possible
    it'd be a complete cluster.
Jarkko Sakkinen Dec. 19, 2018, 4:47 a.m. UTC | #109
On Tue, Dec 18, 2018 at 07:44:18AM -0800, Sean Christopherson wrote:
> My fd/inode knowledge is lacking, to say the least.  Whatever works, so
> long as we have a way to uniquely identify enclaves.

I will simply trial and error :-) I think it should work since it does
own an address space, but yeah, testing will tell. We can go also with
anon inode if required.

/Jarkko
Jarkko Sakkinen Dec. 19, 2018, 5 a.m. UTC | #110
On Tue, Dec 18, 2018 at 10:53:49AM -0800, Sean Christopherson wrote:
> What if we re-organize the ioctls in such a way that we leave open the
> possibility of allocating raw EPC for KVM via /dev/sgx?  I'm not 100%
> positive this approach will work[1], but conceptually it fits well with
> KVM's memory model, e.g. KVM is aware of the GPA<->HVA association but
> generally speaking doesn't know what's physically backing each memory
> region.

Why would you want to pass EPC through user space to KVM rather than
KVM allocating it through kernel interfaces?

> Tangentially related, I think we should support allocating multiple
> enclaves from a single /dev/sgx fd, i.e. a process shouldn't have to
> open /dev/sgx every time it wants to create a new enclave.

I'm fine with this. It just requires to create anon inode. I'll just
add a new field called 'enclave_fd' to struct sgx_enclave_create and
that's all.

I think I have otherwise ingredients for v19 ready except where to swap.

/Jarkko
Jarkko Sakkinen Dec. 19, 2018, 5:13 a.m. UTC | #111
On Wed, Dec 19, 2018 at 07:00:47AM +0200, Jarkko Sakkinen wrote:
> On Tue, Dec 18, 2018 at 10:53:49AM -0800, Sean Christopherson wrote:
> > What if we re-organize the ioctls in such a way that we leave open the
> > possibility of allocating raw EPC for KVM via /dev/sgx?  I'm not 100%
> > positive this approach will work[1], but conceptually it fits well with
> > KVM's memory model, e.g. KVM is aware of the GPA<->HVA association but
> > generally speaking doesn't know what's physically backing each memory
> > region.
> 
> Why would you want to pass EPC through user space to KVM rather than
> KVM allocating it through kernel interfaces?
> 
> > Tangentially related, I think we should support allocating multiple
> > enclaves from a single /dev/sgx fd, i.e. a process shouldn't have to
> > open /dev/sgx every time it wants to create a new enclave.
> 
> I'm fine with this. It just requires to create anon inode. I'll just
> add a new field called 'enclave_fd' to struct sgx_enclave_create and
> that's all.
> 
> I think I have otherwise ingredients for v19 ready except where to swap.

If I follow your proposal here and allow to create multiple enclaves
(i.e. with anon inodes for each) with one descriptor, is that sufficient
API to later add what you want to KVM?

/Jarkko
Jarkko Sakkinen Dec. 19, 2018, 5:22 a.m. UTC | #112
On Mon, Dec 17, 2018 at 09:55:08PM -0800, Andy Lutomirski wrote:
> On Thu, Nov 15, 2018 at 5:08 PM Jarkko Sakkinen
> <jarkko.sakkinen@linux.intel.com> wrote:
> >
> > Intel Software Guard eXtensions (SGX) is a set of CPU instructions that
> > can be used by applications to set aside private regions of code and
> > data. The code outside the enclave is disallowed to access the memory
> > inside the enclave by the CPU access control.
> 
> This is a very partial review.

Thank you, appreciate it.

> > +int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
> > +                 struct vm_area_struct **vma)
> > +{
> > +       struct vm_area_struct *result;
> > +       struct sgx_encl *encl;
> > +
> > +       result = find_vma(mm, addr);
> > +       if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start)
> > +               return -EINVAL;
> > +
> > +       encl = result->vm_private_data;
> > +       *vma = result;
> > +
> > +       return encl ? 0 : -ENOENT;
> > +}
> 
> I realize that this function may go away entirely but, if you keep it:
> what are the locking rules?  What, if anything, prevents another
> thread from destroying the enclave after sgx_encl_find() returns?

The kref inside the enclave is used to manage this but this function
directly does not prevent it (see for example sgx_encl_get). But yes,
this function does not give any guarantees (should probably have
a documentation stating this).

> > +static int sgx_validate_secs(const struct sgx_secs *secs,
> > +                            unsigned long ssaframesize)
> > +{
> 
> ...
> 
> > +       if (secs->attributes & SGX_ATTR_MODE64BIT) {
> > +               if (secs->size > sgx_encl_size_max_64)
> > +                       return -EINVAL;
> > +       } else {
> > +               /* On 64-bit architecture allow 32-bit encls only in
> > +                * the compatibility mode.
> > +                */
> > +               if (!test_thread_flag(TIF_ADDR32))
> > +                       return -EINVAL;
> > +               if (secs->size > sgx_encl_size_max_32)
> > +                       return -EINVAL;
> > +       }
> 
> Why do we need the 32-bit-on-64-bit check?  In general, anything that
> checks per-task or per-mm flags like TIF_ADDR32 is IMO likely to be
> problematic.  You're allowing 64-bit enclaves in 32-bit tasks, so I'm
> guessing you could just delete the check.

I guess you are right. I can remove this.


> 
> > +
> > +       if (!(secs->xfrm & XFEATURE_MASK_FP) ||
> > +           !(secs->xfrm & XFEATURE_MASK_SSE) ||
> > +           (((secs->xfrm >> XFEATURE_BNDREGS) & 1) !=
> > +            ((secs->xfrm >> XFEATURE_BNDCSR) & 1)) ||
> > +           (secs->xfrm & ~sgx_xfrm_mask))
> > +               return -EINVAL;
> 
> Do we need to check that the enclave doesn't use xfeatures that the
> kernel doesn't know about?  Or are they all safe by design in enclave
> mode?

Really good catch BTW. We don't check what kernel doesn't know about.

I'm not sure what harm this would have as the enclave cannot have much
effect to the outside world. Is there easy way to get similar mask
of kernel supported features as sgx_xfrm_mask? The safe play would
be to use such here as I don't have definitive answer to your second
question.

> 
> > +static int sgx_encl_pm_notifier(struct notifier_block *nb,
> > +                               unsigned long action, void *data)
> > +{
> > +       struct sgx_encl *encl = container_of(nb, struct sgx_encl, pm_notifier);
> > +
> > +       if (action != PM_SUSPEND_PREPARE && action != PM_HIBERNATION_PREPARE)
> > +               return NOTIFY_DONE;
> 
> Hmm.  There's an argument to made that omitting this would better
> exercise the code that handles fully asynchronous loss of an enclave.
> Also, I think you're unnecessarily killing enclaves when suspend is
> attempted but fails.

Are you proposing to not do anything at all to the enclaves? There was
is a problem that it could lead to infinite #PF loop if we don't do
it.


> 
> > +
> > +static int sgx_get_key_hash(const void *modulus, void *hash)
> > +{
> > +       struct crypto_shash *tfm;
> > +       int ret;
> > +
> > +       tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC);
> > +       if (IS_ERR(tfm))
> > +               return PTR_ERR(tfm);
> > +
> > +       ret = __sgx_get_key_hash(tfm, modulus, hash);
> > +
> > +       crypto_free_shash(tfm);
> > +       return ret;
> > +}
> > +
> 
> I'm so sorry you had to deal with this API.  Once Zinc lands, you
> could clean this up :)
> 
> 
> > +static int sgx_encl_get(unsigned long addr, struct sgx_encl **encl)
> > +{
> > +       struct mm_struct *mm = current->mm;
> > +       struct vm_area_struct *vma;
> > +       int ret;
> > +
> > +       if (addr & (PAGE_SIZE - 1))
> > +               return -EINVAL;
> > +
> > +       down_read(&mm->mmap_sem);
> > +
> > +       ret = sgx_encl_find(mm, addr, &vma);
> > +       if (!ret) {
> > +               *encl = vma->vm_private_data;
> > +
> > +               if ((*encl)->flags & SGX_ENCL_SUSPEND)
> > +                       ret = SGX_POWER_LOST_ENCLAVE;
> > +               else
> > +                       kref_get(&(*encl)->refcount);
> > +       }
> 
> Hmm.  This version has explicit refcounting.
> 
> > +static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
> > +{
> > +       vma->vm_ops = &sgx_vm_ops;
> > +       vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO |
> > +                        VM_DONTCOPY;
> > +
> > +       return 0;
> > +}
> > +
> > +static unsigned long sgx_get_unmapped_area(struct file *file,
> > +                                          unsigned long addr,
> > +                                          unsigned long len,
> > +                                          unsigned long pgoff,
> > +                                          unsigned long flags)
> > +{
> > +       if (len < 2 * PAGE_SIZE || (len & (len - 1)))
> > +               return -EINVAL;
> > +
> > +       if (len > sgx_encl_size_max_64)
> > +               return -EINVAL;
> > +
> > +       if (len > sgx_encl_size_max_32 && test_thread_flag(TIF_ADDR32))
> > +               return -EINVAL;
> 
> Generally speaking, this type of check wants to be
> in_compat_syscall().  But I'm not sure I understand why you need it at
> all.

I'll remove it.

> 
> > +static void sgx_ipi_cb(void *info)
> > +{
> > +}
> > +
> > +void sgx_flush_cpus(struct sgx_encl *encl)
> > +{
> > +       on_each_cpu_mask(mm_cpumask(encl->mm), sgx_ipi_cb, NULL, 1);
> > +}
> 
> Please add a comment explaining what this promises to do.

Will do.

/Jarkko
Jarkko Sakkinen Dec. 19, 2018, 5:24 a.m. UTC | #113
On Wed, Dec 19, 2018 at 06:47:32AM +0200, Jarkko Sakkinen wrote:
> On Tue, Dec 18, 2018 at 07:44:18AM -0800, Sean Christopherson wrote:
> > My fd/inode knowledge is lacking, to say the least.  Whatever works, so
> > long as we have a way to uniquely identify enclaves.
> 
> I will simply trial and error :-) I think it should work since it does
> own an address space, but yeah, testing will tell. We can go also with
> anon inode if required.

I think this can be concluded with the fact that it is nice to be able
to multiplex the dev fd. That is the key reason for using anon inode.
You KVM comment locks the decision here.

/Jarkko
Sean Christopherson Dec. 21, 2018, 6:28 p.m. UTC | #114
On Wed, Dec 19, 2018 at 07:00:47AM +0200, Jarkko Sakkinen wrote:
> On Tue, Dec 18, 2018 at 10:53:49AM -0800, Sean Christopherson wrote:
> > What if we re-organize the ioctls in such a way that we leave open the
> > possibility of allocating raw EPC for KVM via /dev/sgx?  I'm not 100%
> > positive this approach will work[1], but conceptually it fits well with
> > KVM's memory model, e.g. KVM is aware of the GPA<->HVA association but
> > generally speaking doesn't know what's physically backing each memory
> > region.
> 
> Why would you want to pass EPC through user space to KVM rather than
> KVM allocating it through kernel interfaces?

Delegating EPC management to userspace fits better with KVM's existing
memory ABI.  KVM provides a single ioctl(), KVM_SET_USER_MEMORY_REGION[1],
that allows userspace to create, move, modify and delete memory regions.

Skipping over a lot of details, there are essentially three options for
exposing EPC to a KVM guest:

 1) Provide a dedicated KVM ioctl() to manage EPC without routing it
    through KVM_SET_USER_MEMORY_REGION.

 2) Add a flag to 'struct kvm_userspace_memory_region' that denotes an
    EPC memory region and mmap() / allocate EPC in KVM.

 3) Provide an ABI to allocate raw EPC and let userspace manage it like
    any other memory region.

Option (1) requires duplicating all of KVM_SET_USER_MEMORY_REGION's
functionality unless the ioctl() is severly restricted.

Option (2) is an ugly abuse of KVM_SET_USER_MEMORY_REGION since the EPC
flag would have completely different semantics than all other usage of
KVM_SET_USER_MEMORY_REGION.

Thus, option (3).

Probably a better question to answer is why provide the ABI through
/dev/sgx and not /dev/kvm.  IMO /dev/sgx is a more logical way to
advertise support to userspace, e.g. userspace can simply check if
/dev/sgx (or /dev/sgx/epc) exists vs. probing a KVM capability.

Without EPC oversubscription in KVM, /dev/sgx is easily the best fit
since the EPC management would reside completely in x86/sgx, i.e. KVM
would essentially have zero code related to EPC management.

EPC oversubscription complicates things because the architecture forces
aspects of VMM oversubscription into the KVM domain, e.g. requires a
post-VMXON instruction (ENCLV) and a VM-Exit handler.   I still think
/dev/sgx is a better fit, my only concern is that the oversubscription
code would be even more heinous due to splitting responsibilities.
But, Andy's idea of having /dev/sgx/enclave vs. /dev/sgx/epc might help
avoid that entirely.
Jarkko Sakkinen Dec. 22, 2018, 12:01 a.m. UTC | #115
On Fri, Dec 21, 2018 at 10:28:09AM -0800, Sean Christopherson wrote:
> > Why would you want to pass EPC through user space to KVM rather than
> > KVM allocating it through kernel interfaces?
> 
> Delegating EPC management to userspace fits better with KVM's existing
> memory ABI.  KVM provides a single ioctl(), KVM_SET_USER_MEMORY_REGION[1],
> that allows userspace to create, move, modify and delete memory regions.
> 
> Skipping over a lot of details, there are essentially three options for
> exposing EPC to a KVM guest:
> 
>  1) Provide a dedicated KVM ioctl() to manage EPC without routing it
>     through KVM_SET_USER_MEMORY_REGION.
> 
>  2) Add a flag to 'struct kvm_userspace_memory_region' that denotes an
>     EPC memory region and mmap() / allocate EPC in KVM.
> 
>  3) Provide an ABI to allocate raw EPC and let userspace manage it like
>     any other memory region.
> 
> Option (1) requires duplicating all of KVM_SET_USER_MEMORY_REGION's
> functionality unless the ioctl() is severly restricted.
> 
> Option (2) is an ugly abuse of KVM_SET_USER_MEMORY_REGION since the EPC
> flag would have completely different semantics than all other usage of
> KVM_SET_USER_MEMORY_REGION.
> 
> Thus, option (3).

OK, thank you for patience explaining this.

> Probably a better question to answer is why provide the ABI through
> /dev/sgx and not /dev/kvm.  IMO /dev/sgx is a more logical way to
> advertise support to userspace, e.g. userspace can simply check if
> /dev/sgx (or /dev/sgx/epc) exists vs. probing a KVM capability.

You have to understand that for a KVM non-expert like me it was really
important to get the context, which you kindly gave. I have never used
KVM's memory management API but now that I know how it works all of this
makes perfect sense. This is not a better question but it is definitely
a good follow up question :-)

I don't really understand you deduction here, however. If SGX was not
supported, why couldn't the hypothetical /dev/kvm functionality just
return an error?

For me it sounds a bit messy that KVM functionality, which is a client
to the SGX functionality, places some of its functionality to the SGX
core.

/Jarkko
diff mbox series

Patch

diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
new file mode 100644
index 000000000000..aadf9c76e360
--- /dev/null
+++ b/arch/x86/include/uapi/asm/sgx.h
@@ -0,0 +1,59 @@ 
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/**
+ * Copyright(c) 2016-18 Intel Corporation.
+ */
+#ifndef _UAPI_ASM_X86_SGX_H
+#define _UAPI_ASM_X86_SGX_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define SGX_MAGIC 0xA4
+
+#define SGX_IOC_ENCLAVE_CREATE \
+	_IOW(SGX_MAGIC, 0x00, struct sgx_enclave_create)
+#define SGX_IOC_ENCLAVE_ADD_PAGE \
+	_IOW(SGX_MAGIC, 0x01, struct sgx_enclave_add_page)
+#define SGX_IOC_ENCLAVE_INIT \
+	_IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init)
+
+/* IOCTL return values */
+#define SGX_POWER_LOST_ENCLAVE		0x40000000
+
+/**
+ * struct sgx_enclave_create - parameter structure for the
+ *                             %SGX_IOC_ENCLAVE_CREATE ioctl
+ * @src:	address for the SECS page data
+ */
+struct sgx_enclave_create  {
+	__u64	src;
+};
+
+/**
+ * struct sgx_enclave_add_page - parameter structure for the
+ *                               %SGX_IOC_ENCLAVE_ADD_PAGE ioctl
+ * @addr:	address within the ELRANGE
+ * @src:	address for the page data
+ * @secinfo:	address for the SECINFO data
+ * @mrmask:	bitmask for the measured 256 byte chunks
+ */
+struct sgx_enclave_add_page {
+	__u64	addr;
+	__u64	src;
+	__u64	secinfo;
+	__u16	mrmask;
+} __attribute__((__packed__));
+
+
+/**
+ * struct sgx_enclave_init - parameter structure for the
+ *                           %SGX_IOC_ENCLAVE_INIT ioctl
+ * @addr:	address within the ELRANGE
+ * @sigstruct:	address for the SIGSTRUCT data
+ */
+struct sgx_enclave_init {
+	__u64	addr;
+	__u64	sigstruct;
+};
+
+#endif /* _UAPI_ASM_X86_SGX_H */
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 54f6a40c75c6..e7c8d7898434 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -1288,6 +1288,8 @@  config INTEL_ATOMISP2_PM
 	  To compile this driver as a module, choose M here: the module
 	  will be called intel_atomisp2_pm.
 
+source "drivers/platform/x86/intel_sgx/Kconfig"
+
 endif # X86_PLATFORM_DEVICES
 
 config PMC_ATOM
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index 39ae94135406..a826ab3d7987 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -96,3 +96,4 @@  obj-$(CONFIG_INTEL_TURBO_MAX_3) += intel_turbo_max_3.o
 obj-$(CONFIG_INTEL_CHTDC_TI_PWRBTN)	+= intel_chtdc_ti_pwrbtn.o
 obj-$(CONFIG_I2C_MULTI_INSTANTIATE)	+= i2c-multi-instantiate.o
 obj-$(CONFIG_INTEL_ATOMISP2_PM)	+= intel_atomisp2_pm.o
+obj-$(CONFIG_INTEL_SGX) += intel_sgx/
diff --git a/drivers/platform/x86/intel_sgx/Kconfig b/drivers/platform/x86/intel_sgx/Kconfig
new file mode 100644
index 000000000000..7d22d44acce9
--- /dev/null
+++ b/drivers/platform/x86/intel_sgx/Kconfig
@@ -0,0 +1,20 @@ 
+#
+# Intel SGX
+#
+
+config INTEL_SGX
+	tristate "Intel(R) SGX Driver"
+	depends on X86_64 && CPU_SUP_INTEL && INTEL_SGX_CORE
+	select MMU_NOTIFIER
+	select CRYPTO
+	select CRYPTO_SHA256
+	help
+	Intel(R) SGX is a set of CPU instructions that can be used by
+	applications to set aside private regions of code and data.  The code
+	outside the enclave is disallowed to access the memory inside the
+	enclave by the CPU access control.
+
+	The firmware uses PRMRR registers to reserve an area of physical memory
+	called Enclave Page Cache (EPC). There is a hardware unit in the
+	processor called Memory Encryption Engine. The MEE encrypts and decrypts
+	the EPC pages as they enter and leave the processor package.
diff --git a/drivers/platform/x86/intel_sgx/Makefile b/drivers/platform/x86/intel_sgx/Makefile
new file mode 100644
index 000000000000..117e97effeff
--- /dev/null
+++ b/drivers/platform/x86/intel_sgx/Makefile
@@ -0,0 +1,12 @@ 
+#
+# Intel SGX
+#
+
+obj-$(CONFIG_INTEL_SGX) += intel_sgx.o
+
+intel_sgx-$(CONFIG_INTEL_SGX) += \
+	sgx_encl.o \
+	sgx_ioctl.o \
+	sgx_main.o \
+	sgx_util.o \
+	sgx_vma.o \
diff --git a/drivers/platform/x86/intel_sgx/sgx.h b/drivers/platform/x86/intel_sgx/sgx.h
new file mode 100644
index 000000000000..67bd8ea1d53d
--- /dev/null
+++ b/drivers/platform/x86/intel_sgx/sgx.h
@@ -0,0 +1,180 @@ 
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/**
+ * Copyright(c) 2016-18 Intel Corporation.
+ */
+#ifndef __ARCH_INTEL_SGX_H__
+#define __ARCH_INTEL_SGX_H__
+
+#include <crypto/hash.h>
+#include <linux/kref.h>
+#include <linux/mmu_notifier.h>
+#include <linux/mmu_notifier.h>
+#include <linux/radix-tree.h>
+#include <linux/radix-tree.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <asm/sgx.h>
+#include <uapi/asm/sgx.h>
+
+#define sgx_pr(level, encl, fmt, ...)				\
+	pr_ ## level("sgx: [%d:0x%p] " fmt, pid_nr((encl)->tgid),	\
+		     (void *)(encl)->base, ##__VA_ARGS__)
+#define sgx_dbg(encl, fmt, ...) \
+	sgx_pr(debug, encl, fmt, ##__VA_ARGS__)
+#define sgx_info(encl, fmt, ...) \
+	sgx_pr(info, encl, fmt, ##__VA_ARGS__)
+#define sgx_warn(encl, fmt, ...) \
+	sgx_pr(warn, encl, fmt, ##__VA_ARGS__)
+#define sgx_err(encl, fmt, ...) \
+	sgx_pr(err, encl, fmt, ##__VA_ARGS__)
+#define sgx_crit(encl, fmt, ...) \
+	sgx_pr(crit, encl, fmt, ##__VA_ARGS__)
+
+#define SGX_EINIT_SPIN_COUNT	20
+#define SGX_EINIT_SLEEP_COUNT	50
+#define SGX_EINIT_SLEEP_TIME	20
+
+/**
+ * enum sgx_encl_page_desc - defines bits for an enclave page's descriptor
+ * %SGX_ENCL_PAGE_TCS:			The page is a TCS page.
+ * %SGX_ENCL_PAGE_LOADED:		The page is not swapped.
+ * %SGX_ENCL_PAGE_ADDR_MASK:		Holds the virtual address of the page.
+ */
+enum sgx_encl_page_desc {
+	SGX_ENCL_PAGE_TCS		= BIT(0),
+	SGX_ENCL_PAGE_LOADED		= BIT(1),
+	/* Bits 11:3 are available when the page is not swapped. */
+	SGX_ENCL_PAGE_ADDR_MASK		= PAGE_MASK,
+};
+
+#define SGX_ENCL_PAGE_ADDR(encl_page) \
+	((encl_page)->desc & SGX_ENCL_PAGE_ADDR_MASK)
+#define SGX_ENCL_PAGE_VA_OFFSET(encl_page) \
+	((encl_page)->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK)
+
+struct sgx_encl_page {
+	unsigned long desc;
+	union {
+		struct sgx_epc_page *epc_page;
+		struct sgx_va_page *va_page;
+	};
+	struct sgx_encl *encl;
+};
+
+enum sgx_encl_flags {
+	SGX_ENCL_INITIALIZED	= BIT(0),
+	SGX_ENCL_DEBUG		= BIT(1),
+	SGX_ENCL_SUSPEND	= BIT(2),
+	SGX_ENCL_DEAD		= BIT(3),
+};
+
+struct sgx_encl {
+	unsigned int flags;
+	uint64_t attributes;
+	uint64_t xfrm;
+	unsigned int page_cnt;
+	unsigned int secs_child_cnt;
+	struct mutex lock;
+	struct mm_struct *mm;
+	struct file *backing;
+	struct kref refcount;
+	unsigned long base;
+	unsigned long size;
+	unsigned long ssaframesize;
+	struct radix_tree_root page_tree;
+	struct list_head add_page_reqs;
+	struct work_struct add_page_work;
+	struct sgx_encl_page secs;
+	struct pid *tgid;
+	struct mmu_notifier mmu_notifier;
+	struct notifier_block pm_notifier;
+};
+
+static inline pgoff_t sgx_encl_page_backing_index(struct sgx_encl_page *page,
+						  struct sgx_encl *encl)
+{
+	/* The backing page for SECS is located after the enclave pages. */
+	if (!PFN_DOWN(page->desc))
+		return PFN_DOWN(encl->size);
+
+	return PFN_DOWN(page->desc - encl->base);
+}
+
+extern struct workqueue_struct *sgx_add_page_wq;
+extern u64 sgx_encl_size_max_32;
+extern u64 sgx_encl_size_max_64;
+extern u64 sgx_xfrm_mask;
+extern u32 sgx_misc_reserved;
+extern u32 sgx_xsave_size_tbl[64];
+extern int sgx_epcm_trapnr;
+
+extern const struct vm_operations_struct sgx_vm_ops;
+
+int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
+		  struct vm_area_struct **vma);
+void sgx_invalidate(struct sgx_encl *encl, bool flush_cpus);
+
+/**
+ * SGX_INVD - invalidate an enclave on failure, i.e. if ret != 0
+ *
+ * @ret:	a return code to check
+ * @encl:	pointer to an enclave
+ * @fmt:	message for WARN if failure is detected
+ * @...:	optional arguments used by @fmt
+ *
+ * SGX_INVD is used in flows where an error, i.e. @ret is non-zero, is
+ * indicative of a driver bug.  Invalidate @encl if @ret indicates an
+ * error and WARN on error unless the error was due to a fault signaled
+ * by the EPCM.
+ *
+ * Faults from the EPCM occur in normal kernel operation, e.g. due to
+ * misonfigured mprotect() from userspace or because the EPCM invalidated
+ * all EPC pages.  The EPCM invalidates the EPC on transitions to S3 or
+ * lower sleep states, and VMMs emulate loss of EPC when migrating VMs.
+ *
+ * Defined as a macro instead of a function so that WARN can provide a
+ * more precise trace.
+ */
+#define SGX_INVD(ret, encl, fmt, ...)					  \
+do {									  \
+	if (unlikely(ret)) {						  \
+		int trapnr = IS_ENCLS_FAULT(ret) ? ENCLS_TRAPNR(ret) : 0; \
+		WARN(trapnr != sgx_epcm_trapnr, fmt, ##__VA_ARGS__);	  \
+		sgx_invalidate(encl, true);				  \
+	}								  \
+} while (0)
+
+struct sgx_encl *sgx_encl_alloc(struct sgx_secs *secs);
+int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs);
+struct sgx_encl_page *sgx_encl_alloc_page(struct sgx_encl *encl,
+					  unsigned long addr);
+void sgx_encl_free_page(struct sgx_encl_page *encl_page);
+int sgx_encl_add_page(struct sgx_encl *encl, unsigned long addr, void *data,
+		      struct sgx_secinfo *secinfo, unsigned int mrmask);
+int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
+		  struct sgx_einittoken *einittoken);
+void sgx_encl_block(struct sgx_encl_page *encl_page);
+void sgx_encl_track(struct sgx_encl *encl);
+int sgx_encl_load_page(struct sgx_encl_page *encl_page,
+		       struct sgx_epc_page *epc_page);
+void sgx_encl_release(struct kref *ref);
+
+long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
+#ifdef CONFIG_COMPAT
+long sgx_compat_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
+#endif
+
+struct sgx_encl_page *sgx_fault_page(struct vm_area_struct *vma,
+				     unsigned long addr,
+				     bool do_reserve);
+
+int sgx_test_and_clear_young(struct sgx_encl_page *page);
+void sgx_flush_cpus(struct sgx_encl *encl);
+void sgx_set_page_loaded(struct sgx_encl_page *encl_page,
+			 struct sgx_epc_page *epc_page);
+struct page *sgx_get_backing(struct file *file, pgoff_t index);
+void sgx_put_backing(struct page *backing_page, bool write);
+
+#endif /* __ARCH_X86_INTEL_SGX_H__ */
diff --git a/drivers/platform/x86/intel_sgx/sgx_encl.c b/drivers/platform/x86/intel_sgx/sgx_encl.c
new file mode 100644
index 000000000000..6bed944c2f92
--- /dev/null
+++ b/drivers/platform/x86/intel_sgx/sgx_encl.c
@@ -0,0 +1,784 @@ 
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+// Copyright(c) 2016-18 Intel Corporation.
+
+#include <asm/mman.h>
+#include <linux/delay.h>
+#include <linux/file.h>
+#include <linux/hashtable.h>
+#include <linux/highmem.h>
+#include <linux/ratelimit.h>
+#include <linux/sched/signal.h>
+#include <linux/shmem_fs.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include "sgx.h"
+
+struct sgx_add_page_req {
+	struct sgx_encl *encl;
+	struct sgx_encl_page *encl_page;
+	struct sgx_secinfo secinfo;
+	unsigned long mrmask;
+	struct list_head list;
+};
+
+/**
+ * sgx_encl_find - find an enclave
+ * @mm:		mm struct of the current process
+ * @addr:	address in the ELRANGE
+ * @vma:	the resulting VMA
+ *
+ * Finds an enclave identified by the given address. Gives back the VMA, that
+ * is part of the enclave, located in that address. The VMA is given back if it
+ * is a proper enclave VMA even if an &sgx_encl instance does not exist
+ * yet (enclave creation has not been performed).
+ *
+ * Return:
+ *   0 on success,
+ *   -EINVAL if an enclave was not found,
+ *   -ENOENT if the enclave has not been created yet
+ */
+int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
+		  struct vm_area_struct **vma)
+{
+	struct vm_area_struct *result;
+	struct sgx_encl *encl;
+
+	result = find_vma(mm, addr);
+	if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start)
+		return -EINVAL;
+
+	encl = result->vm_private_data;
+	*vma = result;
+
+	return encl ? 0 : -ENOENT;
+}
+
+/**
+ * sgx_invalidate - kill an enclave
+ * @encl:	an &sgx_encl instance
+ * @flush_cpus	Set if there can be active threads inside the enclave.
+ *
+ * Mark the enclave as dead and immediately free its EPC pages (but not
+ * its resources).  For active enclaves, the entry points to the enclave
+ * are destroyed first and hardware threads are kicked out so that the
+ * EPC pages can be safely manipulated.
+ */
+void sgx_invalidate(struct sgx_encl *encl, bool flush_cpus)
+{
+	struct sgx_encl_page *entry;
+	struct radix_tree_iter iter;
+	struct vm_area_struct *vma;
+	unsigned long addr;
+	void **slot;
+
+	if (encl->flags & SGX_ENCL_DEAD)
+		return;
+
+	encl->flags |= SGX_ENCL_DEAD;
+	if (flush_cpus && (encl->flags & SGX_ENCL_INITIALIZED)) {
+		radix_tree_for_each_slot(slot, &encl->page_tree, &iter, 0) {
+			entry = *slot;
+			addr = SGX_ENCL_PAGE_ADDR(entry);
+			if ((entry->desc & SGX_ENCL_PAGE_LOADED) &&
+			    (entry->desc & SGX_ENCL_PAGE_TCS) &&
+			    !sgx_encl_find(encl->mm, addr, &vma))
+				zap_vma_ptes(vma, addr, PAGE_SIZE);
+		}
+		sgx_flush_cpus(encl);
+	}
+	radix_tree_for_each_slot(slot, &encl->page_tree, &iter, 0) {
+		entry = *slot;
+		if (entry->desc & SGX_ENCL_PAGE_LOADED) {
+			if (!__sgx_free_page(entry->epc_page)) {
+				encl->secs_child_cnt--;
+				entry->desc &= ~SGX_ENCL_PAGE_LOADED;
+			}
+		}
+	}
+
+	if (!encl->secs_child_cnt &&
+	    (encl->secs.desc & SGX_ENCL_PAGE_LOADED)) {
+		encl->secs.desc &= ~SGX_ENCL_PAGE_LOADED;
+		sgx_free_page(encl->secs.epc_page);
+	}
+}
+
+static bool sgx_process_add_page_req(struct sgx_add_page_req *req,
+				     struct sgx_epc_page *epc_page)
+{
+	struct sgx_encl_page *encl_page = req->encl_page;
+	struct sgx_encl *encl = req->encl;
+	struct sgx_secinfo secinfo;
+	struct sgx_pageinfo pginfo;
+	struct vm_area_struct *vma;
+	pgoff_t backing_index;
+	struct page *backing;
+	unsigned long addr;
+	int ret;
+	int i;
+
+	if (encl->flags & (SGX_ENCL_SUSPEND | SGX_ENCL_DEAD))
+		return false;
+
+	addr = SGX_ENCL_PAGE_ADDR(encl_page);
+	ret = sgx_encl_find(encl->mm, addr, &vma);
+	if (ret)
+		return false;
+
+	backing_index = sgx_encl_page_backing_index(encl_page, encl);
+	backing = sgx_get_backing(encl->backing, backing_index);
+	if (IS_ERR(backing))
+		return false;
+
+	ret = vmf_insert_pfn(vma, addr, PFN_DOWN(epc_page->desc));
+	if (ret != VM_FAULT_NOPAGE) {
+		sgx_put_backing(backing, false);
+		return false;
+	}
+
+	/*
+	 * The SECINFO field must be 64-byte aligned, copy it to a local
+	 * variable that is guaranteed to be aligned as req->secinfo may
+	 * or may not be 64-byte aligned, e.g. req may have been allocated
+	 * via kzalloc which is not aware of __aligned attributes.
+	 */
+	memcpy(&secinfo, &req->secinfo, sizeof(secinfo));
+
+	pginfo.secs = (unsigned long)sgx_epc_addr(encl->secs.epc_page);
+	pginfo.addr = addr;
+	pginfo.metadata = (unsigned long)&secinfo;
+	pginfo.contents = (unsigned long)kmap_atomic(backing);
+	ret = __eadd(&pginfo, sgx_epc_addr(epc_page));
+	kunmap_atomic((void *)(unsigned long)pginfo.contents);
+
+	sgx_put_backing(backing, false);
+	if (ret) {
+		SGX_INVD(ret, encl, "EADD returned %d (0x%x)", ret, ret);
+		zap_vma_ptes(vma, addr, PAGE_SIZE);
+		return false;
+	}
+
+	for_each_set_bit(i, &req->mrmask, 16) {
+		ret = __eextend(sgx_epc_addr(encl->secs.epc_page),
+				sgx_epc_addr(epc_page) + (i * 0x100));
+		if (ret) {
+			SGX_INVD(ret, encl, "EEXTEND returned %d (0x%x)", ret, ret);
+			zap_vma_ptes(vma, addr, PAGE_SIZE);
+			return ret;
+		}
+	}
+
+	encl_page->encl = encl;
+	encl->secs_child_cnt++;
+	sgx_set_page_loaded(encl_page, epc_page);
+	sgx_test_and_clear_young(encl_page);
+	return true;
+}
+
+static void sgx_add_page_worker(struct work_struct *work)
+{
+	struct sgx_add_page_req *req;
+	bool skip_rest = false;
+	bool is_empty = false;
+	struct sgx_encl *encl;
+	struct sgx_epc_page *epc_page;
+
+	encl = container_of(work, struct sgx_encl, add_page_work);
+
+	do {
+		schedule();
+
+		mutex_lock(&encl->lock);
+		if (encl->flags & SGX_ENCL_DEAD)
+			skip_rest = true;
+
+		req = list_first_entry(&encl->add_page_reqs,
+				       struct sgx_add_page_req, list);
+		list_del(&req->list);
+		is_empty = list_empty(&encl->add_page_reqs);
+		mutex_unlock(&encl->lock);
+
+		if (skip_rest)
+			goto next;
+
+		epc_page = sgx_alloc_page();
+		down_read(&encl->mm->mmap_sem);
+		mutex_lock(&encl->lock);
+
+		if (IS_ERR(epc_page)) {
+			sgx_invalidate(encl, false);
+			skip_rest = true;
+		} else	if (!sgx_process_add_page_req(req, epc_page)) {
+			sgx_free_page(epc_page);
+			sgx_invalidate(encl, false);
+			skip_rest = true;
+		}
+
+		mutex_unlock(&encl->lock);
+		up_read(&encl->mm->mmap_sem);
+
+next:
+		kfree(req);
+	} while (!kref_put(&encl->refcount, sgx_encl_release) && !is_empty);
+}
+
+static u32 sgx_calc_ssaframesize(u32 miscselect, u64 xfrm)
+{
+	u32 size_max = PAGE_SIZE;
+	u32 size;
+	int i;
+
+	for (i = 2; i < 64; i++) {
+		if (!((1 << i) & xfrm))
+			continue;
+
+		size = SGX_SSA_GPRS_SIZE + sgx_xsave_size_tbl[i];
+		if (miscselect & SGX_MISC_EXINFO)
+			size += SGX_SSA_MISC_EXINFO_SIZE;
+
+		if (size > size_max)
+			size_max = size;
+	}
+
+	return PFN_UP(size_max);
+}
+
+static int sgx_validate_secs(const struct sgx_secs *secs,
+			     unsigned long ssaframesize)
+{
+	if (secs->size < (2 * PAGE_SIZE) || !is_power_of_2(secs->size))
+		return -EINVAL;
+
+	if (secs->base & (secs->size - 1))
+		return -EINVAL;
+
+	if (secs->attributes & SGX_ATTR_RESERVED_MASK ||
+	    secs->miscselect & sgx_misc_reserved)
+		return -EINVAL;
+
+	if (secs->attributes & SGX_ATTR_MODE64BIT) {
+		if (secs->size > sgx_encl_size_max_64)
+			return -EINVAL;
+	} else {
+		/* On 64-bit architecture allow 32-bit encls only in
+		 * the compatibility mode.
+		 */
+		if (!test_thread_flag(TIF_ADDR32))
+			return -EINVAL;
+		if (secs->size > sgx_encl_size_max_32)
+			return -EINVAL;
+	}
+
+	if (!(secs->xfrm & XFEATURE_MASK_FP) ||
+	    !(secs->xfrm & XFEATURE_MASK_SSE) ||
+	    (((secs->xfrm >> XFEATURE_BNDREGS) & 1) !=
+	     ((secs->xfrm >> XFEATURE_BNDCSR) & 1)) ||
+	    (secs->xfrm & ~sgx_xfrm_mask))
+		return -EINVAL;
+
+	if (!secs->ssa_frame_size || ssaframesize > secs->ssa_frame_size)
+		return -EINVAL;
+
+	if (memchr_inv(secs->reserved1, 0, SGX_SECS_RESERVED1_SIZE) ||
+	    memchr_inv(secs->reserved2, 0, SGX_SECS_RESERVED2_SIZE) ||
+	    memchr_inv(secs->reserved3, 0, SGX_SECS_RESERVED3_SIZE) ||
+	    memchr_inv(secs->reserved4, 0, SGX_SECS_RESERVED4_SIZE))
+		return -EINVAL;
+
+	return 0;
+}
+
+static void sgx_mmu_notifier_release(struct mmu_notifier *mn,
+				     struct mm_struct *mm)
+{
+	struct sgx_encl *encl =
+		container_of(mn, struct sgx_encl, mmu_notifier);
+
+	mutex_lock(&encl->lock);
+	encl->flags |= SGX_ENCL_DEAD;
+	mutex_unlock(&encl->lock);
+}
+
+static const struct mmu_notifier_ops sgx_mmu_notifier_ops = {
+	.release	= sgx_mmu_notifier_release,
+};
+
+/**
+ * sgx_encl_alloc - allocate memory for an enclave and set attributes
+ *
+ * @secs:	SECS data (must be page aligned)
+ *
+ * Allocates a new &sgx_encl instance. Validates SECS attributes, creates
+ * backing storage for the enclave and sets enclave attributes to sane initial
+ * values.
+ *
+ * Return:
+ *   an &sgx_encl instance,
+ *   -errno otherwise
+ */
+struct sgx_encl *sgx_encl_alloc(struct sgx_secs *secs)
+{
+	unsigned long ssaframesize;
+	struct sgx_encl *encl;
+	struct file *backing;
+
+	ssaframesize = sgx_calc_ssaframesize(secs->miscselect, secs->xfrm);
+	if (sgx_validate_secs(secs, ssaframesize))
+		return ERR_PTR(-EINVAL);
+
+	backing = shmem_file_setup("[dev/sgx]", secs->size + PAGE_SIZE,
+				   VM_NORESERVE);
+	if (IS_ERR(backing))
+		return ERR_CAST(backing);
+
+	encl = kzalloc(sizeof(*encl), GFP_KERNEL);
+	if (!encl) {
+		fput(backing);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	encl->attributes = secs->attributes;
+	encl->xfrm = secs->xfrm;
+
+	kref_init(&encl->refcount);
+	INIT_LIST_HEAD(&encl->add_page_reqs);
+	INIT_RADIX_TREE(&encl->page_tree, GFP_KERNEL);
+	mutex_init(&encl->lock);
+	INIT_WORK(&encl->add_page_work, sgx_add_page_worker);
+
+	encl->mm = current->mm;
+	encl->base = secs->base;
+	encl->size = secs->size;
+	encl->ssaframesize = secs->ssa_frame_size;
+	encl->backing = backing;
+
+	return encl;
+}
+
+static int sgx_encl_pm_notifier(struct notifier_block *nb,
+				unsigned long action, void *data)
+{
+	struct sgx_encl *encl = container_of(nb, struct sgx_encl, pm_notifier);
+
+	if (action != PM_SUSPEND_PREPARE && action != PM_HIBERNATION_PREPARE)
+		return NOTIFY_DONE;
+
+	mutex_lock(&encl->lock);
+	sgx_invalidate(encl, false);
+	encl->flags |= SGX_ENCL_SUSPEND;
+	mutex_unlock(&encl->lock);
+	flush_work(&encl->add_page_work);
+	return NOTIFY_DONE;
+}
+
+/**
+ * sgx_encl_create - create an enclave
+ *
+ * @encl:	an enclave
+ * @secs:	page aligned SECS data
+ *
+ * Validates SECS attributes, allocates an EPC page for the SECS and creates
+ * the enclave by performing ECREATE.
+ *
+ * Return:
+ *   0 on success,
+ *   -errno otherwise
+ */
+int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
+{
+	struct vm_area_struct *vma;
+	struct sgx_pageinfo pginfo;
+	struct sgx_secinfo secinfo;
+	struct sgx_epc_page *secs_epc;
+	long ret;
+
+	secs_epc = sgx_alloc_page();
+	if (IS_ERR(secs_epc)) {
+		ret = PTR_ERR(secs_epc);
+		return ret;
+	}
+
+	sgx_set_page_loaded(&encl->secs, secs_epc);
+	encl->secs.encl = encl;
+	encl->tgid = get_pid(task_tgid(current));
+
+	pginfo.addr = 0;
+	pginfo.contents = (unsigned long)secs;
+	pginfo.metadata = (unsigned long)&secinfo;
+	pginfo.secs = 0;
+	memset(&secinfo, 0, sizeof(secinfo));
+	ret = __ecreate((void *)&pginfo, sgx_epc_addr(secs_epc));
+
+	if (ret) {
+		sgx_dbg(encl, "ECREATE returned %ld\n", ret);
+		return ret;
+	}
+
+	if (secs->attributes & SGX_ATTR_DEBUG)
+		encl->flags |= SGX_ENCL_DEBUG;
+
+	encl->mmu_notifier.ops = &sgx_mmu_notifier_ops;
+	ret = mmu_notifier_register(&encl->mmu_notifier, encl->mm);
+	if (ret) {
+		if (ret == -EINTR)
+			ret = -ERESTARTSYS;
+		encl->mmu_notifier.ops = NULL;
+		return ret;
+	}
+
+	encl->pm_notifier.notifier_call = &sgx_encl_pm_notifier;
+	ret = register_pm_notifier(&encl->pm_notifier);
+	if (ret) {
+		encl->pm_notifier.notifier_call = NULL;
+		return ret;
+	}
+
+	down_read(&current->mm->mmap_sem);
+	ret = sgx_encl_find(current->mm, secs->base, &vma);
+	if (ret != -ENOENT) {
+		if (!ret)
+			ret = -EINVAL;
+		up_read(&current->mm->mmap_sem);
+		return ret;
+	}
+
+	if (vma->vm_start != secs->base ||
+	    vma->vm_end != (secs->base + secs->size) ||
+	    vma->vm_pgoff != 0) {
+		ret = -EINVAL;
+		up_read(&current->mm->mmap_sem);
+		return ret;
+	}
+
+	vma->vm_private_data = encl;
+	up_read(&current->mm->mmap_sem);
+	return 0;
+}
+
+static int sgx_validate_secinfo(struct sgx_secinfo *secinfo)
+{
+	u64 page_type = secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK;
+	u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK;
+	int i;
+
+	if ((secinfo->flags & SGX_SECINFO_RESERVED_MASK) ||
+	    ((perm & SGX_SECINFO_W) && !(perm & SGX_SECINFO_R)) ||
+	    (page_type != SGX_SECINFO_TCS &&
+	     page_type != SGX_SECINFO_REG))
+		return -EINVAL;
+
+	for (i = 0; i < SGX_SECINFO_RESERVED_SIZE; i++)
+		if (secinfo->reserved[i])
+			return -EINVAL;
+
+	return 0;
+}
+
+static bool sgx_validate_offset(struct sgx_encl *encl, unsigned long offset)
+{
+	if (offset & (PAGE_SIZE - 1))
+		return false;
+
+	if (offset >= encl->size)
+		return false;
+
+	return true;
+}
+
+static int sgx_validate_tcs(struct sgx_encl *encl, struct sgx_tcs *tcs)
+{
+	int i;
+
+	if (tcs->flags & SGX_TCS_RESERVED_MASK)
+		return -EINVAL;
+
+	if (tcs->flags & SGX_TCS_DBGOPTIN)
+		return -EINVAL;
+
+	if (!sgx_validate_offset(encl, tcs->ssa_offset))
+		return -EINVAL;
+
+	if (!sgx_validate_offset(encl, tcs->fs_offset))
+		return -EINVAL;
+
+	if (!sgx_validate_offset(encl, tcs->gs_offset))
+		return -EINVAL;
+
+	if ((tcs->fs_limit & 0xFFF) != 0xFFF)
+		return -EINVAL;
+
+	if ((tcs->gs_limit & 0xFFF) != 0xFFF)
+		return -EINVAL;
+
+	for (i = 0; i < SGX_TCS_RESERVED_SIZE; i++)
+		if (tcs->reserved[i])
+			return -EINVAL;
+
+	return 0;
+}
+
+static int __sgx_encl_add_page(struct sgx_encl *encl,
+			       struct sgx_encl_page *encl_page,
+			       void *data,
+			       struct sgx_secinfo *secinfo,
+			       unsigned int mrmask)
+{
+	u64 page_type = secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK;
+	struct sgx_add_page_req *req = NULL;
+	pgoff_t backing_index;
+	struct page *backing;
+	void *backing_ptr;
+	int empty;
+
+	req = kzalloc(sizeof(*req), GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	backing_index = sgx_encl_page_backing_index(encl_page, encl);
+	backing = sgx_get_backing(encl->backing, backing_index);
+	if (IS_ERR(backing)) {
+		kfree(req);
+		return PTR_ERR(backing);
+	}
+	backing_ptr = kmap(backing);
+	memcpy(backing_ptr, data, PAGE_SIZE);
+	kunmap(backing);
+	if (page_type == SGX_SECINFO_TCS)
+		encl_page->desc |= SGX_ENCL_PAGE_TCS;
+	memcpy(&req->secinfo, secinfo, sizeof(*secinfo));
+	req->encl = encl;
+	req->encl_page = encl_page;
+	req->mrmask = mrmask;
+	empty = list_empty(&encl->add_page_reqs);
+	kref_get(&encl->refcount);
+	list_add_tail(&req->list, &encl->add_page_reqs);
+	if (empty)
+		queue_work(sgx_add_page_wq, &encl->add_page_work);
+	sgx_put_backing(backing, true /* write */);
+	return 0;
+}
+
+/**
+ * sgx_encl_alloc_page - allocate a new enclave page
+ * @encl:	an enclave
+ * @addr:	page address in the ELRANGE
+ *
+ * Return:
+ *   an &sgx_encl_page instance on success,
+ *   -errno otherwise
+ */
+struct sgx_encl_page *sgx_encl_alloc_page(struct sgx_encl *encl,
+					  unsigned long addr)
+{
+	struct sgx_encl_page *encl_page;
+	int ret;
+
+	if (radix_tree_lookup(&encl->page_tree, PFN_DOWN(addr)))
+		return ERR_PTR(-EEXIST);
+	encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL);
+	if (!encl_page)
+		return ERR_PTR(-ENOMEM);
+	encl_page->desc = addr;
+	encl_page->encl = encl;
+	ret = radix_tree_insert(&encl->page_tree, PFN_DOWN(encl_page->desc),
+				encl_page);
+	if (ret) {
+		kfree(encl_page);
+		return ERR_PTR(ret);
+	}
+	return encl_page;
+}
+
+/**
+ * sgx_encl_free_page - free an enclave page
+ * @encl_page:	an enclave page
+ */
+void sgx_encl_free_page(struct sgx_encl_page *encl_page)
+{
+	radix_tree_delete(&encl_page->encl->page_tree,
+			  PFN_DOWN(encl_page->desc));
+	if (encl_page->desc & SGX_ENCL_PAGE_LOADED)
+		sgx_free_page(encl_page->epc_page);
+	kfree(encl_page);
+}
+
+/**
+ * sgx_encl_add_page - add a page to the enclave
+ *
+ * @encl:	an enclave
+ * @addr:	page address in the ELRANGE
+ * @data:	page data
+ * @secinfo:	page permissions
+ * @mrmask:	bitmask to select the 256 byte chunks to be measured
+ *
+ * Creates a new enclave page and enqueues an EADD operation that will be
+ * processed by a worker thread later on.
+ *
+ * Return:
+ *   0 on success,
+ *   -errno otherwise
+ */
+int sgx_encl_add_page(struct sgx_encl *encl, unsigned long addr, void *data,
+		      struct sgx_secinfo *secinfo, unsigned int mrmask)
+{
+	u64 page_type = secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK;
+	struct sgx_encl_page *encl_page;
+	int ret;
+
+	if (sgx_validate_secinfo(secinfo))
+		return -EINVAL;
+	if (page_type == SGX_SECINFO_TCS) {
+		ret = sgx_validate_tcs(encl, data);
+		if (ret)
+			return ret;
+	}
+	mutex_lock(&encl->lock);
+	if (encl->flags & (SGX_ENCL_INITIALIZED | SGX_ENCL_DEAD)) {
+		mutex_unlock(&encl->lock);
+		return -EINVAL;
+	}
+	encl_page = sgx_encl_alloc_page(encl, addr);
+	if (IS_ERR(encl_page)) {
+		mutex_unlock(&encl->lock);
+		return PTR_ERR(encl_page);
+	}
+	ret = __sgx_encl_add_page(encl, encl_page, data, secinfo, mrmask);
+	if (ret)
+		sgx_encl_free_page(encl_page);
+	mutex_unlock(&encl->lock);
+	return ret;
+}
+
+static int __sgx_get_key_hash(struct crypto_shash *tfm, const void *modulus,
+			      void *hash)
+{
+	SHASH_DESC_ON_STACK(shash, tfm);
+
+	shash->tfm = tfm;
+	shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	return crypto_shash_digest(shash, modulus, SGX_MODULUS_SIZE, hash);
+}
+
+static int sgx_get_key_hash(const void *modulus, void *hash)
+{
+	struct crypto_shash *tfm;
+	int ret;
+
+	tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	ret = __sgx_get_key_hash(tfm, modulus, hash);
+
+	crypto_free_shash(tfm);
+	return ret;
+}
+
+/**
+ * sgx_encl_init - perform EINIT for the given enclave
+ *
+ * @encl:	an enclave
+ * @sigstruct:	SIGSTRUCT for the enclave
+ * @token:	EINITTOKEN for the enclave
+ *
+ * Retries a few times in order to perform EINIT operation on an enclave
+ * because there could be potentially an interrupt storm.
+ *
+ * Return:
+ *   0 on success,
+ *   SGX error code on EINIT failure,
+ *   -errno otherwise
+ */
+int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
+		  struct sgx_einittoken *token)
+{
+	u64 mrsigner[4];
+	int ret;
+	int i;
+	int j;
+
+	ret = sgx_get_key_hash(sigstruct->modulus, mrsigner);
+	if (ret)
+		return ret;
+
+	flush_work(&encl->add_page_work);
+
+	mutex_lock(&encl->lock);
+
+	if (encl->flags & SGX_ENCL_INITIALIZED) {
+		mutex_unlock(&encl->lock);
+		return 0;
+	}
+	if (encl->flags & SGX_ENCL_DEAD) {
+		mutex_unlock(&encl->lock);
+		return -EFAULT;
+	}
+
+	for (i = 0; i < SGX_EINIT_SLEEP_COUNT; i++) {
+		for (j = 0; j < SGX_EINIT_SPIN_COUNT; j++) {
+			ret = sgx_einit(sigstruct, token, encl->secs.epc_page,
+					mrsigner);
+			if (ret == SGX_UNMASKED_EVENT)
+				continue;
+			else
+				break;
+		}
+
+		if (ret != SGX_UNMASKED_EVENT)
+			break;
+
+		msleep_interruptible(SGX_EINIT_SLEEP_TIME);
+		if (signal_pending(current)) {
+			mutex_unlock(&encl->lock);
+			return -ERESTARTSYS;
+		}
+	}
+
+	if (unlikely(IS_ENCLS_FAULT(ret)))
+		SGX_INVD(ret, encl, "EINIT returned %d (%x)", ret, ret);
+	else if (ret > 0)
+		sgx_dbg(encl, "EINIT returned %d\n", ret);
+	else if (!ret)
+		encl->flags |= SGX_ENCL_INITIALIZED;
+	mutex_unlock(&encl->lock);
+
+	return ret;
+}
+
+/**
+ * sgx_encl_release - destroy an enclave instance
+ * @kref:	address of a kref inside &sgx_encl
+ *
+ * Used together with kref_put(). Frees all the resources associated with the
+ * enclave and the instance itself.
+ */
+void sgx_encl_release(struct kref *ref)
+{
+	struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount);
+	struct sgx_encl_page *entry;
+	struct radix_tree_iter iter;
+	void **slot;
+
+	if (encl->mmu_notifier.ops)
+		mmu_notifier_unregister(&encl->mmu_notifier, encl->mm);
+
+	if (encl->pm_notifier.notifier_call)
+		unregister_pm_notifier(&encl->pm_notifier);
+
+	radix_tree_for_each_slot(slot, &encl->page_tree, &iter, 0) {
+		entry = *slot;
+		sgx_encl_free_page(entry);
+	}
+
+	if (encl->tgid)
+		put_pid(encl->tgid);
+
+	if (encl->secs.desc & SGX_ENCL_PAGE_LOADED)
+		sgx_free_page(encl->secs.epc_page);
+
+	if (encl->backing)
+		fput(encl->backing);
+
+	kfree(encl);
+}
diff --git a/drivers/platform/x86/intel_sgx/sgx_ioctl.c b/drivers/platform/x86/intel_sgx/sgx_ioctl.c
new file mode 100644
index 000000000000..4edf1cc956b1
--- /dev/null
+++ b/drivers/platform/x86/intel_sgx/sgx_ioctl.c
@@ -0,0 +1,234 @@ 
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+// Copyright(c) 2016-18 Intel Corporation.
+
+#include <asm/mman.h>
+#include <linux/delay.h>
+#include <linux/file.h>
+#include <linux/hashtable.h>
+#include <linux/highmem.h>
+#include <linux/ratelimit.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include "sgx.h"
+
+static int sgx_encl_get(unsigned long addr, struct sgx_encl **encl)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	int ret;
+
+	if (addr & (PAGE_SIZE - 1))
+		return -EINVAL;
+
+	down_read(&mm->mmap_sem);
+
+	ret = sgx_encl_find(mm, addr, &vma);
+	if (!ret) {
+		*encl = vma->vm_private_data;
+
+		if ((*encl)->flags & SGX_ENCL_SUSPEND)
+			ret = SGX_POWER_LOST_ENCLAVE;
+		else
+			kref_get(&(*encl)->refcount);
+	}
+
+	up_read(&mm->mmap_sem);
+	return ret;
+}
+
+/**
+ * sgx_ioc_enclave_create - handler for %SGX_IOC_ENCLAVE_CREATE
+ * @filep:	open file to /dev/sgx
+ * @cmd:	the command value
+ * @arg:	pointer to an &sgx_enclave_create instance
+ *
+ * Validates SECS attributes, allocates an EPC page for the SECS and performs
+ * ECREATE.
+ *
+ * Return:
+ *   0 on success,
+ *   -errno otherwise
+ */
+static long sgx_ioc_enclave_create(struct file *filep, unsigned int cmd,
+				   unsigned long arg)
+{
+	struct sgx_enclave_create *createp = (struct sgx_enclave_create *)arg;
+	struct page *secs_page;
+	struct sgx_secs *secs;
+	struct sgx_encl *encl;
+	int ret;
+
+	secs_page = alloc_page(GFP_HIGHUSER);
+	if (!secs_page)
+		return -ENOMEM;
+
+	secs = kmap(secs_page);
+	ret = copy_from_user(secs, (void __user *)createp->src, sizeof(*secs));
+	if (ret)
+		goto out;
+
+	encl = sgx_encl_alloc(secs);
+	if (IS_ERR(encl)) {
+		ret = PTR_ERR(encl);
+		goto out;
+	}
+
+	ret = sgx_encl_create(encl, secs);
+	if (ret)
+		kref_put(&encl->refcount, sgx_encl_release);
+
+out:
+	kunmap(secs_page);
+	__free_page(secs_page);
+	return ret;
+}
+
+/**
+ * sgx_ioc_enclave_add_page - handler for %SGX_IOC_ENCLAVE_ADD_PAGE
+ *
+ * @filep:	open file to /dev/sgx
+ * @cmd:	the command value
+ * @arg:	pointer to an &sgx_enclave_add_page instance
+ *
+ * Creates a new enclave page and enqueues an EADD operation that will be
+ * processed by a worker thread later on.
+ *
+ * Return:
+ *   0 on success,
+ *   -errno otherwise
+ */
+static long sgx_ioc_enclave_add_page(struct file *filep, unsigned int cmd,
+				     unsigned long arg)
+{
+	struct sgx_enclave_add_page *addp = (void *)arg;
+	struct sgx_secinfo secinfo;
+	struct sgx_encl *encl;
+	struct page *data_page;
+	void *data;
+	int ret;
+
+	ret = sgx_encl_get(addp->addr, &encl);
+	if (ret)
+		return ret;
+
+	if (copy_from_user(&secinfo, (void __user *)addp->secinfo,
+			   sizeof(secinfo))) {
+		kref_put(&encl->refcount, sgx_encl_release);
+		return -EFAULT;
+	}
+
+	data_page = alloc_page(GFP_HIGHUSER);
+	if (!data_page) {
+		kref_put(&encl->refcount, sgx_encl_release);
+		return -ENOMEM;
+	}
+
+	data = kmap(data_page);
+
+	ret = copy_from_user((void *)data, (void __user *)addp->src, PAGE_SIZE);
+	if (ret)
+		goto out;
+
+	ret = sgx_encl_add_page(encl, addp->addr, data, &secinfo, addp->mrmask);
+	if (ret)
+		goto out;
+
+out:
+	kref_put(&encl->refcount, sgx_encl_release);
+	kunmap(data_page);
+	__free_page(data_page);
+	return ret;
+}
+
+/**
+ * sgx_ioc_enclave_init - handler for %SGX_IOC_ENCLAVE_INIT
+ *
+ * @filep:	open file to /dev/sgx
+ * @cmd:	the command value
+ * @arg:	pointer to an &sgx_enclave_init instance
+ *
+ * Flushes the remaining enqueued EADD operations and performs EINIT. Does not
+ * allow the EINITTOKENKEY attribute for an enclave.
+ *
+ * Return:
+ *   0 on success,
+ *   SGX error code on EINIT failure,
+ *   -errno otherwise
+ */
+static long sgx_ioc_enclave_init(struct file *filep, unsigned int cmd,
+				 unsigned long arg)
+{
+	struct sgx_enclave_init *initp = (struct sgx_enclave_init *)arg;
+	struct sgx_sigstruct *sigstruct;
+	struct sgx_einittoken *einittoken;
+	struct sgx_encl *encl;
+	struct page *initp_page;
+	int ret;
+
+	initp_page = alloc_page(GFP_HIGHUSER);
+	if (!initp_page)
+		return -ENOMEM;
+
+	sigstruct = kmap(initp_page);
+	einittoken = (struct sgx_einittoken *)
+		((unsigned long)sigstruct + PAGE_SIZE / 2);
+	memset(einittoken, 0, sizeof(*einittoken));
+
+	ret = copy_from_user(sigstruct, (void __user *)initp->sigstruct,
+			     sizeof(*sigstruct));
+	if (ret)
+		goto out;
+	if (sigstruct->attributes & SGX_ATTR_EINITTOKENKEY) {
+		ret = EINVAL;
+		goto out;
+	}
+
+	ret = sgx_encl_get(initp->addr, &encl);
+	if (ret)
+		goto out;
+
+	ret = sgx_encl_init(encl, sigstruct, einittoken);
+
+	kref_put(&encl->refcount, sgx_encl_release);
+
+out:
+	kunmap(initp_page);
+	__free_page(initp_page);
+	return ret;
+}
+
+typedef long (*sgx_ioc_t)(struct file *filep, unsigned int cmd,
+			  unsigned long arg);
+
+long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+	char data[256];
+	sgx_ioc_t handler = NULL;
+	long ret;
+
+	switch (cmd) {
+	case SGX_IOC_ENCLAVE_CREATE:
+		handler = sgx_ioc_enclave_create;
+		break;
+	case SGX_IOC_ENCLAVE_ADD_PAGE:
+		handler = sgx_ioc_enclave_add_page;
+		break;
+	case SGX_IOC_ENCLAVE_INIT:
+		handler = sgx_ioc_enclave_init;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	if (copy_from_user(data, (void __user *)arg, _IOC_SIZE(cmd)))
+		return -EFAULT;
+
+	ret = handler(filep, cmd, (unsigned long)((void *)data));
+	if (!ret && (cmd & IOC_OUT)) {
+		if (copy_to_user((void __user *)arg, data, _IOC_SIZE(cmd)))
+			return -EFAULT;
+	}
+	if (IS_ENCLS_FAULT(ret))
+		return -EFAULT;
+	return ret;
+}
diff --git a/drivers/platform/x86/intel_sgx/sgx_main.c b/drivers/platform/x86/intel_sgx/sgx_main.c
new file mode 100644
index 000000000000..4312eab29775
--- /dev/null
+++ b/drivers/platform/x86/intel_sgx/sgx_main.c
@@ -0,0 +1,267 @@ 
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+// Copyright(c) 2016-18 Intel Corporation.
+
+#include <linux/acpi.h>
+#include <linux/cdev.h>
+#include <linux/platform_device.h>
+#include <linux/suspend.h>
+#include <asm/traps.h>
+#include "sgx.h"
+
+MODULE_DESCRIPTION("Intel SGX Driver");
+MODULE_AUTHOR("Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct workqueue_struct *sgx_add_page_wq;
+u64 sgx_encl_size_max_32;
+u64 sgx_encl_size_max_64;
+u64 sgx_xfrm_mask = 0x3;
+u32 sgx_misc_reserved;
+u32 sgx_xsave_size_tbl[64];
+int sgx_epcm_trapnr;
+
+#ifdef CONFIG_COMPAT
+long sgx_compat_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+	return sgx_ioctl(filep, cmd, arg);
+}
+#endif
+
+static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &sgx_vm_ops;
+	vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO |
+			 VM_DONTCOPY;
+
+	return 0;
+}
+
+static unsigned long sgx_get_unmapped_area(struct file *file,
+					   unsigned long addr,
+					   unsigned long len,
+					   unsigned long pgoff,
+					   unsigned long flags)
+{
+	if (len < 2 * PAGE_SIZE || (len & (len - 1)))
+		return -EINVAL;
+
+	if (len > sgx_encl_size_max_64)
+		return -EINVAL;
+
+	if (len > sgx_encl_size_max_32 && test_thread_flag(TIF_ADDR32))
+		return -EINVAL;
+
+	addr = current->mm->get_unmapped_area(file, addr, 2 * len, pgoff,
+					      flags);
+	if (IS_ERR_VALUE(addr))
+		return addr;
+
+	addr = (addr + (len - 1)) & ~(len - 1);
+
+	return addr;
+}
+
+static const struct file_operations sgx_fops = {
+	.owner			= THIS_MODULE,
+	.unlocked_ioctl		= sgx_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl		= sgx_compat_ioctl,
+#endif
+	.mmap			= sgx_mmap,
+	.get_unmapped_area	= sgx_get_unmapped_area,
+};
+
+static struct bus_type sgx_bus_type = {
+	.name	= "sgx",
+};
+
+struct sgx_context {
+	struct device dev;
+	struct cdev cdev;
+};
+
+static dev_t sgx_devt;
+
+static void sgx_dev_release(struct device *dev)
+{
+	struct sgx_context *ctx = container_of(dev, struct sgx_context, dev);
+
+	kfree(ctx);
+}
+
+static struct sgx_context *sgx_ctx_alloc(struct device *parent)
+{
+	struct sgx_context *ctx;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+
+	device_initialize(&ctx->dev);
+
+	ctx->dev.bus = &sgx_bus_type;
+	ctx->dev.parent = parent;
+	ctx->dev.devt = MKDEV(MAJOR(sgx_devt), 0);
+	ctx->dev.release = sgx_dev_release;
+
+	dev_set_name(&ctx->dev, "sgx");
+
+	cdev_init(&ctx->cdev, &sgx_fops);
+	ctx->cdev.owner = THIS_MODULE;
+
+	dev_set_drvdata(parent, ctx);
+
+	return ctx;
+}
+
+static struct sgx_context *sgxm_ctx_alloc(struct device *parent)
+{
+	struct sgx_context *ctx;
+	int rc;
+
+	ctx = sgx_ctx_alloc(parent);
+	if (IS_ERR(ctx))
+		return ctx;
+
+	rc = devm_add_action_or_reset(parent, (void (*)(void *))put_device,
+				      &ctx->dev);
+	if (rc) {
+		kfree(ctx);
+		return ERR_PTR(rc);
+	}
+
+	return ctx;
+}
+
+static int sgx_dev_init(struct device *parent)
+{
+	struct sgx_context *sgx_dev;
+	unsigned int eax;
+	unsigned int ebx;
+	unsigned int ecx;
+	unsigned int edx;
+	int ret;
+	int i;
+
+	sgx_dev = sgxm_ctx_alloc(parent);
+
+	cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx);
+	/* Only allow misc bits supported by the driver. */
+	sgx_misc_reserved = ~ebx | SGX_MISC_RESERVED_MASK;
+	sgx_encl_size_max_64 = 1ULL << ((edx >> 8) & 0xFF);
+	sgx_encl_size_max_32 = 1ULL << (edx & 0xFF);
+
+	if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+		cpuid_count(SGX_CPUID, 1, &eax, &ebx, &ecx, &edx);
+		sgx_xfrm_mask = (((u64)edx) << 32) + (u64)ecx;
+
+		for (i = 2; i < 64; i++) {
+			cpuid_count(0x0D, i, &eax, &ebx, &ecx, &edx);
+			if ((1 << i) & sgx_xfrm_mask)
+				sgx_xsave_size_tbl[i] = eax + ebx;
+		}
+	}
+
+	sgx_epcm_trapnr = boot_cpu_has(X86_FEATURE_SGX2) ? X86_TRAP_PF :
+							   X86_TRAP_GP;
+
+	sgx_add_page_wq = alloc_workqueue("intel_sgx-add-page-wq",
+					  WQ_UNBOUND | WQ_FREEZABLE, 1);
+	if (!sgx_add_page_wq)
+		return -ENOMEM;
+
+	ret = cdev_device_add(&sgx_dev->cdev, &sgx_dev->dev);
+	if (ret)
+		goto out_workqueue;
+
+	return 0;
+out_workqueue:
+	destroy_workqueue(sgx_add_page_wq);
+	return ret;
+}
+
+static int sgx_drv_probe(struct platform_device *pdev)
+{
+	if (!boot_cpu_has(X86_FEATURE_SGX))
+		return -ENODEV;
+
+	if (!boot_cpu_has(X86_FEATURE_SGX_LC)) {
+		pr_warn("sgx: IA32_SGXLEPUBKEYHASHx MSRs are not writable\n");
+		return -ENODEV;
+	}
+
+	return sgx_dev_init(&pdev->dev);
+}
+
+static int sgx_drv_remove(struct platform_device *pdev)
+{
+	struct sgx_context *ctx = dev_get_drvdata(&pdev->dev);
+
+	cdev_device_del(&ctx->cdev, &ctx->dev);
+	destroy_workqueue(sgx_add_page_wq);
+
+	return 0;
+}
+
+#ifdef CONFIG_ACPI
+static struct acpi_device_id sgx_device_ids[] = {
+	{"INT0E0C", 0},
+	{"", 0},
+};
+MODULE_DEVICE_TABLE(acpi, sgx_device_ids);
+#endif
+
+static struct platform_driver sgx_drv = {
+	.probe = sgx_drv_probe,
+	.remove = sgx_drv_remove,
+	.driver = {
+		.name			= "intel_sgx",
+		.acpi_match_table	= ACPI_PTR(sgx_device_ids),
+	},
+};
+
+static int __init sgx_drv_subsys_init(void)
+{
+	int ret;
+
+	ret = bus_register(&sgx_bus_type);
+	if (ret)
+		return ret;
+
+	ret = alloc_chrdev_region(&sgx_devt, 0, 1, "sgx");
+	if (ret < 0) {
+		bus_unregister(&sgx_bus_type);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void sgx_drv_subsys_exit(void)
+{
+	bus_unregister(&sgx_bus_type);
+	unregister_chrdev_region(sgx_devt, 1);
+}
+
+static int __init sgx_drv_init(void)
+{
+	int ret;
+
+	ret = sgx_drv_subsys_init();
+	if (ret)
+		return ret;
+
+	ret = platform_driver_register(&sgx_drv);
+	if (ret)
+		sgx_drv_subsys_exit();
+
+	return ret;
+}
+module_init(sgx_drv_init);
+
+static void __exit sgx_drv_exit(void)
+{
+	platform_driver_unregister(&sgx_drv);
+	sgx_drv_subsys_exit();
+}
+module_exit(sgx_drv_exit);
diff --git a/drivers/platform/x86/intel_sgx/sgx_util.c b/drivers/platform/x86/intel_sgx/sgx_util.c
new file mode 100644
index 000000000000..cbea4c0e794b
--- /dev/null
+++ b/drivers/platform/x86/intel_sgx/sgx_util.c
@@ -0,0 +1,85 @@ 
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+// Copyright(c) 2016-18 Intel Corporation.
+
+#include <linux/mm.h>
+#include <linux/shmem_fs.h>
+#include "sgx.h"
+
+static int sgx_test_and_clear_young_cb(pte_t *ptep, pgtable_t token,
+				       unsigned long addr, void *data)
+{
+	pte_t pte;
+	int ret;
+
+	ret = pte_young(*ptep);
+	if (ret) {
+		pte = pte_mkold(*ptep);
+		set_pte_at((struct mm_struct *)data, addr, ptep, pte);
+	}
+
+	return ret;
+}
+
+/**
+ * sgx_test_and_clear_young() - Test and reset the accessed bit
+ * @page:	enclave page to be tested for recent access
+ *
+ * Checks the Access (A) bit from the PTE corresponding to the
+ * enclave page and clears it.  Returns 1 if the page has been
+ * recently accessed and 0 if not.
+ */
+int sgx_test_and_clear_young(struct sgx_encl_page *page)
+{
+	unsigned long addr = SGX_ENCL_PAGE_ADDR(page);
+	struct sgx_encl *encl = page->encl;
+	struct vm_area_struct *vma;
+	int ret;
+
+	ret = sgx_encl_find(encl->mm, addr, &vma);
+	if (ret)
+		return 0;
+
+	if (encl != vma->vm_private_data)
+		return 0;
+
+	return apply_to_page_range(vma->vm_mm, addr, PAGE_SIZE,
+				   sgx_test_and_clear_young_cb, vma->vm_mm);
+}
+
+static void sgx_ipi_cb(void *info)
+{
+}
+
+void sgx_flush_cpus(struct sgx_encl *encl)
+{
+	on_each_cpu_mask(mm_cpumask(encl->mm), sgx_ipi_cb, NULL, 1);
+}
+
+/**
+ * sgx_set_page_loaded - associate an EPC page with an enclave page
+ * @encl_page:	an enclave page
+ * @epc_page:	the EPC page to attach to @encl_page
+ */
+void sgx_set_page_loaded(struct sgx_encl_page *encl_page,
+			 struct sgx_epc_page *epc_page)
+{
+	encl_page->desc |= SGX_ENCL_PAGE_LOADED;
+	encl_page->epc_page = epc_page;
+}
+
+struct page *sgx_get_backing(struct file *file, pgoff_t index)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+	gfp_t gfpmask = mapping_gfp_mask(mapping);
+
+	return shmem_read_mapping_page_gfp(mapping, index, gfpmask);
+}
+
+void sgx_put_backing(struct page *backing_page, bool write)
+{
+	if (write)
+		set_page_dirty(backing_page);
+
+	put_page(backing_page);
+}
diff --git a/drivers/platform/x86/intel_sgx/sgx_vma.c b/drivers/platform/x86/intel_sgx/sgx_vma.c
new file mode 100644
index 000000000000..17e95a0c734c
--- /dev/null
+++ b/drivers/platform/x86/intel_sgx/sgx_vma.c
@@ -0,0 +1,43 @@ 
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+// Copyright(c) 2016-18 Intel Corporation.
+
+#include <asm/mman.h>
+#include <linux/delay.h>
+#include <linux/file.h>
+#include <linux/hashtable.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/ratelimit.h>
+#include <linux/slab.h>
+#include "sgx.h"
+
+static void sgx_vma_open(struct vm_area_struct *vma)
+{
+	struct sgx_encl *encl = vma->vm_private_data;
+
+	if (!encl)
+		return;
+
+	/* kref cannot underflow because ECREATE ioctl checks that there is only
+	 * one single VMA for the enclave before proceeding.
+	 */
+	kref_get(&encl->refcount);
+}
+
+static void sgx_vma_close(struct vm_area_struct *vma)
+{
+	struct sgx_encl *encl = vma->vm_private_data;
+
+	if (!encl)
+		return;
+
+	mutex_lock(&encl->lock);
+	sgx_invalidate(encl, true);
+	mutex_unlock(&encl->lock);
+	kref_put(&encl->refcount, sgx_encl_release);
+}
+
+const struct vm_operations_struct sgx_vm_ops = {
+	.close = sgx_vma_close,
+	.open = sgx_vma_open,
+};