diff mbox series

[v13,07/13] x86/sgx: Add data structures for tracking the EPC pages

Message ID 20180827185507.17087-8-jarkko.sakkinen@linux.intel.com (mailing list archive)
State Deferred, archived
Headers show
Series Intel SGX1 support | expand

Commit Message

Jarkko Sakkinen Aug. 27, 2018, 6:53 p.m. UTC
Add data structures to track Enclave Page Cache (EPC) pages.  EPC is
divided into multiple banks (1-N) of which addresses and sizes can be
enumerated with CPUID by the OS.

On NUMA systems a node can have at most bank. A bank can be at most part of
two nodes. SGX supports both nodes with a single memory controller and also
sub-cluster nodes with severals memory controllers on a single die.

Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Co-developed-by: Serge Ayoun <serge.ayoun@intel.com>
Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Serge Ayoun <serge.ayoun@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
---
 arch/x86/include/asm/sgx.h      |  60 ++++++++++++++++++
 arch/x86/kernel/cpu/intel_sgx.c | 106 +++++++++++++++++++++++++++++++-
 2 files changed, 164 insertions(+), 2 deletions(-)

Comments

Dave Hansen Aug. 27, 2018, 9:07 p.m. UTC | #1
On 08/27/2018 11:53 AM, Jarkko Sakkinen wrote:
> Add data structures to track Enclave Page Cache (EPC) pages.  EPC is
> divided into multiple banks (1-N) of which addresses and sizes can be
> enumerated with CPUID by the OS.
> 
> On NUMA systems a node can have at most bank. A bank can be at most part of
> two nodes. SGX supports both nodes with a single memory controller and also
> sub-cluster nodes with severals memory controllers on a single die.
> 
> Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> Co-developed-by: Serge Ayoun <serge.ayoun@intel.com>
> Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Signed-off-by: Serge Ayoun <serge.ayoun@intel.com>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> ---
>  arch/x86/include/asm/sgx.h      |  60 ++++++++++++++++++
>  arch/x86/kernel/cpu/intel_sgx.c | 106 +++++++++++++++++++++++++++++++-
>  2 files changed, 164 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
> index 2130e639ab49..17b7b3aa66bf 100644
> --- a/arch/x86/include/asm/sgx.h
> +++ b/arch/x86/include/asm/sgx.h
> @@ -4,9 +4,69 @@
>  #ifndef _ASM_X86_SGX_H
>  #define _ASM_X86_SGX_H
>  
> +#include <linux/bitops.h>
> +#include <linux/err.h>
> +#include <linux/rwsem.h>
>  #include <linux/types.h>
> +#include <asm/sgx_arch.h>
> +#include <asm/asm.h>
> +
> +#define SGX_MAX_EPC_BANKS 8

This is _still_ missing a meaningful description of what a bank is and
whether it is a hardware or software structure.

It would also help us to determine whether your bit packing below is
really required.

> +struct sgx_epc_page {
> +	unsigned long desc;
> +	struct list_head list;
> +};
> +
> +struct sgx_epc_bank {
> +	unsigned long pa;
> +	void *va;
> +	unsigned long size;

Please add units.  size could be bytes or pages, or who knows what.  I
can't tell you how many bugs I've tripped over in the past from simple
unit conversions

> +	struct sgx_epc_page *pages_data;
> +	struct sgx_epc_page **pages;
> +	unsigned long free_cnt;
> +	spinlock_t lock;
> +};
>  
>  extern bool sgx_enabled;
>  extern bool sgx_lc_enabled;
> +extern struct sgx_epc_bank sgx_epc_banks[SGX_MAX_EPC_BANKS];
> +
> +/*
> + * enum sgx_epc_page_desc - defines bits and masks for an EPC page's desc

Why are you bothering packing these bits?  This seems a rather
convoluted way to store two integers.

> +static __init int sgx_init_epc_bank(u64 addr, u64 size, unsigned long index,
> +				    struct sgx_epc_bank *bank)
> +{
> +	unsigned long nr_pages = size >> PAGE_SHIFT;
> +	struct sgx_epc_page *pages_data;
> +	unsigned long i;
> +	void *va;
> +
> +	va = ioremap_cache(addr, size);
> +	if (!va)
> +		return -ENOMEM;
> +
> +	pages_data = kcalloc(nr_pages, sizeof(struct sgx_epc_page), GFP_KERNEL);
> +	if (!pages_data)
> +		goto out_iomap;

This looks like you're roughly limited by the page allocator to a bank
size of ~1.4GB which seems kinda small.  Is this really OK?

> +	bank->pages = kcalloc(nr_pages, sizeof(struct sgx_epc_page *),
> +			      GFP_KERNEL);
> +	if (!bank->pages)
> +		goto out_pdata;
> +
> +	for (i = 0; i < nr_pages; i++) {
> +		bank->pages[i] = &pages_data[i];
> +		bank->pages[i]->desc = (addr + (i << PAGE_SHIFT)) | index;
> +	}
> +
> +	bank->pa = addr;
> +	bank->size = size;
> +	bank->va = va;
> +	bank->free_cnt = nr_pages;
> +	bank->pages_data = pages_data;
> +	spin_lock_init(&bank->lock);
> +	return 0;
> +out_pdata:
> +	kfree(pages_data);
> +out_iomap:
> +	iounmap(va);
> +	return -ENOMEM;
> +}
> +
> +static __init void sgx_page_cache_teardown(void)
> +{
> +	struct sgx_epc_bank *bank;
> +	int i;
> +
> +	for (i = 0; i < sgx_nr_epc_banks; i++) {
> +		bank = &sgx_epc_banks[i];
> +		iounmap((void *)bank->va);
> +		kfree(bank->pages);
> +		kfree(bank->pages_data);
> +	}
> +}
> +
> +static inline u64 sgx_combine_bank_regs(u64 low, u64 high)
> +{
> +	return (low & 0xFFFFF000) + ((high & 0xFFFFF) << 32);
> +}

-ENOCOMMENT for a rather weird looking calculation

> +static __init int sgx_page_cache_init(void)
> +{
> +	u32 eax, ebx, ecx, edx;
> +	u64 pa, size;
> +	int ret;
> +	int i;
> +
> +	for (i = 0; i < SGX_MAX_EPC_BANKS; i++) {
> +		cpuid_count(SGX_CPUID, 2 + i, &eax, &ebx, &ecx, &edx);
> +		if (!(eax & 0xF))
> +			break;

So, we have random data coming out of a random CPUID leaf being called
'eax' and then being tested against a random hard-coded mask.  This
seems rather unfortunate for someone trying to understand the code.  Can
we do better?

> +		pa = sgx_combine_bank_regs(eax, ebx);
> +		size = sgx_combine_bank_regs(ecx, edx);
> +		pr_info("EPC bank 0x%llx-0x%llx\n", pa, pa + size - 1);
> +		ret = sgx_init_epc_bank(pa, size, i, &sgx_epc_banks[i]);
> +		if (ret) {
> +			sgx_page_cache_teardown();
> +			return ret;
> +		}

So if one bank fails, we tear down all banks, yet leave sgx_nr_epc_banks
incremented?  That sounds troublesome.

> +		sgx_nr_epc_banks++;
> +	}
> +
> +	if (!sgx_nr_epc_banks) {
> +		pr_err("There are zero EPC banks.\n");
> +		return -ENODEV;
> +	}
> +
> +	return 0;
> +}

Does this support hot-addition of a bank?  If not, why not?
Jarkko Sakkinen Aug. 28, 2018, 10:30 a.m. UTC | #2
On Mon, Aug 27, 2018 at 02:07:53PM -0700, Dave Hansen wrote:
> On 08/27/2018 11:53 AM, Jarkko Sakkinen wrote:
> > Add data structures to track Enclave Page Cache (EPC) pages.  EPC is
> > divided into multiple banks (1-N) of which addresses and sizes can be
> > enumerated with CPUID by the OS.
> > 
> > On NUMA systems a node can have at most bank. A bank can be at most part of
> > two nodes. SGX supports both nodes with a single memory controller and also
> > sub-cluster nodes with severals memory controllers on a single die.
> > 
> > Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> > Co-developed-by: Serge Ayoun <serge.ayoun@intel.com>
> > Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
> > Signed-off-by: Serge Ayoun <serge.ayoun@intel.com>
> > Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> > ---
> >  arch/x86/include/asm/sgx.h      |  60 ++++++++++++++++++
> >  arch/x86/kernel/cpu/intel_sgx.c | 106 +++++++++++++++++++++++++++++++-
> >  2 files changed, 164 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
> > index 2130e639ab49..17b7b3aa66bf 100644
> > --- a/arch/x86/include/asm/sgx.h
> > +++ b/arch/x86/include/asm/sgx.h
> > @@ -4,9 +4,69 @@
> >  #ifndef _ASM_X86_SGX_H
> >  #define _ASM_X86_SGX_H
> >  
> > +#include <linux/bitops.h>
> > +#include <linux/err.h>
> > +#include <linux/rwsem.h>
> >  #include <linux/types.h>
> > +#include <asm/sgx_arch.h>
> > +#include <asm/asm.h>
> > +
> > +#define SGX_MAX_EPC_BANKS 8
> 
> This is _still_ missing a meaningful description of what a bank is and
> whether it is a hardware or software structure.
> 
> It would also help us to determine whether your bit packing below is
> really required.

I think a better name would be EPC section as this is what the SDM uses
in the Table 3-8 when describing subleaves of EAX=0x12 (SGX specific
leaf) starting from ECX=0x02. It is a software structure that contains
the information given by these subleaves.

These sections constitute the physical pages that are part of the EPC.

> > +struct sgx_epc_page {
> > +	unsigned long desc;
> > +	struct list_head list;
> > +};
> > +
> > +struct sgx_epc_bank {
> > +	unsigned long pa;
> > +	void *va;
> > +	unsigned long size;
> 
> Please add units.  size could be bytes or pages, or who knows what.  I
> can't tell you how many bugs I've tripped over in the past from simple
> unit conversions

Will do.

> > +	struct sgx_epc_page *pages_data;
> > +	struct sgx_epc_page **pages;
> > +	unsigned long free_cnt;
> > +	spinlock_t lock;
> > +};
> >  
> >  extern bool sgx_enabled;
> >  extern bool sgx_lc_enabled;
> > +extern struct sgx_epc_bank sgx_epc_banks[SGX_MAX_EPC_BANKS];
> > +
> > +/*
> > + * enum sgx_epc_page_desc - defines bits and masks for an EPC page's desc
> 
> Why are you bothering packing these bits?  This seems a rather
> convoluted way to store two integers.

To keep struct sgx_epc_page 64 bytes.

> > +static __init int sgx_init_epc_bank(u64 addr, u64 size, unsigned long index,
> > +				    struct sgx_epc_bank *bank)
> > +{
> > +	unsigned long nr_pages = size >> PAGE_SHIFT;
> > +	struct sgx_epc_page *pages_data;
> > +	unsigned long i;
> > +	void *va;
> > +
> > +	va = ioremap_cache(addr, size);
> > +	if (!va)
> > +		return -ENOMEM;
> > +
> > +	pages_data = kcalloc(nr_pages, sizeof(struct sgx_epc_page), GFP_KERNEL);
> > +	if (!pages_data)
> > +		goto out_iomap;
> 
> This looks like you're roughly limited by the page allocator to a bank
> size of ~1.4GB which seems kinda small.  Is this really OK?

Where does this limitation come from?

> 
> > +	bank->pages = kcalloc(nr_pages, sizeof(struct sgx_epc_page *),
> > +			      GFP_KERNEL);
> > +	if (!bank->pages)
> > +		goto out_pdata;
> > +
> > +	for (i = 0; i < nr_pages; i++) {
> > +		bank->pages[i] = &pages_data[i];
> > +		bank->pages[i]->desc = (addr + (i << PAGE_SHIFT)) | index;
> > +	}
> > +
> > +	bank->pa = addr;
> > +	bank->size = size;
> > +	bank->va = va;
> > +	bank->free_cnt = nr_pages;
> > +	bank->pages_data = pages_data;
> > +	spin_lock_init(&bank->lock);
> > +	return 0;
> > +out_pdata:
> > +	kfree(pages_data);
> > +out_iomap:
> > +	iounmap(va);
> > +	return -ENOMEM;
> > +}
> > +
> > +static __init void sgx_page_cache_teardown(void)
> > +{
> > +	struct sgx_epc_bank *bank;
> > +	int i;
> > +
> > +	for (i = 0; i < sgx_nr_epc_banks; i++) {
> > +		bank = &sgx_epc_banks[i];
> > +		iounmap((void *)bank->va);
> > +		kfree(bank->pages);
> > +		kfree(bank->pages_data);
> > +	}
> > +}
> > +
> > +static inline u64 sgx_combine_bank_regs(u64 low, u64 high)
> > +{
> > +	return (low & 0xFFFFF000) + ((high & 0xFFFFF) << 32);
> > +}
> 
> -ENOCOMMENT for a rather weird looking calculation

Yea, totally agreed... I'll think about how to make this cleaner. Maybe
it would be anyway better idea to open code this to the call sites and
explain the calculation in a comment.

> > +static __init int sgx_page_cache_init(void)
> > +{
> > +	u32 eax, ebx, ecx, edx;
> > +	u64 pa, size;
> > +	int ret;
> > +	int i;
> > +
> > +	for (i = 0; i < SGX_MAX_EPC_BANKS; i++) {
> > +		cpuid_count(SGX_CPUID, 2 + i, &eax, &ebx, &ecx, &edx);
> > +		if (!(eax & 0xF))
> > +			break;
> 
> So, we have random data coming out of a random CPUID leaf being called
> 'eax' and then being tested against a random hard-coded mask.  This
> seems rather unfortunate for someone trying to understand the code.  Can
> we do better?

Should probably do something along the lines:

#define SGX_CPUID_SECTION(i) (2 + (i))

enum sgx_section {
	SGX_CPUID_SECTION_INVALID	= 0x00,
	SGX_CPUID_SECTION_VALID		= 0x1B,
	SGX_CPUID_SECTION_MASK		= 0xFF,
};


for (i = 0; i < SGX_MAX_EPC_BANKS; i++) {
	cpuid_count(SGX_CPUID, SGX_CPUID_SECTION(i), &eax, &ebx, &ecx, &edx);

	section = eax & SGX_SECTION_MASK;
	if (section != SGX_CPUID_SECTION_VALID) {
		if (section != SGX_CPUID_SECTION_INVALID) {
			/* Maybe a warning here for any other value as
			 * they are reserved according to the SDM?
			 */
		}

		continue;
	}

	/* ... */
}

> > +		pa = sgx_combine_bank_regs(eax, ebx);
> > +		size = sgx_combine_bank_regs(ecx, edx);
> > +		pr_info("EPC bank 0x%llx-0x%llx\n", pa, pa + size - 1);
> > +		ret = sgx_init_epc_bank(pa, size, i, &sgx_epc_banks[i]);
> > +		if (ret) {
> > +			sgx_page_cache_teardown();
> > +			return ret;
> > +		}
> 
> So if one bank fails, we tear down all banks, yet leave sgx_nr_epc_banks
> incremented?  That sounds troublesome.

It is. Thanks for spotting that out.

> > +		sgx_nr_epc_banks++;
> > +	}
> > +
> > +	if (!sgx_nr_epc_banks) {
> > +		pr_err("There are zero EPC banks.\n");
> > +		return -ENODEV;
> > +	}
> > +
> > +	return 0;
> > +}
> 
> Does this support hot-addition of a bank?  If not, why not?

This is the DSDT for this data from my GLK NUC:

    Scope (_SB)
    {
        Device (EPC)
        {
            Name (_HID, EisaId ("INT0E0C"))  // _HID: Hardware ID
            Name (_STR, Unicode ("Enclave Page Cache 1.0"))  // _STR: Description String
            Name (_MLS, Package (0x01)  // _MLS: Multiple Language String
            {
                Package (0x02)
                {
                    "en", 
                    Unicode ("Enclave Page Cache 1.0")
                }
            })
            Name (RBUF, ResourceTemplate ()
            {
                QWordMemory (ResourceConsumer, PosDecode, MinNotFixed, MaxNotFixed, NonCacheable, ReadWrite,
                    0x0000000000000000, // Granularity
                    0x0000000000000000, // Range Minimum
                    0x0000000000000000, // Range Maximum
                    0x0000000000000000, // Translation Offset
                    0x0000000000000001, // Length
                    ,, _Y18, AddressRangeMemory, TypeStatic)
            })
            Method (_CRS, 0, NotSerialized)  // _CRS: Current Resource Settings
            {
                CreateQWordField (RBUF, \_SB.EPC._Y18._MIN, EMIN)  // _MIN: Minimum Base Address
                CreateQWordField (RBUF, \_SB.EPC._Y18._MAX, EMAX)  // _MAX: Maximum Base Address
                CreateQWordField (RBUF, \_SB.EPC._Y18._LEN, ELEN)  // _LEN: Length
                EMIN = EMNA /* External reference */
                ELEN = ELNG /* External reference */
                EMAX = ((EMNA + ELNG) - One)
                Return (RBUF) /* \_SB_.EPC_.RBUF */
            }

            Method (_STA, 0, NotSerialized)  // _STA: Status
            {
                If ((EPCS != Zero))
                {
                    Return (0x0F)
                }

                Return (Zero)
            }
        }
    }

I'm not aware that we would have an ACPI specification for SGX so this
is all I have at the moment (does not show any ACPI event for
hotplugging).

/Jarkko
Dave Hansen Aug. 28, 2018, 4:53 p.m. UTC | #3
>>>  extern bool sgx_enabled;
>>>  extern bool sgx_lc_enabled;
>>> +extern struct sgx_epc_bank sgx_epc_banks[SGX_MAX_EPC_BANKS];
>>> +
>>> +/*
>>> + * enum sgx_epc_page_desc - defines bits and masks for an EPC page's desc
>>
>> Why are you bothering packing these bits?  This seems a rather
>> convoluted way to store two integers.
> 
> To keep struct sgx_epc_page 64 bytes.

It's a list_head and a ulong now.  That doesn't add up to 64.

If you properly describe the bounds and limits of banks we can possibly
help you find a nice solution.  As it stands, they are totally opaque
and we have no idea what is going on.

>>> +static __init int sgx_init_epc_bank(u64 addr, u64 size, unsigned long index,
>>> +				    struct sgx_epc_bank *bank)
>>> +{
>>> +	unsigned long nr_pages = size >> PAGE_SHIFT;
>>> +	struct sgx_epc_page *pages_data;
>>> +	unsigned long i;
>>> +	void *va;
>>> +
>>> +	va = ioremap_cache(addr, size);
>>> +	if (!va)
>>> +		return -ENOMEM;
>>> +
>>> +	pages_data = kcalloc(nr_pages, sizeof(struct sgx_epc_page), GFP_KERNEL);
>>> +	if (!pages_data)
>>> +		goto out_iomap;
>>
>> This looks like you're roughly limited by the page allocator to a bank
>> size of ~1.4GB which seems kinda small.  Is this really OK?
> 
> Where does this limitation come from?

The page allocator can only do 4MB at a time.  Using your 64 byte
numbers: 4MB/64 = 64k sgx_epc_pages.  64k*PAGE_SIZE = 256MB.  So you can
only handle 256MB banks with this code.

BTW, if you only have 64k worth of pages, you can use a u16 for the index.

>>> +	u32 eax, ebx, ecx, edx;
>>> +	u64 pa, size;
>>> +	int ret;
>>> +	int i;
>>> +
>>> +	for (i = 0; i < SGX_MAX_EPC_BANKS; i++) {
>>> +		cpuid_count(SGX_CPUID, 2 + i, &eax, &ebx, &ecx, &edx);
>>> +		if (!(eax & 0xF))
>>> +			break;
>>
>> So, we have random data coming out of a random CPUID leaf being called
>> 'eax' and then being tested against a random hard-coded mask.  This
>> seems rather unfortunate for someone trying to understand the code.  Can
>> we do better?
> 
> Should probably do something along the lines:
> 
> #define SGX_CPUID_SECTION(i) (2 + (i))
> 
> enum sgx_section {
> 	SGX_CPUID_SECTION_INVALID	= 0x00,
> 	SGX_CPUID_SECTION_VALID		= 0x1B,
> 	SGX_CPUID_SECTION_MASK		= 0xFF,
> };

Plus comments, that would be nice.

>>> +		sgx_nr_epc_banks++;
>>> +	}
>>> +
>>> +	if (!sgx_nr_epc_banks) {
>>> +		pr_err("There are zero EPC banks.\n");
>>> +		return -ENODEV;
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>
>> Does this support hot-addition of a bank?  If not, why not?
...
> I'm not aware that we would have an ACPI specification for SGX so this
> is all I have at the moment (does not show any ACPI event for
> hotplugging).

So you're saying the one platform you looked at don't support hotplug.
I was looking for a more broad statement about SGX.
Sean Christopherson Aug. 28, 2018, 9:34 p.m. UTC | #4
On Tue, Aug 28, 2018 at 09:53:11AM -0700, Dave Hansen wrote:
> >>> +		sgx_nr_epc_banks++;
> >>> +	}
> >>> +
> >>> +	if (!sgx_nr_epc_banks) {
> >>> +		pr_err("There are zero EPC banks.\n");
> >>> +		return -ENODEV;
> >>> +	}
> >>> +
> >>> +	return 0;
> >>> +}
> >>
> >> Does this support hot-addition of a bank?  If not, why not?
> ...
> > I'm not aware that we would have an ACPI specification for SGX so this
> > is all I have at the moment (does not show any ACPI event for
> > hotplugging).
> 
> So you're saying the one platform you looked at don't support hotplug.
> I was looking for a more broad statement about SGX.

Hardware doesn't support hotplug of EPC as the EPC size and location
is locked during activation of SGX.  And IIRC, activation of SGX must
be synchronized across all CPUs in a multi-socket platform, e.g. you
can't late-enable SGX on a socket and due hotplugging that way.

In a virtualized environment there are no such restrictions.  I am not
aware of any explicit requirements or use cases for supporting hotplug
of EPC, but that's probably only because virtualization of SGX is
fairly nascent.
Jarkko Sakkinen Aug. 31, 2018, 11:10 a.m. UTC | #5
On Tue, Aug 28, 2018 at 09:53:11AM -0700, Dave Hansen wrote:
> >>>  extern bool sgx_enabled;
> >>>  extern bool sgx_lc_enabled;
> >>> +extern struct sgx_epc_bank sgx_epc_banks[SGX_MAX_EPC_BANKS];
> >>> +
> >>> +/*
> >>> + * enum sgx_epc_page_desc - defines bits and masks for an EPC page's desc
> >>
> >> Why are you bothering packing these bits?  This seems a rather
> >> convoluted way to store two integers.
> > 
> > To keep struct sgx_epc_page 64 bytes.
> 
> It's a list_head and a ulong now.  That doesn't add up to 64.

Ah, there used to be one more variable in it.

> If you properly describe the bounds and limits of banks we can possibly
> help you find a nice solution.  As it stands, they are totally opaque
> and we have no idea what is going on.

Great, I see what I can do. I understand now better what you are looking
for. Thanks Dave.

/Jarkko
Jarkko Sakkinen Aug. 31, 2018, 11:13 a.m. UTC | #6
On Tue, Aug 28, 2018 at 02:34:32PM -0700, Sean Christopherson wrote:
> On Tue, Aug 28, 2018 at 09:53:11AM -0700, Dave Hansen wrote:
> > >>> +		sgx_nr_epc_banks++;
> > >>> +	}
> > >>> +
> > >>> +	if (!sgx_nr_epc_banks) {
> > >>> +		pr_err("There are zero EPC banks.\n");
> > >>> +		return -ENODEV;
> > >>> +	}
> > >>> +
> > >>> +	return 0;
> > >>> +}
> > >>
> > >> Does this support hot-addition of a bank?  If not, why not?
> > ...
> > > I'm not aware that we would have an ACPI specification for SGX so this
> > > is all I have at the moment (does not show any ACPI event for
> > > hotplugging).
> > 
> > So you're saying the one platform you looked at don't support hotplug.
> > I was looking for a more broad statement about SGX.
> 
> Hardware doesn't support hotplug of EPC as the EPC size and location
> is locked during activation of SGX.  And IIRC, activation of SGX must
> be synchronized across all CPUs in a multi-socket platform, e.g. you
> can't late-enable SGX on a socket and due hotplugging that way.

Makes me wonder how on a multisocket platform would this work anyway
given that they have different fused keys?

> In a virtualized environment there are no such restrictions.  I am not
> aware of any explicit requirements or use cases for supporting hotplug
> of EPC, but that's probably only because virtualization of SGX is
> fairly nascent.

/Jarkko
Andy Shevchenko Sept. 3, 2018, 2:41 p.m. UTC | #7
On Mon, Aug 27, 2018 at 9:58 PM Jarkko Sakkinen
<jarkko.sakkinen@linux.intel.com> wrote:
>
> Add data structures to track Enclave Page Cache (EPC) pages.  EPC is
> divided into multiple banks (1-N) of which addresses and sizes can be
> enumerated with CPUID by the OS.
>
> On NUMA systems a node can have at most bank. A bank can be at most part of
> two nodes. SGX supports both nodes with a single memory controller and also
> sub-cluster nodes with severals memory controllers on a single die.

> -#include <asm/sgx.h>
> -#include <asm/sgx_pr.h>
>  #include <linux/freezer.h>
>  #include <linux/highmem.h>
>  #include <linux/kthread.h>
> +#include <linux/pagemap.h>
>  #include <linux/ratelimit.h>
>  #include <linux/sched/signal.h>
> +#include <linux/shmem_fs.h>
>  #include <linux/slab.h>
> +#include <asm/sgx.h>
> +#include <asm/sgx_pr.h>

Squash issues?

> +       va = ioremap_cache(addr, size);
> +       if (!va)
> +               return -ENOMEM;

I'm not sure this is a right API. Do we operate with memory? Does it
have I/O side effects?
If no, memremap() would be better to use.
Jarkko Sakkinen Sept. 4, 2018, 9:59 a.m. UTC | #8
On Mon, Sep 03, 2018 at 05:41:53PM +0300, Andy Shevchenko wrote:
> On Mon, Aug 27, 2018 at 9:58 PM Jarkko Sakkinen
> <jarkko.sakkinen@linux.intel.com> wrote:
> >
> > Add data structures to track Enclave Page Cache (EPC) pages.  EPC is
> > divided into multiple banks (1-N) of which addresses and sizes can be
> > enumerated with CPUID by the OS.
> >
> > On NUMA systems a node can have at most bank. A bank can be at most part of
> > two nodes. SGX supports both nodes with a single memory controller and also
> > sub-cluster nodes with severals memory controllers on a single die.
> 
> > -#include <asm/sgx.h>
> > -#include <asm/sgx_pr.h>
> >  #include <linux/freezer.h>
> >  #include <linux/highmem.h>
> >  #include <linux/kthread.h>
> > +#include <linux/pagemap.h>
> >  #include <linux/ratelimit.h>
> >  #include <linux/sched/signal.h>
> > +#include <linux/shmem_fs.h>
> >  #include <linux/slab.h>
> > +#include <asm/sgx.h>
> > +#include <asm/sgx_pr.h>
> 
> Squash issues?

Yes :-/

> > +       va = ioremap_cache(addr, size);
> > +       if (!va)
> > +               return -ENOMEM;
> 
> I'm not sure this is a right API. Do we operate with memory? Does it
> have I/O side effects?
> If no, memremap() would be better to use.

Had this idea a long time ago but had forgotten it. EPC is from caching
perspective like regular memory.

> -- 
> With Best Regards,
> Andy Shevchenko
> 

/Jarkko
Sean Christopherson Sept. 4, 2018, 5:49 p.m. UTC | #9
On Mon, Sep 03, 2018 at 05:41:53PM +0300, Andy Shevchenko wrote:
> On Mon, Aug 27, 2018 at 9:58 PM Jarkko Sakkinen
> <jarkko.sakkinen@linux.intel.com> wrote:
> 
> > +       va = ioremap_cache(addr, size);
> > +       if (!va)
> > +               return -ENOMEM;
> 
> I'm not sure this is a right API. Do we operate with memory? Does it
> have I/O side effects?
> If no, memremap() would be better to use.

Preserving __iomem is desirable.  There aren't side effects per se,
but direct non-enclave accesses to the EPC get abort page semantics so
the kernel shouldn't be directly dereferencing a pointer to the EPC.
Though by that argument, sgx_epc_bank.va, sgx_epc_addr's return and
all ENCLS helpers should be tagged __iomem.

For documentation purposes, maybe it would be better to use __private
or "#define __sgx_epc __iomem" and use that?
Andy Shevchenko Sept. 4, 2018, 6:01 p.m. UTC | #10
On Tue, Sep 4, 2018 a> +/**

> > > +       va = ioremap_cache(addr, size);
> > > +       if (!va)
> > > +               return -ENOMEM;
> >
> > I'm not sure this is a right API. Do we operate with memory? Does it
> > have I/O side effects?
> > If no, memremap() would be better to use.
>
> Preserving __iomem is desirable.  There aren't side effects per se,
> but direct non-enclave accesses to the EPC get abort page semantics so
> the kernel shouldn't be directly dereferencing a pointer to the EPC.
> Though by that argument, sgx_epc_bank.va, sgx_epc_addr's return and
> all ENCLS helpers should be tagged __iomem.

Why?
Does it related to *any* I/O?
Sean Christopherson Sept. 4, 2018, 6:17 p.m. UTC | #11
On Tue, Sep 04, 2018 at 09:01:15PM +0300, Andy Shevchenko wrote:
> On Tue, Sep 4, 2018 a> +/**
> 
> > > > +       va = ioremap_cache(addr, size);
> > > > +       if (!va)
> > > > +               return -ENOMEM;
> > >
> > > I'm not sure this is a right API. Do we operate with memory? Does it
> > > have I/O side effects?
> > > If no, memremap() would be better to use.
> >
> > Preserving __iomem is desirable.  There aren't side effects per se,
> > but direct non-enclave accesses to the EPC get abort page semantics so
> > the kernel shouldn't be directly dereferencing a pointer to the EPC.
> > Though by that argument, sgx_epc_bank.va, sgx_epc_addr's return and
> > all ENCLS helpers should be tagged __iomem.
> 
> Why?
> Does it related to *any* I/O?

No, hence my other comment that __private or a new tag altogether may
be more appropriate.  The noderef attribute is what we truly care
about.
Jarkko Sakkinen Sept. 5, 2018, 5:36 p.m. UTC | #12
On Tue, Sep 04, 2018 at 11:17:35AM -0700, Sean Christopherson wrote:
> On Tue, Sep 04, 2018 at 09:01:15PM +0300, Andy Shevchenko wrote:
> > On Tue, Sep 4, 2018 a> +/**
> > 
> > > > > +       va = ioremap_cache(addr, size);
> > > > > +       if (!va)
> > > > > +               return -ENOMEM;
> > > >
> > > > I'm not sure this is a right API. Do we operate with memory? Does it
> > > > have I/O side effects?
> > > > If no, memremap() would be better to use.
> > >
> > > Preserving __iomem is desirable.  There aren't side effects per se,
> > > but direct non-enclave accesses to the EPC get abort page semantics so
> > > the kernel shouldn't be directly dereferencing a pointer to the EPC.
> > > Though by that argument, sgx_epc_bank.va, sgx_epc_addr's return and
> > > all ENCLS helpers should be tagged __iomem.
> > 
> > Why?
> > Does it related to *any* I/O?
> 
> No, hence my other comment that __private or a new tag altogether may
> be more appropriate.  The noderef attribute is what we truly care
> about.

My proposal is that we go with memremap() and use

#define __sgx_epc __attribute__((noderef))

It makes sense to check that direct EPC pointers are not passed to
functions when they are not supposed to.

/Jarkko
diff mbox series

Patch

diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
index 2130e639ab49..17b7b3aa66bf 100644
--- a/arch/x86/include/asm/sgx.h
+++ b/arch/x86/include/asm/sgx.h
@@ -4,9 +4,69 @@ 
 #ifndef _ASM_X86_SGX_H
 #define _ASM_X86_SGX_H
 
+#include <linux/bitops.h>
+#include <linux/err.h>
+#include <linux/rwsem.h>
 #include <linux/types.h>
+#include <asm/sgx_arch.h>
+#include <asm/asm.h>
+
+#define SGX_MAX_EPC_BANKS 8
+
+struct sgx_epc_page {
+	unsigned long desc;
+	struct list_head list;
+};
+
+struct sgx_epc_bank {
+	unsigned long pa;
+	void *va;
+	unsigned long size;
+	struct sgx_epc_page *pages_data;
+	struct sgx_epc_page **pages;
+	unsigned long free_cnt;
+	spinlock_t lock;
+};
 
 extern bool sgx_enabled;
 extern bool sgx_lc_enabled;
+extern struct sgx_epc_bank sgx_epc_banks[SGX_MAX_EPC_BANKS];
+
+/*
+ * enum sgx_epc_page_desc - defines bits and masks for an EPC page's desc
+ * @SGX_EPC_BANK_MASK:	      SGX allows a system to multiple EPC banks (at
+ *			      different physical locations).  The index of a
+ *			      page's bank in its desc so that we can do a quick
+ *			      lookup of its virtual address (EPC is mapped via
+ *			      ioremap_cache() because it's non-standard memory).
+ *			      Current and near-future hardware defines at most
+ *			      eight banks, hence three bits to hold the bank.
+ *			      sgx_page_cache_init() asserts that the max bank
+ *			      index doesn't exceed SGX_EPC_BANK_MASK.
+ * @SGX_EPC_PAGE_RECLAIMABLE: When set, indicates a page is reclaimable.  Used
+ *			      when freeing a page to know that we also need to
+ *			      remove the page from the active page list.
+ *
+ * Defines the layout of the desc field in the &struct sgx_epc_page, which
+ * contains EPC bank number, physical address of the page and the page status
+ * flag.
+ */
+enum sgx_epc_page_desc {
+	SGX_EPC_BANK_MASK			= GENMASK_ULL(3, 0),
+	SGX_EPC_PAGE_RECLAIMABLE		= BIT(4),
+	/* bits 12-63 are reserved for the physical page address of the page */
+};
+
+static inline struct sgx_epc_bank *sgx_epc_bank(struct sgx_epc_page *page)
+{
+	return &sgx_epc_banks[page->desc & SGX_EPC_BANK_MASK];
+}
+
+static inline void *sgx_epc_addr(struct sgx_epc_page *page)
+{
+	struct sgx_epc_bank *bank = sgx_epc_bank(page);
+
+	return (void *)(bank->va + (page->desc & PAGE_MASK) - bank->pa);
+}
 
 #endif /* _ASM_X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/intel_sgx.c b/arch/x86/kernel/cpu/intel_sgx.c
index 17b46bec9c54..53ac172e8006 100644
--- a/arch/x86/kernel/cpu/intel_sgx.c
+++ b/arch/x86/kernel/cpu/intel_sgx.c
@@ -1,23 +1,121 @@ 
 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
 // Copyright(c) 2016-17 Intel Corporation.
 
-#include <asm/sgx.h>
-#include <asm/sgx_pr.h>
 #include <linux/freezer.h>
 #include <linux/highmem.h>
 #include <linux/kthread.h>
+#include <linux/pagemap.h>
 #include <linux/ratelimit.h>
 #include <linux/sched/signal.h>
+#include <linux/shmem_fs.h>
 #include <linux/slab.h>
+#include <asm/sgx.h>
+#include <asm/sgx_pr.h>
 
 bool sgx_enabled __ro_after_init;
 EXPORT_SYMBOL_GPL(sgx_enabled);
 bool sgx_lc_enabled __ro_after_init;
 EXPORT_SYMBOL_GPL(sgx_lc_enabled);
+struct sgx_epc_bank sgx_epc_banks[SGX_MAX_EPC_BANKS];
+EXPORT_SYMBOL_GPL(sgx_epc_banks);
+
+static int sgx_nr_epc_banks;
+
+static __init int sgx_init_epc_bank(u64 addr, u64 size, unsigned long index,
+				    struct sgx_epc_bank *bank)
+{
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	struct sgx_epc_page *pages_data;
+	unsigned long i;
+	void *va;
+
+	va = ioremap_cache(addr, size);
+	if (!va)
+		return -ENOMEM;
+
+	pages_data = kcalloc(nr_pages, sizeof(struct sgx_epc_page), GFP_KERNEL);
+	if (!pages_data)
+		goto out_iomap;
+
+	bank->pages = kcalloc(nr_pages, sizeof(struct sgx_epc_page *),
+			      GFP_KERNEL);
+	if (!bank->pages)
+		goto out_pdata;
+
+	for (i = 0; i < nr_pages; i++) {
+		bank->pages[i] = &pages_data[i];
+		bank->pages[i]->desc = (addr + (i << PAGE_SHIFT)) | index;
+	}
+
+	bank->pa = addr;
+	bank->size = size;
+	bank->va = va;
+	bank->free_cnt = nr_pages;
+	bank->pages_data = pages_data;
+	spin_lock_init(&bank->lock);
+	return 0;
+out_pdata:
+	kfree(pages_data);
+out_iomap:
+	iounmap(va);
+	return -ENOMEM;
+}
+
+static __init void sgx_page_cache_teardown(void)
+{
+	struct sgx_epc_bank *bank;
+	int i;
+
+	for (i = 0; i < sgx_nr_epc_banks; i++) {
+		bank = &sgx_epc_banks[i];
+		iounmap((void *)bank->va);
+		kfree(bank->pages);
+		kfree(bank->pages_data);
+	}
+}
+
+static inline u64 sgx_combine_bank_regs(u64 low, u64 high)
+{
+	return (low & 0xFFFFF000) + ((high & 0xFFFFF) << 32);
+}
+
+static __init int sgx_page_cache_init(void)
+{
+	u32 eax, ebx, ecx, edx;
+	u64 pa, size;
+	int ret;
+	int i;
+
+	for (i = 0; i < SGX_MAX_EPC_BANKS; i++) {
+		cpuid_count(SGX_CPUID, 2 + i, &eax, &ebx, &ecx, &edx);
+		if (!(eax & 0xF))
+			break;
+
+		pa = sgx_combine_bank_regs(eax, ebx);
+		size = sgx_combine_bank_regs(ecx, edx);
+		pr_info("EPC bank 0x%llx-0x%llx\n", pa, pa + size - 1);
+
+		ret = sgx_init_epc_bank(pa, size, i, &sgx_epc_banks[i]);
+		if (ret) {
+			sgx_page_cache_teardown();
+			return ret;
+		}
+
+		sgx_nr_epc_banks++;
+	}
+
+	if (!sgx_nr_epc_banks) {
+		pr_err("There are zero EPC banks.\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
 
 static __init int sgx_init(void)
 {
 	unsigned long fc;
+	int ret;
 
 	if (!boot_cpu_has(X86_FEATURE_SGX))
 		return false;
@@ -39,6 +137,10 @@  static __init int sgx_init(void)
 	if (!(fc & FEATURE_CONTROL_SGX_LE_WR))
 		pr_info("IA32_SGXLEPUBKEYHASHn MSRs are not writable\n");
 
+	ret = sgx_page_cache_init();
+	if (ret)
+		return ret;
+
 	sgx_enabled = true;
 	sgx_lc_enabled = !!(fc & FEATURE_CONTROL_SGX_LE_WR);
 	return 0;