diff mbox series

[RFC,45/62] mm: Add the encrypt_mprotect() system call for MKTME

Message ID 20190508144422.13171-46-kirill.shutemov@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series Intel MKTME enabling | expand

Commit Message

Kirill A. Shutemov May 8, 2019, 2:44 p.m. UTC
From: Alison Schofield <alison.schofield@intel.com>

Implement memory encryption for MKTME (Multi-Key Total Memory
Encryption) with a new system call that is an extension of the
legacy mprotect() system call.

In encrypt_mprotect the caller must pass a handle to a previously
allocated and programmed MKTME encryption key. The key can be
obtained through the kernel key service type "mktme". The caller
must have KEY_NEED_VIEW permission on the key.

MKTME places an additional restriction on the protected data:
The length of the data must be page aligned. This is in addition
to the existing mprotect restriction that the addr must be page
aligned.

encrypt_mprotect() will lookup the hardware keyid for the given
userspace key. It will use previously defined helpers to insert
that keyid in the VMAs during legacy mprotect() execution.

Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 fs/exec.c          |  4 +--
 include/linux/mm.h |  3 +-
 mm/mprotect.c      | 68 +++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 65 insertions(+), 10 deletions(-)

Comments

Peter Zijlstra June 14, 2019, 11:47 a.m. UTC | #1
On Wed, May 08, 2019 at 05:44:05PM +0300, Kirill A. Shutemov wrote:
> diff --git a/fs/exec.c b/fs/exec.c
> index 2e0033348d8e..695c121b34b3 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -755,8 +755,8 @@ int setup_arg_pages(struct linux_binprm *bprm,
>  	vm_flags |= mm->def_flags;
>  	vm_flags |= VM_STACK_INCOMPLETE_SETUP;
>  
> -	ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
> -			vm_flags);
> +	ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, vm_flags,
> +			     -1);

You added a nice NO_KEY helper a few patches back, maybe use it?

>  	if (ret)
>  		goto out_unlock;
>  	BUG_ON(prev != vma);
Peter Zijlstra June 14, 2019, 11:51 a.m. UTC | #2
On Wed, May 08, 2019 at 05:44:05PM +0300, Kirill A. Shutemov wrote:

> @@ -347,7 +348,8 @@ static int prot_none_walk(struct vm_area_struct *vma, unsigned long start,
>  
>  int
>  mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
> -	unsigned long start, unsigned long end, unsigned long newflags)
> +	       unsigned long start, unsigned long end, unsigned long newflags,
> +	       int newkeyid)
>  {
>  	struct mm_struct *mm = vma->vm_mm;
>  	unsigned long oldflags = vma->vm_flags;
> @@ -357,7 +359,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
>  	int error;
>  	int dirty_accountable = 0;
>  
> -	if (newflags == oldflags) {
> +	/*
> +	 * Flags match and Keyids match or we have NO_KEY.
> +	 * This _fixup is usually called from do_mprotect_ext() except
> +	 * for one special case: caller fs/exec.c/setup_arg_pages()
> +	 * In that case, newkeyid is passed as -1 (NO_KEY).
> +	 */
> +	if (newflags == oldflags &&
> +	    (newkeyid == vma_keyid(vma) || newkeyid == NO_KEY)) {
>  		*pprev = vma;
>  		return 0;
>  	}
> @@ -423,6 +432,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
>  	}
>  
>  success:
> +	if (newkeyid != NO_KEY)
> +		mprotect_set_encrypt(vma, newkeyid, start, end);
>  	/*
>  	 * vm_flags and vm_page_prot are protected by the mmap_sem
>  	 * held in write mode.
> @@ -454,10 +465,15 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
>  }
>  
>  /*
> - * When pkey==NO_KEY we get legacy mprotect behavior here.
> + * do_mprotect_ext() supports the legacy mprotect behavior plus extensions
> + * for Protection Keys and Memory Encryption Keys. These extensions are
> + * mutually exclusive and the behavior is:
> + *	(pkey==NO_KEY && keyid==NO_KEY) ==> legacy mprotect
> + *	(pkey is valid)  ==> legacy mprotect plus Protection Key extensions
> + *	(keyid is valid) ==> legacy mprotect plus Encryption Key extensions
>   */
>  static int do_mprotect_ext(unsigned long start, size_t len,
> -		unsigned long prot, int pkey)
> +			   unsigned long prot, int pkey, int keyid)
>  {
>  	unsigned long nstart, end, tmp, reqprot;
>  	struct vm_area_struct *vma, *prev;
> @@ -555,7 +571,8 @@ static int do_mprotect_ext(unsigned long start, size_t len,
>  		tmp = vma->vm_end;
>  		if (tmp > end)
>  			tmp = end;
> -		error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
> +		error = mprotect_fixup(vma, &prev, nstart, tmp, newflags,
> +				       keyid);
>  		if (error)
>  			goto out;
>  		nstart = tmp;

I've missed the part where pkey && keyid results in a WARN or error or
whatever.
Alison Schofield June 14, 2019, 5:35 p.m. UTC | #3
On Fri, Jun 14, 2019 at 01:47:32PM +0200, Peter Zijlstra wrote:
> On Wed, May 08, 2019 at 05:44:05PM +0300, Kirill A. Shutemov wrote:
> > diff --git a/fs/exec.c b/fs/exec.c
> > index 2e0033348d8e..695c121b34b3 100644
> > --- a/fs/exec.c
> > +++ b/fs/exec.c
> > @@ -755,8 +755,8 @@ int setup_arg_pages(struct linux_binprm *bprm,
> >  	vm_flags |= mm->def_flags;
> >  	vm_flags |= VM_STACK_INCOMPLETE_SETUP;
> >  
> > -	ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
> > -			vm_flags);
> > +	ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, vm_flags,
> > +			     -1);
> 
> You added a nice NO_KEY helper a few patches back, maybe use it?

Sure, done.
(I hesitated to define NO_KEY in mm.h initially. Put it there now.
We'll see how that looks it next round.)

> 
> >  	if (ret)
> >  		goto out_unlock;
> >  	BUG_ON(prev != vma);
Alison Schofield June 15, 2019, 12:32 a.m. UTC | #4
On Fri, Jun 14, 2019 at 01:51:37PM +0200, Peter Zijlstra wrote:
> On Wed, May 08, 2019 at 05:44:05PM +0300, Kirill A. Shutemov wrote:
snip
> >  /*
> > - * When pkey==NO_KEY we get legacy mprotect behavior here.
> > + * do_mprotect_ext() supports the legacy mprotect behavior plus extensions
> > + * for Protection Keys and Memory Encryption Keys. These extensions are
> > + * mutually exclusive and the behavior is:
> > + *	(pkey==NO_KEY && keyid==NO_KEY) ==> legacy mprotect
> > + *	(pkey is valid)  ==> legacy mprotect plus Protection Key extensions
> > + *	(keyid is valid) ==> legacy mprotect plus Encryption Key extensions
> >   */
> >  static int do_mprotect_ext(unsigned long start, size_t len,
> > -		unsigned long prot, int pkey)
> > +			   unsigned long prot, int pkey, int keyid)
> >  {

snip

>
> I've missed the part where pkey && keyid results in a WARN or error or
> whatever.
> 
I wasn't so sure about that since do_mprotect_ext()
is the call 'behind' the system calls. 

legacy mprotect always calls with: NO_KEY, NO_KEY
pkey_mprotect always calls with:  pkey, NO_KEY
encrypt_mprotect always calls with  NO_KEY, keyid

Would a check on those arguments be debug only 
to future proof this?
Peter Zijlstra June 17, 2019, 9:08 a.m. UTC | #5
On Fri, Jun 14, 2019 at 05:32:31PM -0700, Alison Schofield wrote:
> On Fri, Jun 14, 2019 at 01:51:37PM +0200, Peter Zijlstra wrote:
> > On Wed, May 08, 2019 at 05:44:05PM +0300, Kirill A. Shutemov wrote:
> snip
> > >  /*
> > > - * When pkey==NO_KEY we get legacy mprotect behavior here.
> > > + * do_mprotect_ext() supports the legacy mprotect behavior plus extensions
> > > + * for Protection Keys and Memory Encryption Keys. These extensions are
> > > + * mutually exclusive and the behavior is:

Well, here it states that the extentions are mutually exclusive.

> > > + *	(pkey==NO_KEY && keyid==NO_KEY) ==> legacy mprotect
> > > + *	(pkey is valid)  ==> legacy mprotect plus Protection Key extensions
> > > + *	(keyid is valid) ==> legacy mprotect plus Encryption Key extensions
> > >   */
> > >  static int do_mprotect_ext(unsigned long start, size_t len,
> > > -		unsigned long prot, int pkey)
> > > +			   unsigned long prot, int pkey, int keyid)
> > >  {
> 
> snip
> 
> >
> > I've missed the part where pkey && keyid results in a WARN or error or
> > whatever.
> > 
> I wasn't so sure about that since do_mprotect_ext()
> is the call 'behind' the system calls. 
> 
> legacy mprotect always calls with: NO_KEY, NO_KEY
> pkey_mprotect always calls with:  pkey, NO_KEY
> encrypt_mprotect always calls with  NO_KEY, keyid
> 
> Would a check on those arguments be debug only 
> to future proof this?

But you then don't check that, anywhere, afaict.
Andy Lutomirski June 17, 2019, 3:07 p.m. UTC | #6
On Wed, May 8, 2019 at 7:44 AM Kirill A. Shutemov
<kirill.shutemov@linux.intel.com> wrote:
>
> From: Alison Schofield <alison.schofield@intel.com>
>
> Implement memory encryption for MKTME (Multi-Key Total Memory
> Encryption) with a new system call that is an extension of the
> legacy mprotect() system call.
>
> In encrypt_mprotect the caller must pass a handle to a previously
> allocated and programmed MKTME encryption key. The key can be
> obtained through the kernel key service type "mktme". The caller
> must have KEY_NEED_VIEW permission on the key.
>
> MKTME places an additional restriction on the protected data:
> The length of the data must be page aligned. This is in addition
> to the existing mprotect restriction that the addr must be page
> aligned.

I still find it bizarre that this is conflated with mprotect().

I also remain entirely unconvinced that MKTME on anonymous memory is
useful in the long run.  There will inevitably be all kinds of fancy
new CPU features that make the underlying MKTME mechanisms much more
useful.  For example, some way to bind a key to a VM, or a way to
*sanely* encrypt persistent memory.  By making this thing a syscall
that does more than just MKTME, you're adding combinatorial complexity
(you forget pkey!) and you're tying other functionality (change of
protection) to this likely-to-be-deprecated interface.

This is part of why I much prefer the idea of making this style of
MKTME a driver or some other non-intrusive interface.  Then, once
everyone gets tired of it, the driver can just get turned off with no
side effects.

--Andy
Dave Hansen June 17, 2019, 3:28 p.m. UTC | #7
On 6/17/19 8:07 AM, Andy Lutomirski wrote:
> I still find it bizarre that this is conflated with mprotect().

This needs to be in the changelog.  But, for better or worse, it's
following the mprotect_pkey() pattern.

Other than the obvious "set the key on this memory", we're looking for
two other properties: atomicity (ensuring there is no transient state
where the memory is usable without the desired properties) and that it
is usable on existing allocations.

For atomicity, we have a model where we can allocate things with
PROT_NONE, then do mprotect_pkey() and mprotect_encrypt() (plus any
future features), then the last mprotect_*() call takes us from
PROT_NONE to the desired end permisions.  We could just require a plain
old mprotect() to do that instead of embedding mprotect()-like behavior
in these, of course, but that isn't the path we're on at the moment with
mprotect_pkey().

So, for this series it's just a matter of whether we do this:

	ptr = mmap(..., PROT_NONE);
	mprotect_pkey(protect_key, ptr, PROT_NONE);
	mprotect_encrypt(encr_key, ptr, PROT_READ|PROT_WRITE);
	// good to go

or this:

	ptr = mmap(..., PROT_NONE);
	mprotect_pkey(protect_key, ptr, PROT_NONE);
	sys_encrypt(key, ptr);
	mprotect(ptr, PROT_READ|PROT_WRITE);
	// good to go

I actually don't care all that much which one we end up with.  It's not
like the extra syscall in the second options means much.

> This is part of why I much prefer the idea of making this style of
> MKTME a driver or some other non-intrusive interface.  Then, once
> everyone gets tired of it, the driver can just get turned off with no
> side effects.

I like the concept, but not where it leads.  I'd call it the 'hugetlbfs
approach". :)  Hugetblfs certainly go us huge pages, but it's continued
to be a parallel set of code with parallel bugs and parallel
implementations of many VM features.  It's not that you can't implement
new things on hugetlbfs, it's that you *need* to.  You never get them
for free.

For instance, if we do a driver, how do we get large pages?  How do we
swap/reclaim the pages?  How do we do NUMA affinity?  How do we
eventually stack it on top of persistent memory filesystems or Device
DAX?  With a driver approach, I think we're stuck basically
reimplementing things or gluing them back together.  Nothing comes for free.

With this approach, we basically start with our normal, full feature set
(modulo weirdo interactions like with KSM).
Andy Lutomirski June 17, 2019, 3:46 p.m. UTC | #8
On Mon, Jun 17, 2019 at 8:28 AM Dave Hansen <dave.hansen@intel.com> wrote:
>
> On 6/17/19 8:07 AM, Andy Lutomirski wrote:
> > I still find it bizarre that this is conflated with mprotect().
>
> This needs to be in the changelog.  But, for better or worse, it's
> following the mprotect_pkey() pattern.
>
> Other than the obvious "set the key on this memory", we're looking for
> two other properties: atomicity (ensuring there is no transient state
> where the memory is usable without the desired properties) and that it
> is usable on existing allocations.
>
> For atomicity, we have a model where we can allocate things with
> PROT_NONE, then do mprotect_pkey() and mprotect_encrypt() (plus any
> future features), then the last mprotect_*() call takes us from
> PROT_NONE to the desired end permisions.  We could just require a plain
> old mprotect() to do that instead of embedding mprotect()-like behavior
> in these, of course, but that isn't the path we're on at the moment with
> mprotect_pkey().
>
> So, for this series it's just a matter of whether we do this:
>
>         ptr = mmap(..., PROT_NONE);
>         mprotect_pkey(protect_key, ptr, PROT_NONE);
>         mprotect_encrypt(encr_key, ptr, PROT_READ|PROT_WRITE);
>         // good to go
>
> or this:
>
>         ptr = mmap(..., PROT_NONE);
>         mprotect_pkey(protect_key, ptr, PROT_NONE);
>         sys_encrypt(key, ptr);
>         mprotect(ptr, PROT_READ|PROT_WRITE);
>         // good to go
>
> I actually don't care all that much which one we end up with.  It's not
> like the extra syscall in the second options means much.

The benefit of the second one is that, if sys_encrypt is absent, it
just works.  In the first model, programs need a fallback because
they'll segfault of mprotect_encrypt() gets ENOSYS.

>
> > This is part of why I much prefer the idea of making this style of
> > MKTME a driver or some other non-intrusive interface.  Then, once
> > everyone gets tired of it, the driver can just get turned off with no
> > side effects.
>
> I like the concept, but not where it leads.  I'd call it the 'hugetlbfs
> approach". :)  Hugetblfs certainly go us huge pages, but it's continued
> to be a parallel set of code with parallel bugs and parallel
> implementations of many VM features.  It's not that you can't implement
> new things on hugetlbfs, it's that you *need* to.  You never get them
> for free.

Fair enough, but...

>
> For instance, if we do a driver, how do we get large pages?  How do we
> swap/reclaim the pages?  How do we do NUMA affinity?

Those all make sense.

>  How do we
> eventually stack it on top of persistent memory filesystems or Device
> DAX?

How do we stack anonymous memory on top of persistent memory or Device
DAX?  I'm confused.

Just to throw this out there, what if we had a new device /dev/xpfo
and MKTME were one of its features.  You open /dev/xpfo, optionally do
an ioctl to set a key, and them map it.  The pages you get are
unmapped entirely from the direct map, and you get a PFNMAP VMA with
all its limitations.  This seems much more useful -- it's limited, but
it's limited *because the kernel can't accidentally read it*.

I think that, in the long run, we're going to have to either expand
the core mm's concept of what "memory" is or just have a whole
parallel set of mechanisms for memory that doesn't work like memory.
We're already accumulating a set of things that are backed by memory
but aren't usable as memory. SGX EPC pages and SEV pages come to mind.
They are faster when they're in big contiguous chunks (well, not SGX
AFAIK, but maybe some day), they have NUMA node affinity, and they
show up in page tables, but the hardware restricts who can read and
write them.  If Intel isn't planning to do something like this with
the MKTME hardware, I'll eat my hat.

I expect that some day normal memory will  be able to be repurposed as
SGX pages on the fly, and that will also look a lot more like SEV or
XPFO than like the this model of MKTME.

So, if we upstream MKTME as anonymous memory with a magic config
syscall, I predict that, in a few years, it will be end up inheriting
all downsides of both approaches with few of the upsides.  Programs
like QEMU will need to learn to manipulate pages that can't be
accessed outside the VM without special VM buy-in, so the fact that
MKTME pages are fully functional and can be GUP-ed won't be very
useful.  And the VM will learn about all these things, but MKTME won't
really fit in.

And, one of these days, someone will come up with a version of XPFO
that could actually be upstreamed, and it seems entirely plausible
that it will be totally incompatible with MKTME-as-anonymous-memory
and that users of MKTME will actually get *worse* security.
Dave Hansen June 17, 2019, 6:27 p.m. UTC | #9
Tom Lendacky, could you take a look down in the message to the talk of
SEV?  I want to make sure I'm not misrepresenting what it does today.
...


>> I actually don't care all that much which one we end up with.  It's not
>> like the extra syscall in the second options means much.
> 
> The benefit of the second one is that, if sys_encrypt is absent, it
> just works.  In the first model, programs need a fallback because
> they'll segfault of mprotect_encrypt() gets ENOSYS.

Well, by the time they get here, they would have already had to allocate
and set up the encryption key.  I don't think this would really be the
"normal" malloc() path, for instance.

>>  How do we
>> eventually stack it on top of persistent memory filesystems or Device
>> DAX?
> 
> How do we stack anonymous memory on top of persistent memory or Device
> DAX?  I'm confused.

If our interface to MKTME is:

	fd = open("/dev/mktme");
	ptr = mmap(fd);

Then it's hard to combine with an interface which is:

	fd = open("/dev/dax123");
	ptr = mmap(fd);

Where if we have something like mprotect() (or madvise() or something
else taking pointer), we can just do:

	fd = open("/dev/anything987");
	ptr = mmap(fd);
	sys_encrypt(ptr);

Now, we might not *do* it that way for dax, for instance, but I'm just
saying that if we go the /dev/mktme route, we never get a choice.

> I think that, in the long run, we're going to have to either expand
> the core mm's concept of what "memory" is or just have a whole
> parallel set of mechanisms for memory that doesn't work like memory.
...
> I expect that some day normal memory will  be able to be repurposed as
> SGX pages on the fly, and that will also look a lot more like SEV or
> XPFO than like the this model of MKTME.

I think you're drawing the line at pages where the kernel can manage
contents vs. not manage contents.  I'm not sure that's the right
distinction to make, though.  The thing that is important is whether the
kernel can manage the lifetime and location of the data in the page.

Basically: Can the kernel choose where the page comes from and get the
page back when it wants?

I really don't like the current state of things like with SEV or with
KVM direct device assignment where the physical location is quite locked
down and the kernel really can't manage the memory.  I'm trying really
hard to make sure future hardware is more permissive about such things.
 My hope is that these are a temporary blip and not the new normal.

> So, if we upstream MKTME as anonymous memory with a magic config
> syscall, I predict that, in a few years, it will be end up inheriting
> all downsides of both approaches with few of the upsides.  Programs
> like QEMU will need to learn to manipulate pages that can't be
> accessed outside the VM without special VM buy-in, so the fact that
> MKTME pages are fully functional and can be GUP-ed won't be very
> useful.  And the VM will learn about all these things, but MKTME won't
> really fit in.

Kai Huang (who is on cc) has been doing the QEMU enabling and might want
to weigh in.  I'd also love to hear from the AMD folks in case I'm not
grokking some aspect of SEV.

But, my understanding is that, even today, neither QEMU nor the kernel
can see SEV-encrypted guest memory.  So QEMU should already understand
how to not interact with guest memory.  I _assume_ it's also already
doing this with anonymous memory, without needing /dev/sme or something.

> And, one of these days, someone will come up with a version of XPFO
> that could actually be upstreamed, and it seems entirely plausible
> that it will be totally incompatible with MKTME-as-anonymous-memory
> and that users of MKTME will actually get *worse* security.

I'm not following here.  XPFO just means that we don't keep the direct
map around all the time for all memory.  If XPFO and
MKTME-as-anonymous-memory were both in play, I think we'd just be
creating/destroying the MKTME-enlightened direct map instead of a
vanilla one.
Andy Lutomirski June 17, 2019, 7:12 p.m. UTC | #10
On Mon, Jun 17, 2019 at 11:37 AM Dave Hansen <dave.hansen@intel.com> wrote:
>
> Tom Lendacky, could you take a look down in the message to the talk of
> SEV?  I want to make sure I'm not misrepresenting what it does today.
> ...
>
>
> >> I actually don't care all that much which one we end up with.  It's not
> >> like the extra syscall in the second options means much.
> >
> > The benefit of the second one is that, if sys_encrypt is absent, it
> > just works.  In the first model, programs need a fallback because
> > they'll segfault of mprotect_encrypt() gets ENOSYS.
>
> Well, by the time they get here, they would have already had to allocate
> and set up the encryption key.  I don't think this would really be the
> "normal" malloc() path, for instance.
>
> >>  How do we
> >> eventually stack it on top of persistent memory filesystems or Device
> >> DAX?
> >
> > How do we stack anonymous memory on top of persistent memory or Device
> > DAX?  I'm confused.
>
> If our interface to MKTME is:
>
>         fd = open("/dev/mktme");
>         ptr = mmap(fd);
>
> Then it's hard to combine with an interface which is:
>
>         fd = open("/dev/dax123");
>         ptr = mmap(fd);
>
> Where if we have something like mprotect() (or madvise() or something
> else taking pointer), we can just do:
>
>         fd = open("/dev/anything987");
>         ptr = mmap(fd);
>         sys_encrypt(ptr);

I'm having a hard time imagining that ever working -- wouldn't it blow
up if someone did:

fd = open("/dev/anything987");
ptr1 = mmap(fd);
ptr2 = mmap(fd);
sys_encrypt(ptr1);

So I think it really has to be:
fd = open("/dev/anything987");
ioctl(fd, ENCRYPT_ME);
mmap(fd);

But I really expect that the encryption of a DAX device will actually
be a block device setting and won't look like this at all.  It'll be
more like dm-crypt except without device mapper.

>
> Now, we might not *do* it that way for dax, for instance, but I'm just
> saying that if we go the /dev/mktme route, we never get a choice.
>
> > I think that, in the long run, we're going to have to either expand
> > the core mm's concept of what "memory" is or just have a whole
> > parallel set of mechanisms for memory that doesn't work like memory.
> ...
> > I expect that some day normal memory will  be able to be repurposed as
> > SGX pages on the fly, and that will also look a lot more like SEV or
> > XPFO than like the this model of MKTME.
>
> I think you're drawing the line at pages where the kernel can manage
> contents vs. not manage contents.  I'm not sure that's the right
> distinction to make, though.  The thing that is important is whether the
> kernel can manage the lifetime and location of the data in the page.

The kernel can manage the location of EPC pages, for example, but only
under extreme constraints right now.  The draft SGX driver can and
does swap them out and swap them back in, potentially at a different
address.

>
> Basically: Can the kernel choose where the page comes from and get the
> page back when it wants?
>
> I really don't like the current state of things like with SEV or with
> KVM direct device assignment where the physical location is quite locked
> down and the kernel really can't manage the memory.  I'm trying really
> hard to make sure future hardware is more permissive about such things.
>  My hope is that these are a temporary blip and not the new normal.
>
> > So, if we upstream MKTME as anonymous memory with a magic config
> > syscall, I predict that, in a few years, it will be end up inheriting
> > all downsides of both approaches with few of the upsides.  Programs
> > like QEMU will need to learn to manipulate pages that can't be
> > accessed outside the VM without special VM buy-in, so the fact that
> > MKTME pages are fully functional and can be GUP-ed won't be very
> > useful.  And the VM will learn about all these things, but MKTME won't
> > really fit in.
>
> Kai Huang (who is on cc) has been doing the QEMU enabling and might want
> to weigh in.  I'd also love to hear from the AMD folks in case I'm not
> grokking some aspect of SEV.
>
> But, my understanding is that, even today, neither QEMU nor the kernel
> can see SEV-encrypted guest memory.  So QEMU should already understand
> how to not interact with guest memory.  I _assume_ it's also already
> doing this with anonymous memory, without needing /dev/sme or something.

Let's find out!

If it's using anonymous memory, it must be very strange, since it
would more or less have to be mmapped PROT_NONE.  The thing that makes
anonymous memory in particular seem so awkward to me is that it's
fundamentally identified by it's *address*, which means it makes no
sense if it's not mapped.

>
> > And, one of these days, someone will come up with a version of XPFO
> > that could actually be upstreamed, and it seems entirely plausible
> > that it will be totally incompatible with MKTME-as-anonymous-memory
> > and that users of MKTME will actually get *worse* security.
>
> I'm not following here.  XPFO just means that we don't keep the direct
> map around all the time for all memory.  If XPFO and
> MKTME-as-anonymous-memory were both in play, I think we'd just be
> creating/destroying the MKTME-enlightened direct map instead of a
> vanilla one.

What I'm saying is that I can imagine XPFO also wanting to be
something other than anonymous memory.  I don't think we'll ever want
regular MAP_ANONYMOUS to enable XPFO by default because the
performance will suck.  Doing this seems odd:

ptr = mmap(MAP_ANONYMOUS);
sys_xpfo_a_pointer(ptr);

So I could imagine:

ptr = mmap(MAP_ANONYMOUS | MAP_XPFO);

or

fd = open("/dev/xpfo"); (or fd = memfd_create(..., XPFO);
ptr = mmap(fd);

I'm thinking that XPFO is a *lot* simpler under the hood if we just
straight-up don't support GUP on it.  Maybe we should call this
"strong XPFO".  Similarly, the kinds of things that want MKTME may
also want the memory to be entirely absent from the direct map.  And
the things that use SEV (as I understand it) *can't* usefully use the
memory for normal IO via GUP or copy_to/from_user(), so these things
all have a decent amount in common.

Another down side of anonymous memory (in my head, anyway -- QEMU
people should chime in) is that it seems awkward to use it for IO
techniques in which the back-end isn't in the QEMU process.  If
there's an fd involved, you can pass it around, feed it to things like
vfio, etc.  If there's no fd, it's stuck in the creating process.

And another silly argument: if we had /dev/mktme, then we could
possibly get away with avoiding all the keyring stuff entirely.
Instead, you open /dev/mktme and you get your own key under the hook.
If you want two keys, you open /dev/mktme twice.  If you want some
other program to be able to see your memory, you pass it the fd.

I hope this email isn't too rambling :)

--Andy
Dave Hansen June 17, 2019, 9:36 p.m. UTC | #11
>> Where if we have something like mprotect() (or madvise() or something
>> else taking pointer), we can just do:
>>
>>         fd = open("/dev/anything987");
>>         ptr = mmap(fd);
>>         sys_encrypt(ptr);
> 
> I'm having a hard time imagining that ever working -- wouldn't it blow
> up if someone did:
> 
> fd = open("/dev/anything987");
> ptr1 = mmap(fd);
> ptr2 = mmap(fd);
> sys_encrypt(ptr1);
> 
> So I think it really has to be:
> fd = open("/dev/anything987");
> ioctl(fd, ENCRYPT_ME);
> mmap(fd);

Yeah, shared mappings are annoying. :)

But, let's face it, nobody is going to do what you suggest in the
ptr1/ptr2 example.  It doesn't make any logical sense because it's
effectively asking to read the memory with two different keys.  I
_believe_ fscrypt has similar issues and just punts on them by saying
"don't do that".

We can also quite easily figure out what's going on.  It's a very simple
rule to kill a process that tries to fault a page in whose KeyID doesn't
match the VMA under which it is faulted in, and also require that no
pages are faulted in under VMAs which have their key changed.


>> Now, we might not *do* it that way for dax, for instance, but I'm just
>> saying that if we go the /dev/mktme route, we never get a choice.
>>
>>> I think that, in the long run, we're going to have to either expand
>>> the core mm's concept of what "memory" is or just have a whole
>>> parallel set of mechanisms for memory that doesn't work like memory.
>> ...
>>> I expect that some day normal memory will  be able to be repurposed as
>>> SGX pages on the fly, and that will also look a lot more like SEV or
>>> XPFO than like the this model of MKTME.
>>
>> I think you're drawing the line at pages where the kernel can manage
>> contents vs. not manage contents.  I'm not sure that's the right
>> distinction to make, though.  The thing that is important is whether the
>> kernel can manage the lifetime and location of the data in the page.
> 
> The kernel can manage the location of EPC pages, for example, but only
> under extreme constraints right now.  The draft SGX driver can and
> does swap them out and swap them back in, potentially at a different
> address.

The kernel can't put arbitrary data in EPC pages and can't use normal
memory for EPC.  To me, that puts them clearly on the side of being
unmanageable by the core mm code.

For instance, there's no way we could mix EPC pages in the same 'struct
zone' with non-EPC pages.  Not only are they not in the direct map, but
they never *can* be, even for a second.

>>> And, one of these days, someone will come up with a version of XPFO
>>> that could actually be upstreamed, and it seems entirely plausible
>>> that it will be totally incompatible with MKTME-as-anonymous-memory
>>> and that users of MKTME will actually get *worse* security.
>>
>> I'm not following here.  XPFO just means that we don't keep the direct
>> map around all the time for all memory.  If XPFO and
>> MKTME-as-anonymous-memory were both in play, I think we'd just be
>> creating/destroying the MKTME-enlightened direct map instead of a
>> vanilla one.
> 
> What I'm saying is that I can imagine XPFO also wanting to be
> something other than anonymous memory.  I don't think we'll ever want
> regular MAP_ANONYMOUS to enable XPFO by default because the
> performance will suck.

It will certainly suck for some things.  But, does it suck if the kernel
never uses the direct map for the XPFO memory?  If it were for KVM guest
memory for a guest using direct device assignment, we might not even
ever notice.

> I'm thinking that XPFO is a *lot* simpler under the hood if we just
> straight-up don't support GUP on it.  Maybe we should call this
> "strong XPFO".  Similarly, the kinds of things that want MKTME may
> also want the memory to be entirely absent from the direct map.  And
> the things that use SEV (as I understand it) *can't* usefully use the
> memory for normal IO via GUP or copy_to/from_user(), so these things
> all have a decent amount in common.

OK, so basically, you're thinking about new memory management
infrastructure that a memory-allocating-app can opt into where they get
a reduced kernel feature set, but also increased security guarantees?
 The main insight thought is that some hardware features *already*
impose (some of) this reduced feature set?

FWIW, I don't think many folks will go for the no-GUP rule.  It's one
thing to say no-GUPs for SGX pages which can't have I/O done on them in
the first place, but it's quite another to tell folks that sendfile() no
longer works without bounce buffers.

MKTME's security guarantees are very different than something like SEV.
 Since the kernel is in the trust boundary, it *can* do fun stuff like
RDMA which is a heck of a lot faster than bounce buffering.  Let's say a
franken-system existed with SEV and MKTME.  It isn't even clear to me
that *everyone* would pick SEV over MKTME.  IOW, I'm not sure the MKTME
model necessarily goes away given the presence of SEV.

> And another silly argument: if we had /dev/mktme, then we could
> possibly get away with avoiding all the keyring stuff entirely.
> Instead, you open /dev/mktme and you get your own key under the hook.
> If you want two keys, you open /dev/mktme twice.  If you want some
> other program to be able to see your memory, you pass it the fd.

We still like the keyring because it's one-stop-shopping as the place
that *owns* the hardware KeyID slots.  Those are global resources and
scream for a single global place to allocate and manage them.  The
hardware slots also need to be shared between any anonymous and
file-based users, no matter what the APIs for the anonymous side.
Kai Huang June 17, 2019, 11:59 p.m. UTC | #12
On Mon, 2019-06-17 at 11:27 -0700, Dave Hansen wrote:
> Tom Lendacky, could you take a look down in the message to the talk of
> SEV?  I want to make sure I'm not misrepresenting what it does today.
> ...
> 
> 
> > > I actually don't care all that much which one we end up with.  It's not
> > > like the extra syscall in the second options means much.
> > 
> > The benefit of the second one is that, if sys_encrypt is absent, it
> > just works.  In the first model, programs need a fallback because
> > they'll segfault of mprotect_encrypt() gets ENOSYS.
> 
> Well, by the time they get here, they would have already had to allocate
> and set up the encryption key.  I don't think this would really be the
> "normal" malloc() path, for instance.
> 
> > >  How do we
> > > eventually stack it on top of persistent memory filesystems or Device
> > > DAX?
> > 
> > How do we stack anonymous memory on top of persistent memory or Device
> > DAX?  I'm confused.
> 
> If our interface to MKTME is:
> 
> 	fd = open("/dev/mktme");
> 	ptr = mmap(fd);
> 
> Then it's hard to combine with an interface which is:
> 
> 	fd = open("/dev/dax123");
> 	ptr = mmap(fd);
> 
> Where if we have something like mprotect() (or madvise() or something
> else taking pointer), we can just do:
> 
> 	fd = open("/dev/anything987");
> 	ptr = mmap(fd);
> 	sys_encrypt(ptr);
> 
> Now, we might not *do* it that way for dax, for instance, but I'm just
> saying that if we go the /dev/mktme route, we never get a choice.
> 
> > I think that, in the long run, we're going to have to either expand
> > the core mm's concept of what "memory" is or just have a whole
> > parallel set of mechanisms for memory that doesn't work like memory.
> 
> ...
> > I expect that some day normal memory will  be able to be repurposed as
> > SGX pages on the fly, and that will also look a lot more like SEV or
> > XPFO than like the this model of MKTME.
> 
> I think you're drawing the line at pages where the kernel can manage
> contents vs. not manage contents.  I'm not sure that's the right
> distinction to make, though.  The thing that is important is whether the
> kernel can manage the lifetime and location of the data in the page.
> 
> Basically: Can the kernel choose where the page comes from and get the
> page back when it wants?
> 
> I really don't like the current state of things like with SEV or with
> KVM direct device assignment where the physical location is quite locked
> down and the kernel really can't manage the memory.  I'm trying really
> hard to make sure future hardware is more permissive about such things.
>  My hope is that these are a temporary blip and not the new normal.
> 
> > So, if we upstream MKTME as anonymous memory with a magic config
> > syscall, I predict that, in a few years, it will be end up inheriting
> > all downsides of both approaches with few of the upsides.  Programs
> > like QEMU will need to learn to manipulate pages that can't be
> > accessed outside the VM without special VM buy-in, so the fact that
> > MKTME pages are fully functional and can be GUP-ed won't be very
> > useful.  And the VM will learn about all these things, but MKTME won't
> > really fit in.
> 
> Kai Huang (who is on cc) has been doing the QEMU enabling and might want
> to weigh in.  I'd also love to hear from the AMD folks in case I'm not
> grokking some aspect of SEV.
> 
> But, my understanding is that, even today, neither QEMU nor the kernel
> can see SEV-encrypted guest memory.  So QEMU should already understand
> how to not interact with guest memory.  I _assume_ it's also already
> doing this with anonymous memory, without needing /dev/sme or something.

Correct neither Qemu nor kernel can see SEV-encrypted guest memory. Qemu requires guest's
cooperation when it needs to interacts with guest, i.e. to support virtual DMA (of virtual devices
in SEV-guest), qemu requires SEV-guest to setup bounce buffer (which will not be SEV-encrypted
memory, but shared memory can be accessed from host side too), so that guest kernel can copy DMA
data from bounce buffer to its own SEV-encrypted memory after qemu/host kernel puts DMA data to
bounce buffer.

And yes from my reading (better to have AMD guys to confirm) SEV guest uses anonymous memory, but it
also pins all guest memory (by calling GUP from KVM -- SEV specifically introduced 2 KVM ioctls for
this purpose), since SEV architecturally cannot support swapping, migraiton of SEV-encrypted guest
memory, because SME/SEV also uses physical address as "tweak", and there's no way that kernel can
get or use SEV-guest's memory encryption key. In order to swap/migrate SEV-guest memory, we need SGX
EPC eviction/reload similar thing, which SEV doesn't have today.

From this perspective, I think driver proposal kinda makes sense since we already have security
feature which uses normal memory some kind like "device memory" (no swap, no migration, etc), so it
makes sense that MKTME just follows that (although from HW MKTME can support swap, page migration,
etc). The downside of driver proposal for MKTME I think is, like Dave mentioned, it's hard (or not
sure whether it is possible) to extend to support NVDIMM (and file backed guest memory), since for
virtual NVDIMM, Qemu needs to call mmap against fd of NVDIMM.

Thanks,
-Kai
Kai Huang June 18, 2019, 12:05 a.m. UTC | #13
On Mon, 2019-06-17 at 12:12 -0700, Andy Lutomirski wrote:
> On Mon, Jun 17, 2019 at 11:37 AM Dave Hansen <dave.hansen@intel.com> wrote:
> > 
> > Tom Lendacky, could you take a look down in the message to the talk of
> > SEV?  I want to make sure I'm not misrepresenting what it does today.
> > ...
> > 
> > 
> > > > I actually don't care all that much which one we end up with.  It's not
> > > > like the extra syscall in the second options means much.
> > > 
> > > The benefit of the second one is that, if sys_encrypt is absent, it
> > > just works.  In the first model, programs need a fallback because
> > > they'll segfault of mprotect_encrypt() gets ENOSYS.
> > 
> > Well, by the time they get here, they would have already had to allocate
> > and set up the encryption key.  I don't think this would really be the
> > "normal" malloc() path, for instance.
> > 
> > > >  How do we
> > > > eventually stack it on top of persistent memory filesystems or Device
> > > > DAX?
> > > 
> > > How do we stack anonymous memory on top of persistent memory or Device
> > > DAX?  I'm confused.
> > 
> > If our interface to MKTME is:
> > 
> >         fd = open("/dev/mktme");
> >         ptr = mmap(fd);
> > 
> > Then it's hard to combine with an interface which is:
> > 
> >         fd = open("/dev/dax123");
> >         ptr = mmap(fd);
> > 
> > Where if we have something like mprotect() (or madvise() or something
> > else taking pointer), we can just do:
> > 
> >         fd = open("/dev/anything987");
> >         ptr = mmap(fd);
> >         sys_encrypt(ptr);
> 
> I'm having a hard time imagining that ever working -- wouldn't it blow
> up if someone did:
> 
> fd = open("/dev/anything987");
> ptr1 = mmap(fd);
> ptr2 = mmap(fd);
> sys_encrypt(ptr1);
> 
> So I think it really has to be:
> fd = open("/dev/anything987");
> ioctl(fd, ENCRYPT_ME);
> mmap(fd);

This requires "/dev/anything987" to support ENCRYPT_ME ioctl, right?

So to support NVDIMM (DAX), we need to add ENCRYPT_ME ioctl to DAX?

> 
> But I really expect that the encryption of a DAX device will actually
> be a block device setting and won't look like this at all.  It'll be
> more like dm-crypt except without device mapper.

Are you suggesting not to support MKTME for DAX, or adding MKTME support to dm-crypt?

Thanks,
-Kai
Andy Lutomirski June 18, 2019, 12:15 a.m. UTC | #14
On Mon, Jun 17, 2019 at 5:05 PM Kai Huang <kai.huang@linux.intel.com> wrote:
>
> On Mon, 2019-06-17 at 12:12 -0700, Andy Lutomirski wrote:
> > On Mon, Jun 17, 2019 at 11:37 AM Dave Hansen <dave.hansen@intel.com> wrote:
> > >
> > > Tom Lendacky, could you take a look down in the message to the talk of
> > > SEV?  I want to make sure I'm not misrepresenting what it does today.
> > > ...
> > >
> > >
> > > > > I actually don't care all that much which one we end up with.  It's not
> > > > > like the extra syscall in the second options means much.
> > > >
> > > > The benefit of the second one is that, if sys_encrypt is absent, it
> > > > just works.  In the first model, programs need a fallback because
> > > > they'll segfault of mprotect_encrypt() gets ENOSYS.
> > >
> > > Well, by the time they get here, they would have already had to allocate
> > > and set up the encryption key.  I don't think this would really be the
> > > "normal" malloc() path, for instance.
> > >
> > > > >  How do we
> > > > > eventually stack it on top of persistent memory filesystems or Device
> > > > > DAX?
> > > >
> > > > How do we stack anonymous memory on top of persistent memory or Device
> > > > DAX?  I'm confused.
> > >
> > > If our interface to MKTME is:
> > >
> > >         fd = open("/dev/mktme");
> > >         ptr = mmap(fd);
> > >
> > > Then it's hard to combine with an interface which is:
> > >
> > >         fd = open("/dev/dax123");
> > >         ptr = mmap(fd);
> > >
> > > Where if we have something like mprotect() (or madvise() or something
> > > else taking pointer), we can just do:
> > >
> > >         fd = open("/dev/anything987");
> > >         ptr = mmap(fd);
> > >         sys_encrypt(ptr);
> >
> > I'm having a hard time imagining that ever working -- wouldn't it blow
> > up if someone did:
> >
> > fd = open("/dev/anything987");
> > ptr1 = mmap(fd);
> > ptr2 = mmap(fd);
> > sys_encrypt(ptr1);
> >
> > So I think it really has to be:
> > fd = open("/dev/anything987");
> > ioctl(fd, ENCRYPT_ME);
> > mmap(fd);
>
> This requires "/dev/anything987" to support ENCRYPT_ME ioctl, right?
>
> So to support NVDIMM (DAX), we need to add ENCRYPT_ME ioctl to DAX?

Yes and yes, or we do it with layers -- see below.

I don't see how we can credibly avoid this.  If we try to do MKTME
behind the DAX driver's back, aren't we going to end up with cache
coherence problems?

>
> >
> > But I really expect that the encryption of a DAX device will actually
> > be a block device setting and won't look like this at all.  It'll be
> > more like dm-crypt except without device mapper.
>
> Are you suggesting not to support MKTME for DAX, or adding MKTME support to dm-crypt?

I'm proposing exposing it by an interface that looks somewhat like
dm-crypt.  Either we could have a way to create a device layered on
top of the DAX devices that exposes a decrypted view or we add a way
to tell the DAX device to kindly use MKTME with such-and-such key.

If there is demand for a way to have an fscrypt-like thing on top of
DAX where different files use different keys, I suppose that could be
done too, but it will need filesystem or VFS help.
Kai Huang June 18, 2019, 12:48 a.m. UTC | #15
> 
> > And another silly argument: if we had /dev/mktme, then we could
> > possibly get away with avoiding all the keyring stuff entirely.
> > Instead, you open /dev/mktme and you get your own key under the hook.
> > If you want two keys, you open /dev/mktme twice.  If you want some
> > other program to be able to see your memory, you pass it the fd.
> 
> We still like the keyring because it's one-stop-shopping as the place
> that *owns* the hardware KeyID slots.  Those are global resources and
> scream for a single global place to allocate and manage them.  The
> hardware slots also need to be shared between any anonymous and
> file-based users, no matter what the APIs for the anonymous side.

MKTME driver (who creates /dev/mktme) can also be the one-stop-shopping. I think whether to choose
keyring to manage MKTME key should be based on whether we need/should take advantage of existing key
retention service functionalities. For example, with key retention service we can
revoke/invalidate/set expiry for a key (not sure whether MKTME needs those although), and we have
several keyrings -- thread specific keyring, process specific keyring, user specific keyring, etc,
thus we can control who can/cannot find the key, etc. I think managing MKTME key in MKTME driver
doesn't have those advantages.

Thanks,
-Kai
Tom Lendacky June 18, 2019, 1:34 a.m. UTC | #16
On 6/17/19 6:59 PM, Kai Huang wrote:
> On Mon, 2019-06-17 at 11:27 -0700, Dave Hansen wrote:
>> Tom Lendacky, could you take a look down in the message to the talk of
>> SEV?  I want to make sure I'm not misrepresenting what it does today.

Sorry, I'm traveling this week, so responses will be delayed...

>> ...
>>
>>
>>>> I actually don't care all that much which one we end up with.  It's not
>>>> like the extra syscall in the second options means much.
>>>
>>> The benefit of the second one is that, if sys_encrypt is absent, it
>>> just works.  In the first model, programs need a fallback because
>>> they'll segfault of mprotect_encrypt() gets ENOSYS.
>>
>> Well, by the time they get here, they would have already had to allocate
>> and set up the encryption key.  I don't think this would really be the
>> "normal" malloc() path, for instance.
>>
>>>>  How do we
>>>> eventually stack it on top of persistent memory filesystems or Device
>>>> DAX?
>>>
>>> How do we stack anonymous memory on top of persistent memory or Device
>>> DAX?  I'm confused.
>>
>> If our interface to MKTME is:
>>
>> 	fd = open("/dev/mktme");
>> 	ptr = mmap(fd);
>>
>> Then it's hard to combine with an interface which is:
>>
>> 	fd = open("/dev/dax123");
>> 	ptr = mmap(fd);
>>
>> Where if we have something like mprotect() (or madvise() or something
>> else taking pointer), we can just do:
>>
>> 	fd = open("/dev/anything987");
>> 	ptr = mmap(fd);
>> 	sys_encrypt(ptr);
>>
>> Now, we might not *do* it that way for dax, for instance, but I'm just
>> saying that if we go the /dev/mktme route, we never get a choice.
>>
>>> I think that, in the long run, we're going to have to either expand
>>> the core mm's concept of what "memory" is or just have a whole
>>> parallel set of mechanisms for memory that doesn't work like memory.
>>
>> ...
>>> I expect that some day normal memory will  be able to be repurposed as
>>> SGX pages on the fly, and that will also look a lot more like SEV or
>>> XPFO than like the this model of MKTME.
>>
>> I think you're drawing the line at pages where the kernel can manage
>> contents vs. not manage contents.  I'm not sure that's the right
>> distinction to make, though.  The thing that is important is whether the
>> kernel can manage the lifetime and location of the data in the page.
>>
>> Basically: Can the kernel choose where the page comes from and get the
>> page back when it wants?
>>
>> I really don't like the current state of things like with SEV or with
>> KVM direct device assignment where the physical location is quite locked
>> down and the kernel really can't manage the memory.  I'm trying really
>> hard to make sure future hardware is more permissive about such things.
>>  My hope is that these are a temporary blip and not the new normal.
>>
>>> So, if we upstream MKTME as anonymous memory with a magic config
>>> syscall, I predict that, in a few years, it will be end up inheriting
>>> all downsides of both approaches with few of the upsides.  Programs
>>> like QEMU will need to learn to manipulate pages that can't be
>>> accessed outside the VM without special VM buy-in, so the fact that
>>> MKTME pages are fully functional and can be GUP-ed won't be very
>>> useful.  And the VM will learn about all these things, but MKTME won't
>>> really fit in.
>>
>> Kai Huang (who is on cc) has been doing the QEMU enabling and might want
>> to weigh in.  I'd also love to hear from the AMD folks in case I'm not
>> grokking some aspect of SEV.
>>
>> But, my understanding is that, even today, neither QEMU nor the kernel
>> can see SEV-encrypted guest memory.  So QEMU should already understand
>> how to not interact with guest memory.  I _assume_ it's also already
>> doing this with anonymous memory, without needing /dev/sme or something.
> 
> Correct neither Qemu nor kernel can see SEV-encrypted guest memory. Qemu requires guest's
> cooperation when it needs to interacts with guest, i.e. to support virtual DMA (of virtual devices
> in SEV-guest), qemu requires SEV-guest to setup bounce buffer (which will not be SEV-encrypted
> memory, but shared memory can be accessed from host side too), so that guest kernel can copy DMA
> data from bounce buffer to its own SEV-encrypted memory after qemu/host kernel puts DMA data to
> bounce buffer.

That is correct. an SEV guest must use un-encrypted memory if it wishes
for the hypervisor to be able to see it. Also, to support DMA into the
guest, the target memory must be un-encrypted, which SEV does through the
DMA api and SWIOTLB.

SME must also use bounce buffers if the device does not support 48-bit
DMA, since the encryption bit is bit 47. Any device supporting DMA above
the encryption bit position can perform DMA without bounce buffers under
SME.

> 
> And yes from my reading (better to have AMD guys to confirm) SEV guest uses anonymous memory, but it
> also pins all guest memory (by calling GUP from KVM -- SEV specifically introduced 2 KVM ioctls for
> this purpose), since SEV architecturally cannot support swapping, migraiton of SEV-encrypted guest
> memory, because SME/SEV also uses physical address as "tweak", and there's no way that kernel can
> get or use SEV-guest's memory encryption key. In order to swap/migrate SEV-guest memory, we need SGX
> EPC eviction/reload similar thing, which SEV doesn't have today.

Yes, all the guest memory is currently pinned by calling GUP when creating
an SEV guest. This is to prevent page migration by the kernel. However,
the support to do page migration is available in the 0.17 version of the
SEV API via the COPY commmand. The COPY commannd allows for copying of
an encrypted page from one location to another via the PSP. The support
for this is not yet in the Linux kernel, but we are working on it. This
would remove the requirement of having to pin the guest memory.

The SEV API also supports migration of memory via the SEND_* and RECEIVE_*
APIs to support live migration.

Swapping, however, is not yet supported.

Thanks,
Tom

> 
> From this perspective, I think driver proposal kinda makes sense since we already have security
> feature which uses normal memory some kind like "device memory" (no swap, no migration, etc), so it
> makes sense that MKTME just follows that (although from HW MKTME can support swap, page migration,
> etc). The downside of driver proposal for MKTME I think is, like Dave mentioned, it's hard (or not
> sure whether it is possible) to extend to support NVDIMM (and file backed guest memory), since for
> virtual NVDIMM, Qemu needs to call mmap against fd of NVDIMM.
> 
> Thanks,
> -Kai
>
Kai Huang June 18, 2019, 1:35 a.m. UTC | #17
> > > 
> > > I'm having a hard time imagining that ever working -- wouldn't it blow
> > > up if someone did:
> > > 
> > > fd = open("/dev/anything987");
> > > ptr1 = mmap(fd);
> > > ptr2 = mmap(fd);
> > > sys_encrypt(ptr1);
> > > 
> > > So I think it really has to be:
> > > fd = open("/dev/anything987");
> > > ioctl(fd, ENCRYPT_ME);
> > > mmap(fd);
> > 
> > This requires "/dev/anything987" to support ENCRYPT_ME ioctl, right?
> > 
> > So to support NVDIMM (DAX), we need to add ENCRYPT_ME ioctl to DAX?
> 
> Yes and yes, or we do it with layers -- see below.
> 
> I don't see how we can credibly avoid this.  If we try to do MKTME
> behind the DAX driver's back, aren't we going to end up with cache
> coherence problems?

I am not sure whether I understand correctly but how is cache coherence problem related to putting
MKTME concept to different layers? To make MKTME work with DAX/NVDIMM, I think no matter which layer
MKTME concept resides, eventually we need to put keyID into PTE which maps to NVDIMM, and kernel
needs to manage cache coherence for NVDIMM just like for normal memory showed in this series? 

Thanks,
-Kai
Andy Lutomirski June 18, 2019, 1:40 a.m. UTC | #18
On Mon, Jun 17, 2019 at 6:34 PM Lendacky, Thomas
<Thomas.Lendacky@amd.com> wrote:
>
> On 6/17/19 6:59 PM, Kai Huang wrote:
> > On Mon, 2019-06-17 at 11:27 -0700, Dave Hansen wrote:

> >
> > And yes from my reading (better to have AMD guys to confirm) SEV guest uses anonymous memory, but it
> > also pins all guest memory (by calling GUP from KVM -- SEV specifically introduced 2 KVM ioctls for
> > this purpose), since SEV architecturally cannot support swapping, migraiton of SEV-encrypted guest
> > memory, because SME/SEV also uses physical address as "tweak", and there's no way that kernel can
> > get or use SEV-guest's memory encryption key. In order to swap/migrate SEV-guest memory, we need SGX
> > EPC eviction/reload similar thing, which SEV doesn't have today.
>
> Yes, all the guest memory is currently pinned by calling GUP when creating
> an SEV guest.

Ick.

What happens if QEMU tries to read the memory?  Does it just see
ciphertext?  Is cache coherency lost if QEMU writes it?
Andy Lutomirski June 18, 2019, 1:43 a.m. UTC | #19
On Mon, Jun 17, 2019 at 6:35 PM Kai Huang <kai.huang@linux.intel.com> wrote:
>
>
> > > >
> > > > I'm having a hard time imagining that ever working -- wouldn't it blow
> > > > up if someone did:
> > > >
> > > > fd = open("/dev/anything987");
> > > > ptr1 = mmap(fd);
> > > > ptr2 = mmap(fd);
> > > > sys_encrypt(ptr1);
> > > >
> > > > So I think it really has to be:
> > > > fd = open("/dev/anything987");
> > > > ioctl(fd, ENCRYPT_ME);
> > > > mmap(fd);
> > >
> > > This requires "/dev/anything987" to support ENCRYPT_ME ioctl, right?
> > >
> > > So to support NVDIMM (DAX), we need to add ENCRYPT_ME ioctl to DAX?
> >
> > Yes and yes, or we do it with layers -- see below.
> >
> > I don't see how we can credibly avoid this.  If we try to do MKTME
> > behind the DAX driver's back, aren't we going to end up with cache
> > coherence problems?
>
> I am not sure whether I understand correctly but how is cache coherence problem related to putting
> MKTME concept to different layers? To make MKTME work with DAX/NVDIMM, I think no matter which layer
> MKTME concept resides, eventually we need to put keyID into PTE which maps to NVDIMM, and kernel
> needs to manage cache coherence for NVDIMM just like for normal memory showed in this series?
>

I mean is that, to avoid cache coherence problems, something has to
prevent user code from mapping the same page with two different key
ids.  If the entire MKTME mechanism purely layers on top of DAX,
something needs to prevent the underlying DAX device from being mapped
at the same time as the MKTME-decrypted view.  This is obviously
doable, but it's not automatic.
Andy Lutomirski June 18, 2019, 1:50 a.m. UTC | #20
On Mon, Jun 17, 2019 at 5:48 PM Kai Huang <kai.huang@linux.intel.com> wrote:
>
>
> >
> > > And another silly argument: if we had /dev/mktme, then we could
> > > possibly get away with avoiding all the keyring stuff entirely.
> > > Instead, you open /dev/mktme and you get your own key under the hook.
> > > If you want two keys, you open /dev/mktme twice.  If you want some
> > > other program to be able to see your memory, you pass it the fd.
> >
> > We still like the keyring because it's one-stop-shopping as the place
> > that *owns* the hardware KeyID slots.  Those are global resources and
> > scream for a single global place to allocate and manage them.  The
> > hardware slots also need to be shared between any anonymous and
> > file-based users, no matter what the APIs for the anonymous side.
>
> MKTME driver (who creates /dev/mktme) can also be the one-stop-shopping. I think whether to choose
> keyring to manage MKTME key should be based on whether we need/should take advantage of existing key
> retention service functionalities. For example, with key retention service we can
> revoke/invalidate/set expiry for a key (not sure whether MKTME needs those although), and we have
> several keyrings -- thread specific keyring, process specific keyring, user specific keyring, etc,
> thus we can control who can/cannot find the key, etc. I think managing MKTME key in MKTME driver
> doesn't have those advantages.
>

Trying to evaluate this with the current proposed code is a bit odd, I
think.  Suppose you create a thread-specific key and then fork().  The
child can presumably still use the key regardless of whether the child
can nominally access the key in the keyring because the PTEs are still
there.

More fundamentally, in some sense, the current code has no semantics.
Associating a key with memory and "encrypting" it doesn't actually do
anything unless you are attacking the memory bus but you haven't
compromised the kernel.  There's no protection against a guest that
can corrupt its EPT tables, there's no protection against kernel bugs
(*especially* if the duplicate direct map design stays), and there
isn't even any fd or other object around by which you can only access
the data if you can see the key.

I'm also wondering whether the kernel will always be able to be a
one-stop shop for key allocation -- if the MKTME hardware gains
interesting new uses down the road, who knows how key allocation will
work?
Tom Lendacky June 18, 2019, 2:02 a.m. UTC | #21
On 6/17/19 8:40 PM, Andy Lutomirski wrote:
> On Mon, Jun 17, 2019 at 6:34 PM Lendacky, Thomas
> <Thomas.Lendacky@amd.com> wrote:
>>
>> On 6/17/19 6:59 PM, Kai Huang wrote:
>>> On Mon, 2019-06-17 at 11:27 -0700, Dave Hansen wrote:
> 
>>>
>>> And yes from my reading (better to have AMD guys to confirm) SEV guest uses anonymous memory, but it
>>> also pins all guest memory (by calling GUP from KVM -- SEV specifically introduced 2 KVM ioctls for
>>> this purpose), since SEV architecturally cannot support swapping, migraiton of SEV-encrypted guest
>>> memory, because SME/SEV also uses physical address as "tweak", and there's no way that kernel can
>>> get or use SEV-guest's memory encryption key. In order to swap/migrate SEV-guest memory, we need SGX
>>> EPC eviction/reload similar thing, which SEV doesn't have today.
>>
>> Yes, all the guest memory is currently pinned by calling GUP when creating
>> an SEV guest.
> 
> Ick.
> 
> What happens if QEMU tries to read the memory?  Does it just see
> ciphertext?  Is cache coherency lost if QEMU writes it?

If QEMU tries to read the memory is would just see ciphertext. I'll
double check on the write situation, but I think you would end up with
a cache coherency issue because the write by QEMU would be with the
hypervisor key and tagged separately in the cache from the guest cache
entry. SEV provides confidentiality of guest memory from the hypervisor,
it doesn't prevent the hypervisor from trashing the guest memory.


Thanks,
Tom

>
Kai Huang June 18, 2019, 2:11 a.m. UTC | #22
On Mon, 2019-06-17 at 18:50 -0700, Andy Lutomirski wrote:
> On Mon, Jun 17, 2019 at 5:48 PM Kai Huang <kai.huang@linux.intel.com> wrote:
> > 
> > 
> > > 
> > > > And another silly argument: if we had /dev/mktme, then we could
> > > > possibly get away with avoiding all the keyring stuff entirely.
> > > > Instead, you open /dev/mktme and you get your own key under the hook.
> > > > If you want two keys, you open /dev/mktme twice.  If you want some
> > > > other program to be able to see your memory, you pass it the fd.
> > > 
> > > We still like the keyring because it's one-stop-shopping as the place
> > > that *owns* the hardware KeyID slots.  Those are global resources and
> > > scream for a single global place to allocate and manage them.  The
> > > hardware slots also need to be shared between any anonymous and
> > > file-based users, no matter what the APIs for the anonymous side.
> > 
> > MKTME driver (who creates /dev/mktme) can also be the one-stop-shopping. I think whether to
> > choose
> > keyring to manage MKTME key should be based on whether we need/should take advantage of existing
> > key
> > retention service functionalities. For example, with key retention service we can
> > revoke/invalidate/set expiry for a key (not sure whether MKTME needs those although), and we
> > have
> > several keyrings -- thread specific keyring, process specific keyring, user specific keyring,
> > etc,
> > thus we can control who can/cannot find the key, etc. I think managing MKTME key in MKTME driver
> > doesn't have those advantages.
> > 
> 
> Trying to evaluate this with the current proposed code is a bit odd, I
> think.  Suppose you create a thread-specific key and then fork().  The
> child can presumably still use the key regardless of whether the child
> can nominally access the key in the keyring because the PTEs are still
> there.

Right. This is a little bit odd, although virtualization (Qemu, which is the main use case of MKTME
at least so far) doesn't use fork().

> 
> More fundamentally, in some sense, the current code has no semantics.
> Associating a key with memory and "encrypting" it doesn't actually do
> anything unless you are attacking the memory bus but you haven't
> compromised the kernel.  There's no protection against a guest that
> can corrupt its EPT tables, there's no protection against kernel bugs
> (*especially* if the duplicate direct map design stays), and there
> isn't even any fd or other object around by which you can only access
> the data if you can see the key.

I am not saying managing MKTME key/keyID in key retention service is definitely better, but it seems
all those you mentioned are not related to whether to choose key retention service to manage MKTME
key/keyID? Or you are saying it doesn't matter we manage key/keyID in key retention service or in
MKTME driver, since MKTME barely have any security benefits (besides physical attack)?

> 
> I'm also wondering whether the kernel will always be able to be a
> one-stop shop for key allocation -- if the MKTME hardware gains
> interesting new uses down the road, who knows how key allocation will
> work?

I by now don't have any use case which requires to manage key/keyID specifically for its own use,
rather than letting kernel to manage keyID allocation. Please inspire us if you have any potential.

Thanks,
-Kai
Kai Huang June 18, 2019, 2:23 a.m. UTC | #23
On Mon, 2019-06-17 at 18:43 -0700, Andy Lutomirski wrote:
> On Mon, Jun 17, 2019 at 6:35 PM Kai Huang <kai.huang@linux.intel.com> wrote:
> > 
> > 
> > > > > 
> > > > > I'm having a hard time imagining that ever working -- wouldn't it blow
> > > > > up if someone did:
> > > > > 
> > > > > fd = open("/dev/anything987");
> > > > > ptr1 = mmap(fd);
> > > > > ptr2 = mmap(fd);
> > > > > sys_encrypt(ptr1);
> > > > > 
> > > > > So I think it really has to be:
> > > > > fd = open("/dev/anything987");
> > > > > ioctl(fd, ENCRYPT_ME);
> > > > > mmap(fd);
> > > > 
> > > > This requires "/dev/anything987" to support ENCRYPT_ME ioctl, right?
> > > > 
> > > > So to support NVDIMM (DAX), we need to add ENCRYPT_ME ioctl to DAX?
> > > 
> > > Yes and yes, or we do it with layers -- see below.
> > > 
> > > I don't see how we can credibly avoid this.  If we try to do MKTME
> > > behind the DAX driver's back, aren't we going to end up with cache
> > > coherence problems?
> > 
> > I am not sure whether I understand correctly but how is cache coherence problem related to
> > putting
> > MKTME concept to different layers? To make MKTME work with DAX/NVDIMM, I think no matter which
> > layer
> > MKTME concept resides, eventually we need to put keyID into PTE which maps to NVDIMM, and kernel
> > needs to manage cache coherence for NVDIMM just like for normal memory showed in this series?
> > 
> 
> I mean is that, to avoid cache coherence problems, something has to
> prevent user code from mapping the same page with two different key
> ids.  If the entire MKTME mechanism purely layers on top of DAX,
> something needs to prevent the underlying DAX device from being mapped
> at the same time as the MKTME-decrypted view.  This is obviously
> doable, but it's not automatic.

Assuming I am understanding the context correctly, yes from this perspective it seems having
sys_encrypt is annoying, and having ENCRYPT_ME should be better. But Dave said "nobody is going to
do what you suggest in the ptr1/ptr2 example"? 

Thanks,
-Kai
Andy Lutomirski June 18, 2019, 4:19 a.m. UTC | #24
On Mon, Jun 17, 2019 at 6:40 PM Andy Lutomirski <luto@kernel.org> wrote:
>
> On Mon, Jun 17, 2019 at 6:34 PM Lendacky, Thomas
> <Thomas.Lendacky@amd.com> wrote:
> >
> > On 6/17/19 6:59 PM, Kai Huang wrote:
> > > On Mon, 2019-06-17 at 11:27 -0700, Dave Hansen wrote:
>
> > >
> > > And yes from my reading (better to have AMD guys to confirm) SEV guest uses anonymous memory, but it
> > > also pins all guest memory (by calling GUP from KVM -- SEV specifically introduced 2 KVM ioctls for
> > > this purpose), since SEV architecturally cannot support swapping, migraiton of SEV-encrypted guest
> > > memory, because SME/SEV also uses physical address as "tweak", and there's no way that kernel can
> > > get or use SEV-guest's memory encryption key. In order to swap/migrate SEV-guest memory, we need SGX
> > > EPC eviction/reload similar thing, which SEV doesn't have today.
> >
> > Yes, all the guest memory is currently pinned by calling GUP when creating
> > an SEV guest.
>
> Ick.
>
> What happens if QEMU tries to read the memory?  Does it just see
> ciphertext?  Is cache coherency lost if QEMU writes it?

I should add: is the current interface that SEV uses actually good, or
should the kernel try to do something differently?  I've spent exactly
zero time looking at SEV APIs or at how QEMU manages its memory.
Andy Lutomirski June 18, 2019, 4:24 a.m. UTC | #25
On Mon, Jun 17, 2019 at 7:11 PM Kai Huang <kai.huang@linux.intel.com> wrote:
>
> On Mon, 2019-06-17 at 18:50 -0700, Andy Lutomirski wrote:
> > On Mon, Jun 17, 2019 at 5:48 PM Kai Huang <kai.huang@linux.intel.com> wrote:
> > >
> > >
> > > >
> > > > > And another silly argument: if we had /dev/mktme, then we could
> > > > > possibly get away with avoiding all the keyring stuff entirely.
> > > > > Instead, you open /dev/mktme and you get your own key under the hook.
> > > > > If you want two keys, you open /dev/mktme twice.  If you want some
> > > > > other program to be able to see your memory, you pass it the fd.
> > > >
> > > > We still like the keyring because it's one-stop-shopping as the place
> > > > that *owns* the hardware KeyID slots.  Those are global resources and
> > > > scream for a single global place to allocate and manage them.  The
> > > > hardware slots also need to be shared between any anonymous and
> > > > file-based users, no matter what the APIs for the anonymous side.
> > >
> > > MKTME driver (who creates /dev/mktme) can also be the one-stop-shopping. I think whether to
> > > choose
> > > keyring to manage MKTME key should be based on whether we need/should take advantage of existing
> > > key
> > > retention service functionalities. For example, with key retention service we can
> > > revoke/invalidate/set expiry for a key (not sure whether MKTME needs those although), and we
> > > have
> > > several keyrings -- thread specific keyring, process specific keyring, user specific keyring,
> > > etc,
> > > thus we can control who can/cannot find the key, etc. I think managing MKTME key in MKTME driver
> > > doesn't have those advantages.
> > >
> >
> > Trying to evaluate this with the current proposed code is a bit odd, I
> > think.  Suppose you create a thread-specific key and then fork().  The
> > child can presumably still use the key regardless of whether the child
> > can nominally access the key in the keyring because the PTEs are still
> > there.
>
> Right. This is a little bit odd, although virtualization (Qemu, which is the main use case of MKTME
> at least so far) doesn't use fork().
>
> >
> > More fundamentally, in some sense, the current code has no semantics.
> > Associating a key with memory and "encrypting" it doesn't actually do
> > anything unless you are attacking the memory bus but you haven't
> > compromised the kernel.  There's no protection against a guest that
> > can corrupt its EPT tables, there's no protection against kernel bugs
> > (*especially* if the duplicate direct map design stays), and there
> > isn't even any fd or other object around by which you can only access
> > the data if you can see the key.
>
> I am not saying managing MKTME key/keyID in key retention service is definitely better, but it seems
> all those you mentioned are not related to whether to choose key retention service to manage MKTME
> key/keyID? Or you are saying it doesn't matter we manage key/keyID in key retention service or in
> MKTME driver, since MKTME barely have any security benefits (besides physical attack)?

Mostly the latter.  I think it's very hard to evaluate whether a given
key allocation model makes sense given that MKTME provides such weak
security benefits.  TME has obvious security benefits, as does
encryption of persistent memory, but this giant patch set isn't needed
for plain TME and it doesn't help with persistent memory.


>
> >
> > I'm also wondering whether the kernel will always be able to be a
> > one-stop shop for key allocation -- if the MKTME hardware gains
> > interesting new uses down the road, who knows how key allocation will
> > work?
>
> I by now don't have any use case which requires to manage key/keyID specifically for its own use,
> rather than letting kernel to manage keyID allocation. Please inspire us if you have any potential.
>

Other than compliance, I can't think of much reason that using
multiple keys is useful, regardless of how their allocated.  The only
thing I've thought of is that, with multiple keys, you can use PCONFIG
to remove one and flush caches and the data is most definitely gone.
On the other hand, you can just zero the memory and the data is just
as gone even without any encryption.
Peter Zijlstra June 18, 2019, 9:12 a.m. UTC | #26
On Tue, Jun 18, 2019 at 02:23:31PM +1200, Kai Huang wrote:
> Assuming I am understanding the context correctly, yes from this perspective it seems having
> sys_encrypt is annoying, and having ENCRYPT_ME should be better. But Dave said "nobody is going to
> do what you suggest in the ptr1/ptr2 example"? 

You have to phrase that as: 'nobody who knows what he's doing is going
to do that', which leaves lots of people and fuzzers.

Murphy states that if it is possible, someone _will_ do it. And this
being something that causes severe data corruption on persistent
storage,...
Dave Hansen June 18, 2019, 2:09 p.m. UTC | #27
On 6/18/19 2:12 AM, Peter Zijlstra wrote:
> On Tue, Jun 18, 2019 at 02:23:31PM +1200, Kai Huang wrote:
>> Assuming I am understanding the context correctly, yes from this perspective it seems having
>> sys_encrypt is annoying, and having ENCRYPT_ME should be better. But Dave said "nobody is going to
>> do what you suggest in the ptr1/ptr2 example"? 
> 
> You have to phrase that as: 'nobody who knows what he's doing is going
> to do that', which leaves lots of people and fuzzers.
> 
> Murphy states that if it is possible, someone _will_ do it. And this
> being something that causes severe data corruption on persistent
> storage,...

I actually think it's not a big deal at all to avoid the corruption that
would occur if it were allowed.  But, if you're even asking to map the
same data with two different keys, you're *asking* for data corruption.
 What we're doing here is continuing to  preserve cache coherency and
ensuring an early failure.

We'd need two rules:
1. A page must not be faulted into a VMA if the page's page_keyid()
   is not consistent with the VMA's
2. Upon changing the VMA's KeyID, all underlying PTEs must either be
   checked or zapped.

If the rules are broken, we SIGBUS.  Andy's suggestion has the same
basic requirements.  But, with his scheme, the error can be to the
ioctl() instead of in the form of a SIGBUS.  I guess that makes the
fuzzers' lives a bit easier.

BTW, note that we don't have any problems with the current anonymous
implementation and fork() because we zap at the encryption syscall.
Dave Hansen June 18, 2019, 2:13 p.m. UTC | #28
On 6/17/19 5:15 PM, Andy Lutomirski wrote:
>>> But I really expect that the encryption of a DAX device will actually
>>> be a block device setting and won't look like this at all.  It'll be
>>> more like dm-crypt except without device mapper.
>> Are you suggesting not to support MKTME for DAX, or adding MKTME support to dm-crypt?
> I'm proposing exposing it by an interface that looks somewhat like
> dm-crypt.  Either we could have a way to create a device layered on
> top of the DAX devices that exposes a decrypted view or we add a way
> to tell the DAX device to kindly use MKTME with such-and-such key.

I think this basically implies that we need to settle (or at least
present) on an interface for storage (FS-DAX, Device DAX, page cache)
before we merge one for anonymous memory.

That sounds like a reasonable exercise.
Dave Hansen June 18, 2019, 2:19 p.m. UTC | #29
On 6/17/19 6:50 PM, Andy Lutomirski wrote:
> I'm also wondering whether the kernel will always be able to be a
> one-stop shop for key allocation -- if the MKTME hardware gains
> interesting new uses down the road, who knows how key allocation will
> work?

I can't share all the details on LKML, of course, but I can at least say
that this model of allocating KeyID slots will continue to be used for a
number of generations.
Kirill A . Shutemov June 18, 2019, 4:15 p.m. UTC | #30
On Tue, Jun 18, 2019 at 07:09:36AM -0700, Dave Hansen wrote:
> On 6/18/19 2:12 AM, Peter Zijlstra wrote:
> > On Tue, Jun 18, 2019 at 02:23:31PM +1200, Kai Huang wrote:
> >> Assuming I am understanding the context correctly, yes from this perspective it seems having
> >> sys_encrypt is annoying, and having ENCRYPT_ME should be better. But Dave said "nobody is going to
> >> do what you suggest in the ptr1/ptr2 example"? 
> > 
> > You have to phrase that as: 'nobody who knows what he's doing is going
> > to do that', which leaves lots of people and fuzzers.
> > 
> > Murphy states that if it is possible, someone _will_ do it. And this
> > being something that causes severe data corruption on persistent
> > storage,...
> 
> I actually think it's not a big deal at all to avoid the corruption that
> would occur if it were allowed.  But, if you're even asking to map the
> same data with two different keys, you're *asking* for data corruption.
>  What we're doing here is continuing to  preserve cache coherency and
> ensuring an early failure.
> 
> We'd need two rules:
> 1. A page must not be faulted into a VMA if the page's page_keyid()
>    is not consistent with the VMA's
> 2. Upon changing the VMA's KeyID, all underlying PTEs must either be
>    checked or zapped.
> 
> If the rules are broken, we SIGBUS.  Andy's suggestion has the same
> basic requirements.  But, with his scheme, the error can be to the
> ioctl() instead of in the form of a SIGBUS.  I guess that makes the
> fuzzers' lives a bit easier.

I see a problem with the scheme: if we don't have a way to decide if the
key is right for the file, user without access to the right key is able to
prevent legitimate user from accessing the file. Attacker just need read
access to the encrypted file to prevent any legitimate use to access it.

The problem applies to ioctl() too.

To make sense of it we must have a way to distinguish right key from
wrong. I don't see obvious solution with the current hardware design.
Dave Hansen June 18, 2019, 4:22 p.m. UTC | #31
On 6/18/19 9:15 AM, Kirill A. Shutemov wrote:
>> We'd need two rules:
>> 1. A page must not be faulted into a VMA if the page's page_keyid()
>>    is not consistent with the VMA's
>> 2. Upon changing the VMA's KeyID, all underlying PTEs must either be
>>    checked or zapped.
>>
>> If the rules are broken, we SIGBUS.  Andy's suggestion has the same
>> basic requirements.  But, with his scheme, the error can be to the
>> ioctl() instead of in the form of a SIGBUS.  I guess that makes the
>> fuzzers' lives a bit easier.
> I see a problem with the scheme: if we don't have a way to decide if the
> key is right for the file, user without access to the right key is able to
> prevent legitimate user from accessing the file. Attacker just need read
> access to the encrypted file to prevent any legitimate use to access it.

I think you're bringing up a separate issue.

We were talking about how you resolve a conflict when someone attempts
to use two *different* keyids to decrypt the data in the API and what
the resulting API interaction looks like.

You're describing the situation where one of those is the wrong *key*
(not keyid).  That's a subtly different scenario and requires different
handling (or no handling IMNHO).
Andy Lutomirski June 18, 2019, 4:36 p.m. UTC | #32
> On Jun 18, 2019, at 9:22 AM, Dave Hansen <dave.hansen@intel.com> wrote:
> 
> On 6/18/19 9:15 AM, Kirill A. Shutemov wrote:
>>> We'd need two rules:
>>> 1. A page must not be faulted into a VMA if the page's page_keyid()
>>>   is not consistent with the VMA's
>>> 2. Upon changing the VMA's KeyID, all underlying PTEs must either be
>>>   checked or zapped.
>>> 
>>> If the rules are broken, we SIGBUS.  Andy's suggestion has the same
>>> basic requirements.  But, with his scheme, the error can be to the
>>> ioctl() instead of in the form of a SIGBUS.  I guess that makes the
>>> fuzzers' lives a bit easier.
>> I see a problem with the scheme: if we don't have a way to decide if the
>> key is right for the file, user without access to the right key is able to
>> prevent legitimate user from accessing the file. Attacker just need read
>> access to the encrypted file to prevent any legitimate use to access it.
> 
> I think you're bringing up a separate issue.
> 
> We were talking about how you resolve a conflict when someone attempts
> to use two *different* keyids to decrypt the data in the API and what
> the resulting API interaction looks like.
> 
> You're describing the situation where one of those is the wrong *key*
> (not keyid).  That's a subtly different scenario and requires different
> handling (or no handling IMNHO).

I think we’re quibbling over details before we look at the big questions:

Should MKTME+DAX encrypt the entire volume or should it encrypt individual files?  Or both?

If it encrypts individual files, should the fs be involved at all?  Should there be metadata that can check whether a given key is the correct key?

If it encrypts individual files, is it even conceptually possible to avoid corruption if the fs is not involved?  After all, many filesystems think that they can move data blocks, compute checksums, journal data, etc.

I think Dave is right that there should at least be a somewhat credible proposal for how this could fit together.
Dave Hansen June 18, 2019, 4:48 p.m. UTC | #33
On 6/18/19 9:36 AM, Andy Lutomirski wrote:
> Should MKTME+DAX encrypt the entire volume or should it encrypt individual files?  Or both?

Our current thought is that there should be two modes: One for entire
DAX namespaces and one for filesystem DAX that would allow it to be at
the file level.  More likely, we would mirror fscrypt and do it at the
directory level.

> If it encrypts individual files, should the fs be involved at all?
> Should there be metadata that can check whether a given key is the
> correct key?
FWIW, this is a question for the fs guys.  Their guidance so far has
been "do what fscrypt does", and fscrypt does not protect against
incorrect keys being specified.  See:

	https://www.kernel.org/doc/html/v5.1/filesystems/fscrypt.html

Which says:

> Currently, fscrypt does not prevent a user from maliciously providing
> an incorrect key for another user’s existing encrypted files. A
> protection against this is planned.

> If it encrypts individual files, is it even conceptually possible to
> avoid corruption if the fs is not involved?  After all, many
> filesystems think that they can move data blocks, compute checksums,
> journal data, etc.

Yes, exactly.  Thankfully, fscrypt has thought about this already and
has infrastructure for this.  For instance:

> Online defragmentation of encrypted files is not supported. The
> EXT4_IOC_MOVE_EXT and F2FS_IOC_MOVE_RANGE ioctls will fail with
> EOPNOTSUPP.
diff mbox series

Patch

diff --git a/fs/exec.c b/fs/exec.c
index 2e0033348d8e..695c121b34b3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -755,8 +755,8 @@  int setup_arg_pages(struct linux_binprm *bprm,
 	vm_flags |= mm->def_flags;
 	vm_flags |= VM_STACK_INCOMPLETE_SETUP;
 
-	ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
-			vm_flags);
+	ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, vm_flags,
+			     -1);
 	if (ret)
 		goto out_unlock;
 	BUG_ON(prev != vma);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c027044de9bf..a7f52d053826 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1634,7 +1634,8 @@  extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long
 			      int dirty_accountable, int prot_numa);
 extern int mprotect_fixup(struct vm_area_struct *vma,
 			  struct vm_area_struct **pprev, unsigned long start,
-			  unsigned long end, unsigned long newflags);
+			  unsigned long end, unsigned long newflags,
+			  int newkeyid);
 
 /*
  * doesn't attempt to fault and will return short.
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 23e680f4b1d5..38d766b5cc20 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -28,6 +28,7 @@ 
 #include <linux/ksm.h>
 #include <linux/uaccess.h>
 #include <linux/mm_inline.h>
+#include <linux/key.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
@@ -347,7 +348,8 @@  static int prot_none_walk(struct vm_area_struct *vma, unsigned long start,
 
 int
 mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
-	unsigned long start, unsigned long end, unsigned long newflags)
+	       unsigned long start, unsigned long end, unsigned long newflags,
+	       int newkeyid)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long oldflags = vma->vm_flags;
@@ -357,7 +359,14 @@  mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	int error;
 	int dirty_accountable = 0;
 
-	if (newflags == oldflags) {
+	/*
+	 * Flags match and Keyids match or we have NO_KEY.
+	 * This _fixup is usually called from do_mprotect_ext() except
+	 * for one special case: caller fs/exec.c/setup_arg_pages()
+	 * In that case, newkeyid is passed as -1 (NO_KEY).
+	 */
+	if (newflags == oldflags &&
+	    (newkeyid == vma_keyid(vma) || newkeyid == NO_KEY)) {
 		*pprev = vma;
 		return 0;
 	}
@@ -423,6 +432,8 @@  mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	}
 
 success:
+	if (newkeyid != NO_KEY)
+		mprotect_set_encrypt(vma, newkeyid, start, end);
 	/*
 	 * vm_flags and vm_page_prot are protected by the mmap_sem
 	 * held in write mode.
@@ -454,10 +465,15 @@  mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 }
 
 /*
- * When pkey==NO_KEY we get legacy mprotect behavior here.
+ * do_mprotect_ext() supports the legacy mprotect behavior plus extensions
+ * for Protection Keys and Memory Encryption Keys. These extensions are
+ * mutually exclusive and the behavior is:
+ *	(pkey==NO_KEY && keyid==NO_KEY) ==> legacy mprotect
+ *	(pkey is valid)  ==> legacy mprotect plus Protection Key extensions
+ *	(keyid is valid) ==> legacy mprotect plus Encryption Key extensions
  */
 static int do_mprotect_ext(unsigned long start, size_t len,
-		unsigned long prot, int pkey)
+			   unsigned long prot, int pkey, int keyid)
 {
 	unsigned long nstart, end, tmp, reqprot;
 	struct vm_area_struct *vma, *prev;
@@ -555,7 +571,8 @@  static int do_mprotect_ext(unsigned long start, size_t len,
 		tmp = vma->vm_end;
 		if (tmp > end)
 			tmp = end;
-		error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
+		error = mprotect_fixup(vma, &prev, nstart, tmp, newflags,
+				       keyid);
 		if (error)
 			goto out;
 		nstart = tmp;
@@ -580,7 +597,7 @@  static int do_mprotect_ext(unsigned long start, size_t len,
 SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
 		unsigned long, prot)
 {
-	return do_mprotect_ext(start, len, prot, NO_KEY);
+	return do_mprotect_ext(start, len, prot, NO_KEY, NO_KEY);
 }
 
 #ifdef CONFIG_ARCH_HAS_PKEYS
@@ -588,7 +605,7 @@  SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
 SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
 		unsigned long, prot, int, pkey)
 {
-	return do_mprotect_ext(start, len, prot, pkey);
+	return do_mprotect_ext(start, len, prot, pkey, NO_KEY);
 }
 
 SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
@@ -637,3 +654,40 @@  SYSCALL_DEFINE1(pkey_free, int, pkey)
 }
 
 #endif /* CONFIG_ARCH_HAS_PKEYS */
+
+#ifdef CONFIG_X86_INTEL_MKTME
+
+extern int mktme_keyid_from_key(struct key *key);
+
+SYSCALL_DEFINE4(encrypt_mprotect, unsigned long, start, size_t, len,
+		unsigned long, prot, key_serial_t, serial)
+{
+	key_ref_t key_ref;
+	struct key *key;
+	int ret, keyid;
+
+	/* MKTME restriction */
+	if (!PAGE_ALIGNED(len))
+		return -EINVAL;
+
+	/*
+	 * key_ref prevents the destruction of the key
+	 * while the memory encryption is being set up.
+	 */
+
+	key_ref = lookup_user_key(serial, 0, KEY_NEED_VIEW);
+	if (IS_ERR(key_ref))
+		return PTR_ERR(key_ref);
+
+	key = key_ref_to_ptr(key_ref);
+	keyid = mktme_keyid_from_key(key);
+	if (!keyid) {
+		key_ref_put(key_ref);
+		return -EINVAL;
+	}
+	ret = do_mprotect_ext(start, len, prot, NO_KEY, keyid);
+	key_ref_put(key_ref);
+	return ret;
+}
+
+#endif /* CONFIG_X86_INTEL_MKTME */