Message ID | 20190508144422.13171-46-kirill.shutemov@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Intel MKTME enabling | expand |
On Wed, May 08, 2019 at 05:44:05PM +0300, Kirill A. Shutemov wrote: > diff --git a/fs/exec.c b/fs/exec.c > index 2e0033348d8e..695c121b34b3 100644 > --- a/fs/exec.c > +++ b/fs/exec.c > @@ -755,8 +755,8 @@ int setup_arg_pages(struct linux_binprm *bprm, > vm_flags |= mm->def_flags; > vm_flags |= VM_STACK_INCOMPLETE_SETUP; > > - ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, > - vm_flags); > + ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, vm_flags, > + -1); You added a nice NO_KEY helper a few patches back, maybe use it? > if (ret) > goto out_unlock; > BUG_ON(prev != vma);
On Wed, May 08, 2019 at 05:44:05PM +0300, Kirill A. Shutemov wrote: > @@ -347,7 +348,8 @@ static int prot_none_walk(struct vm_area_struct *vma, unsigned long start, > > int > mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, > - unsigned long start, unsigned long end, unsigned long newflags) > + unsigned long start, unsigned long end, unsigned long newflags, > + int newkeyid) > { > struct mm_struct *mm = vma->vm_mm; > unsigned long oldflags = vma->vm_flags; > @@ -357,7 +359,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, > int error; > int dirty_accountable = 0; > > - if (newflags == oldflags) { > + /* > + * Flags match and Keyids match or we have NO_KEY. > + * This _fixup is usually called from do_mprotect_ext() except > + * for one special case: caller fs/exec.c/setup_arg_pages() > + * In that case, newkeyid is passed as -1 (NO_KEY). > + */ > + if (newflags == oldflags && > + (newkeyid == vma_keyid(vma) || newkeyid == NO_KEY)) { > *pprev = vma; > return 0; > } > @@ -423,6 +432,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, > } > > success: > + if (newkeyid != NO_KEY) > + mprotect_set_encrypt(vma, newkeyid, start, end); > /* > * vm_flags and vm_page_prot are protected by the mmap_sem > * held in write mode. > @@ -454,10 +465,15 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, > } > > /* > - * When pkey==NO_KEY we get legacy mprotect behavior here. > + * do_mprotect_ext() supports the legacy mprotect behavior plus extensions > + * for Protection Keys and Memory Encryption Keys. These extensions are > + * mutually exclusive and the behavior is: > + * (pkey==NO_KEY && keyid==NO_KEY) ==> legacy mprotect > + * (pkey is valid) ==> legacy mprotect plus Protection Key extensions > + * (keyid is valid) ==> legacy mprotect plus Encryption Key extensions > */ > static int do_mprotect_ext(unsigned long start, size_t len, > - unsigned long prot, int pkey) > + unsigned long prot, int pkey, int keyid) > { > unsigned long nstart, end, tmp, reqprot; > struct vm_area_struct *vma, *prev; > @@ -555,7 +571,8 @@ static int do_mprotect_ext(unsigned long start, size_t len, > tmp = vma->vm_end; > if (tmp > end) > tmp = end; > - error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); > + error = mprotect_fixup(vma, &prev, nstart, tmp, newflags, > + keyid); > if (error) > goto out; > nstart = tmp; I've missed the part where pkey && keyid results in a WARN or error or whatever.
On Fri, Jun 14, 2019 at 01:47:32PM +0200, Peter Zijlstra wrote: > On Wed, May 08, 2019 at 05:44:05PM +0300, Kirill A. Shutemov wrote: > > diff --git a/fs/exec.c b/fs/exec.c > > index 2e0033348d8e..695c121b34b3 100644 > > --- a/fs/exec.c > > +++ b/fs/exec.c > > @@ -755,8 +755,8 @@ int setup_arg_pages(struct linux_binprm *bprm, > > vm_flags |= mm->def_flags; > > vm_flags |= VM_STACK_INCOMPLETE_SETUP; > > > > - ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, > > - vm_flags); > > + ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, vm_flags, > > + -1); > > You added a nice NO_KEY helper a few patches back, maybe use it? Sure, done. (I hesitated to define NO_KEY in mm.h initially. Put it there now. We'll see how that looks it next round.) > > > if (ret) > > goto out_unlock; > > BUG_ON(prev != vma);
On Fri, Jun 14, 2019 at 01:51:37PM +0200, Peter Zijlstra wrote: > On Wed, May 08, 2019 at 05:44:05PM +0300, Kirill A. Shutemov wrote: snip > > /* > > - * When pkey==NO_KEY we get legacy mprotect behavior here. > > + * do_mprotect_ext() supports the legacy mprotect behavior plus extensions > > + * for Protection Keys and Memory Encryption Keys. These extensions are > > + * mutually exclusive and the behavior is: > > + * (pkey==NO_KEY && keyid==NO_KEY) ==> legacy mprotect > > + * (pkey is valid) ==> legacy mprotect plus Protection Key extensions > > + * (keyid is valid) ==> legacy mprotect plus Encryption Key extensions > > */ > > static int do_mprotect_ext(unsigned long start, size_t len, > > - unsigned long prot, int pkey) > > + unsigned long prot, int pkey, int keyid) > > { snip > > I've missed the part where pkey && keyid results in a WARN or error or > whatever. > I wasn't so sure about that since do_mprotect_ext() is the call 'behind' the system calls. legacy mprotect always calls with: NO_KEY, NO_KEY pkey_mprotect always calls with: pkey, NO_KEY encrypt_mprotect always calls with NO_KEY, keyid Would a check on those arguments be debug only to future proof this?
On Fri, Jun 14, 2019 at 05:32:31PM -0700, Alison Schofield wrote: > On Fri, Jun 14, 2019 at 01:51:37PM +0200, Peter Zijlstra wrote: > > On Wed, May 08, 2019 at 05:44:05PM +0300, Kirill A. Shutemov wrote: > snip > > > /* > > > - * When pkey==NO_KEY we get legacy mprotect behavior here. > > > + * do_mprotect_ext() supports the legacy mprotect behavior plus extensions > > > + * for Protection Keys and Memory Encryption Keys. These extensions are > > > + * mutually exclusive and the behavior is: Well, here it states that the extentions are mutually exclusive. > > > + * (pkey==NO_KEY && keyid==NO_KEY) ==> legacy mprotect > > > + * (pkey is valid) ==> legacy mprotect plus Protection Key extensions > > > + * (keyid is valid) ==> legacy mprotect plus Encryption Key extensions > > > */ > > > static int do_mprotect_ext(unsigned long start, size_t len, > > > - unsigned long prot, int pkey) > > > + unsigned long prot, int pkey, int keyid) > > > { > > snip > > > > > I've missed the part where pkey && keyid results in a WARN or error or > > whatever. > > > I wasn't so sure about that since do_mprotect_ext() > is the call 'behind' the system calls. > > legacy mprotect always calls with: NO_KEY, NO_KEY > pkey_mprotect always calls with: pkey, NO_KEY > encrypt_mprotect always calls with NO_KEY, keyid > > Would a check on those arguments be debug only > to future proof this? But you then don't check that, anywhere, afaict.
On Wed, May 8, 2019 at 7:44 AM Kirill A. Shutemov <kirill.shutemov@linux.intel.com> wrote: > > From: Alison Schofield <alison.schofield@intel.com> > > Implement memory encryption for MKTME (Multi-Key Total Memory > Encryption) with a new system call that is an extension of the > legacy mprotect() system call. > > In encrypt_mprotect the caller must pass a handle to a previously > allocated and programmed MKTME encryption key. The key can be > obtained through the kernel key service type "mktme". The caller > must have KEY_NEED_VIEW permission on the key. > > MKTME places an additional restriction on the protected data: > The length of the data must be page aligned. This is in addition > to the existing mprotect restriction that the addr must be page > aligned. I still find it bizarre that this is conflated with mprotect(). I also remain entirely unconvinced that MKTME on anonymous memory is useful in the long run. There will inevitably be all kinds of fancy new CPU features that make the underlying MKTME mechanisms much more useful. For example, some way to bind a key to a VM, or a way to *sanely* encrypt persistent memory. By making this thing a syscall that does more than just MKTME, you're adding combinatorial complexity (you forget pkey!) and you're tying other functionality (change of protection) to this likely-to-be-deprecated interface. This is part of why I much prefer the idea of making this style of MKTME a driver or some other non-intrusive interface. Then, once everyone gets tired of it, the driver can just get turned off with no side effects. --Andy
On 6/17/19 8:07 AM, Andy Lutomirski wrote: > I still find it bizarre that this is conflated with mprotect(). This needs to be in the changelog. But, for better or worse, it's following the mprotect_pkey() pattern. Other than the obvious "set the key on this memory", we're looking for two other properties: atomicity (ensuring there is no transient state where the memory is usable without the desired properties) and that it is usable on existing allocations. For atomicity, we have a model where we can allocate things with PROT_NONE, then do mprotect_pkey() and mprotect_encrypt() (plus any future features), then the last mprotect_*() call takes us from PROT_NONE to the desired end permisions. We could just require a plain old mprotect() to do that instead of embedding mprotect()-like behavior in these, of course, but that isn't the path we're on at the moment with mprotect_pkey(). So, for this series it's just a matter of whether we do this: ptr = mmap(..., PROT_NONE); mprotect_pkey(protect_key, ptr, PROT_NONE); mprotect_encrypt(encr_key, ptr, PROT_READ|PROT_WRITE); // good to go or this: ptr = mmap(..., PROT_NONE); mprotect_pkey(protect_key, ptr, PROT_NONE); sys_encrypt(key, ptr); mprotect(ptr, PROT_READ|PROT_WRITE); // good to go I actually don't care all that much which one we end up with. It's not like the extra syscall in the second options means much. > This is part of why I much prefer the idea of making this style of > MKTME a driver or some other non-intrusive interface. Then, once > everyone gets tired of it, the driver can just get turned off with no > side effects. I like the concept, but not where it leads. I'd call it the 'hugetlbfs approach". :) Hugetblfs certainly go us huge pages, but it's continued to be a parallel set of code with parallel bugs and parallel implementations of many VM features. It's not that you can't implement new things on hugetlbfs, it's that you *need* to. You never get them for free. For instance, if we do a driver, how do we get large pages? How do we swap/reclaim the pages? How do we do NUMA affinity? How do we eventually stack it on top of persistent memory filesystems or Device DAX? With a driver approach, I think we're stuck basically reimplementing things or gluing them back together. Nothing comes for free. With this approach, we basically start with our normal, full feature set (modulo weirdo interactions like with KSM).
On Mon, Jun 17, 2019 at 8:28 AM Dave Hansen <dave.hansen@intel.com> wrote: > > On 6/17/19 8:07 AM, Andy Lutomirski wrote: > > I still find it bizarre that this is conflated with mprotect(). > > This needs to be in the changelog. But, for better or worse, it's > following the mprotect_pkey() pattern. > > Other than the obvious "set the key on this memory", we're looking for > two other properties: atomicity (ensuring there is no transient state > where the memory is usable without the desired properties) and that it > is usable on existing allocations. > > For atomicity, we have a model where we can allocate things with > PROT_NONE, then do mprotect_pkey() and mprotect_encrypt() (plus any > future features), then the last mprotect_*() call takes us from > PROT_NONE to the desired end permisions. We could just require a plain > old mprotect() to do that instead of embedding mprotect()-like behavior > in these, of course, but that isn't the path we're on at the moment with > mprotect_pkey(). > > So, for this series it's just a matter of whether we do this: > > ptr = mmap(..., PROT_NONE); > mprotect_pkey(protect_key, ptr, PROT_NONE); > mprotect_encrypt(encr_key, ptr, PROT_READ|PROT_WRITE); > // good to go > > or this: > > ptr = mmap(..., PROT_NONE); > mprotect_pkey(protect_key, ptr, PROT_NONE); > sys_encrypt(key, ptr); > mprotect(ptr, PROT_READ|PROT_WRITE); > // good to go > > I actually don't care all that much which one we end up with. It's not > like the extra syscall in the second options means much. The benefit of the second one is that, if sys_encrypt is absent, it just works. In the first model, programs need a fallback because they'll segfault of mprotect_encrypt() gets ENOSYS. > > > This is part of why I much prefer the idea of making this style of > > MKTME a driver or some other non-intrusive interface. Then, once > > everyone gets tired of it, the driver can just get turned off with no > > side effects. > > I like the concept, but not where it leads. I'd call it the 'hugetlbfs > approach". :) Hugetblfs certainly go us huge pages, but it's continued > to be a parallel set of code with parallel bugs and parallel > implementations of many VM features. It's not that you can't implement > new things on hugetlbfs, it's that you *need* to. You never get them > for free. Fair enough, but... > > For instance, if we do a driver, how do we get large pages? How do we > swap/reclaim the pages? How do we do NUMA affinity? Those all make sense. > How do we > eventually stack it on top of persistent memory filesystems or Device > DAX? How do we stack anonymous memory on top of persistent memory or Device DAX? I'm confused. Just to throw this out there, what if we had a new device /dev/xpfo and MKTME were one of its features. You open /dev/xpfo, optionally do an ioctl to set a key, and them map it. The pages you get are unmapped entirely from the direct map, and you get a PFNMAP VMA with all its limitations. This seems much more useful -- it's limited, but it's limited *because the kernel can't accidentally read it*. I think that, in the long run, we're going to have to either expand the core mm's concept of what "memory" is or just have a whole parallel set of mechanisms for memory that doesn't work like memory. We're already accumulating a set of things that are backed by memory but aren't usable as memory. SGX EPC pages and SEV pages come to mind. They are faster when they're in big contiguous chunks (well, not SGX AFAIK, but maybe some day), they have NUMA node affinity, and they show up in page tables, but the hardware restricts who can read and write them. If Intel isn't planning to do something like this with the MKTME hardware, I'll eat my hat. I expect that some day normal memory will be able to be repurposed as SGX pages on the fly, and that will also look a lot more like SEV or XPFO than like the this model of MKTME. So, if we upstream MKTME as anonymous memory with a magic config syscall, I predict that, in a few years, it will be end up inheriting all downsides of both approaches with few of the upsides. Programs like QEMU will need to learn to manipulate pages that can't be accessed outside the VM without special VM buy-in, so the fact that MKTME pages are fully functional and can be GUP-ed won't be very useful. And the VM will learn about all these things, but MKTME won't really fit in. And, one of these days, someone will come up with a version of XPFO that could actually be upstreamed, and it seems entirely plausible that it will be totally incompatible with MKTME-as-anonymous-memory and that users of MKTME will actually get *worse* security.
Tom Lendacky, could you take a look down in the message to the talk of SEV? I want to make sure I'm not misrepresenting what it does today. ... >> I actually don't care all that much which one we end up with. It's not >> like the extra syscall in the second options means much. > > The benefit of the second one is that, if sys_encrypt is absent, it > just works. In the first model, programs need a fallback because > they'll segfault of mprotect_encrypt() gets ENOSYS. Well, by the time they get here, they would have already had to allocate and set up the encryption key. I don't think this would really be the "normal" malloc() path, for instance. >> How do we >> eventually stack it on top of persistent memory filesystems or Device >> DAX? > > How do we stack anonymous memory on top of persistent memory or Device > DAX? I'm confused. If our interface to MKTME is: fd = open("/dev/mktme"); ptr = mmap(fd); Then it's hard to combine with an interface which is: fd = open("/dev/dax123"); ptr = mmap(fd); Where if we have something like mprotect() (or madvise() or something else taking pointer), we can just do: fd = open("/dev/anything987"); ptr = mmap(fd); sys_encrypt(ptr); Now, we might not *do* it that way for dax, for instance, but I'm just saying that if we go the /dev/mktme route, we never get a choice. > I think that, in the long run, we're going to have to either expand > the core mm's concept of what "memory" is or just have a whole > parallel set of mechanisms for memory that doesn't work like memory. ... > I expect that some day normal memory will be able to be repurposed as > SGX pages on the fly, and that will also look a lot more like SEV or > XPFO than like the this model of MKTME. I think you're drawing the line at pages where the kernel can manage contents vs. not manage contents. I'm not sure that's the right distinction to make, though. The thing that is important is whether the kernel can manage the lifetime and location of the data in the page. Basically: Can the kernel choose where the page comes from and get the page back when it wants? I really don't like the current state of things like with SEV or with KVM direct device assignment where the physical location is quite locked down and the kernel really can't manage the memory. I'm trying really hard to make sure future hardware is more permissive about such things. My hope is that these are a temporary blip and not the new normal. > So, if we upstream MKTME as anonymous memory with a magic config > syscall, I predict that, in a few years, it will be end up inheriting > all downsides of both approaches with few of the upsides. Programs > like QEMU will need to learn to manipulate pages that can't be > accessed outside the VM without special VM buy-in, so the fact that > MKTME pages are fully functional and can be GUP-ed won't be very > useful. And the VM will learn about all these things, but MKTME won't > really fit in. Kai Huang (who is on cc) has been doing the QEMU enabling and might want to weigh in. I'd also love to hear from the AMD folks in case I'm not grokking some aspect of SEV. But, my understanding is that, even today, neither QEMU nor the kernel can see SEV-encrypted guest memory. So QEMU should already understand how to not interact with guest memory. I _assume_ it's also already doing this with anonymous memory, without needing /dev/sme or something. > And, one of these days, someone will come up with a version of XPFO > that could actually be upstreamed, and it seems entirely plausible > that it will be totally incompatible with MKTME-as-anonymous-memory > and that users of MKTME will actually get *worse* security. I'm not following here. XPFO just means that we don't keep the direct map around all the time for all memory. If XPFO and MKTME-as-anonymous-memory were both in play, I think we'd just be creating/destroying the MKTME-enlightened direct map instead of a vanilla one.
On Mon, Jun 17, 2019 at 11:37 AM Dave Hansen <dave.hansen@intel.com> wrote: > > Tom Lendacky, could you take a look down in the message to the talk of > SEV? I want to make sure I'm not misrepresenting what it does today. > ... > > > >> I actually don't care all that much which one we end up with. It's not > >> like the extra syscall in the second options means much. > > > > The benefit of the second one is that, if sys_encrypt is absent, it > > just works. In the first model, programs need a fallback because > > they'll segfault of mprotect_encrypt() gets ENOSYS. > > Well, by the time they get here, they would have already had to allocate > and set up the encryption key. I don't think this would really be the > "normal" malloc() path, for instance. > > >> How do we > >> eventually stack it on top of persistent memory filesystems or Device > >> DAX? > > > > How do we stack anonymous memory on top of persistent memory or Device > > DAX? I'm confused. > > If our interface to MKTME is: > > fd = open("/dev/mktme"); > ptr = mmap(fd); > > Then it's hard to combine with an interface which is: > > fd = open("/dev/dax123"); > ptr = mmap(fd); > > Where if we have something like mprotect() (or madvise() or something > else taking pointer), we can just do: > > fd = open("/dev/anything987"); > ptr = mmap(fd); > sys_encrypt(ptr); I'm having a hard time imagining that ever working -- wouldn't it blow up if someone did: fd = open("/dev/anything987"); ptr1 = mmap(fd); ptr2 = mmap(fd); sys_encrypt(ptr1); So I think it really has to be: fd = open("/dev/anything987"); ioctl(fd, ENCRYPT_ME); mmap(fd); But I really expect that the encryption of a DAX device will actually be a block device setting and won't look like this at all. It'll be more like dm-crypt except without device mapper. > > Now, we might not *do* it that way for dax, for instance, but I'm just > saying that if we go the /dev/mktme route, we never get a choice. > > > I think that, in the long run, we're going to have to either expand > > the core mm's concept of what "memory" is or just have a whole > > parallel set of mechanisms for memory that doesn't work like memory. > ... > > I expect that some day normal memory will be able to be repurposed as > > SGX pages on the fly, and that will also look a lot more like SEV or > > XPFO than like the this model of MKTME. > > I think you're drawing the line at pages where the kernel can manage > contents vs. not manage contents. I'm not sure that's the right > distinction to make, though. The thing that is important is whether the > kernel can manage the lifetime and location of the data in the page. The kernel can manage the location of EPC pages, for example, but only under extreme constraints right now. The draft SGX driver can and does swap them out and swap them back in, potentially at a different address. > > Basically: Can the kernel choose where the page comes from and get the > page back when it wants? > > I really don't like the current state of things like with SEV or with > KVM direct device assignment where the physical location is quite locked > down and the kernel really can't manage the memory. I'm trying really > hard to make sure future hardware is more permissive about such things. > My hope is that these are a temporary blip and not the new normal. > > > So, if we upstream MKTME as anonymous memory with a magic config > > syscall, I predict that, in a few years, it will be end up inheriting > > all downsides of both approaches with few of the upsides. Programs > > like QEMU will need to learn to manipulate pages that can't be > > accessed outside the VM without special VM buy-in, so the fact that > > MKTME pages are fully functional and can be GUP-ed won't be very > > useful. And the VM will learn about all these things, but MKTME won't > > really fit in. > > Kai Huang (who is on cc) has been doing the QEMU enabling and might want > to weigh in. I'd also love to hear from the AMD folks in case I'm not > grokking some aspect of SEV. > > But, my understanding is that, even today, neither QEMU nor the kernel > can see SEV-encrypted guest memory. So QEMU should already understand > how to not interact with guest memory. I _assume_ it's also already > doing this with anonymous memory, without needing /dev/sme or something. Let's find out! If it's using anonymous memory, it must be very strange, since it would more or less have to be mmapped PROT_NONE. The thing that makes anonymous memory in particular seem so awkward to me is that it's fundamentally identified by it's *address*, which means it makes no sense if it's not mapped. > > > And, one of these days, someone will come up with a version of XPFO > > that could actually be upstreamed, and it seems entirely plausible > > that it will be totally incompatible with MKTME-as-anonymous-memory > > and that users of MKTME will actually get *worse* security. > > I'm not following here. XPFO just means that we don't keep the direct > map around all the time for all memory. If XPFO and > MKTME-as-anonymous-memory were both in play, I think we'd just be > creating/destroying the MKTME-enlightened direct map instead of a > vanilla one. What I'm saying is that I can imagine XPFO also wanting to be something other than anonymous memory. I don't think we'll ever want regular MAP_ANONYMOUS to enable XPFO by default because the performance will suck. Doing this seems odd: ptr = mmap(MAP_ANONYMOUS); sys_xpfo_a_pointer(ptr); So I could imagine: ptr = mmap(MAP_ANONYMOUS | MAP_XPFO); or fd = open("/dev/xpfo"); (or fd = memfd_create(..., XPFO); ptr = mmap(fd); I'm thinking that XPFO is a *lot* simpler under the hood if we just straight-up don't support GUP on it. Maybe we should call this "strong XPFO". Similarly, the kinds of things that want MKTME may also want the memory to be entirely absent from the direct map. And the things that use SEV (as I understand it) *can't* usefully use the memory for normal IO via GUP or copy_to/from_user(), so these things all have a decent amount in common. Another down side of anonymous memory (in my head, anyway -- QEMU people should chime in) is that it seems awkward to use it for IO techniques in which the back-end isn't in the QEMU process. If there's an fd involved, you can pass it around, feed it to things like vfio, etc. If there's no fd, it's stuck in the creating process. And another silly argument: if we had /dev/mktme, then we could possibly get away with avoiding all the keyring stuff entirely. Instead, you open /dev/mktme and you get your own key under the hook. If you want two keys, you open /dev/mktme twice. If you want some other program to be able to see your memory, you pass it the fd. I hope this email isn't too rambling :) --Andy
>> Where if we have something like mprotect() (or madvise() or something >> else taking pointer), we can just do: >> >> fd = open("/dev/anything987"); >> ptr = mmap(fd); >> sys_encrypt(ptr); > > I'm having a hard time imagining that ever working -- wouldn't it blow > up if someone did: > > fd = open("/dev/anything987"); > ptr1 = mmap(fd); > ptr2 = mmap(fd); > sys_encrypt(ptr1); > > So I think it really has to be: > fd = open("/dev/anything987"); > ioctl(fd, ENCRYPT_ME); > mmap(fd); Yeah, shared mappings are annoying. :) But, let's face it, nobody is going to do what you suggest in the ptr1/ptr2 example. It doesn't make any logical sense because it's effectively asking to read the memory with two different keys. I _believe_ fscrypt has similar issues and just punts on them by saying "don't do that". We can also quite easily figure out what's going on. It's a very simple rule to kill a process that tries to fault a page in whose KeyID doesn't match the VMA under which it is faulted in, and also require that no pages are faulted in under VMAs which have their key changed. >> Now, we might not *do* it that way for dax, for instance, but I'm just >> saying that if we go the /dev/mktme route, we never get a choice. >> >>> I think that, in the long run, we're going to have to either expand >>> the core mm's concept of what "memory" is or just have a whole >>> parallel set of mechanisms for memory that doesn't work like memory. >> ... >>> I expect that some day normal memory will be able to be repurposed as >>> SGX pages on the fly, and that will also look a lot more like SEV or >>> XPFO than like the this model of MKTME. >> >> I think you're drawing the line at pages where the kernel can manage >> contents vs. not manage contents. I'm not sure that's the right >> distinction to make, though. The thing that is important is whether the >> kernel can manage the lifetime and location of the data in the page. > > The kernel can manage the location of EPC pages, for example, but only > under extreme constraints right now. The draft SGX driver can and > does swap them out and swap them back in, potentially at a different > address. The kernel can't put arbitrary data in EPC pages and can't use normal memory for EPC. To me, that puts them clearly on the side of being unmanageable by the core mm code. For instance, there's no way we could mix EPC pages in the same 'struct zone' with non-EPC pages. Not only are they not in the direct map, but they never *can* be, even for a second. >>> And, one of these days, someone will come up with a version of XPFO >>> that could actually be upstreamed, and it seems entirely plausible >>> that it will be totally incompatible with MKTME-as-anonymous-memory >>> and that users of MKTME will actually get *worse* security. >> >> I'm not following here. XPFO just means that we don't keep the direct >> map around all the time for all memory. If XPFO and >> MKTME-as-anonymous-memory were both in play, I think we'd just be >> creating/destroying the MKTME-enlightened direct map instead of a >> vanilla one. > > What I'm saying is that I can imagine XPFO also wanting to be > something other than anonymous memory. I don't think we'll ever want > regular MAP_ANONYMOUS to enable XPFO by default because the > performance will suck. It will certainly suck for some things. But, does it suck if the kernel never uses the direct map for the XPFO memory? If it were for KVM guest memory for a guest using direct device assignment, we might not even ever notice. > I'm thinking that XPFO is a *lot* simpler under the hood if we just > straight-up don't support GUP on it. Maybe we should call this > "strong XPFO". Similarly, the kinds of things that want MKTME may > also want the memory to be entirely absent from the direct map. And > the things that use SEV (as I understand it) *can't* usefully use the > memory for normal IO via GUP or copy_to/from_user(), so these things > all have a decent amount in common. OK, so basically, you're thinking about new memory management infrastructure that a memory-allocating-app can opt into where they get a reduced kernel feature set, but also increased security guarantees? The main insight thought is that some hardware features *already* impose (some of) this reduced feature set? FWIW, I don't think many folks will go for the no-GUP rule. It's one thing to say no-GUPs for SGX pages which can't have I/O done on them in the first place, but it's quite another to tell folks that sendfile() no longer works without bounce buffers. MKTME's security guarantees are very different than something like SEV. Since the kernel is in the trust boundary, it *can* do fun stuff like RDMA which is a heck of a lot faster than bounce buffering. Let's say a franken-system existed with SEV and MKTME. It isn't even clear to me that *everyone* would pick SEV over MKTME. IOW, I'm not sure the MKTME model necessarily goes away given the presence of SEV. > And another silly argument: if we had /dev/mktme, then we could > possibly get away with avoiding all the keyring stuff entirely. > Instead, you open /dev/mktme and you get your own key under the hook. > If you want two keys, you open /dev/mktme twice. If you want some > other program to be able to see your memory, you pass it the fd. We still like the keyring because it's one-stop-shopping as the place that *owns* the hardware KeyID slots. Those are global resources and scream for a single global place to allocate and manage them. The hardware slots also need to be shared between any anonymous and file-based users, no matter what the APIs for the anonymous side.
On Mon, 2019-06-17 at 11:27 -0700, Dave Hansen wrote: > Tom Lendacky, could you take a look down in the message to the talk of > SEV? I want to make sure I'm not misrepresenting what it does today. > ... > > > > > I actually don't care all that much which one we end up with. It's not > > > like the extra syscall in the second options means much. > > > > The benefit of the second one is that, if sys_encrypt is absent, it > > just works. In the first model, programs need a fallback because > > they'll segfault of mprotect_encrypt() gets ENOSYS. > > Well, by the time they get here, they would have already had to allocate > and set up the encryption key. I don't think this would really be the > "normal" malloc() path, for instance. > > > > How do we > > > eventually stack it on top of persistent memory filesystems or Device > > > DAX? > > > > How do we stack anonymous memory on top of persistent memory or Device > > DAX? I'm confused. > > If our interface to MKTME is: > > fd = open("/dev/mktme"); > ptr = mmap(fd); > > Then it's hard to combine with an interface which is: > > fd = open("/dev/dax123"); > ptr = mmap(fd); > > Where if we have something like mprotect() (or madvise() or something > else taking pointer), we can just do: > > fd = open("/dev/anything987"); > ptr = mmap(fd); > sys_encrypt(ptr); > > Now, we might not *do* it that way for dax, for instance, but I'm just > saying that if we go the /dev/mktme route, we never get a choice. > > > I think that, in the long run, we're going to have to either expand > > the core mm's concept of what "memory" is or just have a whole > > parallel set of mechanisms for memory that doesn't work like memory. > > ... > > I expect that some day normal memory will be able to be repurposed as > > SGX pages on the fly, and that will also look a lot more like SEV or > > XPFO than like the this model of MKTME. > > I think you're drawing the line at pages where the kernel can manage > contents vs. not manage contents. I'm not sure that's the right > distinction to make, though. The thing that is important is whether the > kernel can manage the lifetime and location of the data in the page. > > Basically: Can the kernel choose where the page comes from and get the > page back when it wants? > > I really don't like the current state of things like with SEV or with > KVM direct device assignment where the physical location is quite locked > down and the kernel really can't manage the memory. I'm trying really > hard to make sure future hardware is more permissive about such things. > My hope is that these are a temporary blip and not the new normal. > > > So, if we upstream MKTME as anonymous memory with a magic config > > syscall, I predict that, in a few years, it will be end up inheriting > > all downsides of both approaches with few of the upsides. Programs > > like QEMU will need to learn to manipulate pages that can't be > > accessed outside the VM without special VM buy-in, so the fact that > > MKTME pages are fully functional and can be GUP-ed won't be very > > useful. And the VM will learn about all these things, but MKTME won't > > really fit in. > > Kai Huang (who is on cc) has been doing the QEMU enabling and might want > to weigh in. I'd also love to hear from the AMD folks in case I'm not > grokking some aspect of SEV. > > But, my understanding is that, even today, neither QEMU nor the kernel > can see SEV-encrypted guest memory. So QEMU should already understand > how to not interact with guest memory. I _assume_ it's also already > doing this with anonymous memory, without needing /dev/sme or something. Correct neither Qemu nor kernel can see SEV-encrypted guest memory. Qemu requires guest's cooperation when it needs to interacts with guest, i.e. to support virtual DMA (of virtual devices in SEV-guest), qemu requires SEV-guest to setup bounce buffer (which will not be SEV-encrypted memory, but shared memory can be accessed from host side too), so that guest kernel can copy DMA data from bounce buffer to its own SEV-encrypted memory after qemu/host kernel puts DMA data to bounce buffer. And yes from my reading (better to have AMD guys to confirm) SEV guest uses anonymous memory, but it also pins all guest memory (by calling GUP from KVM -- SEV specifically introduced 2 KVM ioctls for this purpose), since SEV architecturally cannot support swapping, migraiton of SEV-encrypted guest memory, because SME/SEV also uses physical address as "tweak", and there's no way that kernel can get or use SEV-guest's memory encryption key. In order to swap/migrate SEV-guest memory, we need SGX EPC eviction/reload similar thing, which SEV doesn't have today. From this perspective, I think driver proposal kinda makes sense since we already have security feature which uses normal memory some kind like "device memory" (no swap, no migration, etc), so it makes sense that MKTME just follows that (although from HW MKTME can support swap, page migration, etc). The downside of driver proposal for MKTME I think is, like Dave mentioned, it's hard (or not sure whether it is possible) to extend to support NVDIMM (and file backed guest memory), since for virtual NVDIMM, Qemu needs to call mmap against fd of NVDIMM. Thanks, -Kai
On Mon, 2019-06-17 at 12:12 -0700, Andy Lutomirski wrote: > On Mon, Jun 17, 2019 at 11:37 AM Dave Hansen <dave.hansen@intel.com> wrote: > > > > Tom Lendacky, could you take a look down in the message to the talk of > > SEV? I want to make sure I'm not misrepresenting what it does today. > > ... > > > > > > > > I actually don't care all that much which one we end up with. It's not > > > > like the extra syscall in the second options means much. > > > > > > The benefit of the second one is that, if sys_encrypt is absent, it > > > just works. In the first model, programs need a fallback because > > > they'll segfault of mprotect_encrypt() gets ENOSYS. > > > > Well, by the time they get here, they would have already had to allocate > > and set up the encryption key. I don't think this would really be the > > "normal" malloc() path, for instance. > > > > > > How do we > > > > eventually stack it on top of persistent memory filesystems or Device > > > > DAX? > > > > > > How do we stack anonymous memory on top of persistent memory or Device > > > DAX? I'm confused. > > > > If our interface to MKTME is: > > > > fd = open("/dev/mktme"); > > ptr = mmap(fd); > > > > Then it's hard to combine with an interface which is: > > > > fd = open("/dev/dax123"); > > ptr = mmap(fd); > > > > Where if we have something like mprotect() (or madvise() or something > > else taking pointer), we can just do: > > > > fd = open("/dev/anything987"); > > ptr = mmap(fd); > > sys_encrypt(ptr); > > I'm having a hard time imagining that ever working -- wouldn't it blow > up if someone did: > > fd = open("/dev/anything987"); > ptr1 = mmap(fd); > ptr2 = mmap(fd); > sys_encrypt(ptr1); > > So I think it really has to be: > fd = open("/dev/anything987"); > ioctl(fd, ENCRYPT_ME); > mmap(fd); This requires "/dev/anything987" to support ENCRYPT_ME ioctl, right? So to support NVDIMM (DAX), we need to add ENCRYPT_ME ioctl to DAX? > > But I really expect that the encryption of a DAX device will actually > be a block device setting and won't look like this at all. It'll be > more like dm-crypt except without device mapper. Are you suggesting not to support MKTME for DAX, or adding MKTME support to dm-crypt? Thanks, -Kai
On Mon, Jun 17, 2019 at 5:05 PM Kai Huang <kai.huang@linux.intel.com> wrote: > > On Mon, 2019-06-17 at 12:12 -0700, Andy Lutomirski wrote: > > On Mon, Jun 17, 2019 at 11:37 AM Dave Hansen <dave.hansen@intel.com> wrote: > > > > > > Tom Lendacky, could you take a look down in the message to the talk of > > > SEV? I want to make sure I'm not misrepresenting what it does today. > > > ... > > > > > > > > > > > I actually don't care all that much which one we end up with. It's not > > > > > like the extra syscall in the second options means much. > > > > > > > > The benefit of the second one is that, if sys_encrypt is absent, it > > > > just works. In the first model, programs need a fallback because > > > > they'll segfault of mprotect_encrypt() gets ENOSYS. > > > > > > Well, by the time they get here, they would have already had to allocate > > > and set up the encryption key. I don't think this would really be the > > > "normal" malloc() path, for instance. > > > > > > > > How do we > > > > > eventually stack it on top of persistent memory filesystems or Device > > > > > DAX? > > > > > > > > How do we stack anonymous memory on top of persistent memory or Device > > > > DAX? I'm confused. > > > > > > If our interface to MKTME is: > > > > > > fd = open("/dev/mktme"); > > > ptr = mmap(fd); > > > > > > Then it's hard to combine with an interface which is: > > > > > > fd = open("/dev/dax123"); > > > ptr = mmap(fd); > > > > > > Where if we have something like mprotect() (or madvise() or something > > > else taking pointer), we can just do: > > > > > > fd = open("/dev/anything987"); > > > ptr = mmap(fd); > > > sys_encrypt(ptr); > > > > I'm having a hard time imagining that ever working -- wouldn't it blow > > up if someone did: > > > > fd = open("/dev/anything987"); > > ptr1 = mmap(fd); > > ptr2 = mmap(fd); > > sys_encrypt(ptr1); > > > > So I think it really has to be: > > fd = open("/dev/anything987"); > > ioctl(fd, ENCRYPT_ME); > > mmap(fd); > > This requires "/dev/anything987" to support ENCRYPT_ME ioctl, right? > > So to support NVDIMM (DAX), we need to add ENCRYPT_ME ioctl to DAX? Yes and yes, or we do it with layers -- see below. I don't see how we can credibly avoid this. If we try to do MKTME behind the DAX driver's back, aren't we going to end up with cache coherence problems? > > > > > But I really expect that the encryption of a DAX device will actually > > be a block device setting and won't look like this at all. It'll be > > more like dm-crypt except without device mapper. > > Are you suggesting not to support MKTME for DAX, or adding MKTME support to dm-crypt? I'm proposing exposing it by an interface that looks somewhat like dm-crypt. Either we could have a way to create a device layered on top of the DAX devices that exposes a decrypted view or we add a way to tell the DAX device to kindly use MKTME with such-and-such key. If there is demand for a way to have an fscrypt-like thing on top of DAX where different files use different keys, I suppose that could be done too, but it will need filesystem or VFS help.
> > > And another silly argument: if we had /dev/mktme, then we could > > possibly get away with avoiding all the keyring stuff entirely. > > Instead, you open /dev/mktme and you get your own key under the hook. > > If you want two keys, you open /dev/mktme twice. If you want some > > other program to be able to see your memory, you pass it the fd. > > We still like the keyring because it's one-stop-shopping as the place > that *owns* the hardware KeyID slots. Those are global resources and > scream for a single global place to allocate and manage them. The > hardware slots also need to be shared between any anonymous and > file-based users, no matter what the APIs for the anonymous side. MKTME driver (who creates /dev/mktme) can also be the one-stop-shopping. I think whether to choose keyring to manage MKTME key should be based on whether we need/should take advantage of existing key retention service functionalities. For example, with key retention service we can revoke/invalidate/set expiry for a key (not sure whether MKTME needs those although), and we have several keyrings -- thread specific keyring, process specific keyring, user specific keyring, etc, thus we can control who can/cannot find the key, etc. I think managing MKTME key in MKTME driver doesn't have those advantages. Thanks, -Kai
On 6/17/19 6:59 PM, Kai Huang wrote: > On Mon, 2019-06-17 at 11:27 -0700, Dave Hansen wrote: >> Tom Lendacky, could you take a look down in the message to the talk of >> SEV? I want to make sure I'm not misrepresenting what it does today. Sorry, I'm traveling this week, so responses will be delayed... >> ... >> >> >>>> I actually don't care all that much which one we end up with. It's not >>>> like the extra syscall in the second options means much. >>> >>> The benefit of the second one is that, if sys_encrypt is absent, it >>> just works. In the first model, programs need a fallback because >>> they'll segfault of mprotect_encrypt() gets ENOSYS. >> >> Well, by the time they get here, they would have already had to allocate >> and set up the encryption key. I don't think this would really be the >> "normal" malloc() path, for instance. >> >>>> How do we >>>> eventually stack it on top of persistent memory filesystems or Device >>>> DAX? >>> >>> How do we stack anonymous memory on top of persistent memory or Device >>> DAX? I'm confused. >> >> If our interface to MKTME is: >> >> fd = open("/dev/mktme"); >> ptr = mmap(fd); >> >> Then it's hard to combine with an interface which is: >> >> fd = open("/dev/dax123"); >> ptr = mmap(fd); >> >> Where if we have something like mprotect() (or madvise() or something >> else taking pointer), we can just do: >> >> fd = open("/dev/anything987"); >> ptr = mmap(fd); >> sys_encrypt(ptr); >> >> Now, we might not *do* it that way for dax, for instance, but I'm just >> saying that if we go the /dev/mktme route, we never get a choice. >> >>> I think that, in the long run, we're going to have to either expand >>> the core mm's concept of what "memory" is or just have a whole >>> parallel set of mechanisms for memory that doesn't work like memory. >> >> ... >>> I expect that some day normal memory will be able to be repurposed as >>> SGX pages on the fly, and that will also look a lot more like SEV or >>> XPFO than like the this model of MKTME. >> >> I think you're drawing the line at pages where the kernel can manage >> contents vs. not manage contents. I'm not sure that's the right >> distinction to make, though. The thing that is important is whether the >> kernel can manage the lifetime and location of the data in the page. >> >> Basically: Can the kernel choose where the page comes from and get the >> page back when it wants? >> >> I really don't like the current state of things like with SEV or with >> KVM direct device assignment where the physical location is quite locked >> down and the kernel really can't manage the memory. I'm trying really >> hard to make sure future hardware is more permissive about such things. >> My hope is that these are a temporary blip and not the new normal. >> >>> So, if we upstream MKTME as anonymous memory with a magic config >>> syscall, I predict that, in a few years, it will be end up inheriting >>> all downsides of both approaches with few of the upsides. Programs >>> like QEMU will need to learn to manipulate pages that can't be >>> accessed outside the VM without special VM buy-in, so the fact that >>> MKTME pages are fully functional and can be GUP-ed won't be very >>> useful. And the VM will learn about all these things, but MKTME won't >>> really fit in. >> >> Kai Huang (who is on cc) has been doing the QEMU enabling and might want >> to weigh in. I'd also love to hear from the AMD folks in case I'm not >> grokking some aspect of SEV. >> >> But, my understanding is that, even today, neither QEMU nor the kernel >> can see SEV-encrypted guest memory. So QEMU should already understand >> how to not interact with guest memory. I _assume_ it's also already >> doing this with anonymous memory, without needing /dev/sme or something. > > Correct neither Qemu nor kernel can see SEV-encrypted guest memory. Qemu requires guest's > cooperation when it needs to interacts with guest, i.e. to support virtual DMA (of virtual devices > in SEV-guest), qemu requires SEV-guest to setup bounce buffer (which will not be SEV-encrypted > memory, but shared memory can be accessed from host side too), so that guest kernel can copy DMA > data from bounce buffer to its own SEV-encrypted memory after qemu/host kernel puts DMA data to > bounce buffer. That is correct. an SEV guest must use un-encrypted memory if it wishes for the hypervisor to be able to see it. Also, to support DMA into the guest, the target memory must be un-encrypted, which SEV does through the DMA api and SWIOTLB. SME must also use bounce buffers if the device does not support 48-bit DMA, since the encryption bit is bit 47. Any device supporting DMA above the encryption bit position can perform DMA without bounce buffers under SME. > > And yes from my reading (better to have AMD guys to confirm) SEV guest uses anonymous memory, but it > also pins all guest memory (by calling GUP from KVM -- SEV specifically introduced 2 KVM ioctls for > this purpose), since SEV architecturally cannot support swapping, migraiton of SEV-encrypted guest > memory, because SME/SEV also uses physical address as "tweak", and there's no way that kernel can > get or use SEV-guest's memory encryption key. In order to swap/migrate SEV-guest memory, we need SGX > EPC eviction/reload similar thing, which SEV doesn't have today. Yes, all the guest memory is currently pinned by calling GUP when creating an SEV guest. This is to prevent page migration by the kernel. However, the support to do page migration is available in the 0.17 version of the SEV API via the COPY commmand. The COPY commannd allows for copying of an encrypted page from one location to another via the PSP. The support for this is not yet in the Linux kernel, but we are working on it. This would remove the requirement of having to pin the guest memory. The SEV API also supports migration of memory via the SEND_* and RECEIVE_* APIs to support live migration. Swapping, however, is not yet supported. Thanks, Tom > > From this perspective, I think driver proposal kinda makes sense since we already have security > feature which uses normal memory some kind like "device memory" (no swap, no migration, etc), so it > makes sense that MKTME just follows that (although from HW MKTME can support swap, page migration, > etc). The downside of driver proposal for MKTME I think is, like Dave mentioned, it's hard (or not > sure whether it is possible) to extend to support NVDIMM (and file backed guest memory), since for > virtual NVDIMM, Qemu needs to call mmap against fd of NVDIMM. > > Thanks, > -Kai >
> > > > > > I'm having a hard time imagining that ever working -- wouldn't it blow > > > up if someone did: > > > > > > fd = open("/dev/anything987"); > > > ptr1 = mmap(fd); > > > ptr2 = mmap(fd); > > > sys_encrypt(ptr1); > > > > > > So I think it really has to be: > > > fd = open("/dev/anything987"); > > > ioctl(fd, ENCRYPT_ME); > > > mmap(fd); > > > > This requires "/dev/anything987" to support ENCRYPT_ME ioctl, right? > > > > So to support NVDIMM (DAX), we need to add ENCRYPT_ME ioctl to DAX? > > Yes and yes, or we do it with layers -- see below. > > I don't see how we can credibly avoid this. If we try to do MKTME > behind the DAX driver's back, aren't we going to end up with cache > coherence problems? I am not sure whether I understand correctly but how is cache coherence problem related to putting MKTME concept to different layers? To make MKTME work with DAX/NVDIMM, I think no matter which layer MKTME concept resides, eventually we need to put keyID into PTE which maps to NVDIMM, and kernel needs to manage cache coherence for NVDIMM just like for normal memory showed in this series? Thanks, -Kai
On Mon, Jun 17, 2019 at 6:34 PM Lendacky, Thomas <Thomas.Lendacky@amd.com> wrote: > > On 6/17/19 6:59 PM, Kai Huang wrote: > > On Mon, 2019-06-17 at 11:27 -0700, Dave Hansen wrote: > > > > And yes from my reading (better to have AMD guys to confirm) SEV guest uses anonymous memory, but it > > also pins all guest memory (by calling GUP from KVM -- SEV specifically introduced 2 KVM ioctls for > > this purpose), since SEV architecturally cannot support swapping, migraiton of SEV-encrypted guest > > memory, because SME/SEV also uses physical address as "tweak", and there's no way that kernel can > > get or use SEV-guest's memory encryption key. In order to swap/migrate SEV-guest memory, we need SGX > > EPC eviction/reload similar thing, which SEV doesn't have today. > > Yes, all the guest memory is currently pinned by calling GUP when creating > an SEV guest. Ick. What happens if QEMU tries to read the memory? Does it just see ciphertext? Is cache coherency lost if QEMU writes it?
On Mon, Jun 17, 2019 at 6:35 PM Kai Huang <kai.huang@linux.intel.com> wrote: > > > > > > > > > > I'm having a hard time imagining that ever working -- wouldn't it blow > > > > up if someone did: > > > > > > > > fd = open("/dev/anything987"); > > > > ptr1 = mmap(fd); > > > > ptr2 = mmap(fd); > > > > sys_encrypt(ptr1); > > > > > > > > So I think it really has to be: > > > > fd = open("/dev/anything987"); > > > > ioctl(fd, ENCRYPT_ME); > > > > mmap(fd); > > > > > > This requires "/dev/anything987" to support ENCRYPT_ME ioctl, right? > > > > > > So to support NVDIMM (DAX), we need to add ENCRYPT_ME ioctl to DAX? > > > > Yes and yes, or we do it with layers -- see below. > > > > I don't see how we can credibly avoid this. If we try to do MKTME > > behind the DAX driver's back, aren't we going to end up with cache > > coherence problems? > > I am not sure whether I understand correctly but how is cache coherence problem related to putting > MKTME concept to different layers? To make MKTME work with DAX/NVDIMM, I think no matter which layer > MKTME concept resides, eventually we need to put keyID into PTE which maps to NVDIMM, and kernel > needs to manage cache coherence for NVDIMM just like for normal memory showed in this series? > I mean is that, to avoid cache coherence problems, something has to prevent user code from mapping the same page with two different key ids. If the entire MKTME mechanism purely layers on top of DAX, something needs to prevent the underlying DAX device from being mapped at the same time as the MKTME-decrypted view. This is obviously doable, but it's not automatic.
On Mon, Jun 17, 2019 at 5:48 PM Kai Huang <kai.huang@linux.intel.com> wrote: > > > > > > > And another silly argument: if we had /dev/mktme, then we could > > > possibly get away with avoiding all the keyring stuff entirely. > > > Instead, you open /dev/mktme and you get your own key under the hook. > > > If you want two keys, you open /dev/mktme twice. If you want some > > > other program to be able to see your memory, you pass it the fd. > > > > We still like the keyring because it's one-stop-shopping as the place > > that *owns* the hardware KeyID slots. Those are global resources and > > scream for a single global place to allocate and manage them. The > > hardware slots also need to be shared between any anonymous and > > file-based users, no matter what the APIs for the anonymous side. > > MKTME driver (who creates /dev/mktme) can also be the one-stop-shopping. I think whether to choose > keyring to manage MKTME key should be based on whether we need/should take advantage of existing key > retention service functionalities. For example, with key retention service we can > revoke/invalidate/set expiry for a key (not sure whether MKTME needs those although), and we have > several keyrings -- thread specific keyring, process specific keyring, user specific keyring, etc, > thus we can control who can/cannot find the key, etc. I think managing MKTME key in MKTME driver > doesn't have those advantages. > Trying to evaluate this with the current proposed code is a bit odd, I think. Suppose you create a thread-specific key and then fork(). The child can presumably still use the key regardless of whether the child can nominally access the key in the keyring because the PTEs are still there. More fundamentally, in some sense, the current code has no semantics. Associating a key with memory and "encrypting" it doesn't actually do anything unless you are attacking the memory bus but you haven't compromised the kernel. There's no protection against a guest that can corrupt its EPT tables, there's no protection against kernel bugs (*especially* if the duplicate direct map design stays), and there isn't even any fd or other object around by which you can only access the data if you can see the key. I'm also wondering whether the kernel will always be able to be a one-stop shop for key allocation -- if the MKTME hardware gains interesting new uses down the road, who knows how key allocation will work?
On 6/17/19 8:40 PM, Andy Lutomirski wrote: > On Mon, Jun 17, 2019 at 6:34 PM Lendacky, Thomas > <Thomas.Lendacky@amd.com> wrote: >> >> On 6/17/19 6:59 PM, Kai Huang wrote: >>> On Mon, 2019-06-17 at 11:27 -0700, Dave Hansen wrote: > >>> >>> And yes from my reading (better to have AMD guys to confirm) SEV guest uses anonymous memory, but it >>> also pins all guest memory (by calling GUP from KVM -- SEV specifically introduced 2 KVM ioctls for >>> this purpose), since SEV architecturally cannot support swapping, migraiton of SEV-encrypted guest >>> memory, because SME/SEV also uses physical address as "tweak", and there's no way that kernel can >>> get or use SEV-guest's memory encryption key. In order to swap/migrate SEV-guest memory, we need SGX >>> EPC eviction/reload similar thing, which SEV doesn't have today. >> >> Yes, all the guest memory is currently pinned by calling GUP when creating >> an SEV guest. > > Ick. > > What happens if QEMU tries to read the memory? Does it just see > ciphertext? Is cache coherency lost if QEMU writes it? If QEMU tries to read the memory is would just see ciphertext. I'll double check on the write situation, but I think you would end up with a cache coherency issue because the write by QEMU would be with the hypervisor key and tagged separately in the cache from the guest cache entry. SEV provides confidentiality of guest memory from the hypervisor, it doesn't prevent the hypervisor from trashing the guest memory. Thanks, Tom >
On Mon, 2019-06-17 at 18:50 -0700, Andy Lutomirski wrote: > On Mon, Jun 17, 2019 at 5:48 PM Kai Huang <kai.huang@linux.intel.com> wrote: > > > > > > > > > > > And another silly argument: if we had /dev/mktme, then we could > > > > possibly get away with avoiding all the keyring stuff entirely. > > > > Instead, you open /dev/mktme and you get your own key under the hook. > > > > If you want two keys, you open /dev/mktme twice. If you want some > > > > other program to be able to see your memory, you pass it the fd. > > > > > > We still like the keyring because it's one-stop-shopping as the place > > > that *owns* the hardware KeyID slots. Those are global resources and > > > scream for a single global place to allocate and manage them. The > > > hardware slots also need to be shared between any anonymous and > > > file-based users, no matter what the APIs for the anonymous side. > > > > MKTME driver (who creates /dev/mktme) can also be the one-stop-shopping. I think whether to > > choose > > keyring to manage MKTME key should be based on whether we need/should take advantage of existing > > key > > retention service functionalities. For example, with key retention service we can > > revoke/invalidate/set expiry for a key (not sure whether MKTME needs those although), and we > > have > > several keyrings -- thread specific keyring, process specific keyring, user specific keyring, > > etc, > > thus we can control who can/cannot find the key, etc. I think managing MKTME key in MKTME driver > > doesn't have those advantages. > > > > Trying to evaluate this with the current proposed code is a bit odd, I > think. Suppose you create a thread-specific key and then fork(). The > child can presumably still use the key regardless of whether the child > can nominally access the key in the keyring because the PTEs are still > there. Right. This is a little bit odd, although virtualization (Qemu, which is the main use case of MKTME at least so far) doesn't use fork(). > > More fundamentally, in some sense, the current code has no semantics. > Associating a key with memory and "encrypting" it doesn't actually do > anything unless you are attacking the memory bus but you haven't > compromised the kernel. There's no protection against a guest that > can corrupt its EPT tables, there's no protection against kernel bugs > (*especially* if the duplicate direct map design stays), and there > isn't even any fd or other object around by which you can only access > the data if you can see the key. I am not saying managing MKTME key/keyID in key retention service is definitely better, but it seems all those you mentioned are not related to whether to choose key retention service to manage MKTME key/keyID? Or you are saying it doesn't matter we manage key/keyID in key retention service or in MKTME driver, since MKTME barely have any security benefits (besides physical attack)? > > I'm also wondering whether the kernel will always be able to be a > one-stop shop for key allocation -- if the MKTME hardware gains > interesting new uses down the road, who knows how key allocation will > work? I by now don't have any use case which requires to manage key/keyID specifically for its own use, rather than letting kernel to manage keyID allocation. Please inspire us if you have any potential. Thanks, -Kai
On Mon, 2019-06-17 at 18:43 -0700, Andy Lutomirski wrote: > On Mon, Jun 17, 2019 at 6:35 PM Kai Huang <kai.huang@linux.intel.com> wrote: > > > > > > > > > > > > > > I'm having a hard time imagining that ever working -- wouldn't it blow > > > > > up if someone did: > > > > > > > > > > fd = open("/dev/anything987"); > > > > > ptr1 = mmap(fd); > > > > > ptr2 = mmap(fd); > > > > > sys_encrypt(ptr1); > > > > > > > > > > So I think it really has to be: > > > > > fd = open("/dev/anything987"); > > > > > ioctl(fd, ENCRYPT_ME); > > > > > mmap(fd); > > > > > > > > This requires "/dev/anything987" to support ENCRYPT_ME ioctl, right? > > > > > > > > So to support NVDIMM (DAX), we need to add ENCRYPT_ME ioctl to DAX? > > > > > > Yes and yes, or we do it with layers -- see below. > > > > > > I don't see how we can credibly avoid this. If we try to do MKTME > > > behind the DAX driver's back, aren't we going to end up with cache > > > coherence problems? > > > > I am not sure whether I understand correctly but how is cache coherence problem related to > > putting > > MKTME concept to different layers? To make MKTME work with DAX/NVDIMM, I think no matter which > > layer > > MKTME concept resides, eventually we need to put keyID into PTE which maps to NVDIMM, and kernel > > needs to manage cache coherence for NVDIMM just like for normal memory showed in this series? > > > > I mean is that, to avoid cache coherence problems, something has to > prevent user code from mapping the same page with two different key > ids. If the entire MKTME mechanism purely layers on top of DAX, > something needs to prevent the underlying DAX device from being mapped > at the same time as the MKTME-decrypted view. This is obviously > doable, but it's not automatic. Assuming I am understanding the context correctly, yes from this perspective it seems having sys_encrypt is annoying, and having ENCRYPT_ME should be better. But Dave said "nobody is going to do what you suggest in the ptr1/ptr2 example"? Thanks, -Kai
On Mon, Jun 17, 2019 at 6:40 PM Andy Lutomirski <luto@kernel.org> wrote: > > On Mon, Jun 17, 2019 at 6:34 PM Lendacky, Thomas > <Thomas.Lendacky@amd.com> wrote: > > > > On 6/17/19 6:59 PM, Kai Huang wrote: > > > On Mon, 2019-06-17 at 11:27 -0700, Dave Hansen wrote: > > > > > > > And yes from my reading (better to have AMD guys to confirm) SEV guest uses anonymous memory, but it > > > also pins all guest memory (by calling GUP from KVM -- SEV specifically introduced 2 KVM ioctls for > > > this purpose), since SEV architecturally cannot support swapping, migraiton of SEV-encrypted guest > > > memory, because SME/SEV also uses physical address as "tweak", and there's no way that kernel can > > > get or use SEV-guest's memory encryption key. In order to swap/migrate SEV-guest memory, we need SGX > > > EPC eviction/reload similar thing, which SEV doesn't have today. > > > > Yes, all the guest memory is currently pinned by calling GUP when creating > > an SEV guest. > > Ick. > > What happens if QEMU tries to read the memory? Does it just see > ciphertext? Is cache coherency lost if QEMU writes it? I should add: is the current interface that SEV uses actually good, or should the kernel try to do something differently? I've spent exactly zero time looking at SEV APIs or at how QEMU manages its memory.
On Mon, Jun 17, 2019 at 7:11 PM Kai Huang <kai.huang@linux.intel.com> wrote: > > On Mon, 2019-06-17 at 18:50 -0700, Andy Lutomirski wrote: > > On Mon, Jun 17, 2019 at 5:48 PM Kai Huang <kai.huang@linux.intel.com> wrote: > > > > > > > > > > > > > > > And another silly argument: if we had /dev/mktme, then we could > > > > > possibly get away with avoiding all the keyring stuff entirely. > > > > > Instead, you open /dev/mktme and you get your own key under the hook. > > > > > If you want two keys, you open /dev/mktme twice. If you want some > > > > > other program to be able to see your memory, you pass it the fd. > > > > > > > > We still like the keyring because it's one-stop-shopping as the place > > > > that *owns* the hardware KeyID slots. Those are global resources and > > > > scream for a single global place to allocate and manage them. The > > > > hardware slots also need to be shared between any anonymous and > > > > file-based users, no matter what the APIs for the anonymous side. > > > > > > MKTME driver (who creates /dev/mktme) can also be the one-stop-shopping. I think whether to > > > choose > > > keyring to manage MKTME key should be based on whether we need/should take advantage of existing > > > key > > > retention service functionalities. For example, with key retention service we can > > > revoke/invalidate/set expiry for a key (not sure whether MKTME needs those although), and we > > > have > > > several keyrings -- thread specific keyring, process specific keyring, user specific keyring, > > > etc, > > > thus we can control who can/cannot find the key, etc. I think managing MKTME key in MKTME driver > > > doesn't have those advantages. > > > > > > > Trying to evaluate this with the current proposed code is a bit odd, I > > think. Suppose you create a thread-specific key and then fork(). The > > child can presumably still use the key regardless of whether the child > > can nominally access the key in the keyring because the PTEs are still > > there. > > Right. This is a little bit odd, although virtualization (Qemu, which is the main use case of MKTME > at least so far) doesn't use fork(). > > > > > More fundamentally, in some sense, the current code has no semantics. > > Associating a key with memory and "encrypting" it doesn't actually do > > anything unless you are attacking the memory bus but you haven't > > compromised the kernel. There's no protection against a guest that > > can corrupt its EPT tables, there's no protection against kernel bugs > > (*especially* if the duplicate direct map design stays), and there > > isn't even any fd or other object around by which you can only access > > the data if you can see the key. > > I am not saying managing MKTME key/keyID in key retention service is definitely better, but it seems > all those you mentioned are not related to whether to choose key retention service to manage MKTME > key/keyID? Or you are saying it doesn't matter we manage key/keyID in key retention service or in > MKTME driver, since MKTME barely have any security benefits (besides physical attack)? Mostly the latter. I think it's very hard to evaluate whether a given key allocation model makes sense given that MKTME provides such weak security benefits. TME has obvious security benefits, as does encryption of persistent memory, but this giant patch set isn't needed for plain TME and it doesn't help with persistent memory. > > > > > I'm also wondering whether the kernel will always be able to be a > > one-stop shop for key allocation -- if the MKTME hardware gains > > interesting new uses down the road, who knows how key allocation will > > work? > > I by now don't have any use case which requires to manage key/keyID specifically for its own use, > rather than letting kernel to manage keyID allocation. Please inspire us if you have any potential. > Other than compliance, I can't think of much reason that using multiple keys is useful, regardless of how their allocated. The only thing I've thought of is that, with multiple keys, you can use PCONFIG to remove one and flush caches and the data is most definitely gone. On the other hand, you can just zero the memory and the data is just as gone even without any encryption.
On Tue, Jun 18, 2019 at 02:23:31PM +1200, Kai Huang wrote: > Assuming I am understanding the context correctly, yes from this perspective it seems having > sys_encrypt is annoying, and having ENCRYPT_ME should be better. But Dave said "nobody is going to > do what you suggest in the ptr1/ptr2 example"? You have to phrase that as: 'nobody who knows what he's doing is going to do that', which leaves lots of people and fuzzers. Murphy states that if it is possible, someone _will_ do it. And this being something that causes severe data corruption on persistent storage,...
On 6/18/19 2:12 AM, Peter Zijlstra wrote: > On Tue, Jun 18, 2019 at 02:23:31PM +1200, Kai Huang wrote: >> Assuming I am understanding the context correctly, yes from this perspective it seems having >> sys_encrypt is annoying, and having ENCRYPT_ME should be better. But Dave said "nobody is going to >> do what you suggest in the ptr1/ptr2 example"? > > You have to phrase that as: 'nobody who knows what he's doing is going > to do that', which leaves lots of people and fuzzers. > > Murphy states that if it is possible, someone _will_ do it. And this > being something that causes severe data corruption on persistent > storage,... I actually think it's not a big deal at all to avoid the corruption that would occur if it were allowed. But, if you're even asking to map the same data with two different keys, you're *asking* for data corruption. What we're doing here is continuing to preserve cache coherency and ensuring an early failure. We'd need two rules: 1. A page must not be faulted into a VMA if the page's page_keyid() is not consistent with the VMA's 2. Upon changing the VMA's KeyID, all underlying PTEs must either be checked or zapped. If the rules are broken, we SIGBUS. Andy's suggestion has the same basic requirements. But, with his scheme, the error can be to the ioctl() instead of in the form of a SIGBUS. I guess that makes the fuzzers' lives a bit easier. BTW, note that we don't have any problems with the current anonymous implementation and fork() because we zap at the encryption syscall.
On 6/17/19 5:15 PM, Andy Lutomirski wrote: >>> But I really expect that the encryption of a DAX device will actually >>> be a block device setting and won't look like this at all. It'll be >>> more like dm-crypt except without device mapper. >> Are you suggesting not to support MKTME for DAX, or adding MKTME support to dm-crypt? > I'm proposing exposing it by an interface that looks somewhat like > dm-crypt. Either we could have a way to create a device layered on > top of the DAX devices that exposes a decrypted view or we add a way > to tell the DAX device to kindly use MKTME with such-and-such key. I think this basically implies that we need to settle (or at least present) on an interface for storage (FS-DAX, Device DAX, page cache) before we merge one for anonymous memory. That sounds like a reasonable exercise.
On 6/17/19 6:50 PM, Andy Lutomirski wrote: > I'm also wondering whether the kernel will always be able to be a > one-stop shop for key allocation -- if the MKTME hardware gains > interesting new uses down the road, who knows how key allocation will > work? I can't share all the details on LKML, of course, but I can at least say that this model of allocating KeyID slots will continue to be used for a number of generations.
On Tue, Jun 18, 2019 at 07:09:36AM -0700, Dave Hansen wrote: > On 6/18/19 2:12 AM, Peter Zijlstra wrote: > > On Tue, Jun 18, 2019 at 02:23:31PM +1200, Kai Huang wrote: > >> Assuming I am understanding the context correctly, yes from this perspective it seems having > >> sys_encrypt is annoying, and having ENCRYPT_ME should be better. But Dave said "nobody is going to > >> do what you suggest in the ptr1/ptr2 example"? > > > > You have to phrase that as: 'nobody who knows what he's doing is going > > to do that', which leaves lots of people and fuzzers. > > > > Murphy states that if it is possible, someone _will_ do it. And this > > being something that causes severe data corruption on persistent > > storage,... > > I actually think it's not a big deal at all to avoid the corruption that > would occur if it were allowed. But, if you're even asking to map the > same data with two different keys, you're *asking* for data corruption. > What we're doing here is continuing to preserve cache coherency and > ensuring an early failure. > > We'd need two rules: > 1. A page must not be faulted into a VMA if the page's page_keyid() > is not consistent with the VMA's > 2. Upon changing the VMA's KeyID, all underlying PTEs must either be > checked or zapped. > > If the rules are broken, we SIGBUS. Andy's suggestion has the same > basic requirements. But, with his scheme, the error can be to the > ioctl() instead of in the form of a SIGBUS. I guess that makes the > fuzzers' lives a bit easier. I see a problem with the scheme: if we don't have a way to decide if the key is right for the file, user without access to the right key is able to prevent legitimate user from accessing the file. Attacker just need read access to the encrypted file to prevent any legitimate use to access it. The problem applies to ioctl() too. To make sense of it we must have a way to distinguish right key from wrong. I don't see obvious solution with the current hardware design.
On 6/18/19 9:15 AM, Kirill A. Shutemov wrote: >> We'd need two rules: >> 1. A page must not be faulted into a VMA if the page's page_keyid() >> is not consistent with the VMA's >> 2. Upon changing the VMA's KeyID, all underlying PTEs must either be >> checked or zapped. >> >> If the rules are broken, we SIGBUS. Andy's suggestion has the same >> basic requirements. But, with his scheme, the error can be to the >> ioctl() instead of in the form of a SIGBUS. I guess that makes the >> fuzzers' lives a bit easier. > I see a problem with the scheme: if we don't have a way to decide if the > key is right for the file, user without access to the right key is able to > prevent legitimate user from accessing the file. Attacker just need read > access to the encrypted file to prevent any legitimate use to access it. I think you're bringing up a separate issue. We were talking about how you resolve a conflict when someone attempts to use two *different* keyids to decrypt the data in the API and what the resulting API interaction looks like. You're describing the situation where one of those is the wrong *key* (not keyid). That's a subtly different scenario and requires different handling (or no handling IMNHO).
> On Jun 18, 2019, at 9:22 AM, Dave Hansen <dave.hansen@intel.com> wrote: > > On 6/18/19 9:15 AM, Kirill A. Shutemov wrote: >>> We'd need two rules: >>> 1. A page must not be faulted into a VMA if the page's page_keyid() >>> is not consistent with the VMA's >>> 2. Upon changing the VMA's KeyID, all underlying PTEs must either be >>> checked or zapped. >>> >>> If the rules are broken, we SIGBUS. Andy's suggestion has the same >>> basic requirements. But, with his scheme, the error can be to the >>> ioctl() instead of in the form of a SIGBUS. I guess that makes the >>> fuzzers' lives a bit easier. >> I see a problem with the scheme: if we don't have a way to decide if the >> key is right for the file, user without access to the right key is able to >> prevent legitimate user from accessing the file. Attacker just need read >> access to the encrypted file to prevent any legitimate use to access it. > > I think you're bringing up a separate issue. > > We were talking about how you resolve a conflict when someone attempts > to use two *different* keyids to decrypt the data in the API and what > the resulting API interaction looks like. > > You're describing the situation where one of those is the wrong *key* > (not keyid). That's a subtly different scenario and requires different > handling (or no handling IMNHO). I think we’re quibbling over details before we look at the big questions: Should MKTME+DAX encrypt the entire volume or should it encrypt individual files? Or both? If it encrypts individual files, should the fs be involved at all? Should there be metadata that can check whether a given key is the correct key? If it encrypts individual files, is it even conceptually possible to avoid corruption if the fs is not involved? After all, many filesystems think that they can move data blocks, compute checksums, journal data, etc. I think Dave is right that there should at least be a somewhat credible proposal for how this could fit together.
On 6/18/19 9:36 AM, Andy Lutomirski wrote: > Should MKTME+DAX encrypt the entire volume or should it encrypt individual files? Or both? Our current thought is that there should be two modes: One for entire DAX namespaces and one for filesystem DAX that would allow it to be at the file level. More likely, we would mirror fscrypt and do it at the directory level. > If it encrypts individual files, should the fs be involved at all? > Should there be metadata that can check whether a given key is the > correct key? FWIW, this is a question for the fs guys. Their guidance so far has been "do what fscrypt does", and fscrypt does not protect against incorrect keys being specified. See: https://www.kernel.org/doc/html/v5.1/filesystems/fscrypt.html Which says: > Currently, fscrypt does not prevent a user from maliciously providing > an incorrect key for another user’s existing encrypted files. A > protection against this is planned. > If it encrypts individual files, is it even conceptually possible to > avoid corruption if the fs is not involved? After all, many > filesystems think that they can move data blocks, compute checksums, > journal data, etc. Yes, exactly. Thankfully, fscrypt has thought about this already and has infrastructure for this. For instance: > Online defragmentation of encrypted files is not supported. The > EXT4_IOC_MOVE_EXT and F2FS_IOC_MOVE_RANGE ioctls will fail with > EOPNOTSUPP.
diff --git a/fs/exec.c b/fs/exec.c index 2e0033348d8e..695c121b34b3 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -755,8 +755,8 @@ int setup_arg_pages(struct linux_binprm *bprm, vm_flags |= mm->def_flags; vm_flags |= VM_STACK_INCOMPLETE_SETUP; - ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, - vm_flags); + ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, vm_flags, + -1); if (ret) goto out_unlock; BUG_ON(prev != vma); diff --git a/include/linux/mm.h b/include/linux/mm.h index c027044de9bf..a7f52d053826 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1634,7 +1634,8 @@ extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long int dirty_accountable, int prot_numa); extern int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, - unsigned long end, unsigned long newflags); + unsigned long end, unsigned long newflags, + int newkeyid); /* * doesn't attempt to fault and will return short. diff --git a/mm/mprotect.c b/mm/mprotect.c index 23e680f4b1d5..38d766b5cc20 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -28,6 +28,7 @@ #include <linux/ksm.h> #include <linux/uaccess.h> #include <linux/mm_inline.h> +#include <linux/key.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> @@ -347,7 +348,8 @@ static int prot_none_walk(struct vm_area_struct *vma, unsigned long start, int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, - unsigned long start, unsigned long end, unsigned long newflags) + unsigned long start, unsigned long end, unsigned long newflags, + int newkeyid) { struct mm_struct *mm = vma->vm_mm; unsigned long oldflags = vma->vm_flags; @@ -357,7 +359,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, int error; int dirty_accountable = 0; - if (newflags == oldflags) { + /* + * Flags match and Keyids match or we have NO_KEY. + * This _fixup is usually called from do_mprotect_ext() except + * for one special case: caller fs/exec.c/setup_arg_pages() + * In that case, newkeyid is passed as -1 (NO_KEY). + */ + if (newflags == oldflags && + (newkeyid == vma_keyid(vma) || newkeyid == NO_KEY)) { *pprev = vma; return 0; } @@ -423,6 +432,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, } success: + if (newkeyid != NO_KEY) + mprotect_set_encrypt(vma, newkeyid, start, end); /* * vm_flags and vm_page_prot are protected by the mmap_sem * held in write mode. @@ -454,10 +465,15 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, } /* - * When pkey==NO_KEY we get legacy mprotect behavior here. + * do_mprotect_ext() supports the legacy mprotect behavior plus extensions + * for Protection Keys and Memory Encryption Keys. These extensions are + * mutually exclusive and the behavior is: + * (pkey==NO_KEY && keyid==NO_KEY) ==> legacy mprotect + * (pkey is valid) ==> legacy mprotect plus Protection Key extensions + * (keyid is valid) ==> legacy mprotect plus Encryption Key extensions */ static int do_mprotect_ext(unsigned long start, size_t len, - unsigned long prot, int pkey) + unsigned long prot, int pkey, int keyid) { unsigned long nstart, end, tmp, reqprot; struct vm_area_struct *vma, *prev; @@ -555,7 +571,8 @@ static int do_mprotect_ext(unsigned long start, size_t len, tmp = vma->vm_end; if (tmp > end) tmp = end; - error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); + error = mprotect_fixup(vma, &prev, nstart, tmp, newflags, + keyid); if (error) goto out; nstart = tmp; @@ -580,7 +597,7 @@ static int do_mprotect_ext(unsigned long start, size_t len, SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, unsigned long, prot) { - return do_mprotect_ext(start, len, prot, NO_KEY); + return do_mprotect_ext(start, len, prot, NO_KEY, NO_KEY); } #ifdef CONFIG_ARCH_HAS_PKEYS @@ -588,7 +605,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len, unsigned long, prot, int, pkey) { - return do_mprotect_ext(start, len, prot, pkey); + return do_mprotect_ext(start, len, prot, pkey, NO_KEY); } SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val) @@ -637,3 +654,40 @@ SYSCALL_DEFINE1(pkey_free, int, pkey) } #endif /* CONFIG_ARCH_HAS_PKEYS */ + +#ifdef CONFIG_X86_INTEL_MKTME + +extern int mktme_keyid_from_key(struct key *key); + +SYSCALL_DEFINE4(encrypt_mprotect, unsigned long, start, size_t, len, + unsigned long, prot, key_serial_t, serial) +{ + key_ref_t key_ref; + struct key *key; + int ret, keyid; + + /* MKTME restriction */ + if (!PAGE_ALIGNED(len)) + return -EINVAL; + + /* + * key_ref prevents the destruction of the key + * while the memory encryption is being set up. + */ + + key_ref = lookup_user_key(serial, 0, KEY_NEED_VIEW); + if (IS_ERR(key_ref)) + return PTR_ERR(key_ref); + + key = key_ref_to_ptr(key_ref); + keyid = mktme_keyid_from_key(key); + if (!keyid) { + key_ref_put(key_ref); + return -EINVAL; + } + ret = do_mprotect_ext(start, len, prot, NO_KEY, keyid); + key_ref_put(key_ref); + return ret; +} + +#endif /* CONFIG_X86_INTEL_MKTME */