diff mbox

pkeys: Introduce PKEY_ALLOC_SIGNALINHERIT and change signal semantics

Message ID 20180502132751.05B9F401F3041@oldenburg.str.redhat.com
State New
Headers show

Commit Message

Florian Weimer May 2, 2018, 1:26 p.m. UTC
pkeys support for IBM POWER intends to inherited the access rights of
the current thread in signal handlers.  The advantage is that this
preserves access to memory regions associated with non-default keys,
enabling additional usage scenarios for memory protection keys which
currently do not work on x86 due to the unconditional reset to the
(configurable) default key in signal handlers.

Consequently, this commit updates the x86 implementation to preserve
the PKRU register value of the interrupted context in signal handlers.
If a key is allocated successfully with the PKEY_ALLOC_SIGNALINHERIT
flag, the application can assume this signal inheritance behavior.

This change does not affect the init_pkru optimization because if the
thread's PKRU register is zero due to the init_pkru setting, it will
remain zero in the signal handler through inheritance from the
interrupted context.

After this change, this program:

??=include <sys/syscall.h>
??=include <unistd.h>
??=include <stdio.h>
??=include <err.h>
??=include <signal.h>

??=define PKEY_ALLOC_SIGNALINHERIT 1
??=define PKEY_DISABLE_ACCESS 1
??=define PKEY_DISABLE_WRITE 2

static inline unsigned int
pkey_read (void)
{
  unsigned int result;
  __asm__ volatile (".byte 0x0f, 0x01, 0xee"
                    : "=a" (result) : "c" (0) : "rdx");
  return result;
}

static void
print_pkru (const char *where)
{
  printf ("PKRU (%s): %08x\n", where, pkey_read ());
}

static void
sigusr1 (int signo)
{
  print_pkru ("signal handler");
}

int
main (void)
{
  if (signal (SIGUSR1, sigusr1) == SIG_ERR)
    err (1, "signal");
  print_pkru ("main");
  raise (SIGUSR1);

  puts ("allocating key 1");
  int key1 = syscall (SYS_pkey_alloc, PKEY_ALLOC_SIGNALINHERIT, 0);
  if (key1 < 0)
    err (1, "pkey_alloc");
  print_pkru ("main");
  raise (SIGUSR1);

  puts ("allocating key 2");
  int key2 = syscall (SYS_pkey_alloc, PKEY_ALLOC_SIGNALINHERIT,
                      PKEY_DISABLE_ACCESS);
  if (key2 < 0)
    err (1, "pkey_alloc");
  print_pkru ("main");
  raise (SIGUSR1);

  puts ("allocating key 3");
  int key3 = syscall (SYS_pkey_alloc, PKEY_ALLOC_SIGNALINHERIT,
                      PKEY_DISABLE_WRITE);
  if (key3 < 0)
    err (1, "pkey_alloc");
  print_pkru ("main");
  raise (SIGUSR1);

  return 0;
}

should print:

PKRU (main): 55555554
PKRU (signal handler): 55555554
allocating key 1
PKRU (main): 55555550
PKRU (signal handler): 55555550
allocating key 2
PKRU (main): 55555550
PKRU (signal handler): 55555550
allocating key 3
PKRU (main): 55555590
PKRU (signal handler): 55555590

That is, the PKRU register value in the signal handler matches that of
the main thread, even if the access rights have been changed from the
system default.

Signed-off-by: Florian Weimer <fweimer@redhat.com>
---
 Documentation/x86/protection-keys.txt         |  9 ++++-
 arch/alpha/include/uapi/asm/mman.h            |  2 ++
 arch/mips/include/uapi/asm/mman.h             |  2 ++
 arch/parisc/include/uapi/asm/mman.h           |  2 ++
 arch/x86/include/asm/fpu/internal.h           |  1 +
 arch/x86/kernel/fpu/core.c                    | 47 +++++++++++++++++++++------
 arch/x86/kernel/signal.c                      |  2 +-
 arch/xtensa/include/uapi/asm/mman.h           |  2 ++
 include/uapi/asm-generic/mman-common.h        |  2 ++
 mm/mprotect.c                                 | 11 +++++--
 tools/include/uapi/asm-generic/mman-common.h  |  2 ++
 tools/testing/selftests/x86/protection_keys.c |  4 ++-
 12 files changed, 71 insertions(+), 15 deletions(-)

Comments

Dave Hansen May 2, 2018, 2:30 p.m. UTC | #1
On 05/02/2018 06:26 AM, Florian Weimer wrote:
> pkeys support for IBM POWER intends to inherited the access rights of
> the current thread in signal handlers.  The advantage is that this
> preserves access to memory regions associated with non-default keys,
> enabling additional usage scenarios for memory protection keys which
> currently do not work on x86 due to the unconditional reset to the
> (configurable) default key in signal handlers.

What's the usage scenario that does not work?

> Consequently, this commit updates the x86 implementation to preserve
> the PKRU register value of the interrupted context in signal handlers.
> If a key is allocated successfully with the PKEY_ALLOC_SIGNALINHERIT
> flag, the application can assume this signal inheritance behavior.

I think this is a pretty gross misuse of the API.  Adding an argument to
pkey_alloc() is something that folks would assume would impact the key
being *allocated*, not pkeys behavior across the process as a whole.

> This change does not affect the init_pkru optimization because if the
> thread's PKRU register is zero due to the init_pkru setting, it will
> remain zero in the signal handler through inheritance from the
> interrupted context.

I think you are right, but it's rather convoluted.  It does:

1. Running with PKRU in the init state
2. Kernel saves off init-state-PKRU XSAVE signal buffer
3. Enter signal, kernel XRSTOR (may) set the init state again
4. fpu__clear() does __write_pkru(), takes it out of the init state
5. Signal handler runs, exits
6. fpu__restore_sig() XRSTOR's the state from #2, taking PKRU back to
   the init state

But, about the patch in general:

I'm not a big fan of doing this in such a PKRU-specific way.  It would
be nice to have this available for all XSAVE states.  It would also keep
you from so unnecessarily frobbing with WRPKRU in fpu__clear().  You
could just clear the PKRU bit in the Requested Feature BitMap (RFBM)
passed to XRSTOR.  That would be much straightforward and able to be
more easily extended to more states.

PKRU is now preserved on signal entry, but not signal exit.  Was that
intentional?  That seems like odd behavior, and also differs from the
POWER implementation as I understand it.
Florian Weimer May 2, 2018, 3:12 p.m. UTC | #2
On 05/02/2018 04:30 PM, Dave Hansen wrote:
> On 05/02/2018 06:26 AM, Florian Weimer wrote:
>> pkeys support for IBM POWER intends to inherited the access rights of
>> the current thread in signal handlers.  The advantage is that this
>> preserves access to memory regions associated with non-default keys,
>> enabling additional usage scenarios for memory protection keys which
>> currently do not work on x86 due to the unconditional reset to the
>> (configurable) default key in signal handlers.
> 
> What's the usage scenario that does not work?

Here's what I want to do:

Nick Clifton wrote a binutils patch which puts the .got.plt section on 
separate pages.  We allocate a protection key for it, assign it to all 
such sections in the process image, and change the access rights of the 
main thread to disallow writes via that key during process startup.  In 
_dl_fixup, we enable write access to the GOT, update the GOT entry, and 
then disable it again.

This way, we have a pretty safe form of lazy binding, without having to 
resort to BIND_NOW.

With the current kernel behavior on x86, we cannot do that because 
signal handlers revert to the default (deny) access rights, so the GOT 
turns inaccessible.

>> Consequently, this commit updates the x86 implementation to preserve
>> the PKRU register value of the interrupted context in signal handlers.
>> If a key is allocated successfully with the PKEY_ALLOC_SIGNALINHERIT
>> flag, the application can assume this signal inheritance behavior.
> 
> I think this is a pretty gross misuse of the API.  Adding an argument to
> pkey_alloc() is something that folks would assume would impact the key
> being *allocated*, not pkeys behavior across the process as a whole.

 From the application point of view, only the allocated key is 
affected—it has specific semantics that were undefined before and varied 
between x86 and POWER.

>> This change does not affect the init_pkru optimization because if the
>> thread's PKRU register is zero due to the init_pkru setting, it will
>> remain zero in the signal handler through inheritance from the
>> interrupted context.
> 
> I think you are right, but it's rather convoluted.  It does:
> 
> 1. Running with PKRU in the init state
> 2. Kernel saves off init-state-PKRU XSAVE signal buffer
> 3. Enter signal, kernel XRSTOR (may) set the init state again
> 4. fpu__clear() does __write_pkru(), takes it out of the init state
> 5. Signal handler runs, exits
> 6. fpu__restore_sig() XRSTOR's the state from #2, taking PKRU back to
>     the init state

Isn't that just the cost of not hard-coding the XSAVE area layout?

> But, about the patch in general:
> 
> I'm not a big fan of doing this in such a PKRU-specific way.  It would
> be nice to have this available for all XSAVE states.  It would also keep
> you from so unnecessarily frobbing with WRPKRU in fpu__clear().  You
> could just clear the PKRU bit in the Requested Feature BitMap (RFBM)
> passed to XRSTOR.  That would be much straightforward and able to be
> more easily extended to more states.

I don't see where I could plug this into the current kernel sources. 
Would you please provide some pointers?

> PKRU is now preserved on signal entry, but not signal exit.  Was that
> intentional?  That seems like odd behavior, and also differs from the
> POWER implementation as I understand it.

Ram, would you please comment?

I think it is a bug not restore the access rights to the former value in 
the interrupted context.  In userspace, we have exactly this problem 
with errno, and it can lead to subtle bugs.

Thanks,
Florian
Dave Hansen May 2, 2018, 3:28 p.m. UTC | #3
On 05/02/2018 08:12 AM, Florian Weimer wrote:
> On 05/02/2018 04:30 PM, Dave Hansen wrote:
>> On 05/02/2018 06:26 AM, Florian Weimer wrote:
>>> pkeys support for IBM POWER intends to inherited the access rights of
>>> the current thread in signal handlers.  The advantage is that this
>>> preserves access to memory regions associated with non-default keys,
>>> enabling additional usage scenarios for memory protection keys which
>>> currently do not work on x86 due to the unconditional reset to the
>>> (configurable) default key in signal handlers.
>>
>> What's the usage scenario that does not work?
> 
> Here's what I want to do:
> 
> Nick Clifton wrote a binutils patch which puts the .got.plt section on
> separate pages.  We allocate a protection key for it, assign it to all
> such sections in the process image, and change the access rights of the
> main thread to disallow writes via that key during process startup.  In
> _dl_fixup, we enable write access to the GOT, update the GOT entry, and
> then disable it again.
> 
> This way, we have a pretty safe form of lazy binding, without having to
> resort to BIND_NOW.
> 
> With the current kernel behavior on x86, we cannot do that because
> signal handlers revert to the default (deny) access rights, so the GOT
> turns inaccessible.

cc'ing Andy Lutomirksi...  He was the one that specifically asked for
the deny-by-default behavior that I think is biting you.

The behavior we had before implementing Andy's suggestion was, I think,
the one you would want: we had allow-all as the default PKRU value.

The other option here that I think we discussed in the past was to have
an *explicit* signal PKRU value.  That way, we can be restrictive by
default but allow overrides for special cases like you have.

>>> Consequently, this commit updates the x86 implementation to preserve
>>> the PKRU register value of the interrupted context in signal handlers.
>>> If a key is allocated successfully with the PKEY_ALLOC_SIGNALINHERIT
>>> flag, the application can assume this signal inheritance behavior.
>>
>> I think this is a pretty gross misuse of the API.  Adding an argument to
>> pkey_alloc() is something that folks would assume would impact the key
>> being *allocated*, not pkeys behavior across the process as a whole.
> 
> From the application point of view, only the allocated key is
> affected—it has specific semantics that were undefined before and varied
> between x86 and POWER.

I'd really like to see it in a separate API.

>>> This change does not affect the init_pkru optimization because if the
>>> thread's PKRU register is zero due to the init_pkru setting, it will
>>> remain zero in the signal handler through inheritance from the
>>> interrupted context.
>>
>> I think you are right, but it's rather convoluted.  It does:
>>
>> 1. Running with PKRU in the init state
>> 2. Kernel saves off init-state-PKRU XSAVE signal buffer
>> 3. Enter signal, kernel XRSTOR (may) set the init state again
>> 4. fpu__clear() does __write_pkru(), takes it out of the init state
>> 5. Signal handler runs, exits
>> 6. fpu__restore_sig() XRSTOR's the state from #2, taking PKRU back to
>>     the init state
> 
> Isn't that just the cost of not hard-coding the XSAVE area layout?

No.  You unnecessarily take PKRU out of the init state.  If you don't
want to change its value, just don't do that when you run XRSTOR.

>> But, about the patch in general:
>>
>> I'm not a big fan of doing this in such a PKRU-specific way.  It would
>> be nice to have this available for all XSAVE states.  It would also keep
>> you from so unnecessarily frobbing with WRPKRU in fpu__clear().  You
>> could just clear the PKRU bit in the Requested Feature BitMap (RFBM)
>> passed to XRSTOR.  That would be much straightforward and able to be
>> more easily extended to more states.
> 
> I don't see where I could plug this into the current kernel sources.
> Would you please provide some pointers?

In handle_signal()->fpu__clear()->...__copy_kernel_to_fpregs(), we pass
-1 as the RFBM.  Don't do that.  Pass (~0 & ~XFEATURE_MASK_PKRU).
Andy Lutomirski May 2, 2018, 5:09 p.m. UTC | #4
> On May 2, 2018, at 8:12 AM, Florian Weimer <fweimer@redhat.com> wrote:
> 
>> On 05/02/2018 04:30 PM, Dave Hansen wrote:
>>> On 05/02/2018 06:26 AM, Florian Weimer wrote:
>>> pkeys support for IBM POWER intends to inherited the access rights of
>>> the current thread in signal handlers.  The advantage is that this
>>> preserves access to memory regions associated with non-default keys,
>>> enabling additional usage scenarios for memory protection keys which
>>> currently do not work on x86 due to the unconditional reset to the
>>> (configurable) default key in signal handlers.
>> What's the usage scenario that does not work?
> 
> Here's what I want to do:
> 
> Nick Clifton wrote a binutils patch which puts the .got.plt section on separate pages.  We allocate a protection key for it, assign it to all such sections in the process image, and change the access rights of the main thread to disallow writes via that key during process startup.  In _dl_fixup, we enable write access to the GOT, update the GOT entry, and then disable it again.
> 
> This way, we have a pretty safe form of lazy binding, without having to resort to BIND_NOW.
> 
> With the current kernel behavior on x86, we cannot do that because signal handlers revert to the default (deny) access rights, so the GOT turns inaccessible.

Dave is right: the current behavior was my request, and I still think it’s correct.  The whole point is that, even if something nasty happens like a SIGALRM handler hitting in the middle of _dl_fixup, the SIGALRM handler is preventing from accidentally writing to the protected memory.  When SIGALRM returns, PKRU should get restored

Another way of looking at this is that the kernel would like to approximate what the ISA behavior *should* have been: the whole sequence “modify PKRU; access memory; restore PKRU” should be as atomic as possible.

Florian, what is the actual problematic sequence of events?

—Andy
Florian Weimer May 2, 2018, 5:17 p.m. UTC | #5
On 05/02/2018 07:09 PM, Andy Lutomirski wrote:
>> Nick Clifton wrote a binutils patch which puts the .got.plt section on separate pages.  We allocate a protection key for it, assign it to all such sections in the process image, and change the access rights of the main thread to disallow writes via that key during process startup.  In _dl_fixup, we enable write access to the GOT, update the GOT entry, and then disable it again.
>>
>> This way, we have a pretty safe form of lazy binding, without having to resort to BIND_NOW.
>>
>> With the current kernel behavior on x86, we cannot do that because signal handlers revert to the default (deny) access rights, so the GOT turns inaccessible.

> Dave is right: the current behavior was my request, and I still think it’s correct.  The whole point is that, even if something nasty happens like a SIGALRM handler hitting in the middle of _dl_fixup, the SIGALRM handler is preventing from accidentally writing to the protected memory.  When SIGALRM returns, PKRU should get restored
> 
> Another way of looking at this is that the kernel would like to approximate what the ISA behavior*should*  have been: the whole sequence “modify PKRU; access memory; restore PKRU” should be as atomic as possible.
> 
> Florian, what is the actual problematic sequence of events?

See above.  The signal handler will crash if it calls any non-local 
function through the GOT because with the default access rights, it's 
not readable in the signal handler.

Any use of memory protection keys for basic infrastructure will run into 
this problem, so I think the current kernel behavior is not very useful. 
  It's also x86-specific.

 From a security perspective, the atomic behavior is not very useful 
because you generally want to modify PKRU *before* computing the details 
of the memory access, so that you don't have a general “poke anywhere 
with this access right” primitive in the text segment.  (I called this 
the “suffix problem” in another context.)

For this reason, I plan to add the PKRU modification to the beginning of 
_dl_fixup.

CET will offer a different trade-off here, but we haven't that yet.

Thanks,
Florian
Andy Lutomirski May 2, 2018, 8:41 p.m. UTC | #6
On Wed, May 2, 2018 at 10:17 AM Florian Weimer <fweimer@redhat.com> wrote:

> On 05/02/2018 07:09 PM, Andy Lutomirski wrote:
> >> Nick Clifton wrote a binutils patch which puts the .got.plt section on
separate pages.  We allocate a protection key for it, assign it to all such
sections in the process image, and change the access rights of the main
thread to disallow writes via that key during process startup.  In
_dl_fixup, we enable write access to the GOT, update the GOT entry, and
then disable it again.
> >>
> >> This way, we have a pretty safe form of lazy binding, without having
to resort to BIND_NOW.
> >>
> >> With the current kernel behavior on x86, we cannot do that because
signal handlers revert to the default (deny) access rights, so the GOT
turns inaccessible.

> > Dave is right: the current behavior was my request, and I still think
it’s correct.  The whole point is that, even if something nasty happens
like a SIGALRM handler hitting in the middle of _dl_fixup, the SIGALRM
handler is preventing from accidentally writing to the protected memory.
When SIGALRM returns, PKRU should get restored
> >
> > Another way of looking at this is that the kernel would like to
approximate what the ISA behavior*should*  have been: the whole sequence
“modify PKRU; access memory; restore PKRU” should be as atomic as possible.
> >
> > Florian, what is the actual problematic sequence of events?

> See above.  The signal handler will crash if it calls any non-local
> function through the GOT because with the default access rights, it's
> not readable in the signal handler.

> Any use of memory protection keys for basic infrastructure will run into
> this problem, so I think the current kernel behavior is not very useful.
>    It's also x86-specific.

>   From a security perspective, the atomic behavior is not very useful
> because you generally want to modify PKRU *before* computing the details
> of the memory access, so that you don't have a general “poke anywhere
> with this access right” primitive in the text segment.  (I called this
> the “suffix problem” in another context.)


Ugh, right.  It's been long enough that I forgot about the underlying
issue.  A big part of the problem here is that pkey_alloc() should set the
initial value of the key across all threads, but it *can't*.  There is
literally no way to do it in a multithreaded program that uses RDPKRU and
WRPKRU.

But I think the right fix, at least for your use case, is to have a per-mm
init_pkru variable that starts as "deny all".  We'd add a new pkey_alloc()
flag like PKEY_ALLOC_UPDATE_INITIAL_STATE that causes the specified mode to
update init_pkru.  New threads and delivered signals would get the
init_pkru state instead of the hardcoded default.
Florian Weimer May 2, 2018, 9:06 p.m. UTC | #7
On 05/02/2018 10:41 PM, Andy Lutomirski wrote:

>> See above.  The signal handler will crash if it calls any non-local
>> function through the GOT because with the default access rights, it's
>> not readable in the signal handler.
> 
>> Any use of memory protection keys for basic infrastructure will run into
>> this problem, so I think the current kernel behavior is not very useful.
>>     It's also x86-specific.
> 
>>    From a security perspective, the atomic behavior is not very useful
>> because you generally want to modify PKRU *before* computing the details
>> of the memory access, so that you don't have a general “poke anywhere
>> with this access right” primitive in the text segment.  (I called this
>> the “suffix problem” in another context.)
> 
> 
> Ugh, right.  It's been long enough that I forgot about the underlying
> issue.  A big part of the problem here is that pkey_alloc() should set the
> initial value of the key across all threads, but it *can't*.  There is
> literally no way to do it in a multithreaded program that uses RDPKRU and
> WRPKRU.

The kernel could do *something*, probably along the membarrier system 
call.  I mean, I could implement a reasonable close approximation in 
userspace, via the setxid mechanism in glibc (but I really don't want to).

> But I think the right fix, at least for your use case, is to have a per-mm
> init_pkru variable that starts as "deny all".  We'd add a new pkey_alloc()
> flag like PKEY_ALLOC_UPDATE_INITIAL_STATE that causes the specified mode to
> update init_pkru.  New threads and delivered signals would get the
> init_pkru state instead of the hardcoded default.

I implemented this for signal handlers:

   https://marc.info/?l=linux-api&m=151285420302698&w=2

This does not alter the thread inheritance behavior yet.  I would have 
to investigate how to implement that.

Feedback led to the current patch, though.  I'm not sure what has 
changed since then.

If I recall correctly, the POWER maintainer did express a strong desire 
back then for (what is, I believe) their current semantics, which my 
PKEY_ALLOC_SIGNALINHERIT patch implements for x86, too.

Thanks,
Florian
Florian Weimer May 2, 2018, 9:08 p.m. UTC | #8
On 05/02/2018 05:28 PM, Dave Hansen wrote:
> The other option here that I think we discussed in the past was to have
> an*explicit*  signal PKRU value.  That way, we can be restrictive by
> default but allow overrides for special cases like you have.

That's the patch I posted before (with PKEY_ALLOC_SETSIGNAL).  I'm 
afraid we are going in circles.

Thanks,
Florian
Ram Pai May 2, 2018, 9:12 p.m. UTC | #9
On Wed, May 02, 2018 at 05:12:50PM +0200, Florian Weimer wrote:
> On 05/02/2018 04:30 PM, Dave Hansen wrote:
> >On 05/02/2018 06:26 AM, Florian Weimer wrote:
> >>pkeys support for IBM POWER intends to inherited the access rights of
> >>the current thread in signal handlers.  The advantage is that this
> >>preserves access to memory regions associated with non-default keys,
> >>enabling additional usage scenarios for memory protection keys which
> >>currently do not work on x86 due to the unconditional reset to the
> >>(configurable) default key in signal handlers.
> >
> >What's the usage scenario that does not work?
> 
> Here's what I want to do:
> 
> Nick Clifton wrote a binutils patch which puts the .got.plt section
> on separate pages.  We allocate a protection key for it, assign it
> to all such sections in the process image, and change the access
> rights of the main thread to disallow writes via that key during
> process startup.  In _dl_fixup, we enable write access to the GOT,
> update the GOT entry, and then disable it again.
> 
> This way, we have a pretty safe form of lazy binding, without having
> to resort to BIND_NOW.
> 
> With the current kernel behavior on x86, we cannot do that because
> signal handlers revert to the default (deny) access rights, so the
> GOT turns inaccessible.
> 
> >>Consequently, this commit updates the x86 implementation to preserve
> >>the PKRU register value of the interrupted context in signal handlers.
> >>If a key is allocated successfully with the PKEY_ALLOC_SIGNALINHERIT
> >>flag, the application can assume this signal inheritance behavior.
> >
> >I think this is a pretty gross misuse of the API.  Adding an argument to
> >pkey_alloc() is something that folks would assume would impact the key
> >being *allocated*, not pkeys behavior across the process as a whole.
> 
> From the application point of view, only the allocated key is
> affected—it has specific semantics that were undefined before and
> varied between x86 and POWER.
> 
> >>This change does not affect the init_pkru optimization because if the
> >>thread's PKRU register is zero due to the init_pkru setting, it will
> >>remain zero in the signal handler through inheritance from the
> >>interrupted context.
> >
> >I think you are right, but it's rather convoluted.  It does:
> >
> >1. Running with PKRU in the init state
> >2. Kernel saves off init-state-PKRU XSAVE signal buffer
> >3. Enter signal, kernel XRSTOR (may) set the init state again
> >4. fpu__clear() does __write_pkru(), takes it out of the init state
> >5. Signal handler runs, exits
> >6. fpu__restore_sig() XRSTOR's the state from #2, taking PKRU back to
> >    the init state
> 
> Isn't that just the cost of not hard-coding the XSAVE area layout?
> 
> >But, about the patch in general:
> >
> >I'm not a big fan of doing this in such a PKRU-specific way.  It would
> >be nice to have this available for all XSAVE states.  It would also keep
> >you from so unnecessarily frobbing with WRPKRU in fpu__clear().  You
> >could just clear the PKRU bit in the Requested Feature BitMap (RFBM)
> >passed to XRSTOR.  That would be much straightforward and able to be
> >more easily extended to more states.
> 
> I don't see where I could plug this into the current kernel sources.
> Would you please provide some pointers?
> 
> >PKRU is now preserved on signal entry, but not signal exit.  Was that
> >intentional?  That seems like odd behavior, and also differs from the
> >POWER implementation as I understand it.
> 
> Ram, would you please comment?

on POWER the pkey behavior will remain the same at entry or at exit from
the signal handler.  For eg:  if a key is read-disabled on entry into
the signal handler, and gets read-enabled in the signal handler, than it
will continue to be read-enabled on return from the signal handler.

In other words, changes to key permissions persist across signal
boundaries.

> 
> I think it is a bug not restore the access rights to the former
> value in the interrupted context.  In userspace, we have exactly
> this problem with errno, and it can lead to subtle bugs.
> 
> Thanks,
> Florian
Andy Lutomirski May 2, 2018, 9:18 p.m. UTC | #10
On Wed, May 2, 2018 at 2:13 PM Ram Pai <linuxram@us.ibm.com> wrote:


> > Ram, would you please comment?

> on POWER the pkey behavior will remain the same at entry or at exit from
> the signal handler.  For eg:  if a key is read-disabled on entry into
> the signal handler, and gets read-enabled in the signal handler, than it
> will continue to be read-enabled on return from the signal handler.

> In other words, changes to key permissions persist across signal
> boundaries.

I don't know about POWER's ISA, but this is crappy behavior.  If a thread
temporarily grants itself access to a restrictive memory key and then gets
a signal, the signal handler should *not* have access to that key.
Andy Lutomirski May 2, 2018, 9:23 p.m. UTC | #11
On Wed, May 2, 2018 at 2:06 PM Florian Weimer <fweimer@redhat.com> wrote:

> On 05/02/2018 10:41 PM, Andy Lutomirski wrote:

> >> See above.  The signal handler will crash if it calls any non-local
> >> function through the GOT because with the default access rights, it's
> >> not readable in the signal handler.
> >
> >> Any use of memory protection keys for basic infrastructure will run
into
> >> this problem, so I think the current kernel behavior is not very
useful.
> >>     It's also x86-specific.
> >
> >>    From a security perspective, the atomic behavior is not very useful
> >> because you generally want to modify PKRU *before* computing the
details
> >> of the memory access, so that you don't have a general “poke anywhere
> >> with this access right” primitive in the text segment.  (I called this
> >> the “suffix problem” in another context.)
> >
> >
> > Ugh, right.  It's been long enough that I forgot about the underlying
> > issue.  A big part of the problem here is that pkey_alloc() should set
the
> > initial value of the key across all threads, but it *can't*.  There is
> > literally no way to do it in a multithreaded program that uses RDPKRU
and
> > WRPKRU.

> The kernel could do *something*, probably along the membarrier system
> call.  I mean, I could implement a reasonable close approximation in
> userspace, via the setxid mechanism in glibc (but I really don't want to).

I beg to differ.

Thread A:
old = RDPKRU();
WRPKRU(old & ~3);
...
WRPKRU(old);

Thread B:
pkey_alloc().

If pkey_alloc() happens while thread A is in the ... part, you lose.  It
makes no difference what the kernel does.  The problem is that the WRPKRU
instruction itself is designed incorrectly.



> > But I think the right fix, at least for your use case, is to have a
per-mm
> > init_pkru variable that starts as "deny all".  We'd add a new
pkey_alloc()
> > flag like PKEY_ALLOC_UPDATE_INITIAL_STATE that causes the specified
mode to
> > update init_pkru.  New threads and delivered signals would get the
> > init_pkru state instead of the hardcoded default.

> I implemented this for signal handlers:

>     https://marc.info/?l=linux-api&m=151285420302698&w=2

> This does not alter the thread inheritance behavior yet.  I would have
> to investigate how to implement that.

> Feedback led to the current patch, though.  I'm not sure what has
> changed since then.

What feedback?  I think the old patch was much better than the new patch.
I could point out some issues in the kernel code, and I think it should
deal with thread creation, but otherwise I think it's the right approach.

Keep in mind, though, that it's just not possible to make pkey_alloc() work
on x86 in any sensible way in a multithreaded program.


> If I recall correctly, the POWER maintainer did express a strong desire
> back then for (what is, I believe) their current semantics, which my
> PKEY_ALLOC_SIGNALINHERIT patch implements for x86, too.

Ram, I really really don't like the POWER semantics.  Can you give some
justification for them?  Does POWER at least have an atomic way for
userspace to modify just the key it wants to modify or, even better,
special load and store instructions to use alternate keys?

--Andy
Dave Hansen May 2, 2018, 10:03 p.m. UTC | #12
On 05/02/2018 02:08 PM, Florian Weimer wrote:
> On 05/02/2018 05:28 PM, Dave Hansen wrote:
>> The other option here that I think we discussed in the past was to have
>> an*explicit*  signal PKRU value.  That way, we can be restrictive by
>> default but allow overrides for special cases like you have.
> 
> That's the patch I posted before (with PKEY_ALLOC_SETSIGNAL).  I'm
> afraid we are going in circles.

Could you remind us why you abandoned that approach and its relative
merits versus this new approach?
Dave Hansen May 2, 2018, 10:08 p.m. UTC | #13
On 05/02/2018 02:23 PM, Andy Lutomirski wrote:
>> The kernel could do *something*, probably along the membarrier system
>> call.  I mean, I could implement a reasonable close approximation in
>> userspace, via the setxid mechanism in glibc (but I really don't want to).
> 
> I beg to differ.
> 
> Thread A:
> old = RDPKRU();
> WRPKRU(old & ~3);
> ...
> WRPKRU(old);
> 
> Thread B:
> pkey_alloc().
> 
> If pkey_alloc() happens while thread A is in the ... part, you lose.  It
> makes no difference what the kernel does.  The problem is that the WRPKRU
> instruction itself is designed incorrectly.

Yes, *if* we define pkey_alloc() to be implicitly changing other
threads' PKRU value.

Let's say we go to the hardware guys and ask for a new instruction to
fix this.  We're going to have to make a pretty good case that this is
either impossible or really hard to do in software.

Surely we have the locking to tell another thread that we want its PKRU
value to change without actively going out and having the kernel poke a
new value in.
Andy Lutomirski May 2, 2018, 10:22 p.m. UTC | #14
On Wed, May 2, 2018 at 3:08 PM Dave Hansen <dave.hansen@intel.com> wrote:

> On 05/02/2018 02:23 PM, Andy Lutomirski wrote:
> >> The kernel could do *something*, probably along the membarrier system
> >> call.  I mean, I could implement a reasonable close approximation in
> >> userspace, via the setxid mechanism in glibc (but I really don't want
to).
> >
> > I beg to differ.
> >
> > Thread A:
> > old = RDPKRU();
> > WRPKRU(old & ~3);
> > ...
> > WRPKRU(old);
> >
> > Thread B:
> > pkey_alloc().
> >
> > If pkey_alloc() happens while thread A is in the ... part, you lose.  It
> > makes no difference what the kernel does.  The problem is that the
WRPKRU
> > instruction itself is designed incorrectly.

> Yes, *if* we define pkey_alloc() to be implicitly changing other
> threads' PKRU value.

I think that's the only generally useful behavior.  In general, if
libraries are going to use protection keys, they're going to do this:

int key = pkey_alloc(...);
mmap() or mprotect_key() some memory (for crypto secrets, a database,
whatever).
...
enable_write_access(key);
write to the memory;
disable_write_access(key);

That library wants other threads, signal handlers, and, in general, the
whole rest of the process to be restricted, and that library doesn't want
race conditions.  The problem here is that, to get this right, we either
need the PKRU modifications to be syscalls or to take locks, and the lock
approach is going to be fairly gross.


> Let's say we go to the hardware guys and ask for a new instruction to
> fix this.  We're going to have to make a pretty good case that this is
> either impossible or really hard to do in software.

> Surely we have the locking to tell another thread that we want its PKRU
> value to change without actively going out and having the kernel poke a
> new value in.

I think that doing it in userspace without a new instruction is going to be
slow, ugly, unreliable, or more than one of the above.

Here's my proposal.  We ask the hardware folks for new instructions.  Once
we get tentative agreement, we add new vDSO pkey accessors.  At first those
accessors function will do a syscall.  When we get the new instructions,
they get alternatives.  The vDSO helpers could look like this:

typedef struct { u64 opaque; } pkey_save_t;

pkey_save_t pkey_save_and_set(int key, unsigned int mode);
void pkey_set(int key, unsigned int mode);
void pkey_restore(int key, pkey_save_t prev);

Slight variants are possible.  On new hardware, pkey_set() and
pkey_restore() use the WRPKRU replacement.  pkey_save_and_set() uses the
RDPKRU replacement followed by the WRPKRU replacement.  The usage is:

pkey_save_t prev = pkey_save_and_set(my_key, 0);  /* enable read and write
*/
do something with memory;
pkey_restore(my_key, prev);

or just:

pkey_set(my_key, 0);
...
pkey_set(my_key, 3);  /* or 1 or 2 depending on the application */

And we make it very, very clear to user code that using RDPKRU and WRPKRU
directly is a big no-no.

What do you all think?  This gives us sane, if slow, behavior for current
CPUs and is decently fast on hypothetical new CPUs.
Dave Hansen May 2, 2018, 10:32 p.m. UTC | #15
On 05/02/2018 03:22 PM, Andy Lutomirski wrote:
> That library wants other threads, signal handlers, and, in general, the
> whole rest of the process to be restricted, and that library doesn't want
> race conditions.  The problem here is that, to get this right, we either
> need the PKRU modifications to be syscalls or to take locks, and the lock
> approach is going to be fairly gross.

I totally get the idea that a RDPKRU/WRPKRU is non-atomic and that it
can't be mixed with asynchronous WRPKRU's in that thread.

But, where do those come from in this scenario?  I'm not getting the
secondary mechanism is that *makes* them unsafe.
Andy Lutomirski May 2, 2018, 11:32 p.m. UTC | #16
> On May 2, 2018, at 3:32 PM, Dave Hansen <dave.hansen@intel.com> wrote:
> 
>> On 05/02/2018 03:22 PM, Andy Lutomirski wrote:
>> That library wants other threads, signal handlers, and, in general, the
>> whole rest of the process to be restricted, and that library doesn't want
>> race conditions.  The problem here is that, to get this right, we either
>> need the PKRU modifications to be syscalls or to take locks, and the lock
>> approach is going to be fairly gross.
> 
> I totally get the idea that a RDPKRU/WRPKRU is non-atomic and that it
> can't be mixed with asynchronous WRPKRU's in that thread.
> 
> But, where do those come from in this scenario?  I'm not getting the
> secondary mechanism is that *makes* them unsafe.

pkey_alloc() itself.  If someone tries to allocate a key with a given default mode, unless there’s already a key that already had that value in all threads or pkey_alloc() needs to asynchronously create such a key.

There is a partial hack that glibc could do. DSOs could have a way to statically request a key (e.g. a PT_PKEY segment) and glibc could do all the pkey_alloc() calls before any threads get created. Of course, a DSO like this can’t be dlopened().  We still need a way for pkey_alloc() to update the value for signal delivery, but that’s straightforward.
Ram Pai May 2, 2018, 11:38 p.m. UTC | #17
On Wed, May 02, 2018 at 09:18:11PM +0000, Andy Lutomirski wrote:
> On Wed, May 2, 2018 at 2:13 PM Ram Pai <linuxram@us.ibm.com> wrote:
> 
> 
> > > Ram, would you please comment?
> 
> > on POWER the pkey behavior will remain the same at entry or at exit from
> > the signal handler.  For eg:  if a key is read-disabled on entry into
> > the signal handler, and gets read-enabled in the signal handler, than it
> > will continue to be read-enabled on return from the signal handler.
> 
> > In other words, changes to key permissions persist across signal
> > boundaries.
> 
> I don't know about POWER's ISA, but this is crappy behavior.  If a thread
> temporarily grants itself access to a restrictive memory key and then gets
> a signal, the signal handler should *not* have access to that key.

This is a new requirement that I was not aware off. Its not documented
anywhere AFAICT.  Regardless of how the ISA behaves, its still a kernel
behavior that needs to be clearly defined.
Dave Hansen May 2, 2018, 11:58 p.m. UTC | #18
On 05/02/2018 04:32 PM, Andy Lutomirski wrote:
>> But, where do those come from in this scenario?  I'm not getting
>> the secondary mechanism is that *makes* them unsafe.
> pkey_alloc() itself.  If someone tries to allocate a key with a given
> default mode, unless there’s already a key that already had that
> value in all threads or pkey_alloc() needs to asynchronously create
> such a key.

I think you are saying: If a thread calls pkey_alloc(), all threads
should, by default, implicitly get access.  That
broadcast-to-other-threads is the thing that the current architecture
doesn't do.  In this situation, CPU threads have to go opt-out of
getting access to data protected with a given, allocated key.

Right?
Andy Lutomirski May 3, 2018, 1:14 a.m. UTC | #19
> On May 2, 2018, at 4:58 PM, Dave Hansen <dave.hansen@intel.com> wrote:
> 
> On 05/02/2018 04:32 PM, Andy Lutomirski wrote:
>>> But, where do those come from in this scenario?  I'm not getting
>>> the secondary mechanism is that *makes* them unsafe.
>> pkey_alloc() itself.  If someone tries to allocate a key with a given
>> default mode, unless there’s already a key that already had that
>> value in all threads or pkey_alloc() needs to asynchronously create
>> such a key.
> 
> I think you are saying: If a thread calls pkey_alloc(), all threads
> should, by default, implicitly get access.

No, I’m saying that all threads should get the *requested* access.  If I’m protecting the GOT, I want all threads to get RO access. If I’m writing a crypto library, I probably want all threads to have no access.  If I’m writing a database, I probably want all threads to get RO by default.  If I’m writing some doodad to sandbox some carefully constructed code, I might want all threads to have full access by default.

—Andy
Ram Pai May 3, 2018, 2:10 a.m. UTC | #20
On Wed, May 02, 2018 at 09:23:49PM +0000, Andy Lutomirski wrote:
> 
> > If I recall correctly, the POWER maintainer did express a strong desire
> > back then for (what is, I believe) their current semantics, which my
> > PKEY_ALLOC_SIGNALINHERIT patch implements for x86, too.
> 
> Ram, I really really don't like the POWER semantics.  Can you give some
> justification for them?  Does POWER at least have an atomic way for
> userspace to modify just the key it wants to modify or, even better,
> special load and store instructions to use alternate keys?

I wouldn't call it POWER semantics. The way I implemented it on power
lead to the semantics, given that nothing was explicitly stated 
about how the semantics should work within a signal handler.

As far as power ISA is concerned, there are no special load
and store instructions that can somehow circumvent the permissions
of the key associated with the PTE.

Also unlike x86;  on POWER, pkey-permissions are not administered based
on its run context (user context, signal context).
Hence the default behavior tends to make the key permissions remain the
same regardless of the context.


> Does POWER at least have an atomic way for userspace to modify just the 
> key it wants to modify .. ?

No. just like PKRU, on power its a register which has bits corresponding
to each key. Entire register has to be read and written, which means
multiple keys could get modified in the same write.


adding ppc mailing list to the CC.

> 
> --Andy
Andy Lutomirski May 3, 2018, 4:05 a.m. UTC | #21
On Wed, May 2, 2018 at 7:11 PM Ram Pai <linuxram@us.ibm.com> wrote:

> On Wed, May 02, 2018 at 09:23:49PM +0000, Andy Lutomirski wrote:
> >
> > > If I recall correctly, the POWER maintainer did express a strong
desire
> > > back then for (what is, I believe) their current semantics, which my
> > > PKEY_ALLOC_SIGNALINHERIT patch implements for x86, too.
> >
> > Ram, I really really don't like the POWER semantics.  Can you give some
> > justification for them?  Does POWER at least have an atomic way for
> > userspace to modify just the key it wants to modify or, even better,
> > special load and store instructions to use alternate keys?

> I wouldn't call it POWER semantics. The way I implemented it on power
> lead to the semantics, given that nothing was explicitly stated
> about how the semantics should work within a signal handler.

I think that this is further evidence that we should introduce a new
pkey_alloc() mode and deprecate the old.  To the extent possible, this
thing should work the same way on x86 and POWER.

I think that we, as kernel API designers enabling fancy hardware features,
need to think about them with some care.  Our goal isn't just to expose the
hardware feature to userspace and let userspace run wild with it -- our
goal is to figure out what the use cases are and make the API useful for
those use cases without introducing more footguns that necessary.  For
pkey, this means realizing that user code consists of various loosely
coupled components and that the purpose of pkeys is to allow some userspace
component to prevent other components from *accidentally* clobbering or
leaking data due to bugs.  And I think that the current APIs don't really
achieve this.
Florian Weimer May 3, 2018, 2:37 p.m. UTC | #22
On 05/02/2018 11:23 PM, Andy Lutomirski wrote:
>> The kernel could do*something*, probably along the membarrier system
>> call.  I mean, I could implement a reasonable close approximation in
>> userspace, via the setxid mechanism in glibc (but I really don't want to).
> I beg to differ.
> 
> Thread A:
> old = RDPKRU();
> WRPKRU(old & ~3);
> ...
> WRPKRU(old);
> 
> Thread B:
> pkey_alloc().
> 
> If pkey_alloc() happens while thread A is in the ... part, you lose.  It
> makes no difference what the kernel does.  The problem is that the WRPKRU
> instruction itself is designed incorrectly.

Even that is solvable, as long as the architecture as exact traps: You 
can look at the program counter and patch up the registers accordingly 
if the code is in the critical section.  Of course, this would need 
centralizing PKRU updates in a vDSO or a single (glibc) library 
function.  Certainly not nice and even horrible enough not to do it, but 
I don't think it's actually impossible.

Didn't we discuss this before?

Thanks,
Florian
Dave Hansen May 3, 2018, 2:42 p.m. UTC | #23
On 05/02/2018 06:14 PM, Andy Lutomirski wrote:
>> I think you are saying: If a thread calls pkey_alloc(), all
>> threads should, by default, implicitly get access.
> No, I’m saying that all threads should get the *requested* access.
> If I’m protecting the GOT, I want all threads to get RO access. If
> I’m writing a crypto library, I probably want all threads to have no
> access.  If I’m writing a database, I probably want all threads to
> get RO by default.  If I’m writing some doodad to sandbox some
> carefully constructed code, I might want all threads to have full
> access by default.

OK, fair enough.  I totally agree that the current interface (or
architecture for that matter) is not amenable to use models where we are
implicitly imposing policies on *other* threads.

I don't think that means the current stuff is broken for
multi-threading, though, just the (admittedly useful) cases you are
talking about where you want to poke at a remote thread's PKRU.

So, where do we go from here?
Florian Weimer May 3, 2018, 2:42 p.m. UTC | #24
On 05/03/2018 03:14 AM, Andy Lutomirski wrote:
> No, I’m saying that all threads should get the*requested*  access.  If I’m protecting the GOT, I want all threads to get RO access. If I’m writing a crypto library, I probably want all threads to have no access.  If I’m writing a database, I probably want all threads to get RO by default.  If I’m writing some doodad to sandbox some carefully constructed code, I might want all threads to have full access by default.

Just a clarification: This key allocation issue is *not* a blocker for 
anything related to a safer GOT, or any other use of memory protection 
keys by the C implementation itself.  I agree that there could be 
application issues if threads are created early, but solving this issue 
in a general way appears to be quite costly.

Thanks,
Florian
Florian Weimer May 7, 2018, 9:43 a.m. UTC | #25
On 05/02/2018 11:18 PM, Andy Lutomirski wrote:
> I don't know about POWER's ISA, but this is crappy behavior.  If a thread
> temporarily grants itself access to a restrictive memory key and then gets
> a signal, the signal handler should*not*  have access to that key.

Sorry, that totally depends on what the signal handler does, especially 
for synchronously delivered signals.

Thanks,
Florian
Florian Weimer May 7, 2018, 9:47 a.m. UTC | #26
On 05/03/2018 12:03 AM, Dave Hansen wrote:
> On 05/02/2018 02:08 PM, Florian Weimer wrote:
>> On 05/02/2018 05:28 PM, Dave Hansen wrote:
>>> The other option here that I think we discussed in the past was to have
>>> an*explicit*  signal PKRU value.  That way, we can be restrictive by
>>> default but allow overrides for special cases like you have.
>>
>> That's the patch I posted before (with PKEY_ALLOC_SETSIGNAL).  I'm
>> afraid we are going in circles.
> 
> Could you remind us why you abandoned that approach and its relative
> merits versus this new approach?

Ram argued for the PKEY_ALLOC_SIGNALINHERIT and no one else objected or 
was interested at the time.  I may have misread the consensus.

I'm not sure what do here.  I tried to submit patches for the two 
suggested approaches, and each one resulted in suggests to implement the 
other semantics instead.

Thanks,
Florian
Florian Weimer May 7, 2018, 9:47 a.m. UTC | #27
On 05/03/2018 01:38 AM, Ram Pai wrote:
> This is a new requirement that I was not aware off. Its not documented
> anywhere AFAICT.

Correct.  All inheritance behavior was deliberately left unspecified.

I'm surprised about the reluctance to fix the x86 behavior.  Are there 
any applications at all for the current semantics?

I guess I can implement this particular glibc hardening on POWER only 
for now.

Thanks,
Florian
Florian Weimer May 7, 2018, 9:48 a.m. UTC | #28
On 05/03/2018 06:05 AM, Andy Lutomirski wrote:
> On Wed, May 2, 2018 at 7:11 PM Ram Pai <linuxram@us.ibm.com> wrote:
> 
>> On Wed, May 02, 2018 at 09:23:49PM +0000, Andy Lutomirski wrote:
>>>
>>>> If I recall correctly, the POWER maintainer did express a strong
> desire
>>>> back then for (what is, I believe) their current semantics, which my
>>>> PKEY_ALLOC_SIGNALINHERIT patch implements for x86, too.
>>>
>>> Ram, I really really don't like the POWER semantics.  Can you give some
>>> justification for them?  Does POWER at least have an atomic way for
>>> userspace to modify just the key it wants to modify or, even better,
>>> special load and store instructions to use alternate keys?
> 
>> I wouldn't call it POWER semantics. The way I implemented it on power
>> lead to the semantics, given that nothing was explicitly stated
>> about how the semantics should work within a signal handler.
> 
> I think that this is further evidence that we should introduce a new
> pkey_alloc() mode and deprecate the old.  To the extent possible, this
> thing should work the same way on x86 and POWER.

Do you propose to change POWER or to change x86?

Thanks,
Florian
Andy Lutomirski May 8, 2018, 2:49 a.m. UTC | #29
On Mon, May 7, 2018 at 2:48 AM Florian Weimer <fweimer@redhat.com> wrote:

> On 05/03/2018 06:05 AM, Andy Lutomirski wrote:
> > On Wed, May 2, 2018 at 7:11 PM Ram Pai <linuxram@us.ibm.com> wrote:
> >
> >> On Wed, May 02, 2018 at 09:23:49PM +0000, Andy Lutomirski wrote:
> >>>
> >>>> If I recall correctly, the POWER maintainer did express a strong
> > desire
> >>>> back then for (what is, I believe) their current semantics, which my
> >>>> PKEY_ALLOC_SIGNALINHERIT patch implements for x86, too.
> >>>
> >>> Ram, I really really don't like the POWER semantics.  Can you give
some
> >>> justification for them?  Does POWER at least have an atomic way for
> >>> userspace to modify just the key it wants to modify or, even better,
> >>> special load and store instructions to use alternate keys?
> >
> >> I wouldn't call it POWER semantics. The way I implemented it on power
> >> lead to the semantics, given that nothing was explicitly stated
> >> about how the semantics should work within a signal handler.
> >
> > I think that this is further evidence that we should introduce a new
> > pkey_alloc() mode and deprecate the old.  To the extent possible, this
> > thing should work the same way on x86 and POWER.

> Do you propose to change POWER or to change x86?

Sorry for being slow to reply.  I propose to introduce a new
PKEY_ALLOC_something variant on x86 and POWER and to make the behavior
match on both.  It should at least update the values loaded when a signal
is delivered and it should probably also update it for new threads.

For glibc, for example, I assume that you want signals to be delivered with
write access disabled to the GOT.  Otherwise you would fail to protect
against exploits that occur in signal context.  Glibc controls thread
creation, so the initial state on thread startup doesn't really matter, but
there will be more users than just glibc.

--Andy
Florian Weimer May 8, 2018, 12:40 p.m. UTC | #30
On 05/08/2018 04:49 AM, Andy Lutomirski wrote:
> On Mon, May 7, 2018 at 2:48 AM Florian Weimer <fweimer@redhat.com> wrote:
> 
>> On 05/03/2018 06:05 AM, Andy Lutomirski wrote:
>>> On Wed, May 2, 2018 at 7:11 PM Ram Pai <linuxram@us.ibm.com> wrote:
>>>
>>>> On Wed, May 02, 2018 at 09:23:49PM +0000, Andy Lutomirski wrote:
>>>>>
>>>>>> If I recall correctly, the POWER maintainer did express a strong
>>> desire
>>>>>> back then for (what is, I believe) their current semantics, which my
>>>>>> PKEY_ALLOC_SIGNALINHERIT patch implements for x86, too.
>>>>>
>>>>> Ram, I really really don't like the POWER semantics.  Can you give
> some
>>>>> justification for them?  Does POWER at least have an atomic way for
>>>>> userspace to modify just the key it wants to modify or, even better,
>>>>> special load and store instructions to use alternate keys?
>>>
>>>> I wouldn't call it POWER semantics. The way I implemented it on power
>>>> lead to the semantics, given that nothing was explicitly stated
>>>> about how the semantics should work within a signal handler.
>>>
>>> I think that this is further evidence that we should introduce a new
>>> pkey_alloc() mode and deprecate the old.  To the extent possible, this
>>> thing should work the same way on x86 and POWER.
> 
>> Do you propose to change POWER or to change x86?
> 
> Sorry for being slow to reply.  I propose to introduce a new
> PKEY_ALLOC_something variant on x86 and POWER and to make the behavior
> match on both.

So basically implement PKEY_ALLOC_SETSIGNAL for POWER, and keep the 
existing (different) behavior without the flag?

Ram, would you be okay with that?  Could you give me a hand if 
necessary?  (I assume we have silicon in-house because it's a 
long-standing feature of the POWER platform which was simply dormant on 
Linux until now.)

> It should at least update the values loaded when a signal
> is delivered and it should probably also update it for new threads.

I think we should keep inheritance for new threads and fork.  pkey_alloc 
only has a single access rights argument, which makes it hard to reuse 
this interface if there are two (three) separate sets of access rights.

Is there precedent for process state reverting on fork, besides 
MADV_WIPEONFORK?  My gut feeling is that we should avoid that.

> For glibc, for example, I assume that you want signals to be delivered with
> write access disabled to the GOT.  Otherwise you would fail to protect
> against exploits that occur in signal context.  Glibc controls thread
> creation, so the initial state on thread startup doesn't really matter, but
> there will be more users than just glibc.

glibc does not control thread, or more precisely, subprocess creation. 
Otherwise we wouldn't have face that many issues with our PID cache. 8-/

Thanks,
Florian
Andy Lutomirski May 9, 2018, 2:41 p.m. UTC | #31
On Tue, May 8, 2018 at 5:40 AM Florian Weimer <fweimer@redhat.com> wrote:

> On 05/08/2018 04:49 AM, Andy Lutomirski wrote:
> > On Mon, May 7, 2018 at 2:48 AM Florian Weimer <fweimer@redhat.com>
wrote:
> >
> >> On 05/03/2018 06:05 AM, Andy Lutomirski wrote:
> >>> On Wed, May 2, 2018 at 7:11 PM Ram Pai <linuxram@us.ibm.com> wrote:
> >>>
> >>>> On Wed, May 02, 2018 at 09:23:49PM +0000, Andy Lutomirski wrote:
> >>>>>
> >>>>>> If I recall correctly, the POWER maintainer did express a strong
> >>> desire
> >>>>>> back then for (what is, I believe) their current semantics, which
my
> >>>>>> PKEY_ALLOC_SIGNALINHERIT patch implements for x86, too.
> >>>>>
> >>>>> Ram, I really really don't like the POWER semantics.  Can you give
> > some
> >>>>> justification for them?  Does POWER at least have an atomic way for
> >>>>> userspace to modify just the key it wants to modify or, even better,
> >>>>> special load and store instructions to use alternate keys?
> >>>
> >>>> I wouldn't call it POWER semantics. The way I implemented it on power
> >>>> lead to the semantics, given that nothing was explicitly stated
> >>>> about how the semantics should work within a signal handler.
> >>>
> >>> I think that this is further evidence that we should introduce a new
> >>> pkey_alloc() mode and deprecate the old.  To the extent possible, this
> >>> thing should work the same way on x86 and POWER.
> >
> >> Do you propose to change POWER or to change x86?
> >
> > Sorry for being slow to reply.  I propose to introduce a new
> > PKEY_ALLOC_something variant on x86 and POWER and to make the behavior
> > match on both.

> So basically implement PKEY_ALLOC_SETSIGNAL for POWER, and keep the
> existing (different) behavior without the flag?

> Ram, would you be okay with that?  Could you give me a hand if
> necessary?  (I assume we have silicon in-house because it's a
> long-standing feature of the POWER platform which was simply dormant on
> Linux until now.)

> > It should at least update the values loaded when a signal
> > is delivered and it should probably also update it for new threads.

> I think we should keep inheritance for new threads and fork.  pkey_alloc
> only has a single access rights argument, which makes it hard to reuse
> this interface if there are two (three) separate sets of access rights.


Hmm.  I can get on board with the idea that fork() / clone() /
pthread_create() are all just special cases of the idea that the thread
that *calls* them should have the right pkey values, and the latter is
already busted given our inability to asynchronously propagate the new mode
in pkey_alloc().  So let's so PKEY_ALLOC_SETSIGNAL as a starting point.

One thing we could do, though: the current initual state on process
creation is all access blocked on all keys.  We could change it so that
half the keys are fully blocked and half are read-only.  Then we could add
a PKEY_ALLOC_STRICT or similar that allocates a key with the correct
initial state *and* does the setsignal thing.  If there are no keys left
with the correct initial state, then it fails.
Florian Weimer May 14, 2018, 12:01 p.m. UTC | #32
On 05/09/2018 04:41 PM, Andy Lutomirski wrote:
> Hmm.  I can get on board with the idea that fork() / clone() /
> pthread_create() are all just special cases of the idea that the thread
> that*calls*  them should have the right pkey values, and the latter is
> already busted given our inability to asynchronously propagate the new mode
> in pkey_alloc().  So let's so PKEY_ALLOC_SETSIGNAL as a starting point.

Ram, any suggestions for implementing this on POWER?

> One thing we could do, though: the current initual state on process
> creation is all access blocked on all keys.  We could change it so that
> half the keys are fully blocked and half are read-only.  Then we could add
> a PKEY_ALLOC_STRICT or similar that allocates a key with the correct
> initial state*and*  does the setsignal thing.  If there are no keys left
> with the correct initial state, then it fails.

The initial PKRU value can currently be configured by the system 
administrator.  I fear this approach has too many moving parts to be viable.

Thanks,
Florian
Andy Lutomirski May 14, 2018, 3:32 p.m. UTC | #33
> On May 14, 2018, at 5:01 AM, Florian Weimer <fweimer@redhat.com> wrote:
> 
>> One thing we could do, though: the current initual state on process
>> creation is all access blocked on all keys.  We could change it so that
>> half the keys are fully blocked and half are read-only.  Then we could add
>> a PKEY_ALLOC_STRICT or similar that allocates a key with the correct
>> initial state*and*  does the setsignal thing.  If there are no keys left
>> with the correct initial state, then it fails.
> 
> The initial PKRU value can currently be configured by the system administrator.  I fear this approach has too many moving parts to be viable.
> 
> 

Honestly, I think we should drop that option. I don’t see how we can expect an administrator to do this usefully.
Florian Weimer May 14, 2018, 3:34 p.m. UTC | #34
On 05/14/2018 05:32 PM, Andy Lutomirski wrote:
> 
> 
> 
>> On May 14, 2018, at 5:01 AM, Florian Weimer <fweimer@redhat.com> wrote:
>>
>>> One thing we could do, though: the current initual state on process
>>> creation is all access blocked on all keys.  We could change it so that
>>> half the keys are fully blocked and half are read-only.  Then we could add
>>> a PKEY_ALLOC_STRICT or similar that allocates a key with the correct
>>> initial state*and*  does the setsignal thing.  If there are no keys left
>>> with the correct initial state, then it fails.
>>
>> The initial PKRU value can currently be configured by the system administrator.  I fear this approach has too many moving parts to be viable.
>>
>>
> 
> Honestly, I think we should drop that option. I don’t see how we can expect an administrator to do this usefully.

I don't disagree—it makes things way less predictable in practice.

Thanks,
Florian
Dave Hansen May 16, 2018, 5:01 p.m. UTC | #35
On 05/14/2018 08:34 AM, Florian Weimer wrote:
>>> The initial PKRU value can currently be configured by the system
>>> administrator.  I fear this approach has too many moving parts to be
>>> viable.
>>
>> Honestly, I think we should drop that option. I don’t see how we can
>> expect an administrator to do this usefully.
> 
> I don't disagree—it makes things way less predictable in practice.

I originally put that thing in there to make Andy happy with the initial
permissions, and give us a way to back it out if something went wrong.
I have no objections to removing it either.
Ram Pai May 16, 2018, 8:35 p.m. UTC | #36
On Tue, May 08, 2018 at 02:40:46PM +0200, Florian Weimer wrote:
> On 05/08/2018 04:49 AM, Andy Lutomirski wrote:
> >On Mon, May 7, 2018 at 2:48 AM Florian Weimer <fweimer@redhat.com> wrote:
> >
> >>On 05/03/2018 06:05 AM, Andy Lutomirski wrote:
> >>>On Wed, May 2, 2018 at 7:11 PM Ram Pai <linuxram@us.ibm.com> wrote:
> >>>
> >>>>On Wed, May 02, 2018 at 09:23:49PM +0000, Andy Lutomirski wrote:
> >>>>>
> >>>>>>If I recall correctly, the POWER maintainer did express a strong
> >>>desire
> >>>>>>back then for (what is, I believe) their current semantics, which my
> >>>>>>PKEY_ALLOC_SIGNALINHERIT patch implements for x86, too.
> >>>>>
> >>>>>Ram, I really really don't like the POWER semantics.  Can you give
> >some
> >>>>>justification for them?  Does POWER at least have an atomic way for
> >>>>>userspace to modify just the key it wants to modify or, even better,
> >>>>>special load and store instructions to use alternate keys?
> >>>
> >>>>I wouldn't call it POWER semantics. The way I implemented it on power
> >>>>lead to the semantics, given that nothing was explicitly stated
> >>>>about how the semantics should work within a signal handler.
> >>>
> >>>I think that this is further evidence that we should introduce a new
> >>>pkey_alloc() mode and deprecate the old.  To the extent possible, this
> >>>thing should work the same way on x86 and POWER.
> >
> >>Do you propose to change POWER or to change x86?
> >
> >Sorry for being slow to reply.  I propose to introduce a new
> >PKEY_ALLOC_something variant on x86 and POWER and to make the behavior
> >match on both.
> 
> So basically implement PKEY_ALLOC_SETSIGNAL for POWER, and keep the
> existing (different) behavior without the flag?
> 
> Ram, would you be okay with that?  Could you give me a hand if
> necessary?  (I assume we have silicon in-house because it's a
> long-standing feature of the POWER platform which was simply dormant
> on Linux until now.)

Yes. I can help you with that.

So let me see if I understand the overall idea.

Application can allocate new keys through a new syscall
sys_pkey_alloc_1(flags, init_val, sig_init_val) 

'sig_init_val' is the permission-state of the key in signal context.

The kernel will set the permission of each keys to their
corresponding values when entering the signal handler and revert
on return from the signal handler.

just like init_val, sig_init_val also percolates to children threads.

Is this a correct understanding?
RP
Andy Lutomirski May 16, 2018, 8:37 p.m. UTC | #37
On Wed, May 16, 2018 at 1:35 PM Ram Pai <linuxram@us.ibm.com> wrote:

> On Tue, May 08, 2018 at 02:40:46PM +0200, Florian Weimer wrote:
> > On 05/08/2018 04:49 AM, Andy Lutomirski wrote:
> > >On Mon, May 7, 2018 at 2:48 AM Florian Weimer <fweimer@redhat.com>
wrote:
> > >
> > >>On 05/03/2018 06:05 AM, Andy Lutomirski wrote:
> > >>>On Wed, May 2, 2018 at 7:11 PM Ram Pai <linuxram@us.ibm.com> wrote:
> > >>>
> > >>>>On Wed, May 02, 2018 at 09:23:49PM +0000, Andy Lutomirski wrote:
> > >>>>>
> > >>>>>>If I recall correctly, the POWER maintainer did express a strong
> > >>>desire
> > >>>>>>back then for (what is, I believe) their current semantics, which
my
> > >>>>>>PKEY_ALLOC_SIGNALINHERIT patch implements for x86, too.
> > >>>>>
> > >>>>>Ram, I really really don't like the POWER semantics.  Can you give
> > >some
> > >>>>>justification for them?  Does POWER at least have an atomic way for
> > >>>>>userspace to modify just the key it wants to modify or, even
better,
> > >>>>>special load and store instructions to use alternate keys?
> > >>>
> > >>>>I wouldn't call it POWER semantics. The way I implemented it on
power
> > >>>>lead to the semantics, given that nothing was explicitly stated
> > >>>>about how the semantics should work within a signal handler.
> > >>>
> > >>>I think that this is further evidence that we should introduce a new
> > >>>pkey_alloc() mode and deprecate the old.  To the extent possible,
this
> > >>>thing should work the same way on x86 and POWER.
> > >
> > >>Do you propose to change POWER or to change x86?
> > >
> > >Sorry for being slow to reply.  I propose to introduce a new
> > >PKEY_ALLOC_something variant on x86 and POWER and to make the behavior
> > >match on both.
> >
> > So basically implement PKEY_ALLOC_SETSIGNAL for POWER, and keep the
> > existing (different) behavior without the flag?
> >
> > Ram, would you be okay with that?  Could you give me a hand if
> > necessary?  (I assume we have silicon in-house because it's a
> > long-standing feature of the POWER platform which was simply dormant
> > on Linux until now.)

> Yes. I can help you with that.

> So let me see if I understand the overall idea.

> Application can allocate new keys through a new syscall
> sys_pkey_alloc_1(flags, init_val, sig_init_val)

> 'sig_init_val' is the permission-state of the key in signal context.

> The kernel will set the permission of each keys to their
> corresponding values when entering the signal handler and revert
> on return from the signal handler.

> just like init_val, sig_init_val also percolates to children threads.


I was imagining it would be just pkey_alloc(SOME_NEW_FLAG, init_val); and
the init val would be used for the current thread and for signal handlers.
New threads would inherit their parents' values.  The latter is certainly
up for negotiation, but it's the simplest behavior, and it's not obvious to
be that it's wrong.

--Andy
Ram Pai May 16, 2018, 8:52 p.m. UTC | #38
On Mon, May 14, 2018 at 02:01:23PM +0200, Florian Weimer wrote:
> On 05/09/2018 04:41 PM, Andy Lutomirski wrote:
> >Hmm.  I can get on board with the idea that fork() / clone() /
> >pthread_create() are all just special cases of the idea that the thread
> >that*calls*  them should have the right pkey values, and the latter is
> >already busted given our inability to asynchronously propagate the new mode
> >in pkey_alloc().  So let's so PKEY_ALLOC_SETSIGNAL as a starting point.
> 
> Ram, any suggestions for implementing this on POWER?

I suspect the changes will go in 
restore_user_regs() and save_user_regs().  These are the functions
that save and restore register state before entry and exit into/from
a signal handler.

> 
> >One thing we could do, though: the current initual state on process
> >creation is all access blocked on all keys.  We could change it so that
> >half the keys are fully blocked and half are read-only.  Then we could add
> >a PKEY_ALLOC_STRICT or similar that allocates a key with the correct
> >initial state*and*  does the setsignal thing.  If there are no keys left
> >with the correct initial state, then it fails.
> 
> The initial PKRU value can currently be configured by the system
> administrator.  I fear this approach has too many moving parts to be
> viable.

Sounds like on x86  keys can go active in signal-handler 
without any explicit allocation request by the application.  This is not
the case on power. Is that API requirement? Hope not.

RP
Andy Lutomirski May 16, 2018, 8:54 p.m. UTC | #39
On Wed, May 16, 2018 at 1:52 PM Ram Pai <linuxram@us.ibm.com> wrote:

> On Mon, May 14, 2018 at 02:01:23PM +0200, Florian Weimer wrote:
> > On 05/09/2018 04:41 PM, Andy Lutomirski wrote:
> > >Hmm.  I can get on board with the idea that fork() / clone() /
> > >pthread_create() are all just special cases of the idea that the thread
> > >that*calls*  them should have the right pkey values, and the latter is
> > >already busted given our inability to asynchronously propagate the new
mode
> > >in pkey_alloc().  So let's so PKEY_ALLOC_SETSIGNAL as a starting point.
> >
> > Ram, any suggestions for implementing this on POWER?

> I suspect the changes will go in
> restore_user_regs() and save_user_regs().  These are the functions
> that save and restore register state before entry and exit into/from
> a signal handler.

> >
> > >One thing we could do, though: the current initual state on process
> > >creation is all access blocked on all keys.  We could change it so that
> > >half the keys are fully blocked and half are read-only.  Then we could
add
> > >a PKEY_ALLOC_STRICT or similar that allocates a key with the correct
> > >initial state*and*  does the setsignal thing.  If there are no keys
left
> > >with the correct initial state, then it fails.
> >
> > The initial PKRU value can currently be configured by the system
> > administrator.  I fear this approach has too many moving parts to be
> > viable.

> Sounds like on x86  keys can go active in signal-handler
> without any explicit allocation request by the application.  This is not
> the case on power. Is that API requirement? Hope not.

On x86, signals are currently delivered with all keys locked all the way
down (except for the magic one we use to emulate no-read access).  I would
hesitate to change this for existing applications.
Ram Pai May 16, 2018, 9:07 p.m. UTC | #40
On Wed, May 16, 2018 at 01:37:46PM -0700, Andy Lutomirski wrote:
> On Wed, May 16, 2018 at 1:35 PM Ram Pai <linuxram@us.ibm.com> wrote:
> 
> > On Tue, May 08, 2018 at 02:40:46PM +0200, Florian Weimer wrote:
> > > On 05/08/2018 04:49 AM, Andy Lutomirski wrote:
> > > >On Mon, May 7, 2018 at 2:48 AM Florian Weimer <fweimer@redhat.com>
> wrote:
> > > >
> > > >>On 05/03/2018 06:05 AM, Andy Lutomirski wrote:
> > > >>>On Wed, May 2, 2018 at 7:11 PM Ram Pai <linuxram@us.ibm.com> wrote:
> > > >>>
> > > >>>>On Wed, May 02, 2018 at 09:23:49PM +0000, Andy Lutomirski wrote:
> > > >>>>>
> > > >>>>>>If I recall correctly, the POWER maintainer did express a strong
> > > >>>desire
> > > >>>>>>back then for (what is, I believe) their current semantics, which
> my
> > > >>>>>>PKEY_ALLOC_SIGNALINHERIT patch implements for x86, too.
> > > >>>>>
> > > >>>>>Ram, I really really don't like the POWER semantics.  Can you give
> > > >some
> > > >>>>>justification for them?  Does POWER at least have an atomic way for
> > > >>>>>userspace to modify just the key it wants to modify or, even
> better,
> > > >>>>>special load and store instructions to use alternate keys?
> > > >>>
> > > >>>>I wouldn't call it POWER semantics. The way I implemented it on
> power
> > > >>>>lead to the semantics, given that nothing was explicitly stated
> > > >>>>about how the semantics should work within a signal handler.
> > > >>>
> > > >>>I think that this is further evidence that we should introduce a new
> > > >>>pkey_alloc() mode and deprecate the old.  To the extent possible,
> this
> > > >>>thing should work the same way on x86 and POWER.
> > > >
> > > >>Do you propose to change POWER or to change x86?
> > > >
> > > >Sorry for being slow to reply.  I propose to introduce a new
> > > >PKEY_ALLOC_something variant on x86 and POWER and to make the behavior
> > > >match on both.
> > >
> > > So basically implement PKEY_ALLOC_SETSIGNAL for POWER, and keep the
> > > existing (different) behavior without the flag?
> > >
> > > Ram, would you be okay with that?  Could you give me a hand if
> > > necessary?  (I assume we have silicon in-house because it's a
> > > long-standing feature of the POWER platform which was simply dormant
> > > on Linux until now.)
> 
> > Yes. I can help you with that.
> 
> > So let me see if I understand the overall idea.
> 
> > Application can allocate new keys through a new syscall
> > sys_pkey_alloc_1(flags, init_val, sig_init_val)
> 
> > 'sig_init_val' is the permission-state of the key in signal context.
> 
> > The kernel will set the permission of each keys to their
> > corresponding values when entering the signal handler and revert
> > on return from the signal handler.
> 
> > just like init_val, sig_init_val also percolates to children threads.
> 
> 
> I was imagining it would be just pkey_alloc(SOME_NEW_FLAG, init_val); and
> the init val would be used for the current thread and for signal handlers.

what would change the key-permission-values enforced in signal-handler
context?  Or can it never be changed, ones set through sys_pkey_alloc()?

I suppose key-permission-values change done in non-signal-handler context,
will not apply to those in signal-handler context.

Can the signal handler change the key-permission-values from the
signal-handler context?

RP
Florian Weimer May 17, 2018, 10:09 a.m. UTC | #41
On 05/16/2018 11:07 PM, Ram Pai wrote:

> what would change the key-permission-values enforced in signal-handler
> context?  Or can it never be changed, ones set through sys_pkey_alloc()?

The access rights can only be set by pkey_alloc and are unchanged after 
that (so we do not have to discuss whether the signal handler access 
rights are per-thread or not).

> I suppose key-permission-values change done in non-signal-handler context,
> will not apply to those in signal-handler context.

Correct, that is the plan.

> Can the signal handler change the key-permission-values from the
> signal-handler context?

Yes, changes are possible.  The access rights given to pkey_alloc only 
specify the initial access rights when the signal handler is entered.

We need to decide if we should restore it on exit from the signal 
handler.  There is also the matter of siglongjmp, which currently does 
not restore the current thread's access rights.  In general, this might 
be difficult to implement because of the limited space in jmp_buf.

Thanks,
Florian
Florian Weimer May 17, 2018, 10:11 a.m. UTC | #42
On 05/16/2018 10:35 PM, Ram Pai wrote:
> So let me see if I understand the overall idea.
> 
> Application can allocate new keys through a new syscall
> sys_pkey_alloc_1(flags, init_val, sig_init_val)
> 
> 'sig_init_val' is the permission-state of the key in signal context.

I would keep the existing system call and just add a flag, say 
PKEY_ALLOC_SETSIGNAL.  If the current thread needs different access 
rights, it can set those rights just after pkey_alloc returns.  There is 
no race that matters here, I think.

Thanks,
Florian
diff mbox

Patch

diff --git a/Documentation/x86/protection-keys.txt b/Documentation/x86/protection-keys.txt
index ecb0d2dadfb7..d46d8e501c3a 100644
--- a/Documentation/x86/protection-keys.txt
+++ b/Documentation/x86/protection-keys.txt
@@ -39,11 +39,18 @@  with a key.  In this example WRPKRU is wrapped by a C function
 called pkey_set().
 
 	int real_prot = PROT_READ|PROT_WRITE;
-	pkey = pkey_alloc(0, PKEY_DISABLE_WRITE);
+	pkey = pkey_alloc(PKEY_ALLOC_SIGNALINHERIT, PKEY_DISABLE_WRITE);
 	ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
 	ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey);
 	... application runs here
 
+The PKEY_ALLOC_SIGNALINHERIT flag ensures the that key allocation
+fails if the kernel does not support access rights inheritance for
+signal handlers.  (Some kernel versions implement different semantics
+where signal handlers execute not with the access rights of the
+interrupted thread, but with some unspecified system default access
+rights.)
+
 Now, if the application needs to update the data at 'ptr', it can
 gain access, do the update, then remove its write access:
 
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index f9d4e6b6d4bd..39468bf388a2 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -73,6 +73,8 @@ 
 /* compatibility flags */
 #define MAP_FILE	0
 
+#define PKEY_ALLOC_SIGNALINHERIT 0x1
+
 #define PKEY_DISABLE_ACCESS	0x1
 #define PKEY_DISABLE_WRITE	0x2
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index 3035ca499cd8..dfe6cd82403a 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -100,6 +100,8 @@ 
 /* compatibility flags */
 #define MAP_FILE	0
 
+#define PKEY_ALLOC_SIGNALINHERIT 0x1
+
 #define PKEY_DISABLE_ACCESS	0x1
 #define PKEY_DISABLE_WRITE	0x2
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index 870fbf8c7088..773c130c17db 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -70,6 +70,8 @@ 
 #define MAP_FILE	0
 #define MAP_VARIABLE	0
 
+#define PKEY_ALLOC_SIGNALINHERIT 0x1
+
 #define PKEY_DISABLE_ACCESS	0x1
 #define PKEY_DISABLE_WRITE	0x2
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index a38bf5a1e37a..a87e99f72be6 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -33,6 +33,7 @@  extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
 extern void fpu__drop(struct fpu *fpu);
 extern int  fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu);
 extern void fpu__clear(struct fpu *fpu);
+extern void fpu__clear_signal(struct fpu *fpu);
 extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
 extern int  dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate);
 
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index f92a6593de1e..a3b304888af8 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -370,21 +370,16 @@  static inline void copy_init_fpstate_to_fpregs(void)
 		copy_kernel_to_fxregs(&init_fpstate.fxsave);
 	else
 		copy_kernel_to_fregs(&init_fpstate.fsave);
-
-	if (boot_cpu_has(X86_FEATURE_OSPKE))
-		copy_init_pkru_to_fpregs();
 }
 
-/*
- * Clear the FPU state back to init state.
- *
- * Called by sys_execve(), by the signal handler code and by various
- * error paths.
- */
-void fpu__clear(struct fpu *fpu)
+static void __fpu_clear(struct fpu *fpu, bool for_signal)
 {
+	u32 pkru;
+
 	WARN_ON_FPU(fpu != &current->thread.fpu); /* Almost certainly an anomaly */
 
+	if (for_signal)
+		pkru = read_pkru();
 	fpu__drop(fpu);
 
 	/*
@@ -395,10 +390,42 @@  void fpu__clear(struct fpu *fpu)
 		fpu__initialize(fpu);
 		user_fpu_begin();
 		copy_init_fpstate_to_fpregs();
+		if (boot_cpu_has(X86_FEATURE_OSPKE)) {
+			/* A signal handler inherits the original PKRU
+			 * value of the interrupted thread.
+			 */
+			if (for_signal)
+				__write_pkru(pkru);
+			else
+				copy_init_pkru_to_fpregs();
+		}
 		preempt_enable();
 	}
 }
 
+/*
+ * Clear the FPU state back to init state.
+ *
+ * Called by sys_execve(), the signal handler return code, and by
+ * various error paths.
+ */
+void fpu__clear(struct fpu *fpu)
+{
+	return __fpu_clear(fpu, false);
+}
+
+/*
+ * Prepare the FPU for invoking a signal handler.
+ *
+ * This is like fpu__clear(), but some CPU registers are inherited
+ * from the current thread and not restored to their initial values,
+ * to match behavior on other architectures.
+ */
+void fpu__clear_signal(struct fpu *fpu)
+{
+	return __fpu_clear(fpu, true);
+}
+
 /*
  * x87 math exception handling:
  */
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index da270b95fe4d..b3c1f6f3df66 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -758,7 +758,7 @@  handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 		 * Ensure the signal handler starts with the new fpu state.
 		 */
 		if (fpu->initialized)
-			fpu__clear(fpu);
+			fpu__clear_signal(fpu);
 	}
 	signal_setup_done(failed, ksig, stepping);
 }
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 58f29a9d895d..741b5d39882f 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -112,6 +112,8 @@ 
 /* compatibility flags */
 #define MAP_FILE	0
 
+#define PKEY_ALLOC_SIGNALINHERIT 0x1
+
 #define PKEY_DISABLE_ACCESS	0x1
 #define PKEY_DISABLE_WRITE	0x2
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index e7ee32861d51..18f6c1ebe2bb 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -69,6 +69,8 @@ 
 /* compatibility flags */
 #define MAP_FILE	0
 
+#define PKEY_ALLOC_SIGNALINHERIT 0x1
+
 #define PKEY_DISABLE_ACCESS	0x1
 #define PKEY_DISABLE_WRITE	0x2
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 625608bc8962..ec82728774af 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -539,14 +539,21 @@  SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
 	return do_mprotect_pkey(start, len, prot, pkey);
 }
 
+#define PKEY_ALLOC_FLAGS ((unsigned long) (PKEY_ALLOC_SIGNALINHERIT))
+
 SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
 {
 	int pkey;
 	int ret;
 
-	/* No flags supported yet. */
-	if (flags)
+	/* Check for unsupported flags. No further action for
+	 * PKEY_ALLOC_SIGNALINHERIT is required; this flag merely
+	 * provides a way for applications to detect that allocated
+	 * keys support inheriting access rights in signal handler.
+	 */
+	if (flags & ~PKEY_ALLOC_FLAGS)
 		return -EINVAL;
+
 	/* check for unsupported init values */
 	if (init_val & ~PKEY_ACCESS_MASK)
 		return -EINVAL;
diff --git a/tools/include/uapi/asm-generic/mman-common.h b/tools/include/uapi/asm-generic/mman-common.h
index e7ee32861d51..18f6c1ebe2bb 100644
--- a/tools/include/uapi/asm-generic/mman-common.h
+++ b/tools/include/uapi/asm-generic/mman-common.h
@@ -69,6 +69,8 @@ 
 /* compatibility flags */
 #define MAP_FILE	0
 
+#define PKEY_ALLOC_SIGNALINHERIT 0x1
+
 #define PKEY_DISABLE_ACCESS	0x1
 #define PKEY_DISABLE_WRITE	0x2
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c
index f15aa5a76fe3..e651c83e09aa 100644
--- a/tools/testing/selftests/x86/protection_keys.c
+++ b/tools/testing/selftests/x86/protection_keys.c
@@ -393,6 +393,8 @@  pid_t fork_lazy_child(void)
 	return forkret;
 }
 
+#define PKEY_ALLOC_SIGNALINHERIT 0x1
+
 #define PKEY_DISABLE_ACCESS    0x1
 #define PKEY_DISABLE_WRITE     0x2
 
@@ -560,7 +562,7 @@  int alloc_pkey(void)
 
 	dprintf1("alloc_pkey()::%d, pkru: 0x%x shadow: %x\n",
 			__LINE__, __rdpkru(), shadow_pkru);
-	ret = sys_pkey_alloc(0, init_val);
+	ret = sys_pkey_alloc(PKEY_ALLOC_SIGNALINHERIT, init_val);
 	/*
 	 * pkey_alloc() sets PKRU, so we need to reflect it in
 	 * shadow_pkru: