diff mbox series

[v2,3/5] arm64: signal: Improve POR_EL0 handling to avoid uaccess failures

Message ID 20241023150511.3923558-4-kevin.brodsky@arm.com (mailing list archive)
State New
Headers show
Series Improve arm64 pkeys handling in signal delivery | expand

Commit Message

Kevin Brodsky Oct. 23, 2024, 3:05 p.m. UTC
TL;DR: reset POR_EL0 to "allow all" before writing the signal frame,
preventing spurious uaccess failures. Also, make sure that POR_EL0
remains unchanged if delivering the signal fails.

When POE is supported, the POR_EL0 register constrains memory
accesses based on the target page's POIndex (pkey). This raises the
question: what constraints should apply to a signal handler? The
current answer is that POR_EL0 is reset to POR_EL0_INIT when
invoking the handler, giving it full access to POIndex 0. This is in
line with x86's MPK support and remains unchanged.

This is only part of the story, though. POR_EL0 constrains all
unprivileged memory accesses, meaning that uaccess routines such as
put_user() are also impacted. As a result POR_EL0 may prevent the
signal frame from being written to the signal stack (ultimately
causing a SIGSEGV). This is especially concerning when an alternate
signal stack is used, because userspace may want to prevent access
to it outside of signal handlers. There is currently no provision
for that: POR_EL0 is reset after writing to the stack, and
POR_EL0_INIT only enables access to POIndex 0.

This patch ensures that POR_EL0 is reset to its most permissive
state before the signal stack is accessed. Once the signal frame has
been fully written, POR_EL0 is still set to POR_EL0_INIT - it is up
to the signal handler to enable access to additional pkeys if
needed. As to sigreturn(), it expects having access to the stack
like any other syscall; we only need to ensure that POR_EL0 is
restored from the signal frame after all uaccess calls. This
approach is in line with the recent x86/pkeys series [1].

Resetting POR_EL0 early introduces some complications, in that we
can no longer read the register directly in preserve_poe_context().
This is addressed by introducing a struct (user_access_state)
and helpers to manage any such register impacting user accesses
(uaccess and accesses in userspace). Things look like this on signal
delivery:

1. Save original POR_EL0 into struct [save_reset_user_access_state()]
2. Set POR_EL0 to "allow all"  [save_reset_user_access_state()]
3. Create signal frame
4. Write saved POR_EL0 value to the signal frame [preserve_poe_context()]
5. Finalise signal frame
6. If all operations succeeded:
  a. Set POR_EL0 to POR_EL0_INIT [set_handler_user_access_state()]
  b. Else reset POR_EL0 to its original value [restore_user_access_state()]

If any step fails when setting up the signal frame, the process will
be sent a SIGSEGV, which it may be able to handle. Step 6.b ensures
that the original POR_EL0 is saved in the signal frame when
delivering that SIGSEGV (so that the original value is restored by
sigreturn).

The return path (sys_rt_sigreturn) doesn't strictly require any change
since restore_poe_context() is already called last. However, to
avoid uaccess calls being accidentally added after that point, we
use the same approach as in the delivery path, i.e. separating
uaccess from writing to the register:

1. Read saved POR_EL0 value from the signal frame [restore_poe_context()]
2. Set POR_EL0 to the saved value [restore_user_access_state()]

[1] https://lore.kernel.org/lkml/20240802061318.2140081-1-aruna.ramakrishna@oracle.com/

Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---

Note: the addtional check on err at the end of setup_rt_frame() is
made on purpose, in expectation of [2] which will allow setup_return()
to fail too.

[2] https://lore.kernel.org/all/20241001-arm64-gcs-v13-25-222b78d87eee@kernel.org/
---

 arch/arm64/kernel/signal.c | 92 ++++++++++++++++++++++++++++++++------
 1 file changed, 78 insertions(+), 14 deletions(-)

Comments

Catalin Marinas Oct. 24, 2024, 10:59 a.m. UTC | #1
On Wed, Oct 23, 2024 at 04:05:09PM +0100, Kevin Brodsky wrote:
> diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> index f5fb48dabebe..d2e4e50977ae 100644
> --- a/arch/arm64/kernel/signal.c
> +++ b/arch/arm64/kernel/signal.c
> @@ -66,9 +66,63 @@ struct rt_sigframe_user_layout {
>  	unsigned long end_offset;
>  };
>  
> +/*
> + * Holds any EL0-controlled state that influences unprivileged memory accesses.
> + * This includes both accesses done in userspace and uaccess done in the kernel.
> + *
> + * This state needs to be carefully managed to ensure that it doesn't cause
> + * uaccess to fail when setting up the signal frame, and the signal handler
> + * itself also expects a well-defined state when entered.
> + */
> +struct user_access_state {
> +	u64 por_el0;
> +};
> +
>  #define TERMINATOR_SIZE round_up(sizeof(struct _aarch64_ctx), 16)
>  #define EXTRA_CONTEXT_SIZE round_up(sizeof(struct extra_context), 16)
>  
> +/*
> + * Save the unpriv access state into ua_state and reset it to disable any
> + * restrictions.
> + */
> +static void save_reset_user_access_state(struct user_access_state *ua_state)
> +{
> +	if (system_supports_poe()) {
> +		/*
> +		 * Enable all permissions in all 8 keys
> +		 * (inspired by REPEAT_BYTE())
> +		 */
> +		u64 por_enable_all = (~0u / POE_MASK) * POE_RXW;

I think this should be ~0ul.

> @@ -907,6 +964,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
>  {
>  	struct pt_regs *regs = current_pt_regs();
>  	struct rt_sigframe __user *frame;
> +	struct user_access_state ua_state;
>  
>  	/* Always make any pending restarted system calls return -EINTR */
>  	current->restart_block.fn = do_no_restart_syscall;
> @@ -923,12 +981,14 @@ SYSCALL_DEFINE0(rt_sigreturn)
>  	if (!access_ok(frame, sizeof (*frame)))
>  		goto badframe;
>  
> -	if (restore_sigframe(regs, frame))
> +	if (restore_sigframe(regs, frame, &ua_state))
>  		goto badframe;
>  
>  	if (restore_altstack(&frame->uc.uc_stack))
>  		goto badframe;
>  
> +	restore_user_access_state(&ua_state);
> +
>  	return regs->regs[0];
>  
>  badframe:

The saving part I'm fine with. For restoring, I was wondering whether we
can get a more privileged POR_EL0 if reading the frame somehow failed.
This is largely theoretical, there are other ways to attack like
writing POR_EL0 directly than unmapping/remapping the signal stack.

What I'd change here is always restore_user_access_state() to
POR_EL0_INIT. Maybe just initialise ua_state above and add the function
call after the badframe label.

Either way:

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Kevin Brodsky Oct. 24, 2024, 2:55 p.m. UTC | #2
On 24/10/2024 12:59, Catalin Marinas wrote:
> On Wed, Oct 23, 2024 at 04:05:09PM +0100, Kevin Brodsky wrote:
>> +/*
>> + * Save the unpriv access state into ua_state and reset it to disable any
>> + * restrictions.
>> + */
>> +static void save_reset_user_access_state(struct user_access_state *ua_state)
>> +{
>> +	if (system_supports_poe()) {
>> +		/*
>> +		 * Enable all permissions in all 8 keys
>> +		 * (inspired by REPEAT_BYTE())
>> +		 */
>> +		u64 por_enable_all = (~0u / POE_MASK) * POE_RXW;
> I think this should be ~0ul.

It is ~0u on purpose, because unlike in REPEAT_BYTE(), I only wanted the
lower 32 bits to be filled with POE_RXW (we only have 8 keys, the top 32
bits are RES0). That said, given that D128 has 4-bit pkeys, we could
anticipate and fill the top 32 bits too (should make no difference on D64).

>> @@ -907,6 +964,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
>>  {
>>  	struct pt_regs *regs = current_pt_regs();
>>  	struct rt_sigframe __user *frame;
>> +	struct user_access_state ua_state;
>>  
>>  	/* Always make any pending restarted system calls return -EINTR */
>>  	current->restart_block.fn = do_no_restart_syscall;
>> @@ -923,12 +981,14 @@ SYSCALL_DEFINE0(rt_sigreturn)
>>  	if (!access_ok(frame, sizeof (*frame)))
>>  		goto badframe;
>>  
>> -	if (restore_sigframe(regs, frame))
>> +	if (restore_sigframe(regs, frame, &ua_state))
>>  		goto badframe;
>>  
>>  	if (restore_altstack(&frame->uc.uc_stack))
>>  		goto badframe;
>>  
>> +	restore_user_access_state(&ua_state);
>> +
>>  	return regs->regs[0];
>>  
>>  badframe:
> The saving part I'm fine with. For restoring, I was wondering whether we
> can get a more privileged POR_EL0 if reading the frame somehow failed.
> This is largely theoretical, there are other ways to attack like
> writing POR_EL0 directly than unmapping/remapping the signal stack.
>
> What I'd change here is always restore_user_access_state() to
> POR_EL0_INIT. Maybe just initialise ua_state above and add the function
> call after the badframe label.

I'm not sure I understand. When we enter this function, POR_EL0 is set
to whatever the signal handler set it to (POR_EL0_INIT by default).
There are then two cases:
1) Everything succeeds, including reading the saved POR_EL0 from the
frame. We then call restore_user_access_state(), setting POR_EL0 to the
value we've read, and return to userspace.
2) Any uaccess fails (for instance reading POR_EL0). In that case we
leave POR_EL0 unchanged and deliver SIGSEGV.

In case 2 POR_EL0 is most likely already set to POR_EL0_INIT, or
whatever the signal handler set it to. It's not clear to me that forcing
it to POR_EL0_INIT helps much. Either way it's doubtful that the SIGSEGV
handler will be able to recover, since the new signal frame we will
create for it may be a mix of interrupted state and signal handler state
(depending on exactly where we fail).

Kevin
Catalin Marinas Oct. 24, 2024, 3:42 p.m. UTC | #3
On Thu, Oct 24, 2024 at 04:55:48PM +0200, Kevin Brodsky wrote:
> On 24/10/2024 12:59, Catalin Marinas wrote:
> > On Wed, Oct 23, 2024 at 04:05:09PM +0100, Kevin Brodsky wrote:
> >> +/*
> >> + * Save the unpriv access state into ua_state and reset it to disable any
> >> + * restrictions.
> >> + */
> >> +static void save_reset_user_access_state(struct user_access_state *ua_state)
> >> +{
> >> +	if (system_supports_poe()) {
> >> +		/*
> >> +		 * Enable all permissions in all 8 keys
> >> +		 * (inspired by REPEAT_BYTE())
> >> +		 */
> >> +		u64 por_enable_all = (~0u / POE_MASK) * POE_RXW;
> > I think this should be ~0ul.
> 
> It is ~0u on purpose, because unlike in REPEAT_BYTE(), I only wanted the
> lower 32 bits to be filled with POE_RXW (we only have 8 keys, the top 32
> bits are RES0). That said, given that D128 has 4-bit pkeys, we could
> anticipate and fill the top 32 bits too (should make no difference on D64).

I guess we could leave it as 32-bit for now and remember to update it
when we enable more keys with D128. Setting the top RES0 bits doesn't
hurt either since they are already documented in the Arm ARM. Up to you,
it's fine like above as well.

> >> @@ -907,6 +964,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
> >>  {
> >>  	struct pt_regs *regs = current_pt_regs();
> >>  	struct rt_sigframe __user *frame;
> >> +	struct user_access_state ua_state;
> >>  
> >>  	/* Always make any pending restarted system calls return -EINTR */
> >>  	current->restart_block.fn = do_no_restart_syscall;
> >> @@ -923,12 +981,14 @@ SYSCALL_DEFINE0(rt_sigreturn)
> >>  	if (!access_ok(frame, sizeof (*frame)))
> >>  		goto badframe;
> >>  
> >> -	if (restore_sigframe(regs, frame))
> >> +	if (restore_sigframe(regs, frame, &ua_state))
> >>  		goto badframe;
> >>  
> >>  	if (restore_altstack(&frame->uc.uc_stack))
> >>  		goto badframe;
> >>  
> >> +	restore_user_access_state(&ua_state);
> >> +
> >>  	return regs->regs[0];
> >>  
> >>  badframe:
> > The saving part I'm fine with. For restoring, I was wondering whether we
> > can get a more privileged POR_EL0 if reading the frame somehow failed.
> > This is largely theoretical, there are other ways to attack like
> > writing POR_EL0 directly than unmapping/remapping the signal stack.
> >
> > What I'd change here is always restore_user_access_state() to
> > POR_EL0_INIT. Maybe just initialise ua_state above and add the function
> > call after the badframe label.
> 
> I'm not sure I understand. When we enter this function, POR_EL0 is set
> to whatever the signal handler set it to (POR_EL0_INIT by default).
> There are then two cases:
> 1) Everything succeeds, including reading the saved POR_EL0 from the
> frame. We then call restore_user_access_state(), setting POR_EL0 to the
> value we've read, and return to userspace.
> 2) Any uaccess fails (for instance reading POR_EL0). In that case we
> leave POR_EL0 unchanged and deliver SIGSEGV.
> 
> In case 2 POR_EL0 is most likely already set to POR_EL0_INIT, or
> whatever the signal handler set it to. It's not clear to me that forcing
> it to POR_EL0_INIT helps much. Either way it's doubtful that the SIGSEGV
> handler will be able to recover, since the new signal frame we will
> create for it may be a mix of interrupted state and signal handler state
> (depending on exactly where we fail).

If the SIGSEGV delivery succeeds, returning would restore the POR_EL0
set up by the previous signal handler, potentially more privileged. Does
it matter? Can it return all the way to the original context?
Dave Martin Oct. 24, 2024, 4:19 p.m. UTC | #4
On Thu, Oct 24, 2024 at 04:42:10PM +0100, Catalin Marinas wrote:
> On Thu, Oct 24, 2024 at 04:55:48PM +0200, Kevin Brodsky wrote:
> > On 24/10/2024 12:59, Catalin Marinas wrote:
> > > On Wed, Oct 23, 2024 at 04:05:09PM +0100, Kevin Brodsky wrote:
> > >> +/*
> > >> + * Save the unpriv access state into ua_state and reset it to disable any
> > >> + * restrictions.
> > >> + */
> > >> +static void save_reset_user_access_state(struct user_access_state *ua_state)
> > >> +{
> > >> +	if (system_supports_poe()) {
> > >> +		/*
> > >> +		 * Enable all permissions in all 8 keys
> > >> +		 * (inspired by REPEAT_BYTE())
> > >> +		 */
> > >> +		u64 por_enable_all = (~0u / POE_MASK) * POE_RXW;
> > > I think this should be ~0ul.
> > 
> > It is ~0u on purpose, because unlike in REPEAT_BYTE(), I only wanted the
> > lower 32 bits to be filled with POE_RXW (we only have 8 keys, the top 32
> > bits are RES0). That said, given that D128 has 4-bit pkeys, we could
> > anticipate and fill the top 32 bits too (should make no difference on D64).
> 
> I guess we could leave it as 32-bit for now and remember to update it
> when we enable more keys with D128. Setting the top RES0 bits doesn't
> hurt either since they are already documented in the Arm ARM. Up to you,
> it's fine like above as well.

Can we maybe just have a brute-force loop that constructs the value
using the appropriate #define macros?

The compiler will const-fold it; I'd be prepared to bet that the
generated code would be identical...


> > >> @@ -907,6 +964,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
> > >>  {
> > >>  	struct pt_regs *regs = current_pt_regs();
> > >>  	struct rt_sigframe __user *frame;
> > >> +	struct user_access_state ua_state;
> > >>  
> > >>  	/* Always make any pending restarted system calls return -EINTR */
> > >>  	current->restart_block.fn = do_no_restart_syscall;
> > >> @@ -923,12 +981,14 @@ SYSCALL_DEFINE0(rt_sigreturn)
> > >>  	if (!access_ok(frame, sizeof (*frame)))
> > >>  		goto badframe;
> > >>  
> > >> -	if (restore_sigframe(regs, frame))
> > >> +	if (restore_sigframe(regs, frame, &ua_state))
> > >>  		goto badframe;
> > >>  
> > >>  	if (restore_altstack(&frame->uc.uc_stack))
> > >>  		goto badframe;
> > >>  
> > >> +	restore_user_access_state(&ua_state);
> > >> +
> > >>  	return regs->regs[0];
> > >>  
> > >>  badframe:
> > > The saving part I'm fine with. For restoring, I was wondering whether we
> > > can get a more privileged POR_EL0 if reading the frame somehow failed.
> > > This is largely theoretical, there are other ways to attack like
> > > writing POR_EL0 directly than unmapping/remapping the signal stack.
> > >
> > > What I'd change here is always restore_user_access_state() to
> > > POR_EL0_INIT. Maybe just initialise ua_state above and add the function
> > > call after the badframe label.
> > 
> > I'm not sure I understand. When we enter this function, POR_EL0 is set
> > to whatever the signal handler set it to (POR_EL0_INIT by default).
> > There are then two cases:
> > 1) Everything succeeds, including reading the saved POR_EL0 from the
> > frame. We then call restore_user_access_state(), setting POR_EL0 to the
> > value we've read, and return to userspace.
> > 2) Any uaccess fails (for instance reading POR_EL0). In that case we
> > leave POR_EL0 unchanged and deliver SIGSEGV.
> > 
> > In case 2 POR_EL0 is most likely already set to POR_EL0_INIT, or
> > whatever the signal handler set it to. It's not clear to me that forcing
> > it to POR_EL0_INIT helps much. Either way it's doubtful that the SIGSEGV
> > handler will be able to recover, since the new signal frame we will
> > create for it may be a mix of interrupted state and signal handler state
> > (depending on exactly where we fail).
> 
> If the SIGSEGV delivery succeeds, returning would restore the POR_EL0
> set up by the previous signal handler, potentially more privileged. Does
> it matter? Can it return all the way to the original context?

That seems a valid concern.

It looks a bit like we don't back out the temporary change to POR_EL0
if writing the sigframe fails, so the temporary "allow all" perms might
get saved out into the SIGSEGV sigframe on the alternate signal
stack, and will then be restored as the user thread's POR_EL0 when the
SIGSEGV returns.

(This is all assuming that the force_sig(SIGSEGV) logic works properly
at all...  I'm still trying to puzzle it out!)

Cheers
---Dave
Kevin Brodsky Oct. 25, 2024, 8:24 a.m. UTC | #5
On 24/10/2024 18:19, Dave Martin wrote:
> On Thu, Oct 24, 2024 at 04:42:10PM +0100, Catalin Marinas wrote:
>> On Thu, Oct 24, 2024 at 04:55:48PM +0200, Kevin Brodsky wrote:
>>> On 24/10/2024 12:59, Catalin Marinas wrote:
>>>> On Wed, Oct 23, 2024 at 04:05:09PM +0100, Kevin Brodsky wrote:
>>>>> +/*
>>>>> + * Save the unpriv access state into ua_state and reset it to disable any
>>>>> + * restrictions.
>>>>> + */
>>>>> +static void save_reset_user_access_state(struct user_access_state *ua_state)
>>>>> +{
>>>>> +	if (system_supports_poe()) {
>>>>> +		/*
>>>>> +		 * Enable all permissions in all 8 keys
>>>>> +		 * (inspired by REPEAT_BYTE())
>>>>> +		 */
>>>>> +		u64 por_enable_all = (~0u / POE_MASK) * POE_RXW;
>>>> I think this should be ~0ul.
>>> It is ~0u on purpose, because unlike in REPEAT_BYTE(), I only wanted the
>>> lower 32 bits to be filled with POE_RXW (we only have 8 keys, the top 32
>>> bits are RES0). That said, given that D128 has 4-bit pkeys, we could
>>> anticipate and fill the top 32 bits too (should make no difference on D64).
>> I guess we could leave it as 32-bit for now and remember to update it
>> when we enable more keys with D128. Setting the top RES0 bits doesn't
>> hurt either since they are already documented in the Arm ARM. Up to you,
>> it's fine like above as well.
> Can we maybe just have a brute-force loop that constructs the value
> using the appropriate #define macros?
>
> The compiler will const-fold it; I'd be prepared to bet that the
> generated code would be identical...

Fine by me, I suppose I was too eager to use the one-liner I had found
:) Building that value based on arch_max_pkey() is probably a better
idea in the long run. (And indeed the codegen is the same, it boils down
to a mov w0, #0x77777777 in both case.)

>>>>> @@ -907,6 +964,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
>>>>>  {
>>>>>  	struct pt_regs *regs = current_pt_regs();
>>>>>  	struct rt_sigframe __user *frame;
>>>>> +	struct user_access_state ua_state;
>>>>>  
>>>>>  	/* Always make any pending restarted system calls return -EINTR */
>>>>>  	current->restart_block.fn = do_no_restart_syscall;
>>>>> @@ -923,12 +981,14 @@ SYSCALL_DEFINE0(rt_sigreturn)
>>>>>  	if (!access_ok(frame, sizeof (*frame)))
>>>>>  		goto badframe;
>>>>>  
>>>>> -	if (restore_sigframe(regs, frame))
>>>>> +	if (restore_sigframe(regs, frame, &ua_state))
>>>>>  		goto badframe;
>>>>>  
>>>>>  	if (restore_altstack(&frame->uc.uc_stack))
>>>>>  		goto badframe;
>>>>>  
>>>>> +	restore_user_access_state(&ua_state);
>>>>> +
>>>>>  	return regs->regs[0];
>>>>>  
>>>>>  badframe:
>>>> The saving part I'm fine with. For restoring, I was wondering whether we
>>>> can get a more privileged POR_EL0 if reading the frame somehow failed.
>>>> This is largely theoretical, there are other ways to attack like
>>>> writing POR_EL0 directly than unmapping/remapping the signal stack.
>>>>
>>>> What I'd change here is always restore_user_access_state() to
>>>> POR_EL0_INIT. Maybe just initialise ua_state above and add the function
>>>> call after the badframe label.
>>> I'm not sure I understand. When we enter this function, POR_EL0 is set
>>> to whatever the signal handler set it to (POR_EL0_INIT by default).
>>> There are then two cases:
>>> 1) Everything succeeds, including reading the saved POR_EL0 from the
>>> frame. We then call restore_user_access_state(), setting POR_EL0 to the
>>> value we've read, and return to userspace.
>>> 2) Any uaccess fails (for instance reading POR_EL0). In that case we
>>> leave POR_EL0 unchanged and deliver SIGSEGV.
>>>
>>> In case 2 POR_EL0 is most likely already set to POR_EL0_INIT, or
>>> whatever the signal handler set it to. It's not clear to me that forcing
>>> it to POR_EL0_INIT helps much. Either way it's doubtful that the SIGSEGV
>>> handler will be able to recover, since the new signal frame we will
>>> create for it may be a mix of interrupted state and signal handler state
>>> (depending on exactly where we fail).
>> If the SIGSEGV delivery succeeds, returning would restore the POR_EL0
>> set up by the previous signal handler, potentially more privileged. Does
>> it matter? Can it return all the way to the original context?

What we store into the signal frame when delivering that SIGSEGV is a
mixture of the original state (up to the point of failure) and the
signal handler's state (what we couldn't restore). It's hard to reason
about how that SIGSEGV handler could possibly handle this, but in any
case it would have to massage its signal frame so that the next
sigreturn does the right thing. Restoring only part of the frame records
is bound to cause trouble and that's true for POR_EL0 as well - I doubt
there's much value in special-casing it.

>
> That seems a valid concern.
>
> It looks a bit like we don't back out the temporary change to POR_EL0
> if writing the sigframe fails, so the temporary "allow all" perms might
> get saved out into the SIGSEGV sigframe on the alternate signal
> stack, and will then be restored as the user thread's POR_EL0 when the
> SIGSEGV returns.

It sounds like you're referring to the delivery case, not the return
case. In the delivery case (setup_rt_frame()), the "allow all" value
will never be saved into the sigframe because we call
restore_user_access_state() if anything failed (this is new in v2,
exactly to prevent that scenario).

Kevin
Dave Martin Oct. 25, 2024, 11:04 a.m. UTC | #6
On Fri, Oct 25, 2024 at 10:24:41AM +0200, Kevin Brodsky wrote:
> On 24/10/2024 18:19, Dave Martin wrote:
> > On Thu, Oct 24, 2024 at 04:42:10PM +0100, Catalin Marinas wrote:
> >> On Thu, Oct 24, 2024 at 04:55:48PM +0200, Kevin Brodsky wrote:
> >>> On 24/10/2024 12:59, Catalin Marinas wrote:
> >>>> On Wed, Oct 23, 2024 at 04:05:09PM +0100, Kevin Brodsky wrote:
> >>>>> +/*
> >>>>> + * Save the unpriv access state into ua_state and reset it to disable any
> >>>>> + * restrictions.
> >>>>> + */
> >>>>> +static void save_reset_user_access_state(struct user_access_state *ua_state)
> >>>>> +{
> >>>>> +	if (system_supports_poe()) {
> >>>>> +		/*
> >>>>> +		 * Enable all permissions in all 8 keys
> >>>>> +		 * (inspired by REPEAT_BYTE())
> >>>>> +		 */
> >>>>> +		u64 por_enable_all = (~0u / POE_MASK) * POE_RXW;
> >>>> I think this should be ~0ul.
> >>> It is ~0u on purpose, because unlike in REPEAT_BYTE(), I only wanted the
> >>> lower 32 bits to be filled with POE_RXW (we only have 8 keys, the top 32
> >>> bits are RES0). That said, given that D128 has 4-bit pkeys, we could
> >>> anticipate and fill the top 32 bits too (should make no difference on D64).
> >> I guess we could leave it as 32-bit for now and remember to update it
> >> when we enable more keys with D128. Setting the top RES0 bits doesn't
> >> hurt either since they are already documented in the Arm ARM. Up to you,
> >> it's fine like above as well.
> > Can we maybe just have a brute-force loop that constructs the value
> > using the appropriate #define macros?
> >
> > The compiler will const-fold it; I'd be prepared to bet that the
> > generated code would be identical...
> 
> Fine by me, I suppose I was too eager to use the one-liner I had found
> :) Building that value based on arch_max_pkey() is probably a better
> idea in the long run. (And indeed the codegen is the same, it boils down
> to a mov w0, #0x77777777 in both case.)

The one-line was a neat trick (after the brief WTF moment) :)

I guess my uneasiness comes from baking the number of pkeys in via the
type of 0u and an implicit relationship that this happens to have with
the number bits per pkey in the POR.

[...]

Cheers
---Dave
Dave Martin Oct. 25, 2024, 11:33 a.m. UTC | #7
On Fri, Oct 25, 2024 at 10:24:41AM +0200, Kevin Brodsky wrote:
> On 24/10/2024 18:19, Dave Martin wrote:
> > On Thu, Oct 24, 2024 at 04:42:10PM +0100, Catalin Marinas wrote:
> >> On Thu, Oct 24, 2024 at 04:55:48PM +0200, Kevin Brodsky wrote:
> >>> On 24/10/2024 12:59, Catalin Marinas wrote:
> >>>> On Wed, Oct 23, 2024 at 04:05:09PM +0100, Kevin Brodsky wrote:
> >>>>> +/*
> >>>>> + * Save the unpriv access state into ua_state and reset it to disable any
> >>>>> + * restrictions.
> >>>>> + */
> >>>>> +static void save_reset_user_access_state(struct user_access_state *ua_state)
> >>>>> +{
> >>>>> +	if (system_supports_poe()) {
> >>>>> +		/*
> >>>>> +		 * Enable all permissions in all 8 keys
> >>>>> +		 * (inspired by REPEAT_BYTE())
> >>>>> +		 */
> >>>>> +		u64 por_enable_all = (~0u / POE_MASK) * POE_RXW;
> >>>> I think this should be ~0ul.
> >>> It is ~0u on purpose, because unlike in REPEAT_BYTE(), I only wanted the
> >>> lower 32 bits to be filled with POE_RXW (we only have 8 keys, the top 32
> >>> bits are RES0). That said, given that D128 has 4-bit pkeys, we could
> >>> anticipate and fill the top 32 bits too (should make no difference on D64).
> >> I guess we could leave it as 32-bit for now and remember to update it
> >> when we enable more keys with D128. Setting the top RES0 bits doesn't
> >> hurt either since they are already documented in the Arm ARM. Up to you,
> >> it's fine like above as well.
> > Can we maybe just have a brute-force loop that constructs the value
> > using the appropriate #define macros?
> >
> > The compiler will const-fold it; I'd be prepared to bet that the
> > generated code would be identical...
> 
> Fine by me, I suppose I was too eager to use the one-liner I had found
> :) Building that value based on arch_max_pkey() is probably a better
> idea in the long run. (And indeed the codegen is the same, it boils down
> to a mov w0, #0x77777777 in both case.)
> 
> >>>>> @@ -907,6 +964,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
> >>>>>  {
> >>>>>  	struct pt_regs *regs = current_pt_regs();
> >>>>>  	struct rt_sigframe __user *frame;
> >>>>> +	struct user_access_state ua_state;
> >>>>>  
> >>>>>  	/* Always make any pending restarted system calls return -EINTR */
> >>>>>  	current->restart_block.fn = do_no_restart_syscall;
> >>>>> @@ -923,12 +981,14 @@ SYSCALL_DEFINE0(rt_sigreturn)
> >>>>>  	if (!access_ok(frame, sizeof (*frame)))
> >>>>>  		goto badframe;
> >>>>>  
> >>>>> -	if (restore_sigframe(regs, frame))
> >>>>> +	if (restore_sigframe(regs, frame, &ua_state))
> >>>>>  		goto badframe;
> >>>>>  
> >>>>>  	if (restore_altstack(&frame->uc.uc_stack))
> >>>>>  		goto badframe;
> >>>>>  
> >>>>> +	restore_user_access_state(&ua_state);
> >>>>> +
> >>>>>  	return regs->regs[0];
> >>>>>  
> >>>>>  badframe:
> >>>> The saving part I'm fine with. For restoring, I was wondering whether we
> >>>> can get a more privileged POR_EL0 if reading the frame somehow failed.
> >>>> This is largely theoretical, there are other ways to attack like
> >>>> writing POR_EL0 directly than unmapping/remapping the signal stack.
> >>>>
> >>>> What I'd change here is always restore_user_access_state() to
> >>>> POR_EL0_INIT. Maybe just initialise ua_state above and add the function
> >>>> call after the badframe label.
> >>> I'm not sure I understand. When we enter this function, POR_EL0 is set
> >>> to whatever the signal handler set it to (POR_EL0_INIT by default).
> >>> There are then two cases:
> >>> 1) Everything succeeds, including reading the saved POR_EL0 from the
> >>> frame. We then call restore_user_access_state(), setting POR_EL0 to the
> >>> value we've read, and return to userspace.
> >>> 2) Any uaccess fails (for instance reading POR_EL0). In that case we
> >>> leave POR_EL0 unchanged and deliver SIGSEGV.
> >>>
> >>> In case 2 POR_EL0 is most likely already set to POR_EL0_INIT, or
> >>> whatever the signal handler set it to. It's not clear to me that forcing
> >>> it to POR_EL0_INIT helps much. Either way it's doubtful that the SIGSEGV
> >>> handler will be able to recover, since the new signal frame we will
> >>> create for it may be a mix of interrupted state and signal handler state
> >>> (depending on exactly where we fail).
> >> If the SIGSEGV delivery succeeds, returning would restore the POR_EL0
> >> set up by the previous signal handler, potentially more privileged. Does
> >> it matter? Can it return all the way to the original context?
> 
> What we store into the signal frame when delivering that SIGSEGV is a
> mixture of the original state (up to the point of failure) and the
> signal handler's state (what we couldn't restore). It's hard to reason
> about how that SIGSEGV handler could possibly handle this, but in any
> case it would have to massage its signal frame so that the next
> sigreturn does the right thing. Restoring only part of the frame records
> is bound to cause trouble and that's true for POR_EL0 as well - I doubt
> there's much value in special-casing it.

This feels like a simplification?

We can leave a mix of restored and unrestored state when generating the
SIGSEGV signal frame, providing that those changes will make no
difference when the rt_sigreturn is replayed.

POR_EL0 will make a difference, though.

The POR_EL0 image in the SIGSEGV signal frame needs be the same value
that caused the original rt_sigreturn to barf (if this is what caused
the barf).  It should be up to the SIGSEGV handler to decide what (if
anything) to do about that.  The kernel can't know what userspace
intended.

Note that for this to work, the SIGSEGV stack (whether main or
alternate) must be accessible with POR_EL0_INIT permissions, or the
SIGSEGV handler must start with a (gross) asm shim to establish a
usable POR_EL0.  But that's not really our problem here.

(I'm not saying that the kernel necessarily fails to do this -- I
haven't checked -- but just trying to understand the problem here.)


The actual problem here is that if the SIGSEGV handler wants to bail
out with a siglongjmp(), there is no way to determine the correct value
of POR_EL0 to restore.

I wonder whether POR_EL0 should be saved in sigjmp_buf (depending on
whether sigjmp_buf is horribly inextensible and also full up, or merely
horribly inextensible).

Does anyone know whether PKRU in sigjmp_buf on x86?

> 
> >
> > That seems a valid concern.
> >
> > It looks a bit like we don't back out the temporary change to POR_EL0
> > if writing the sigframe fails, so the temporary "allow all" perms might
> > get saved out into the SIGSEGV sigframe on the alternate signal
> > stack, and will then be restored as the user thread's POR_EL0 when the
> > SIGSEGV returns.
> 
> It sounds like you're referring to the delivery case, not the return
> case. In the delivery case (setup_rt_frame()), the "allow all" value
> will never be saved into the sigframe because we call
> restore_user_access_state() if anything failed (this is new in v2,
> exactly to prevent that scenario).

Ah, right -- I missed that detail.

Cheers
---Dave
Kevin Brodsky Oct. 25, 2024, 3:34 p.m. UTC | #8
On 25/10/2024 13:33, Dave Martin wrote:
> On Fri, Oct 25, 2024 at 10:24:41AM +0200, Kevin Brodsky wrote:
>>>>>>> @@ -907,6 +964,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
>>>>>>>  {
>>>>>>>  	struct pt_regs *regs = current_pt_regs();
>>>>>>>  	struct rt_sigframe __user *frame;
>>>>>>> +	struct user_access_state ua_state;
>>>>>>>  
>>>>>>>  	/* Always make any pending restarted system calls return -EINTR */
>>>>>>>  	current->restart_block.fn = do_no_restart_syscall;
>>>>>>> @@ -923,12 +981,14 @@ SYSCALL_DEFINE0(rt_sigreturn)
>>>>>>>  	if (!access_ok(frame, sizeof (*frame)))
>>>>>>>  		goto badframe;
>>>>>>>  
>>>>>>> -	if (restore_sigframe(regs, frame))
>>>>>>> +	if (restore_sigframe(regs, frame, &ua_state))
>>>>>>>  		goto badframe;
>>>>>>>  
>>>>>>>  	if (restore_altstack(&frame->uc.uc_stack))
>>>>>>>  		goto badframe;
>>>>>>>  
>>>>>>> +	restore_user_access_state(&ua_state);
>>>>>>> +
>>>>>>>  	return regs->regs[0];
>>>>>>>  
>>>>>>>  badframe:
>>>>>> The saving part I'm fine with. For restoring, I was wondering whether we
>>>>>> can get a more privileged POR_EL0 if reading the frame somehow failed.
>>>>>> This is largely theoretical, there are other ways to attack like
>>>>>> writing POR_EL0 directly than unmapping/remapping the signal stack.
>>>>>>
>>>>>> What I'd change here is always restore_user_access_state() to
>>>>>> POR_EL0_INIT. Maybe just initialise ua_state above and add the function
>>>>>> call after the badframe label.
>>>>> I'm not sure I understand. When we enter this function, POR_EL0 is set
>>>>> to whatever the signal handler set it to (POR_EL0_INIT by default).
>>>>> There are then two cases:
>>>>> 1) Everything succeeds, including reading the saved POR_EL0 from the
>>>>> frame. We then call restore_user_access_state(), setting POR_EL0 to the
>>>>> value we've read, and return to userspace.
>>>>> 2) Any uaccess fails (for instance reading POR_EL0). In that case we
>>>>> leave POR_EL0 unchanged and deliver SIGSEGV.
>>>>>
>>>>> In case 2 POR_EL0 is most likely already set to POR_EL0_INIT, or
>>>>> whatever the signal handler set it to. It's not clear to me that forcing
>>>>> it to POR_EL0_INIT helps much. Either way it's doubtful that the SIGSEGV
>>>>> handler will be able to recover, since the new signal frame we will
>>>>> create for it may be a mix of interrupted state and signal handler state
>>>>> (depending on exactly where we fail).
>>>> If the SIGSEGV delivery succeeds, returning would restore the POR_EL0
>>>> set up by the previous signal handler, potentially more privileged. Does
>>>> it matter? Can it return all the way to the original context?
>> What we store into the signal frame when delivering that SIGSEGV is a
>> mixture of the original state (up to the point of failure) and the
>> signal handler's state (what we couldn't restore). It's hard to reason
>> about how that SIGSEGV handler could possibly handle this, but in any
>> case it would have to massage its signal frame so that the next
>> sigreturn does the right thing. Restoring only part of the frame records
>> is bound to cause trouble and that's true for POR_EL0 as well - I doubt
>> there's much value in special-casing it.
> This feels like a simplification?
>
> We can leave a mix of restored and unrestored state when generating the
> SIGSEGV signal frame, providing that those changes will make no
> difference when the rt_sigreturn is replayed.

I'm not sure I understand what this means. If the SIGSEGV handler were
to sigreturn without touching its signal frame, things are likely to
explode: it may be returning to the point where the original handler
called sigreturn, for instance (if the first uaccess failed during that
sigreturn call).

> POR_EL0 will make a difference, though.
>
> The POR_EL0 image in the SIGSEGV signal frame needs be the same value
> that caused the original rt_sigreturn to barf (if this is what caused
> the barf).  It should be up to the SIGSEGV handler to decide what (if
> anything) to do about that.  The kernel can't know what userspace
> intended.

Unless I'm missing something this is exactly what happens now: what we
store in the SIGSEGV frame is the POR_EL0 value the original handler was
using.

> Note that for this to work, the SIGSEGV stack (whether main or
> alternate) must be accessible with POR_EL0_INIT permissions, or the
> SIGSEGV handler must start with a (gross) asm shim to establish a
> usable POR_EL0.  But that's not really our problem here.

This is indeed orthogonal - the SIGSEGV handler will be run with
POR_EL0_INIT, like any other handler. The value we store in the frame is
unrelated.

> (I'm not saying that the kernel necessarily fails to do this -- I
> haven't checked -- but just trying to understand the problem here.)
>
>
> The actual problem here is that if the SIGSEGV handler wants to bail
> out with a siglongjmp(), there is no way to determine the correct value
> of POR_EL0 to restore.

Correct, but again this is true of any other record - for instance TPIDR2.

> I wonder whether POR_EL0 should be saved in sigjmp_buf (depending on
> whether sigjmp_buf is horribly inextensible and also full up, or merely
> horribly inextensible).

It very much feels that this is the case - if a handler relies on
longjmp() or setcontext() to restore a known state, then POR_EL0 should
be part of that state.

>
> Does anyone know whether PKRU in sigjmp_buf on x86?

I can't say for sure but I don't see PKRU being handled in
setjmp/longjmp in glibc at least.

Kevin
diff mbox series

Patch

diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index f5fb48dabebe..d2e4e50977ae 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -66,9 +66,63 @@  struct rt_sigframe_user_layout {
 	unsigned long end_offset;
 };
 
+/*
+ * Holds any EL0-controlled state that influences unprivileged memory accesses.
+ * This includes both accesses done in userspace and uaccess done in the kernel.
+ *
+ * This state needs to be carefully managed to ensure that it doesn't cause
+ * uaccess to fail when setting up the signal frame, and the signal handler
+ * itself also expects a well-defined state when entered.
+ */
+struct user_access_state {
+	u64 por_el0;
+};
+
 #define TERMINATOR_SIZE round_up(sizeof(struct _aarch64_ctx), 16)
 #define EXTRA_CONTEXT_SIZE round_up(sizeof(struct extra_context), 16)
 
+/*
+ * Save the unpriv access state into ua_state and reset it to disable any
+ * restrictions.
+ */
+static void save_reset_user_access_state(struct user_access_state *ua_state)
+{
+	if (system_supports_poe()) {
+		/*
+		 * Enable all permissions in all 8 keys
+		 * (inspired by REPEAT_BYTE())
+		 */
+		u64 por_enable_all = (~0u / POE_MASK) * POE_RXW;
+
+		ua_state->por_el0 = read_sysreg_s(SYS_POR_EL0);
+		write_sysreg_s(por_enable_all, SYS_POR_EL0);
+		/* Ensure that any subsequent uaccess observes the updated value */
+		isb();
+	}
+}
+
+/*
+ * Set the unpriv access state for invoking the signal handler.
+ *
+ * No uaccess should be done after that function is called.
+ */
+static void set_handler_user_access_state(void)
+{
+	if (system_supports_poe())
+		write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0);
+}
+
+/*
+ * Restore the unpriv access state to the values saved in ua_state.
+ *
+ * No uaccess should be done after that function is called.
+ */
+static void restore_user_access_state(const struct user_access_state *ua_state)
+{
+	if (system_supports_poe())
+		write_sysreg_s(ua_state->por_el0, SYS_POR_EL0);
+}
+
 static void init_user_layout(struct rt_sigframe_user_layout *user)
 {
 	const size_t reserved_size =
@@ -260,18 +314,20 @@  static int restore_fpmr_context(struct user_ctxs *user)
 	return err;
 }
 
-static int preserve_poe_context(struct poe_context __user *ctx)
+static int preserve_poe_context(struct poe_context __user *ctx,
+				const struct user_access_state *ua_state)
 {
 	int err = 0;
 
 	__put_user_error(POE_MAGIC, &ctx->head.magic, err);
 	__put_user_error(sizeof(*ctx), &ctx->head.size, err);
-	__put_user_error(read_sysreg_s(SYS_POR_EL0), &ctx->por_el0, err);
+	__put_user_error(ua_state->por_el0, &ctx->por_el0, err);
 
 	return err;
 }
 
-static int restore_poe_context(struct user_ctxs *user)
+static int restore_poe_context(struct user_ctxs *user,
+			       struct user_access_state *ua_state)
 {
 	u64 por_el0;
 	int err = 0;
@@ -281,7 +337,7 @@  static int restore_poe_context(struct user_ctxs *user)
 
 	__get_user_error(por_el0, &(user->poe->por_el0), err);
 	if (!err)
-		write_sysreg_s(por_el0, SYS_POR_EL0);
+		ua_state->por_el0 = por_el0;
 
 	return err;
 }
@@ -849,7 +905,8 @@  static int parse_user_sigframe(struct user_ctxs *user,
 }
 
 static int restore_sigframe(struct pt_regs *regs,
-			    struct rt_sigframe __user *sf)
+			    struct rt_sigframe __user *sf,
+			    struct user_access_state *ua_state)
 {
 	sigset_t set;
 	int i, err;
@@ -898,7 +955,7 @@  static int restore_sigframe(struct pt_regs *regs,
 		err = restore_zt_context(&user);
 
 	if (err == 0 && system_supports_poe() && user.poe)
-		err = restore_poe_context(&user);
+		err = restore_poe_context(&user, ua_state);
 
 	return err;
 }
@@ -907,6 +964,7 @@  SYSCALL_DEFINE0(rt_sigreturn)
 {
 	struct pt_regs *regs = current_pt_regs();
 	struct rt_sigframe __user *frame;
+	struct user_access_state ua_state;
 
 	/* Always make any pending restarted system calls return -EINTR */
 	current->restart_block.fn = do_no_restart_syscall;
@@ -923,12 +981,14 @@  SYSCALL_DEFINE0(rt_sigreturn)
 	if (!access_ok(frame, sizeof (*frame)))
 		goto badframe;
 
-	if (restore_sigframe(regs, frame))
+	if (restore_sigframe(regs, frame, &ua_state))
 		goto badframe;
 
 	if (restore_altstack(&frame->uc.uc_stack))
 		goto badframe;
 
+	restore_user_access_state(&ua_state);
+
 	return regs->regs[0];
 
 badframe:
@@ -1034,7 +1094,8 @@  static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
 }
 
 static int setup_sigframe(struct rt_sigframe_user_layout *user,
-			  struct pt_regs *regs, sigset_t *set)
+			  struct pt_regs *regs, sigset_t *set,
+			  const struct user_access_state *ua_state)
 {
 	int i, err = 0;
 	struct rt_sigframe __user *sf = user->sigframe;
@@ -1096,10 +1157,9 @@  static int setup_sigframe(struct rt_sigframe_user_layout *user,
 		struct poe_context __user *poe_ctx =
 			apply_user_offset(user, user->poe_offset);
 
-		err |= preserve_poe_context(poe_ctx);
+		err |= preserve_poe_context(poe_ctx, ua_state);
 	}
 
-
 	/* ZA state if present */
 	if (system_supports_sme() && err == 0 && user->za_offset) {
 		struct za_context __user *za_ctx =
@@ -1236,9 +1296,6 @@  static void setup_return(struct pt_regs *regs, struct k_sigaction *ka,
 		sme_smstop();
 	}
 
-	if (system_supports_poe())
-		write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0);
-
 	if (ka->sa.sa_flags & SA_RESTORER)
 		sigtramp = ka->sa.sa_restorer;
 	else
@@ -1252,6 +1309,7 @@  static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,
 {
 	struct rt_sigframe_user_layout user;
 	struct rt_sigframe __user *frame;
+	struct user_access_state ua_state;
 	int err = 0;
 
 	fpsimd_signal_preserve_current_state();
@@ -1259,13 +1317,14 @@  static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,
 	if (get_sigframe(&user, ksig, regs))
 		return 1;
 
+	save_reset_user_access_state(&ua_state);
 	frame = user.sigframe;
 
 	__put_user_error(0, &frame->uc.uc_flags, err);
 	__put_user_error(NULL, &frame->uc.uc_link, err);
 
 	err |= __save_altstack(&frame->uc.uc_stack, regs->sp);
-	err |= setup_sigframe(&user, regs, set);
+	err |= setup_sigframe(&user, regs, set, &ua_state);
 	if (err == 0) {
 		setup_return(regs, &ksig->ka, &user, usig);
 		if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
@@ -1275,6 +1334,11 @@  static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,
 		}
 	}
 
+	if (err == 0)
+		set_handler_user_access_state();
+	else
+		restore_user_access_state(&ua_state);
+
 	return err;
 }