diff mbox series

[v4,4/7] uaccess-buffer: add CONFIG_GENERIC_ENTRY support

Message ID 20211209221545.2333249-5-pcc@google.com (mailing list archive)
State New, archived
Headers show
Series kernel: introduce uaccess logging | expand

Commit Message

Peter Collingbourne Dec. 9, 2021, 10:15 p.m. UTC
Add uaccess logging support on architectures that use
CONFIG_GENERIC_ENTRY (currently only s390 and x86).

Link: https://linux-review.googlesource.com/id/I3c5eb19a7e4a1dbe6095f6971f7826c4b0663f7d
Signed-off-by: Peter Collingbourne <pcc@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
---
v4:
- move pre/post-exit-loop calls into if statement

 arch/Kconfig                 |  1 +
 include/linux/entry-common.h |  2 ++
 include/linux/thread_info.h  |  4 ++++
 kernel/entry/common.c        | 14 +++++++++++++-
 4 files changed, 20 insertions(+), 1 deletion(-)

Comments

Thomas Gleixner Dec. 11, 2021, 11:50 a.m. UTC | #1
Peter,

On Thu, Dec 09 2021 at 14:15, Peter Collingbourne wrote:
> @@ -197,14 +201,19 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>  static void exit_to_user_mode_prepare(struct pt_regs *regs)
>  {
>  	unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
> +	bool uaccess_buffer_pending;
>  
>  	lockdep_assert_irqs_disabled();
>  
>  	/* Flush pending rcuog wakeup before the last need_resched() check */
>  	tick_nohz_user_enter_prepare();
>  
> -	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
> +	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) {
> +		bool uaccess_buffer_pending = uaccess_buffer_pre_exit_loop();
> +
>  		ti_work = exit_to_user_mode_loop(regs, ti_work);
> +		uaccess_buffer_post_exit_loop(uaccess_buffer_pending);

What? Let me look at the these two functions, which are so full of useful
comments:

> +bool __uaccess_buffer_pre_exit_loop(void)
> +{
> +	struct uaccess_buffer_info *buf = &current->uaccess_buffer;
> +	struct uaccess_descriptor __user *desc_ptr;
> +	sigset_t tmp_mask;
> +
> +	if (get_user(desc_ptr, buf->desc_ptr_ptr) || !desc_ptr)
> +		return false;
> +
> +	current->real_blocked = current->blocked;
> +	sigfillset(&tmp_mask);
> +	set_current_blocked(&tmp_mask);

This prevents signal delivery in exit_to_user_mode_loop(), right?

> +	return true;
> +}
> +
> +void __uaccess_buffer_post_exit_loop(void)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&current->sighand->siglock, flags);
> +	current->blocked = current->real_blocked;
> +	recalc_sigpending();

This restores the signal blocked mask _after_ exit_to_user_mode_loop()
has completed, recalculates pending signals and goes out to user space
with eventually pending signals.

How is this supposed to be even remotely correct?

But that aside, let me look at the whole picture as I understand it from
reverse engineering it. Yes, reverse engineering, because there are
neither comments in the code nor any useful information in the
changelogs of 2/7 and 4/7. Also the cover letter and the "documentation"
are not explaining any of this and just blurb about sanitizers and how
wonderful this all is.
 
> @@ -70,6 +71,9 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
>  			return ret;
>  	}
>  
> +	if (work & SYSCALL_WORK_UACCESS_BUFFER_ENTRY)
> +		uaccess_buffer_syscall_entry();

This conditionally sets SYSCALL_WORK_UACCESS_BUFFER_EXIT.

> @@ -247,6 +256,9 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
>  
>  	audit_syscall_exit(regs);
>  
> +	if (work & SYSCALL_WORK_UACCESS_BUFFER_EXIT)
> +		uaccess_buffer_syscall_exit();

When returning from the syscall and SYSCALL_WORK_UACCESS_BUFFER_EXIT is
set, then uaccess_buffer_syscall_exit() clears
SYSCALL_WORK_UACCESS_BUFFER_EXIT, right?

This is called _before_ exit_to_user_mode_prepare(). So why is this
__uaccess_buffer_pre/post_exit_loop() required at all?

It's not required at all. Why?

Simply because there are only two ways how exit_to_user_mode_prepare()
can be reached:

  1) When returning from a syscall

  2) When returning from an interrupt which hit user mode execution

#1 SYSCALL_WORK_UACCESS_BUFFER_EXIT is cleared _before_
   exit_to_user_mode_prepare() is reached as documented above.

#2 SYSCALL_WORK_UACCESS_BUFFER_EXIT cannot be set because the entry
   to the kernel does not go through syscall_trace_enter().

So what is this pre/post exit loop code about? Handle something which
cannot happen in the first place?

If at all this would warrant a:

	if (WARN_ON_ONCE(test_syscall_work(UACCESS_BUFFER_ENTRY)))
        	do_something_sensible();

instead of adding undocumented voodoo w/o providing any rationale. Well,
I can see why that was not provided because there is no rationale to
begin with.

Seriously, I'm all for better instrumentation and analysis, but if the
code provided for that is incomprehensible, uncommented and
undocumented, then the result is worse than what we have now.

If you think that this qualifies as documentation:

> +/*
> + * uaccess_buffer_syscall_entry - hook to be run before syscall entry
> + */

> +/*
> + * uaccess_buffer_syscall_exit - hook to be run after syscall exit
> + */

> +/*
> + * uaccess_buffer_pre_exit_loop - hook to be run immediately before the
> + * pre-kernel-exit loop that handles signals, tracing etc. Returns a bool to
> + * be passed to uaccess_buffer_post_exit_loop.
> + */

> +/*
> + * uaccess_buffer_post_exit_loop - hook to be run immediately after the
> + * pre-kernel-exit loop that handles signals, tracing etc.
> + * @pending: the bool returned from uaccess_buffer_pre_exit_loop.
> + */

then we have a very differrent understanding of what documentation
should provide.

Thanks,

        tglx
Peter Collingbourne Dec. 16, 2021, 1:25 a.m. UTC | #2
On Sat, Dec 11, 2021 at 3:50 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> Peter,
>
> On Thu, Dec 09 2021 at 14:15, Peter Collingbourne wrote:
> > @@ -197,14 +201,19 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> >  static void exit_to_user_mode_prepare(struct pt_regs *regs)
> >  {
> >       unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
> > +     bool uaccess_buffer_pending;
> >
> >       lockdep_assert_irqs_disabled();
> >
> >       /* Flush pending rcuog wakeup before the last need_resched() check */
> >       tick_nohz_user_enter_prepare();
> >
> > -     if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
> > +     if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) {
> > +             bool uaccess_buffer_pending = uaccess_buffer_pre_exit_loop();
> > +
> >               ti_work = exit_to_user_mode_loop(regs, ti_work);
> > +             uaccess_buffer_post_exit_loop(uaccess_buffer_pending);
>
> What? Let me look at the these two functions, which are so full of useful
> comments:
>
> > +bool __uaccess_buffer_pre_exit_loop(void)
> > +{
> > +     struct uaccess_buffer_info *buf = &current->uaccess_buffer;
> > +     struct uaccess_descriptor __user *desc_ptr;
> > +     sigset_t tmp_mask;
> > +
> > +     if (get_user(desc_ptr, buf->desc_ptr_ptr) || !desc_ptr)
> > +             return false;
> > +
> > +     current->real_blocked = current->blocked;
> > +     sigfillset(&tmp_mask);
> > +     set_current_blocked(&tmp_mask);
>
> This prevents signal delivery in exit_to_user_mode_loop(), right?

It prevents asynchronous signal delivery, same as with
sigprocmask(SIG_SETMASK, set, NULL) with a full set.

> > +     return true;
> > +}
> > +
> > +void __uaccess_buffer_post_exit_loop(void)
> > +{
> > +     unsigned long flags;
> > +
> > +     spin_lock_irqsave(&current->sighand->siglock, flags);
> > +     current->blocked = current->real_blocked;
> > +     recalc_sigpending();
>
> This restores the signal blocked mask _after_ exit_to_user_mode_loop()
> has completed, recalculates pending signals and goes out to user space
> with eventually pending signals.
>
> How is this supposed to be even remotely correct?

Please see this paragraph from the documentation:

When entering the kernel with a non-zero uaccess descriptor
address for a reason other than a syscall (for example, when
IPI'd due to an incoming asynchronous signal), any signals other
than ``SIGKILL`` and ``SIGSTOP`` are masked as if by calling
``sigprocmask(SIG_SETMASK, set, NULL)`` where ``set`` has been
initialized with ``sigfillset(set)``. This is to prevent incoming
signals from interfering with uaccess logging.

I believe that we will also go out to userspace with pending signals
when one of the signals that came in was a masked (via sigprocmask)
asynchronous signal, so this is an expected state.

> But that aside, let me look at the whole picture as I understand it from
> reverse engineering it. Yes, reverse engineering, because there are
> neither comments in the code nor any useful information in the
> changelogs of 2/7 and 4/7. Also the cover letter and the "documentation"
> are not explaining any of this and just blurb about sanitizers and how
> wonderful this all is.

The whole business with pre/post_exit_loop() is implementing the
paragraph mentioned above. I imagine that the kerneldoc comments could
be improved by referencing that paragraph.

> > @@ -70,6 +71,9 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
> >                       return ret;
> >       }
> >
> > +     if (work & SYSCALL_WORK_UACCESS_BUFFER_ENTRY)
> > +             uaccess_buffer_syscall_entry();
>
> This conditionally sets SYSCALL_WORK_UACCESS_BUFFER_EXIT.

Right.

> > @@ -247,6 +256,9 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
> >
> >       audit_syscall_exit(regs);
> >
> > +     if (work & SYSCALL_WORK_UACCESS_BUFFER_EXIT)
> > +             uaccess_buffer_syscall_exit();
>
> When returning from the syscall and SYSCALL_WORK_UACCESS_BUFFER_EXIT is
> set, then uaccess_buffer_syscall_exit() clears
> SYSCALL_WORK_UACCESS_BUFFER_EXIT, right?

Right.

> This is called _before_ exit_to_user_mode_prepare(). So why is this
> __uaccess_buffer_pre/post_exit_loop() required at all?
>
> It's not required at all. Why?
>
> Simply because there are only two ways how exit_to_user_mode_prepare()
> can be reached:
>
>   1) When returning from a syscall
>
>   2) When returning from an interrupt which hit user mode execution
>
> #1 SYSCALL_WORK_UACCESS_BUFFER_EXIT is cleared _before_
>    exit_to_user_mode_prepare() is reached as documented above.
>
> #2 SYSCALL_WORK_UACCESS_BUFFER_EXIT cannot be set because the entry
>    to the kernel does not go through syscall_trace_enter().
>
> So what is this pre/post exit loop code about? Handle something which
> cannot happen in the first place?

The pre/post_exit_loop() functions are checking UACCESS_BUFFER_ENTRY,
which is set when prctl(PR_SET_UACCESS_DESCRIPTOR_ADDR_ADDR) has been
used to set the uaccess descriptor address address to a non-zero
value. It is a different flag from UACCESS_BUFFER_EXIT. It is
certainly possible for the ENTRY flag to be set in your 2) above,
since that flag is not normally modified while inside the kernel.

> If at all this would warrant a:
>
>         if (WARN_ON_ONCE(test_syscall_work(UACCESS_BUFFER_ENTRY)))
>                 do_something_sensible();
>
> instead of adding undocumented voodoo w/o providing any rationale. Well,
> I can see why that was not provided because there is no rationale to
> begin with.
>
> Seriously, I'm all for better instrumentation and analysis, but if the
> code provided for that is incomprehensible, uncommented and
> undocumented, then the result is worse than what we have now.

Okay, as well as improving the kerneldoc I'll add some code comments
to make it clearer what's going on.

> If you think that this qualifies as documentation:
>
> > +/*
> > + * uaccess_buffer_syscall_entry - hook to be run before syscall entry
> > + */
>
> > +/*
> > + * uaccess_buffer_syscall_exit - hook to be run after syscall exit
> > + */
>
> > +/*
> > + * uaccess_buffer_pre_exit_loop - hook to be run immediately before the
> > + * pre-kernel-exit loop that handles signals, tracing etc. Returns a bool to
> > + * be passed to uaccess_buffer_post_exit_loop.
> > + */
>
> > +/*
> > + * uaccess_buffer_post_exit_loop - hook to be run immediately after the
> > + * pre-kernel-exit loop that handles signals, tracing etc.
> > + * @pending: the bool returned from uaccess_buffer_pre_exit_loop.
> > + */
>
> then we have a very differrent understanding of what documentation
> should provide.

This was intended as interface documentation, so it doesn't go into
too many details. It could certainly be improved though by referencing
the user documentation, as I mentioned above.

Peter
Thomas Gleixner Dec. 16, 2021, 1:05 p.m. UTC | #3
Peter,

On Wed, Dec 15 2021 at 17:25, Peter Collingbourne wrote:
> On Sat, Dec 11, 2021 at 3:50 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>> This restores the signal blocked mask _after_ exit_to_user_mode_loop()
>> has completed, recalculates pending signals and goes out to user space
>> with eventually pending signals.
>>
>> How is this supposed to be even remotely correct?
>
> Please see this paragraph from the documentation:
>
> When entering the kernel with a non-zero uaccess descriptor
> address for a reason other than a syscall (for example, when
> IPI'd due to an incoming asynchronous signal), any signals other
> than ``SIGKILL`` and ``SIGSTOP`` are masked as if by calling
> ``sigprocmask(SIG_SETMASK, set, NULL)`` where ``set`` has been
> initialized with ``sigfillset(set)``. This is to prevent incoming
> signals from interfering with uaccess logging.
>
> I believe that we will also go out to userspace with pending signals
> when one of the signals that came in was a masked (via sigprocmask)
> asynchronous signal, so this is an expected state.

Believe is not part of a technical analysis, believe belongs into the
realm of religion.

It's a fundamental difference whether the program masks signals itself
or the kernel decides to do that just because.

Pending signals, which are not masked by the process, have to be
delivered _before_ returning to user space.

    That's the expected behaviour. Period.

Instrumentation which changes the semantics of the observed code is
broken by definition.

>> So what is this pre/post exit loop code about? Handle something which
>> cannot happen in the first place?
>
> The pre/post_exit_loop() functions are checking UACCESS_BUFFER_ENTRY,
> which is set when prctl(PR_SET_UACCESS_DESCRIPTOR_ADDR_ADDR) has been
> used to set the uaccess descriptor address address to a non-zero
> value. It is a different flag from UACCESS_BUFFER_EXIT. It is
> certainly possible for the ENTRY flag to be set in your 2) above,
> since that flag is not normally modified while inside the kernel.

Let me try again. The logger is only active when:

    1) PR_SET_UACCESS_DESCRIPTOR_ADDR_ADDR has set an address, which
       sets UACCESS_BUFFER_ENTRY

    2) The task enters the kernel via syscall and the syscall entry
       observes UACCESS_BUFFER_ENTRY and sets UACCESS_BUFFER_EXIT

because the log functions only log when UACCESS_BUFFER_EXIT is set.

UACCESS_BUFFER_EXIT is cleared in the syscall exit path _before_ the
exit to usermode loop is reached, which means signal delivery is _NOT_
logged at all.

A non-syscall entry from user space - interrupt, exception, NMI - will
_NOT_ set UACCESS_BUFFER_EXIT because it takes a different entry
path. So when that non-syscall entry returns and delivers a signal then
there is no logging.

When the task has entered the kernel via a syscall and the kernel gets
interrupted and that interruption raises a signal, then there is no
signal delivery. The interrupt returns to kernel mode, which obviously
does not go through exit_to_user_mode(). The raised signal is delivered
when the task returns from the syscall to user mode, but that won't be
logged because UACCESS_BUFFER_EXIT is already cleared before the exit to
user mode loop is reached.

See?

>> then we have a very differrent understanding of what documentation
>> should provide.
>
> This was intended as interface documentation, so it doesn't go into
> too many details. It could certainly be improved though by referencing
> the user documentation, as I mentioned above.

Explanations which are required to make the code understandable have to
be in the code/kernel-doc comments and not in some disjunct place. This
disjunct documentation is guaranteed to be out of date within no time.

Thanks,

        tglx
Peter Collingbourne Dec. 17, 2021, 12:09 a.m. UTC | #4
On Thu, Dec 16, 2021 at 5:05 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> Peter,
>
> On Wed, Dec 15 2021 at 17:25, Peter Collingbourne wrote:
> > On Sat, Dec 11, 2021 at 3:50 AM Thomas Gleixner <tglx@linutronix.de> wrote:
> >> This restores the signal blocked mask _after_ exit_to_user_mode_loop()
> >> has completed, recalculates pending signals and goes out to user space
> >> with eventually pending signals.
> >>
> >> How is this supposed to be even remotely correct?
> >
> > Please see this paragraph from the documentation:
> >
> > When entering the kernel with a non-zero uaccess descriptor
> > address for a reason other than a syscall (for example, when
> > IPI'd due to an incoming asynchronous signal), any signals other
> > than ``SIGKILL`` and ``SIGSTOP`` are masked as if by calling
> > ``sigprocmask(SIG_SETMASK, set, NULL)`` where ``set`` has been
> > initialized with ``sigfillset(set)``. This is to prevent incoming
> > signals from interfering with uaccess logging.
> >
> > I believe that we will also go out to userspace with pending signals
> > when one of the signals that came in was a masked (via sigprocmask)
> > asynchronous signal, so this is an expected state.
>
> Believe is not part of a technical analysis, believe belongs into the
> realm of religion.
>
> It's a fundamental difference whether the program masks signals itself
> or the kernel decides to do that just because.
>
> Pending signals, which are not masked by the process, have to be
> delivered _before_ returning to user space.
>
>     That's the expected behaviour. Period.
>
> Instrumentation which changes the semantics of the observed code is
> broken by definition.

The idea is that the uaccess descriptor address would be set to a
non-zero value inside the syscall wrapper, before performing the
syscall. Since the kernel will set the uaccess descriptor address to
zero before returning from a syscall, at no point should the caller of
the syscall wrapper be executing with a non-zero uaccess descriptor
address. At worst, a signal will be delivered to a task executing a
syscall wrapper a few instructions later than it would otherwise, but
that's not really important because the determination of the exact
delivery point of an asynchronous signal is fundamentally racy anyway.

Basically, it's as if the syscall wrapper did this:

// During task startup:
uint64_t addr;
prctl(PR_SET_UACCESS_DESCRIPTOR_ADDR_ADDR, &addr, 0, 0, 0);

// Wrapper for syscall "x"
int x(...) {
  sigset_t set, old_set;
  struct uaccess_descriptor desc = { ... };

  addr = &desc;
  // The following two statements implicitly occur atomically together
with setting addr:
  sigfillset(set);
  sigprocmask(SIG_SETMASK, set, old_set);

  syscall(__NR_x ...,);
  // The following two statements implicitly occur atomically together
with the syscall:
  sigprocmask(SIG_SETMASK, old_set, NULL);
  addr = 0;

  // Now the uaccesses for syscall "x" are logged to "desc".
}

Aside from the guarantees of atomicity, this really seems no different
from the kernel providing another API for setting the signal mask.

> >> So what is this pre/post exit loop code about? Handle something which
> >> cannot happen in the first place?
> >
> > The pre/post_exit_loop() functions are checking UACCESS_BUFFER_ENTRY,
> > which is set when prctl(PR_SET_UACCESS_DESCRIPTOR_ADDR_ADDR) has been
> > used to set the uaccess descriptor address address to a non-zero
> > value. It is a different flag from UACCESS_BUFFER_EXIT. It is
> > certainly possible for the ENTRY flag to be set in your 2) above,
> > since that flag is not normally modified while inside the kernel.
>
> Let me try again. The logger is only active when:
>
>     1) PR_SET_UACCESS_DESCRIPTOR_ADDR_ADDR has set an address, which
>        sets UACCESS_BUFFER_ENTRY
>
>     2) The task enters the kernel via syscall and the syscall entry
>        observes UACCESS_BUFFER_ENTRY and sets UACCESS_BUFFER_EXIT
>
> because the log functions only log when UACCESS_BUFFER_EXIT is set.
>
> UACCESS_BUFFER_EXIT is cleared in the syscall exit path _before_ the
> exit to usermode loop is reached, which means signal delivery is _NOT_
> logged at all.

Right. It is not the intent to log uaccesses associated with signal
delivery. Only uaccesses that occur while handling the syscall itself
are logged.

> A non-syscall entry from user space - interrupt, exception, NMI - will
> _NOT_ set UACCESS_BUFFER_EXIT because it takes a different entry
> path. So when that non-syscall entry returns and delivers a signal then
> there is no logging.

Again, that's fine, there's no intent to log that.

> When the task has entered the kernel via a syscall and the kernel gets
> interrupted and that interruption raises a signal, then there is no
> signal delivery. The interrupt returns to kernel mode, which obviously
> does not go through exit_to_user_mode(). The raised signal is delivered
> when the task returns from the syscall to user mode, but that won't be
> logged because UACCESS_BUFFER_EXIT is already cleared before the exit to
> user mode loop is reached.
>
> See?

Perhaps there is a misunderstanding of the purpose of the signal
blocking with non-zero uaccess descriptor address. It isn't there
because we want to log anything about these signals. It's there
because we don't want a signal handler to be invoked between when we
arrange for the kernel to log the next syscall and when we issue the
syscall that we want to log, because that could lead to the signal
handler's syscalls being logged instead of the syscall that we intend
to log.

Consider the syscall wrapper for syscall "x" above, and imagine that
we didn't have the sigprocmask statements, and imagine that a signal
came in after storing &desc to addr but before the call to syscall.
Also imagine that the handler for that signal is unaware of uaccess
logging, so it just issues syscalls directly without touching addr.
Now the first syscall performed by the signal handler will be logged,
instead of the intended syscall "x", because the kernel will read the
uaccess descriptor intended for logging syscall "x" from addr when
entering the kernel for the signal handler's first syscall.

The kernel setting addr to 0 during the syscall is also necessary in
order for the kernel to continue processing signals normally once the
logged syscall has returned. Effectively any incoming signals are
queued until we have finished processing the logged syscall. Because
the kernel has set addr to 0, it refrains from blocking signals when
returning to userspace from the logged syscall, and therefore any
pending signals are delivered. Any syscalls that occur in any signal
handlers invoked via this signal delivery will not interfere with the
previously collected log for syscall "x", precisely because addr was
set to 0 by the kernel. If we left it up to userspace to set addr back
to 0, we would have the same problem as if we didn't have the
sigprocmask statements, but now the critical section extends until the
userspace program sets addr to 0. Furthermore, a userspace program
setting addr to 0 would not automatically cause the pending signals to
be delivered (because simply storing a value to memory from userspace
will not necessarily trigger a kernel entry), and signals could
therefore be left blocked for longer than expected (at least until the
next kernel entry).

> >> then we have a very differrent understanding of what documentation
> >> should provide.
> >
> > This was intended as interface documentation, so it doesn't go into
> > too many details. It could certainly be improved though by referencing
> > the user documentation, as I mentioned above.
>
> Explanations which are required to make the code understandable have to
> be in the code/kernel-doc comments and not in some disjunct place. This
> disjunct documentation is guaranteed to be out of date within no time.

Got it. From our discussion it's clear that the justification for the
design of the uaccess logging interface (especially the signal
handling parts) needs to be documented in the code in order to avoid
confusion.

Peter
Thomas Gleixner Dec. 17, 2021, 6:42 p.m. UTC | #5
Peter,

On Thu, Dec 16 2021 at 16:09, Peter Collingbourne wrote:
> userspace program sets addr to 0. Furthermore, a userspace program
> setting addr to 0 would not automatically cause the pending signals to
> be delivered (because simply storing a value to memory from userspace
> will not necessarily trigger a kernel entry), and signals could
> therefore be left blocked for longer than expected (at least until the
> next kernel entry).

Groan, so what you are trying to prevent is:

   *ptr = addr;

--> interrupt
       signal raised

    signal delivery

    signal handler
      syscall()   <- Logs this syscall
      sigreturn;

   syscall() <- Is not logged

I must have missed that detail in these novel sized comments all over
the place.

Yes, I can see how that pre/post muck solves this, but TBH while it is
admittedly a smart hack it's also a horrible hack.

There are a two aspects which I really dislike:

  - It's yet another ad hoc 'solution' to scratch 'my particular itch'

  - It's adding a horrorshow in the syscall hotpath. We have already
    enough gunk there. No need to add more.

The problem you are trying to solve is to instrument user accesses of
the kernel, which is special purpose tracing, right?

May I ask why this is not solvable via tracepoints?

  DECLARE_EVENT_CLASS(uaccess_class,....);
  DECLARE_EVENT(uaccess_class, uaccess_read,...);
  DECLARE_EVENT(uaccess_class, uaccess_write,...);

    trace_uaccess_read(from, n);

    trace_uaccess_write(to, n);

Tracepoints have filters, tooling, libraries etc. Zero code except for
the tracepoints themself. They are disabled by default with a static
key, which means very close to 0 overhead.

Aside of that such tracepoints can be used for other purposes as well
and are therefore not bound to 'my particular itch'.

There are obviously some questions to solve versus filtering, but even
with a stupid tid based filter, it's easy enough to filter out the stuff
you're interested in. E.g. to filter out the signal scenario above all
you need is to enable two more tracepoints:

   signal:signal_deliver
   syscalls:sys_enter_rt_sigreturn

and when analyzing the event stream you can just skip the noise between
signal:signal_deliver and syscalls:sys_enter_rt_sigreturn trace entries.

There are other fancies like BPF which can be used for filtering and
filling a map with entries.

The only downside is that system configuration restricts access and
requires certain priviledges. But is that a real problem for the purpose
at hand, sanitizers and validation tools?

I don't think it is, but you surely can provide more information here.

Thanks,

        tglx
Peter Collingbourne Jan. 10, 2022, 9:43 p.m. UTC | #6
On Fri, Dec 17, 2021 at 10:42 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> Peter,
>
> On Thu, Dec 16 2021 at 16:09, Peter Collingbourne wrote:
> > userspace program sets addr to 0. Furthermore, a userspace program
> > setting addr to 0 would not automatically cause the pending signals to
> > be delivered (because simply storing a value to memory from userspace
> > will not necessarily trigger a kernel entry), and signals could
> > therefore be left blocked for longer than expected (at least until the
> > next kernel entry).
>
> Groan, so what you are trying to prevent is:
>
>    *ptr = addr;
>
> --> interrupt
>        signal raised
>
>     signal delivery
>
>     signal handler
>       syscall()   <- Logs this syscall
>       sigreturn;
>
>    syscall() <- Is not logged
>
> I must have missed that detail in these novel sized comments all over
> the place.
>
> Yes, I can see how that pre/post muck solves this, but TBH while it is
> admittedly a smart hack it's also a horrible hack.
>
> There are a two aspects which I really dislike:
>
>   - It's yet another ad hoc 'solution' to scratch 'my particular itch'
>
>   - It's adding a horrorshow in the syscall hotpath. We have already
>     enough gunk there. No need to add more.
>
> The problem you are trying to solve is to instrument user accesses of
> the kernel, which is special purpose tracing, right?
>
> May I ask why this is not solvable via tracepoints?
>
>   DECLARE_EVENT_CLASS(uaccess_class,....);
>   DECLARE_EVENT(uaccess_class, uaccess_read,...);
>   DECLARE_EVENT(uaccess_class, uaccess_write,...);
>
>     trace_uaccess_read(from, n);
>
>     trace_uaccess_write(to, n);
>
> Tracepoints have filters, tooling, libraries etc. Zero code except for
> the tracepoints themself. They are disabled by default with a static
> key, which means very close to 0 overhead.
>
> Aside of that such tracepoints can be used for other purposes as well
> and are therefore not bound to 'my particular itch'.
>
> There are obviously some questions to solve versus filtering, but even
> with a stupid tid based filter, it's easy enough to filter out the stuff
> you're interested in. E.g. to filter out the signal scenario above all
> you need is to enable two more tracepoints:
>
>    signal:signal_deliver
>    syscalls:sys_enter_rt_sigreturn
>
> and when analyzing the event stream you can just skip the noise between
> signal:signal_deliver and syscalls:sys_enter_rt_sigreturn trace entries.

I'm afraid that it won't always work. This is because the control flow
can leave a signal handler without going through sigreturn (e.g. when
throwing an exception through the signal handler in C++, or via
longjmp).

But aside from that, there seem to be more fundamental problems with
using tracepoints for this. Let me start by saying that the uaccess
monitoring needs to be synchronous. This is because we need to read
the sanitizer metadata (e.g. memory tags for HWASan) for the memory
accessed by the syscall at the time that the syscall takes place,
before the syscall wrapper function returns. If we read it later, the
memory accessed by the syscall may have been deallocated or
reallocated, so we may end up producing a false positive
use-after-free (if deallocated) or buffer-overflow (if reallocated)
report.

As far as I can tell, there are four main ways that a userspace
program can access tracepoint events:

1) /sys/kernel/debug/tracing/trace_pipe
2) perf_event_open()
3) BPF
4) SIGTRAP on perf events

Let me go over each of them in turn. I don't think any of them will
work, but let me know if you think I made a mistake.

1) The event monitoring via trace_pipe appears to be a mechanism that
is global to the system, so it isn't suitable for an environment where
multiple processes may have sanitizers enabled.

2) This seems to be closest to being feasible, but it looks like there
are a few issues. From reading the perf_event_open man page, the way I
imagined it could work is that we sample the events uaccess_read,
uaccess_write, signal_deliver and sys_enter_rt_sigreturn to a ring
buffer. Then in the syscall wrapper we read the buffer and filter out
the events as you proposed. We do, however, need to be careful to
avoid memory corruption with this approach -- suppose that a signal
comes in while the syscall wrapper is part way through reading the
ring buffer. The syscalls performed by the signal handler may cause
the kernel to overwrite the part of the ring buffer that the syscall
wrapper is currently reading. I think something like this may work to
avoid the overwriting:

// thread local variables:
perf_event_mmap_page *page; // initialized at thread startup
bool in_syscall = false;

long syscall(...) {
  if (in_syscall) {
    // just do the syscall without logging to avoid reentrancy issues
    raw_syscall(...);
  }
  in_syscall = true;
  old_data_head = page->data_head;
  page->data_tail = old_data_head;
  raw_syscall(...);
  new_data_head = page->data_head;
  // read events between old_data_head and new_data_head
  in_syscall = false;
}

With this scheme we end up dropping events from signal handlers in
rare cases. This may be acceptable because the whole scheme is
best-effort anyway. However, if we abnormally return from the syscall
wrapper (exception or longjmp from signal handler), we will be stuck
in the state where logging is disabled, which seems more problematic.

For this to work we will need access to the arguments to the
uaccess_read and uaccess_write events. I couldn't find anything
suitable in the man page, but it does offer this tantalizing snippet:

              PERF_SAMPLE_RAW
                     Records additional data, if applicable.  Usually
returned by tracepoint events.

Maybe the arguments can be accessed via this record? Reading through
the kernel code it looks like *something* is copied in response to
this flag being set, but it is not very clear what. A comment in the
include/uapi/linux/perf_event.h header seems to indicate, however,
that even if the arguments are accessible this way, that isn't part of
the ABI:

         *      #
         *      # The RAW record below is opaque data wrt the ABI
         *      #
         *      # That is, the ABI doesn't make any promises wrt to
         *      # the stability of its content, it may vary depending
         *      # on event, hardware, kernel version and phase of
         *      # the moon.
         *      #
         *      # In other words, PERF_SAMPLE_RAW contents are not an ABI.
         *      #
         *
         *      { u32                   size;
         *        char                  data[size];}&& PERF_SAMPLE_RAW

This would make PERF_SAMPLE_RAW unsuitable for use in sanitizers
because they are generally expected not to break with new kernel
versions.

Another issue is that we need to be able to read the id file from
tracefs in order to issue the perf_event_open() syscall, but there
isn't necessarily a guarantee that tracefs will always be available
given that we will want to use this from almost every process on the
system including init and sandboxed processes. It also looks like the
whole "events" tree of tracefs is restricted to root on at least some
Android devices. There may be other privileges issues that I haven't
run into yet.

3) Installing BPF programs requires additional privileges. As
explained below, this makes it unsuitable for use with sanitizers.

4) I explained why I don't think it will work in the cover letter of
my most recent patch series:

- Tracing would need to be synchronous in order to produce useful
  stack traces. For example this could be achieved using the new SIGTRAP
  on perf events mechanism. However, this would require logging each
  access to the stack (in the form of a sigcontext) and this is more
  likely to overflow the stack due to being much larger than a uaccess
  buffer entry as well as being unbounded, in contrast to the bounded
  buffer size passed to prctl(). An approach based on signal handlers is
  also likely to fall foul of the asynchronous signal issues mentioned
  previously, together with needing sigreturn to be handled specially
  (because it copies a sigcontext from userspace) otherwise we could
  never return from the signal handler. Furthermore, arguments to the
  trace events are not available to SIGTRAP. (This on its own wouldn't
  be insurmountable though -- we could add the arguments as fields
  to siginfo.)

> There are other fancies like BPF which can be used for filtering and
> filling a map with entries.
>
> The only downside is that system configuration restricts access and
> requires certain priviledges. But is that a real problem for the purpose
> at hand, sanitizers and validation tools?
>
> I don't think it is, but you surely can provide more information here.

I'm afraid that it is a problem. We frequently deploy sanitizers in
production-like environments, which means that processes may not only
be running as non-root but may also be sandboxed. For example, a
typical HWASan deployment on an Android device will have HWASan
enabled in almost every process, including untrusted application
processes and sandboxed processes used for parsing untrusted data (and
which are particularly security sensitive).

Furthermore, we plan to use uaccess logging with the ARM Memory
Tagging Extension, which is intended to be deployed to end-user
devices. While opening holes in the sandbox for HWASan may be possible
in some cases (though inadvisable because sanitizers are meant to be
transparent to the program as much as possible, and we don't control
every sandbox used by every Android app), we shouldn't open holes in
the sandbox on end-user devices even if we were able to get every app
with a sandbox to do so as this would increase the risk to end users
(whereas the entire goal of deploying this is to *reduce* the risk).

Peter
diff mbox series

Patch

diff --git a/arch/Kconfig b/arch/Kconfig
index 17819f53ea80..bc849a61b636 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -31,6 +31,7 @@  config HOTPLUG_SMT
 	bool
 
 config GENERIC_ENTRY
+       select HAVE_ARCH_UACCESS_BUFFER
        bool
 
 config KPROBES
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 2e2b8d6140ed..973fcd1d48a3 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -42,12 +42,14 @@ 
 				 SYSCALL_WORK_SYSCALL_EMU |		\
 				 SYSCALL_WORK_SYSCALL_AUDIT |		\
 				 SYSCALL_WORK_SYSCALL_USER_DISPATCH |	\
+				 SYSCALL_WORK_UACCESS_BUFFER_ENTRY |	\
 				 ARCH_SYSCALL_WORK_ENTER)
 #define SYSCALL_WORK_EXIT	(SYSCALL_WORK_SYSCALL_TRACEPOINT |	\
 				 SYSCALL_WORK_SYSCALL_TRACE |		\
 				 SYSCALL_WORK_SYSCALL_AUDIT |		\
 				 SYSCALL_WORK_SYSCALL_USER_DISPATCH |	\
 				 SYSCALL_WORK_SYSCALL_EXIT_TRAP	|	\
+				 SYSCALL_WORK_UACCESS_BUFFER_EXIT |	\
 				 ARCH_SYSCALL_WORK_EXIT)
 
 /*
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index ad0c4e041030..b0f8ea86967f 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -46,6 +46,8 @@  enum syscall_work_bit {
 	SYSCALL_WORK_BIT_SYSCALL_AUDIT,
 	SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH,
 	SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP,
+	SYSCALL_WORK_BIT_UACCESS_BUFFER_ENTRY,
+	SYSCALL_WORK_BIT_UACCESS_BUFFER_EXIT,
 };
 
 #define SYSCALL_WORK_SECCOMP		BIT(SYSCALL_WORK_BIT_SECCOMP)
@@ -55,6 +57,8 @@  enum syscall_work_bit {
 #define SYSCALL_WORK_SYSCALL_AUDIT	BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
 #define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
 #define SYSCALL_WORK_SYSCALL_EXIT_TRAP	BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP)
+#define SYSCALL_WORK_UACCESS_BUFFER_ENTRY	BIT(SYSCALL_WORK_BIT_UACCESS_BUFFER_ENTRY)
+#define SYSCALL_WORK_UACCESS_BUFFER_EXIT	BIT(SYSCALL_WORK_BIT_UACCESS_BUFFER_EXIT)
 #endif
 
 #include <asm/thread_info.h>
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index d5a61d565ad5..59ec6e3f793b 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -6,6 +6,7 @@ 
 #include <linux/livepatch.h>
 #include <linux/audit.h>
 #include <linux/tick.h>
+#include <linux/uaccess-buffer.h>
 
 #include "common.h"
 
@@ -70,6 +71,9 @@  static long syscall_trace_enter(struct pt_regs *regs, long syscall,
 			return ret;
 	}
 
+	if (work & SYSCALL_WORK_UACCESS_BUFFER_ENTRY)
+		uaccess_buffer_syscall_entry();
+
 	/* Either of the above might have changed the syscall number */
 	syscall = syscall_get_nr(current, regs);
 
@@ -197,14 +201,19 @@  static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 static void exit_to_user_mode_prepare(struct pt_regs *regs)
 {
 	unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
+	bool uaccess_buffer_pending;
 
 	lockdep_assert_irqs_disabled();
 
 	/* Flush pending rcuog wakeup before the last need_resched() check */
 	tick_nohz_user_enter_prepare();
 
-	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
+	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) {
+		bool uaccess_buffer_pending = uaccess_buffer_pre_exit_loop();
+
 		ti_work = exit_to_user_mode_loop(regs, ti_work);
+		uaccess_buffer_post_exit_loop(uaccess_buffer_pending);
+	}
 
 	arch_exit_to_user_mode_prepare(regs, ti_work);
 
@@ -247,6 +256,9 @@  static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
 
 	audit_syscall_exit(regs);
 
+	if (work & SYSCALL_WORK_UACCESS_BUFFER_EXIT)
+		uaccess_buffer_syscall_exit();
+
 	if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
 		trace_sys_exit(regs, syscall_get_return_value(current, regs));