diff mbox series

audit,io_uring: io_uring openat triggers audit reference count underflow

Message ID 20231012215518.GA4048@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net (mailing list archive)
State Accepted
Delegated to: Paul Moore
Headers show
Series audit,io_uring: io_uring openat triggers audit reference count underflow | expand

Commit Message

Dan Clash Oct. 12, 2023, 9:55 p.m. UTC
An io_uring openat operation can update an audit reference count
from multiple threads resulting in the call trace below.

A call to io_uring_submit() with a single openat op with a flag of
IOSQE_ASYNC results in the following reference count updates.

These first part of the system call performs two increments that do not race.

do_syscall_64()
  __do_sys_io_uring_enter()
    io_submit_sqes()
      io_openat_prep()
        __io_openat_prep()
          getname()
            getname_flags()       /* update 1 (increment) */
              __audit_getname()   /* update 2 (increment) */

The openat op is queued to an io_uring worker thread which starts the
opportunity for a race.  The system call exit performs one decrement.

do_syscall_64()
  syscall_exit_to_user_mode()
    syscall_exit_to_user_mode_prepare()
      __audit_syscall_exit()
        audit_reset_context()
           putname()              /* update 3 (decrement) */

The io_uring worker thread performs one increment and two decrements.
These updates can race with the system call decrement.

io_wqe_worker()
  io_worker_handle_work()
    io_wq_submit_work()
      io_issue_sqe()
        io_openat()
          io_openat2()
            do_filp_open()
              path_openat()
                __audit_inode()   /* update 4 (increment) */
            putname()             /* update 5 (decrement) */
        __audit_uring_exit()
          audit_reset_context()
            putname()             /* update 6 (decrement) */

The fix is to change the refcnt member of struct audit_names
from int to atomic_t.

kernel BUG at fs/namei.c:262!
Call Trace:
...
 ? putname+0x68/0x70
 audit_reset_context.part.0.constprop.0+0xe1/0x300
 __audit_uring_exit+0xda/0x1c0
 io_issue_sqe+0x1f3/0x450
 ? lock_timer_base+0x3b/0xd0
 io_wq_submit_work+0x8d/0x2b0
 ? __try_to_del_timer_sync+0x67/0xa0
 io_worker_handle_work+0x17c/0x2b0
 io_wqe_worker+0x10a/0x350

Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/lkml/MW2PR2101MB1033FFF044A258F84AEAA584F1C9A@MW2PR2101MB1033.namprd21.prod.outlook.com/
Fixes: 5bd2182d58e9 ("audit,io_uring,io-wq: add some basic audit support to io_uring")
Signed-off-by: Dan Clash <daclash@linux.microsoft.com>
---
 fs/namei.c         | 9 +++++----
 include/linux/fs.h | 2 +-
 kernel/auditsc.c   | 8 ++++----
 3 files changed, 10 insertions(+), 9 deletions(-)


base-commit: 0bb80ecc33a8fb5a682236443c1e740d5c917d1d

Comments

Jens Axboe Oct. 12, 2023, 10:07 p.m. UTC | #1
On 10/12/23 3:55 PM, Dan Clash wrote:
> An io_uring openat operation can update an audit reference count
> from multiple threads resulting in the call trace below.
> 
> A call to io_uring_submit() with a single openat op with a flag of
> IOSQE_ASYNC results in the following reference count updates.
> 
> These first part of the system call performs two increments that do not race.
> 
> do_syscall_64()
>   __do_sys_io_uring_enter()
>     io_submit_sqes()
>       io_openat_prep()
>         __io_openat_prep()
>           getname()
>             getname_flags()       /* update 1 (increment) */
>               __audit_getname()   /* update 2 (increment) */
> 
> The openat op is queued to an io_uring worker thread which starts the
> opportunity for a race.  The system call exit performs one decrement.
> 
> do_syscall_64()
>   syscall_exit_to_user_mode()
>     syscall_exit_to_user_mode_prepare()
>       __audit_syscall_exit()
>         audit_reset_context()
>            putname()              /* update 3 (decrement) */
> 
> The io_uring worker thread performs one increment and two decrements.
> These updates can race with the system call decrement.
> 
> io_wqe_worker()
>   io_worker_handle_work()
>     io_wq_submit_work()
>       io_issue_sqe()
>         io_openat()
>           io_openat2()
>             do_filp_open()
>               path_openat()
>                 __audit_inode()   /* update 4 (increment) */
>             putname()             /* update 5 (decrement) */
>         __audit_uring_exit()
>           audit_reset_context()
>             putname()             /* update 6 (decrement) */
> 
> The fix is to change the refcnt member of struct audit_names
> from int to atomic_t.
> 
> kernel BUG at fs/namei.c:262!
> Call Trace:
> ...
>  ? putname+0x68/0x70
>  audit_reset_context.part.0.constprop.0+0xe1/0x300
>  __audit_uring_exit+0xda/0x1c0
>  io_issue_sqe+0x1f3/0x450
>  ? lock_timer_base+0x3b/0xd0
>  io_wq_submit_work+0x8d/0x2b0
>  ? __try_to_del_timer_sync+0x67/0xa0
>  io_worker_handle_work+0x17c/0x2b0
>  io_wqe_worker+0x10a/0x350
> 
> Cc: <stable@vger.kernel.org>
> Link: https://lore.kernel.org/lkml/MW2PR2101MB1033FFF044A258F84AEAA584F1C9A@MW2PR2101MB1033.namprd21.prod.outlook.com/
> Fixes: 5bd2182d58e9 ("audit,io_uring,io-wq: add some basic audit support to io_uring")
> Signed-off-by: Dan Clash <daclash@linux.microsoft.com>

Reviewed-by: Jens Axboe <axboe@kernel.dk>
Christian Brauner Oct. 13, 2023, 8:24 a.m. UTC | #2
On Thu, Oct 12, 2023 at 02:55:18PM -0700, Dan Clash wrote:
> An io_uring openat operation can update an audit reference count
> from multiple threads resulting in the call trace below.
> 
> A call to io_uring_submit() with a single openat op with a flag of
> IOSQE_ASYNC results in the following reference count updates.
> 
> These first part of the system call performs two increments that do not race.
> 
> do_syscall_64()
>   __do_sys_io_uring_enter()
>     io_submit_sqes()
>       io_openat_prep()
>         __io_openat_prep()
>           getname()
>             getname_flags()       /* update 1 (increment) */
>               __audit_getname()   /* update 2 (increment) */
> 
> The openat op is queued to an io_uring worker thread which starts the
> opportunity for a race.  The system call exit performs one decrement.
> 
> do_syscall_64()
>   syscall_exit_to_user_mode()
>     syscall_exit_to_user_mode_prepare()
>       __audit_syscall_exit()
>         audit_reset_context()
>            putname()              /* update 3 (decrement) */
> 
> The io_uring worker thread performs one increment and two decrements.
> These updates can race with the system call decrement.
> 
> io_wqe_worker()
>   io_worker_handle_work()
>     io_wq_submit_work()
>       io_issue_sqe()
>         io_openat()
>           io_openat2()
>             do_filp_open()
>               path_openat()
>                 __audit_inode()   /* update 4 (increment) */
>             putname()             /* update 5 (decrement) */
>         __audit_uring_exit()
>           audit_reset_context()
>             putname()             /* update 6 (decrement) */
> 
> The fix is to change the refcnt member of struct audit_names
> from int to atomic_t.
> 
> kernel BUG at fs/namei.c:262!
> Call Trace:
> ...
>  ? putname+0x68/0x70
>  audit_reset_context.part.0.constprop.0+0xe1/0x300
>  __audit_uring_exit+0xda/0x1c0
>  io_issue_sqe+0x1f3/0x450
>  ? lock_timer_base+0x3b/0xd0
>  io_wq_submit_work+0x8d/0x2b0
>  ? __try_to_del_timer_sync+0x67/0xa0
>  io_worker_handle_work+0x17c/0x2b0
>  io_wqe_worker+0x10a/0x350
> 
> Cc: <stable@vger.kernel.org>
> Link: https://lore.kernel.org/lkml/MW2PR2101MB1033FFF044A258F84AEAA584F1C9A@MW2PR2101MB1033.namprd21.prod.outlook.com/
> Fixes: 5bd2182d58e9 ("audit,io_uring,io-wq: add some basic audit support to io_uring")
> Signed-off-by: Dan Clash <daclash@linux.microsoft.com>
> ---
>  fs/namei.c         | 9 +++++----
>  include/linux/fs.h | 2 +-
>  kernel/auditsc.c   | 8 ++++----
>  3 files changed, 10 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/namei.c b/fs/namei.c
> index 567ee547492b..94565bd7e73f 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -188,7 +188,7 @@ getname_flags(const char __user *filename, int flags, int *empty)
>  		}
>  	}
>  
> -	result->refcnt = 1;
> +	atomic_set(&result->refcnt, 1);
>  	/* The empty path is special. */
>  	if (unlikely(!len)) {
>  		if (empty)
> @@ -249,7 +249,7 @@ getname_kernel(const char * filename)
>  	memcpy((char *)result->name, filename, len);
>  	result->uptr = NULL;
>  	result->aname = NULL;
> -	result->refcnt = 1;
> +	atomic_set(&result->refcnt, 1);
>  	audit_getname(result);
>  
>  	return result;
> @@ -261,9 +261,10 @@ void putname(struct filename *name)
>  	if (IS_ERR(name))
>  		return;
>  
> -	BUG_ON(name->refcnt <= 0);
> +	if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
> +		return;
>  
> -	if (--name->refcnt > 0)
> +	if (!atomic_dec_and_test(&name->refcnt))
>  		return;

Fine by me. I'd write this as:

count = atomic_dec_if_positive(&name->refcnt);
if (WARN_ON_ONCE(unlikely(count < 0))
	return;
if (count > 0)
	return;
Jens Axboe Oct. 13, 2023, 2:21 p.m. UTC | #3
On 10/13/23 2:24 AM, Christian Brauner wrote:
> On Thu, Oct 12, 2023 at 02:55:18PM -0700, Dan Clash wrote:
>> An io_uring openat operation can update an audit reference count
>> from multiple threads resulting in the call trace below.
>>
>> A call to io_uring_submit() with a single openat op with a flag of
>> IOSQE_ASYNC results in the following reference count updates.
>>
>> These first part of the system call performs two increments that do not race.
>>
>> do_syscall_64()
>>   __do_sys_io_uring_enter()
>>     io_submit_sqes()
>>       io_openat_prep()
>>         __io_openat_prep()
>>           getname()
>>             getname_flags()       /* update 1 (increment) */
>>               __audit_getname()   /* update 2 (increment) */
>>
>> The openat op is queued to an io_uring worker thread which starts the
>> opportunity for a race.  The system call exit performs one decrement.
>>
>> do_syscall_64()
>>   syscall_exit_to_user_mode()
>>     syscall_exit_to_user_mode_prepare()
>>       __audit_syscall_exit()
>>         audit_reset_context()
>>            putname()              /* update 3 (decrement) */
>>
>> The io_uring worker thread performs one increment and two decrements.
>> These updates can race with the system call decrement.
>>
>> io_wqe_worker()
>>   io_worker_handle_work()
>>     io_wq_submit_work()
>>       io_issue_sqe()
>>         io_openat()
>>           io_openat2()
>>             do_filp_open()
>>               path_openat()
>>                 __audit_inode()   /* update 4 (increment) */
>>             putname()             /* update 5 (decrement) */
>>         __audit_uring_exit()
>>           audit_reset_context()
>>             putname()             /* update 6 (decrement) */
>>
>> The fix is to change the refcnt member of struct audit_names
>> from int to atomic_t.
>>
>> kernel BUG at fs/namei.c:262!
>> Call Trace:
>> ...
>>  ? putname+0x68/0x70
>>  audit_reset_context.part.0.constprop.0+0xe1/0x300
>>  __audit_uring_exit+0xda/0x1c0
>>  io_issue_sqe+0x1f3/0x450
>>  ? lock_timer_base+0x3b/0xd0
>>  io_wq_submit_work+0x8d/0x2b0
>>  ? __try_to_del_timer_sync+0x67/0xa0
>>  io_worker_handle_work+0x17c/0x2b0
>>  io_wqe_worker+0x10a/0x350
>>
>> Cc: <stable@vger.kernel.org>
>> Link: https://lore.kernel.org/lkml/MW2PR2101MB1033FFF044A258F84AEAA584F1C9A@MW2PR2101MB1033.namprd21.prod.outlook.com/
>> Fixes: 5bd2182d58e9 ("audit,io_uring,io-wq: add some basic audit support to io_uring")
>> Signed-off-by: Dan Clash <daclash@linux.microsoft.com>
>> ---
>>  fs/namei.c         | 9 +++++----
>>  include/linux/fs.h | 2 +-
>>  kernel/auditsc.c   | 8 ++++----
>>  3 files changed, 10 insertions(+), 9 deletions(-)
>>
>> diff --git a/fs/namei.c b/fs/namei.c
>> index 567ee547492b..94565bd7e73f 100644
>> --- a/fs/namei.c
>> +++ b/fs/namei.c
>> @@ -188,7 +188,7 @@ getname_flags(const char __user *filename, int flags, int *empty)
>>  		}
>>  	}
>>  
>> -	result->refcnt = 1;
>> +	atomic_set(&result->refcnt, 1);
>>  	/* The empty path is special. */
>>  	if (unlikely(!len)) {
>>  		if (empty)
>> @@ -249,7 +249,7 @@ getname_kernel(const char * filename)
>>  	memcpy((char *)result->name, filename, len);
>>  	result->uptr = NULL;
>>  	result->aname = NULL;
>> -	result->refcnt = 1;
>> +	atomic_set(&result->refcnt, 1);
>>  	audit_getname(result);
>>  
>>  	return result;
>> @@ -261,9 +261,10 @@ void putname(struct filename *name)
>>  	if (IS_ERR(name))
>>  		return;
>>  
>> -	BUG_ON(name->refcnt <= 0);
>> +	if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
>> +		return;
>>  
>> -	if (--name->refcnt > 0)
>> +	if (!atomic_dec_and_test(&name->refcnt))
>>  		return;
> 
> Fine by me. I'd write this as:
> 
> count = atomic_dec_if_positive(&name->refcnt);
> if (WARN_ON_ONCE(unlikely(count < 0))
> 	return;
> if (count > 0)
> 	return;

Would be fine too, my suspicion was that most archs don't implement a
primitive for that, and hence it might be more expensive than
atomic_read()/atomic_dec_and_test() which do. But I haven't looked at
the code generation. The dec_if_positive degenerates to a atomic cmpxchg
for most cases.
Paul Moore Oct. 13, 2023, 3:43 p.m. UTC | #4
On Fri, Oct 13, 2023 at 10:21 AM Jens Axboe <axboe@kernel.dk> wrote:
> On 10/13/23 2:24 AM, Christian Brauner wrote:
> > On Thu, Oct 12, 2023 at 02:55:18PM -0700, Dan Clash wrote:
> >> An io_uring openat operation can update an audit reference count
> >> from multiple threads resulting in the call trace below.
> >>
> >> A call to io_uring_submit() with a single openat op with a flag of
> >> IOSQE_ASYNC results in the following reference count updates.
> >>
> >> These first part of the system call performs two increments that do not race.
> >>
> >> do_syscall_64()
> >>   __do_sys_io_uring_enter()
> >>     io_submit_sqes()
> >>       io_openat_prep()
> >>         __io_openat_prep()
> >>           getname()
> >>             getname_flags()       /* update 1 (increment) */
> >>               __audit_getname()   /* update 2 (increment) */
> >>
> >> The openat op is queued to an io_uring worker thread which starts the
> >> opportunity for a race.  The system call exit performs one decrement.
> >>
> >> do_syscall_64()
> >>   syscall_exit_to_user_mode()
> >>     syscall_exit_to_user_mode_prepare()
> >>       __audit_syscall_exit()
> >>         audit_reset_context()
> >>            putname()              /* update 3 (decrement) */
> >>
> >> The io_uring worker thread performs one increment and two decrements.
> >> These updates can race with the system call decrement.
> >>
> >> io_wqe_worker()
> >>   io_worker_handle_work()
> >>     io_wq_submit_work()
> >>       io_issue_sqe()
> >>         io_openat()
> >>           io_openat2()
> >>             do_filp_open()
> >>               path_openat()
> >>                 __audit_inode()   /* update 4 (increment) */
> >>             putname()             /* update 5 (decrement) */
> >>         __audit_uring_exit()
> >>           audit_reset_context()
> >>             putname()             /* update 6 (decrement) */
> >>
> >> The fix is to change the refcnt member of struct audit_names
> >> from int to atomic_t.
> >>
> >> kernel BUG at fs/namei.c:262!
> >> Call Trace:
> >> ...
> >>  ? putname+0x68/0x70
> >>  audit_reset_context.part.0.constprop.0+0xe1/0x300
> >>  __audit_uring_exit+0xda/0x1c0
> >>  io_issue_sqe+0x1f3/0x450
> >>  ? lock_timer_base+0x3b/0xd0
> >>  io_wq_submit_work+0x8d/0x2b0
> >>  ? __try_to_del_timer_sync+0x67/0xa0
> >>  io_worker_handle_work+0x17c/0x2b0
> >>  io_wqe_worker+0x10a/0x350
> >>
> >> Cc: <stable@vger.kernel.org>
> >> Link: https://lore.kernel.org/lkml/MW2PR2101MB1033FFF044A258F84AEAA584F1C9A@MW2PR2101MB1033.namprd21.prod.outlook.com/
> >> Fixes: 5bd2182d58e9 ("audit,io_uring,io-wq: add some basic audit support to io_uring")
> >> Signed-off-by: Dan Clash <daclash@linux.microsoft.com>
> >> ---
> >>  fs/namei.c         | 9 +++++----
> >>  include/linux/fs.h | 2 +-
> >>  kernel/auditsc.c   | 8 ++++----
> >>  3 files changed, 10 insertions(+), 9 deletions(-)
> >>
> >> diff --git a/fs/namei.c b/fs/namei.c
> >> index 567ee547492b..94565bd7e73f 100644
> >> --- a/fs/namei.c
> >> +++ b/fs/namei.c
> >> @@ -188,7 +188,7 @@ getname_flags(const char __user *filename, int flags, int *empty)
> >>              }
> >>      }
> >>
> >> -    result->refcnt = 1;
> >> +    atomic_set(&result->refcnt, 1);
> >>      /* The empty path is special. */
> >>      if (unlikely(!len)) {
> >>              if (empty)
> >> @@ -249,7 +249,7 @@ getname_kernel(const char * filename)
> >>      memcpy((char *)result->name, filename, len);
> >>      result->uptr = NULL;
> >>      result->aname = NULL;
> >> -    result->refcnt = 1;
> >> +    atomic_set(&result->refcnt, 1);
> >>      audit_getname(result);
> >>
> >>      return result;
> >> @@ -261,9 +261,10 @@ void putname(struct filename *name)
> >>      if (IS_ERR(name))
> >>              return;
> >>
> >> -    BUG_ON(name->refcnt <= 0);
> >> +    if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
> >> +            return;
> >>
> >> -    if (--name->refcnt > 0)
> >> +    if (!atomic_dec_and_test(&name->refcnt))
> >>              return;
> >
> > Fine by me. I'd write this as:
> >
> > count = atomic_dec_if_positive(&name->refcnt);
> > if (WARN_ON_ONCE(unlikely(count < 0))
> >       return;
> > if (count > 0)
> >       return;
>
> Would be fine too, my suspicion was that most archs don't implement a
> primitive for that, and hence it might be more expensive than
> atomic_read()/atomic_dec_and_test() which do. But I haven't looked at
> the code generation. The dec_if_positive degenerates to a atomic cmpxchg
> for most cases.

I'm not too concerned, either approach works for me, the important bit
is moving to an atomic_t/refcount_t so we can protect ourselves
against the race.  The patch looks good to me and I'd like to get this
fix merged.

Dan, barring any further back-and-forth on the putname() change, I
would say to go ahead and make the change Christian suggested and
repost the patch.  Based on Jens comment above it seems safe to
preserve his 'Reviewed-by:' tag on the next revision.  Assuming there
are no objections posted in the meantime, I'll plan to merge the next
revision into the audit/stable-6.6 branch and get that up to Linus
(likely next week since it's Friday).

Thanks everyone!
Christian Brauner Oct. 13, 2023, 3:44 p.m. UTC | #5
On Thu, 12 Oct 2023 14:55:18 -0700, Dan Clash wrote:
> An io_uring openat operation can update an audit reference count
> from multiple threads resulting in the call trace below.
> 
> A call to io_uring_submit() with a single openat op with a flag of
> IOSQE_ASYNC results in the following reference count updates.
> 
> These first part of the system call performs two increments that do not race.
> 
> [...]

Picking this up as is. Let me know if this needs another tree.

---

Applied to the vfs.misc branch of the vfs/vfs.git tree.
Patches in the vfs.misc branch should appear in linux-next soon.

Please report any outstanding bugs that were missed during review in a
new review to the original patch series allowing us to drop it.

It's encouraged to provide Acked-bys and Reviewed-bys even though the
patch has now been applied. If possible patch trailers will be updated.

Note that commit hashes shown below are subject to change due to rebase,
trailer updates or similar. If in doubt, please check the listed branch.

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git
branch: vfs.misc

[1/1] audit,io_uring: io_uring openat triggers audit reference count underflow
      https://git.kernel.org/vfs/vfs/c/c6f4350ced79
Jens Axboe Oct. 13, 2023, 3:53 p.m. UTC | #6
On 10/13/23 9:44 AM, Christian Brauner wrote:
> On Thu, 12 Oct 2023 14:55:18 -0700, Dan Clash wrote:
>> An io_uring openat operation can update an audit reference count
>> from multiple threads resulting in the call trace below.
>>
>> A call to io_uring_submit() with a single openat op with a flag of
>> IOSQE_ASYNC results in the following reference count updates.
>>
>> These first part of the system call performs two increments that do not race.
>>
>> [...]
> 
> Picking this up as is. Let me know if this needs another tree.

Since it's really vfs related, your tree is fine.

> Applied to the vfs.misc branch of the vfs/vfs.git tree.
> Patches in the vfs.misc branch should appear in linux-next soon.

You'll send it in for 6.6, right?
Paul Moore Oct. 13, 2023, 3:56 p.m. UTC | #7
On Fri, Oct 13, 2023 at 11:44 AM Christian Brauner <brauner@kernel.org> wrote:
>
> On Thu, 12 Oct 2023 14:55:18 -0700, Dan Clash wrote:
> > An io_uring openat operation can update an audit reference count
> > from multiple threads resulting in the call trace below.
> >
> > A call to io_uring_submit() with a single openat op with a flag of
> > IOSQE_ASYNC results in the following reference count updates.
> >
> > These first part of the system call performs two increments that do not race.
> >
> > [...]
>
> Picking this up as is. Let me know if this needs another tree.

Whoa.  A couple of things:

* Please don't merge patches into an upstream tree if all of the
affected subsystems haven't ACK'd the patch.  I know you've got your
boilerplate below about ACKs *after* the merge, which is fine, but I
find it breaks decorum a bit to merge patches without an explicit ACK
or even just a "looks good to me" from all of the relevant subsystems.
Of course there are exceptions for important patches that are rotting
on the mailing lists, but I don't believe that to be the case here.

* You didn't mention if you've marked this for stable or if you're
going to send this up to Linus now or wait for the merge window.  At a
minimum this should be marked for stable, and I believe it should also
be sent up to Linus prior to the v6.6 release; I'm guessing that is
what you're planning to do, but you didn't mention it here.

Regardless, as I mentioned in my last email (I think our last emails
raced a bit), I'm okay with this change, please add my ACK.

Acked-by: Paul Moore <paul@paul-moore.com>

> Applied to the vfs.misc branch of the vfs/vfs.git tree.
> Patches in the vfs.misc branch should appear in linux-next soon.
>
> Please report any outstanding bugs that were missed during review in a
> new review to the original patch series allowing us to drop it.
>
> It's encouraged to provide Acked-bys and Reviewed-bys even though the
> patch has now been applied. If possible patch trailers will be updated.
>
> Note that commit hashes shown below are subject to change due to rebase,
> trailer updates or similar. If in doubt, please check the listed branch.
>
> tree:   https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git
> branch: vfs.misc
>
> [1/1] audit,io_uring: io_uring openat triggers audit reference count underflow
>       https://git.kernel.org/vfs/vfs/c/c6f4350ced79
Jens Axboe Oct. 13, 2023, 4 p.m. UTC | #8
On 10/13/23 9:56 AM, Paul Moore wrote:
> * You didn't mention if you've marked this for stable or if you're
> going to send this up to Linus now or wait for the merge window.  At a
> minimum this should be marked for stable, and I believe it should also
> be sent up to Linus prior to the v6.6 release; I'm guessing that is
> what you're planning to do, but you didn't mention it here.

The patch already has a stable tag and the commit it fixes, can't
imagine anyone would strip those... But yes, as per my email, just
wanting to make sure this is going to 6.6 and not queued for 6.7.
Christian Brauner Oct. 13, 2023, 4:03 p.m. UTC | #9
> > Picking this up as is. Let me know if this needs another tree.
> 
> Since it's really vfs related, your tree is fine.
> 
> > Applied to the vfs.misc branch of the vfs/vfs.git tree.
> > Patches in the vfs.misc branch should appear in linux-next soon.
> 
> You'll send it in for 6.6, right?

Given that it fixes a bug, yeah. I'll let it sit until Monday so other's
have a moment to speak up though.
Paul Moore Oct. 13, 2023, 4:05 p.m. UTC | #10
On Fri, Oct 13, 2023 at 12:00 PM Jens Axboe <axboe@kernel.dk> wrote:
> On 10/13/23 9:56 AM, Paul Moore wrote:
> > * You didn't mention if you've marked this for stable or if you're
> > going to send this up to Linus now or wait for the merge window.  At a
> > minimum this should be marked for stable, and I believe it should also
> > be sent up to Linus prior to the v6.6 release; I'm guessing that is
> > what you're planning to do, but you didn't mention it here.
>
> The patch already has a stable tag and the commit it fixes, can't
> imagine anyone would strip those...

I've had that done in the past with patches, although admittedly not
with VFS related patches and not by Christian.  I just wanted to make
sure since it wasn't clear from the (automated?) merge response.

> But yes, as per my email, just
> wanting to make sure this is going to 6.6 and not queued for 6.7.

Agreed.
Christian Brauner Oct. 13, 2023, 4:22 p.m. UTC | #11
On Fri, Oct 13, 2023 at 11:56:08AM -0400, Paul Moore wrote:
> On Fri, Oct 13, 2023 at 11:44 AM Christian Brauner <brauner@kernel.org> wrote:
> >
> > On Thu, 12 Oct 2023 14:55:18 -0700, Dan Clash wrote:
> > > An io_uring openat operation can update an audit reference count
> > > from multiple threads resulting in the call trace below.
> > >
> > > A call to io_uring_submit() with a single openat op with a flag of
> > > IOSQE_ASYNC results in the following reference count updates.
> > >
> > > These first part of the system call performs two increments that do not race.
> > >
> > > [...]
> >
> > Picking this up as is. Let me know if this needs another tree.
> 
> Whoa.  A couple of things:
> 
> * Please don't merge patches into an upstream tree if all of the
> affected subsystems haven't ACK'd the patch.  I know you've got your
> boilerplate below about ACKs *after* the merge, which is fine, but I
> find it breaks decorum a bit to merge patches without an explicit ACK
> or even just a "looks good to me" from all of the relevant subsystems.

I simply read your mail:

X-Date: Fri, 13 Oct 2023 17:43:54 +0200
X-URI: https://lore.kernel.org/lkml/CAHC9VhQcSY9q=wVT7hOz9y=o3a67BVUnVGNotgAvE6vK7WAkBw@mail.gmail.com

"I'm not too concerned, either approach works for me, the important bit
 is moving to an atomic_t/refcount_t so we can protect ourselves
 against the race.  The patch looks good to me and I'd like to get this
 fix merged."

including that "The patch looks good to me [...]" part before I sent out
the application message:

X-Date: Fri, 13 Oct 2023 17:44:36 +0200
X-URI: https://lore.kernel.org/lkml/20231013-karierte-mehrzahl-6a938035609e@brauner

> Regardless, as I mentioned in my last email (I think our last emails
> raced a bit), I'm okay with this change, please add my ACK.

It's before the weekend and we're about to release -rc6. This thing
needs to be in -next, you said it looks good to you in a prior mail. I'm
not sure why I'm receiving this mail apart from the justified
clarification about -stable although that was made explicit in your
prior mail as well.

> 
> Acked-by: Paul Moore <paul@paul-moore.com>

Thanks for providing an explicit ACK.
Paul Moore Oct. 13, 2023, 4:38 p.m. UTC | #12
On Fri, Oct 13, 2023 at 12:22 PM Christian Brauner <brauner@kernel.org> wrote:
>
> On Fri, Oct 13, 2023 at 11:56:08AM -0400, Paul Moore wrote:
> > On Fri, Oct 13, 2023 at 11:44 AM Christian Brauner <brauner@kernel.org> wrote:
> > >
> > > On Thu, 12 Oct 2023 14:55:18 -0700, Dan Clash wrote:
> > > > An io_uring openat operation can update an audit reference count
> > > > from multiple threads resulting in the call trace below.
> > > >
> > > > A call to io_uring_submit() with a single openat op with a flag of
> > > > IOSQE_ASYNC results in the following reference count updates.
> > > >
> > > > These first part of the system call performs two increments that do not race.
> > > >
> > > > [...]
> > >
> > > Picking this up as is. Let me know if this needs another tree.
> >
> > Whoa.  A couple of things:
> >
> > * Please don't merge patches into an upstream tree if all of the
> > affected subsystems haven't ACK'd the patch.  I know you've got your
> > boilerplate below about ACKs *after* the merge, which is fine, but I
> > find it breaks decorum a bit to merge patches without an explicit ACK
> > or even just a "looks good to me" from all of the relevant subsystems.
>
> I simply read your mail:
>
> X-Date: Fri, 13 Oct 2023 17:43:54 +0200
> X-URI: https://lore.kernel.org/lkml/CAHC9VhQcSY9q=wVT7hOz9y=o3a67BVUnVGNotgAvE6vK7WAkBw@mail.gmail.com
>
> "I'm not too concerned, either approach works for me, the important bit
>  is moving to an atomic_t/refcount_t so we can protect ourselves
>  against the race.  The patch looks good to me and I'd like to get this
>  fix merged."
>
> including that "The patch looks good to me [...]" part before I sent out
> the application message:

Some of this is likely due to email races, or far faster than normal
responses.  When I was writing the email you reference above ("This
patch looks good to me...") the last email I had from you was asking
for changes to the patch; since you were suggesting a change I made
the assumption (which arguably one shouldn't assume things) that you
were not planning to merge the patch.

> X-Date: Fri, 13 Oct 2023 17:44:36 +0200
> X-URI: https://lore.kernel.org/lkml/20231013-karierte-mehrzahl-6a938035609e@brauner
>
> > Regardless, as I mentioned in my last email (I think our last emails
> > raced a bit), I'm okay with this change, please add my ACK.
>
> It's before the weekend and we're about to release -rc6. This thing
> needs to be in -next, you said it looks good to you in a prior mail. I'm
> not sure why I'm receiving this mail apart from the justified
> clarification about -stable although that was made explicit in your
> prior mail as well.

I hope I explained the intent in my last email a bit more clearly with
the explanation above.  Regardless, I think the lessons to be learned
is that I won't assume that your suggestion of changes and merging a
patch are mutually exclusive, and just to be on the safe side I would
ask that you not merge audit, LSM, or SELinux related patches without
an explicit ACK from those subsystems.  Hopefully that should prevent
things like this from happening again.
Dan Clash Oct. 13, 2023, 8:06 p.m. UTC | #13
On 2023-10-13 11:43, Paul Moore wrote:
> On Fri, Oct 13, 2023 at 10:21 AM Jens Axboe <axboe@kernel.dk> wrote:
>> On 10/13/23 2:24 AM, Christian Brauner wrote:
>>> On Thu, Oct 12, 2023 at 02:55:18PM -0700, Dan Clash wrote:
>>>> An io_uring openat operation can update an audit reference count
>>>> from multiple threads resulting in the call trace below.
>>>>
>>>> A call to io_uring_submit() with a single openat op with a flag of
>>>> IOSQE_ASYNC results in the following reference count updates.
>>>>
>>>> These first part of the system call performs two increments that do not race.
>>>>
>>>> do_syscall_64()
>>>>    __do_sys_io_uring_enter()
>>>>      io_submit_sqes()
>>>>        io_openat_prep()
>>>>          __io_openat_prep()
>>>>            getname()
>>>>              getname_flags()       /* update 1 (increment) */
>>>>                __audit_getname()   /* update 2 (increment) */
>>>>
>>>> The openat op is queued to an io_uring worker thread which starts the
>>>> opportunity for a race.  The system call exit performs one decrement.
>>>>
>>>> do_syscall_64()
>>>>    syscall_exit_to_user_mode()
>>>>      syscall_exit_to_user_mode_prepare()
>>>>        __audit_syscall_exit()
>>>>          audit_reset_context()
>>>>             putname()              /* update 3 (decrement) */
>>>>
>>>> The io_uring worker thread performs one increment and two decrements.
>>>> These updates can race with the system call decrement.
>>>>
>>>> io_wqe_worker()
>>>>    io_worker_handle_work()
>>>>      io_wq_submit_work()
>>>>        io_issue_sqe()
>>>>          io_openat()
>>>>            io_openat2()
>>>>              do_filp_open()
>>>>                path_openat()
>>>>                  __audit_inode()   /* update 4 (increment) */
>>>>              putname()             /* update 5 (decrement) */
>>>>          __audit_uring_exit()
>>>>            audit_reset_context()
>>>>              putname()             /* update 6 (decrement) */
>>>>
>>>> The fix is to change the refcnt member of struct audit_names
>>>> from int to atomic_t.
>>>>
>>>> kernel BUG at fs/namei.c:262!
>>>> Call Trace:
>>>> ...
>>>>   ? putname+0x68/0x70
>>>>   audit_reset_context.part.0.constprop.0+0xe1/0x300
>>>>   __audit_uring_exit+0xda/0x1c0
>>>>   io_issue_sqe+0x1f3/0x450
>>>>   ? lock_timer_base+0x3b/0xd0
>>>>   io_wq_submit_work+0x8d/0x2b0
>>>>   ? __try_to_del_timer_sync+0x67/0xa0
>>>>   io_worker_handle_work+0x17c/0x2b0
>>>>   io_wqe_worker+0x10a/0x350
>>>>
>>>> Cc: <stable@vger.kernel.org>
>>>> Link: https://lore.kernel.org/lkml/MW2PR2101MB1033FFF044A258F84AEAA584F1C9A@MW2PR2101MB1033.namprd21.prod.outlook.com/
>>>> Fixes: 5bd2182d58e9 ("audit,io_uring,io-wq: add some basic audit support to io_uring")
>>>> Signed-off-by: Dan Clash <daclash@linux.microsoft.com>
>>>> ---
>>>>   fs/namei.c         | 9 +++++----
>>>>   include/linux/fs.h | 2 +-
>>>>   kernel/auditsc.c   | 8 ++++----
>>>>   3 files changed, 10 insertions(+), 9 deletions(-)
>>>>
>>>> diff --git a/fs/namei.c b/fs/namei.c
>>>> index 567ee547492b..94565bd7e73f 100644
>>>> --- a/fs/namei.c
>>>> +++ b/fs/namei.c
>>>> @@ -188,7 +188,7 @@ getname_flags(const char __user *filename, int flags, int *empty)
>>>>               }
>>>>       }
>>>>
>>>> -    result->refcnt = 1;
>>>> +    atomic_set(&result->refcnt, 1);
>>>>       /* The empty path is special. */
>>>>       if (unlikely(!len)) {
>>>>               if (empty)
>>>> @@ -249,7 +249,7 @@ getname_kernel(const char * filename)
>>>>       memcpy((char *)result->name, filename, len);
>>>>       result->uptr = NULL;
>>>>       result->aname = NULL;
>>>> -    result->refcnt = 1;
>>>> +    atomic_set(&result->refcnt, 1);
>>>>       audit_getname(result);
>>>>
>>>>       return result;
>>>> @@ -261,9 +261,10 @@ void putname(struct filename *name)
>>>>       if (IS_ERR(name))
>>>>               return;
>>>>
>>>> -    BUG_ON(name->refcnt <= 0);
>>>> +    if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
>>>> +            return;
>>>>
>>>> -    if (--name->refcnt > 0)
>>>> +    if (!atomic_dec_and_test(&name->refcnt))
>>>>               return;
>>>
>>> Fine by me. I'd write this as:
>>>
>>> count = atomic_dec_if_positive(&name->refcnt);
>>> if (WARN_ON_ONCE(unlikely(count < 0))
>>>        return;
>>> if (count > 0)
>>>        return;
>>
>> Would be fine too, my suspicion was that most archs don't implement a
>> primitive for that, and hence it might be more expensive than
>> atomic_read()/atomic_dec_and_test() which do. But I haven't looked at
>> the code generation. The dec_if_positive degenerates to a atomic cmpxchg
>> for most cases.
> 
> I'm not too concerned, either approach works for me, the important bit
> is moving to an atomic_t/refcount_t so we can protect ourselves
> against the race.  The patch looks good to me and I'd like to get this
> fix merged.
> 
> Dan, barring any further back-and-forth on the putname() change, I
> would say to go ahead and make the change Christian suggested and
> repost the patch.  Based on Jens comment above it seems safe to
> preserve his 'Reviewed-by:' tag on the next revision.  Assuming there
> are no objections posted in the meantime, I'll plan to merge the next
> revision into the audit/stable-6.6 branch and get that up to Linus
> (likely next week since it's Friday).

I did not see many arch implementations of atomic_dec_if_positive.
The x86_64 generated code looks like arch_atomic_dec_unless_positive()
in atomic-arch-fallback.h with a loop around lock cmpxchg.

I did not want to compound the email race so I did not send patch v2 but 
I can if desired.


devvm2 ~/linux $ sysctl kernel.arch
kernel.arch = x86_64

devvm2 ~/linux $ cat -n ./fs/namei.c | grep -B 7 -A 4 atomic_dec_if_positive
    259  void putname(struct filename *name)
    260  {
    261          int count;
    262
    263          if (IS_ERR(name))
    264                  return;
    265
    266          count = atomic_dec_if_positive(&name->refcnt);
    267          if (WARN_ON_ONCE(unlikely(count < 0)))
    268                  return;
    269          if (count > 0)
    270                  return;

devvm2 ~/linux $ objdump --disassemble --line-numbers ./fs/namei.o | \
grep -B 8 -A 12 atomic_dec_if_positive
/home/daclash/linux/fs/namei.c:260
      22e:       55                      push   %rbp
      22f:       48 89 e5                mov    %rsp,%rbp
      232:       41 54                   push   %r12
arch_atomic_read():
/home/daclash/linux/./arch/x86/include/asm/atomic.h:23
      234:       8b 47 10                mov    0x10(%rdi),%eax
      237:       49 89 fc                mov    %rdi,%r12
raw_atomic_dec_if_positive():
/home/daclash/linux/./include/linux/atomic/atomic-arch-fallback.h:2535
      23a:       89 c2                   mov    %eax,%edx
      23c:       83 ea 01                sub    $0x1,%edx
      23f:       78 50                   js     291 <putname+0x71>
arch_atomic_try_cmpxchg():
/home/daclash/linux/./arch/x86/include/asm/atomic.h:115
      241:       f0 41 0f b1 54 24 10    lock cmpxchg %edx,0x10(%r12)
      248:       75 f0                   jne    23a <putname+0x1a>
putname():
/home/daclash/linux/fs/namei.c:269
      24a:       85 d2                   test   %edx,%edx
      24c:       75 22                   jne    270 <putname+0x50>
diff mbox series

Patch

diff --git a/fs/namei.c b/fs/namei.c
index 567ee547492b..94565bd7e73f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -188,7 +188,7 @@  getname_flags(const char __user *filename, int flags, int *empty)
 		}
 	}
 
-	result->refcnt = 1;
+	atomic_set(&result->refcnt, 1);
 	/* The empty path is special. */
 	if (unlikely(!len)) {
 		if (empty)
@@ -249,7 +249,7 @@  getname_kernel(const char * filename)
 	memcpy((char *)result->name, filename, len);
 	result->uptr = NULL;
 	result->aname = NULL;
-	result->refcnt = 1;
+	atomic_set(&result->refcnt, 1);
 	audit_getname(result);
 
 	return result;
@@ -261,9 +261,10 @@  void putname(struct filename *name)
 	if (IS_ERR(name))
 		return;
 
-	BUG_ON(name->refcnt <= 0);
+	if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
+		return;
 
-	if (--name->refcnt > 0)
+	if (!atomic_dec_and_test(&name->refcnt))
 		return;
 
 	if (name->name != name->iname) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4aeb3fa11927..85653ce30d2c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2444,7 +2444,7 @@  struct audit_names;
 struct filename {
 	const char		*name;	/* pointer to actual string */
 	const __user char	*uptr;	/* original userland pointer */
-	int			refcnt;
+	atomic_t		refcnt;
 	struct audit_names	*aname;
 	const char		iname[];
 };
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 21d2fa815e78..6f0d6fb6523f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2212,7 +2212,7 @@  __audit_reusename(const __user char *uptr)
 		if (!n->name)
 			continue;
 		if (n->name->uptr == uptr) {
-			n->name->refcnt++;
+			atomic_inc(&n->name->refcnt);
 			return n->name;
 		}
 	}
@@ -2241,7 +2241,7 @@  void __audit_getname(struct filename *name)
 	n->name = name;
 	n->name_len = AUDIT_NAME_FULL;
 	name->aname = n;
-	name->refcnt++;
+	atomic_inc(&name->refcnt);
 }
 
 static inline int audit_copy_fcaps(struct audit_names *name,
@@ -2373,7 +2373,7 @@  void __audit_inode(struct filename *name, const struct dentry *dentry,
 		return;
 	if (name) {
 		n->name = name;
-		name->refcnt++;
+		atomic_inc(&name->refcnt);
 	}
 
 out:
@@ -2500,7 +2500,7 @@  void __audit_inode_child(struct inode *parent,
 		if (found_parent) {
 			found_child->name = found_parent->name;
 			found_child->name_len = AUDIT_NAME_FULL;
-			found_child->name->refcnt++;
+			atomic_inc(&found_child->name->refcnt);
 		}
 	}