diff mbox series

[net] net: implement lockless setsockopt(SO_PEEK_OFF)

Message ID 20240219141220.908047-1-edumazet@google.com (mailing list archive)
State Accepted
Commit 56667da7399eb19af857e30f41bea89aa6fa812c
Delegated to: Netdev Maintainers
Headers show
Series [net] net: implement lockless setsockopt(SO_PEEK_OFF) | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for net, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag present in non-next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 972 this patch: 972
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 1 maintainers not CCed: dhowells@redhat.com
netdev/build_clang success Errors and warnings before: 974 this patch: 974
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 989 this patch: 989
netdev/checkpatch warning WARNING: Possible repeated word: 'Google'
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-02-21--00-00 (tests: 1452)

Commit Message

Eric Dumazet Feb. 19, 2024, 2:12 p.m. UTC
syzbot reported a lockdep violation [1] involving af_unix
support of SO_PEEK_OFF.

Since SO_PEEK_OFF is inherently not thread safe (it uses a per-socket
sk_peek_off field), there is really no point to enforce a pointless
thread safety in the kernel.

After this patch :

- setsockopt(SO_PEEK_OFF) no longer acquires the socket lock.

- skb_consume_udp() no longer has to acquire the socket lock.

- af_unix no longer needs a special version of sk_set_peek_off(),
  because it does not lock u->iolock anymore.

As a followup, we could replace prot->set_peek_off to be a boolean
and avoid an indirect call, since we always use sk_set_peek_off().

[1]

WARNING: possible circular locking dependency detected
6.8.0-rc4-syzkaller-00267-g0f1dd5e91e2b #0 Not tainted

syz-executor.2/30025 is trying to acquire lock:
 ffff8880765e7d80 (&u->iolock){+.+.}-{3:3}, at: unix_set_peek_off+0x26/0xa0 net/unix/af_unix.c:789

but task is already holding lock:
 ffff8880765e7930 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1691 [inline]
 ffff8880765e7930 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: sockopt_lock_sock net/core/sock.c:1060 [inline]
 ffff8880765e7930 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: sk_setsockopt+0xe52/0x3360 net/core/sock.c:1193

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #1 (sk_lock-AF_UNIX){+.+.}-{0:0}:
        lock_acquire+0x1e3/0x530 kernel/locking/lockdep.c:5754
        lock_sock_nested+0x48/0x100 net/core/sock.c:3524
        lock_sock include/net/sock.h:1691 [inline]
        __unix_dgram_recvmsg+0x1275/0x12c0 net/unix/af_unix.c:2415
        sock_recvmsg_nosec+0x18e/0x1d0 net/socket.c:1046
        ____sys_recvmsg+0x3c0/0x470 net/socket.c:2801
        ___sys_recvmsg net/socket.c:2845 [inline]
        do_recvmmsg+0x474/0xae0 net/socket.c:2939
        __sys_recvmmsg net/socket.c:3018 [inline]
        __do_sys_recvmmsg net/socket.c:3041 [inline]
        __se_sys_recvmmsg net/socket.c:3034 [inline]
        __x64_sys_recvmmsg+0x199/0x250 net/socket.c:3034
       do_syscall_64+0xf9/0x240
       entry_SYSCALL_64_after_hwframe+0x6f/0x77

-> #0 (&u->iolock){+.+.}-{3:3}:
        check_prev_add kernel/locking/lockdep.c:3134 [inline]
        check_prevs_add kernel/locking/lockdep.c:3253 [inline]
        validate_chain+0x18ca/0x58e0 kernel/locking/lockdep.c:3869
        __lock_acquire+0x1345/0x1fd0 kernel/locking/lockdep.c:5137
        lock_acquire+0x1e3/0x530 kernel/locking/lockdep.c:5754
        __mutex_lock_common kernel/locking/mutex.c:608 [inline]
        __mutex_lock+0x136/0xd70 kernel/locking/mutex.c:752
        unix_set_peek_off+0x26/0xa0 net/unix/af_unix.c:789
       sk_setsockopt+0x207e/0x3360
        do_sock_setsockopt+0x2fb/0x720 net/socket.c:2307
        __sys_setsockopt+0x1ad/0x250 net/socket.c:2334
        __do_sys_setsockopt net/socket.c:2343 [inline]
        __se_sys_setsockopt net/socket.c:2340 [inline]
        __x64_sys_setsockopt+0xb5/0xd0 net/socket.c:2340
       do_syscall_64+0xf9/0x240
       entry_SYSCALL_64_after_hwframe+0x6f/0x77

other info that might help us debug this:

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(sk_lock-AF_UNIX);
                               lock(&u->iolock);
                               lock(sk_lock-AF_UNIX);
  lock(&u->iolock);

 *** DEADLOCK ***

1 lock held by syz-executor.2/30025:
  #0: ffff8880765e7930 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1691 [inline]
  #0: ffff8880765e7930 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: sockopt_lock_sock net/core/sock.c:1060 [inline]
  #0: ffff8880765e7930 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: sk_setsockopt+0xe52/0x3360 net/core/sock.c:1193

stack backtrace:
CPU: 0 PID: 30025 Comm: syz-executor.2 Not tainted 6.8.0-rc4-syzkaller-00267-g0f1dd5e91e2b #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/25/2024
Call Trace:
 <TASK>
  __dump_stack lib/dump_stack.c:88 [inline]
  dump_stack_lvl+0x1e7/0x2e0 lib/dump_stack.c:106
  check_noncircular+0x36a/0x4a0 kernel/locking/lockdep.c:2187
  check_prev_add kernel/locking/lockdep.c:3134 [inline]
  check_prevs_add kernel/locking/lockdep.c:3253 [inline]
  validate_chain+0x18ca/0x58e0 kernel/locking/lockdep.c:3869
  __lock_acquire+0x1345/0x1fd0 kernel/locking/lockdep.c:5137
  lock_acquire+0x1e3/0x530 kernel/locking/lockdep.c:5754
  __mutex_lock_common kernel/locking/mutex.c:608 [inline]
  __mutex_lock+0x136/0xd70 kernel/locking/mutex.c:752
  unix_set_peek_off+0x26/0xa0 net/unix/af_unix.c:789
 sk_setsockopt+0x207e/0x3360
  do_sock_setsockopt+0x2fb/0x720 net/socket.c:2307
  __sys_setsockopt+0x1ad/0x250 net/socket.c:2334
  __do_sys_setsockopt net/socket.c:2343 [inline]
  __se_sys_setsockopt net/socket.c:2340 [inline]
  __x64_sys_setsockopt+0xb5/0xd0 net/socket.c:2340
 do_syscall_64+0xf9/0x240
 entry_SYSCALL_64_after_hwframe+0x6f/0x77
RIP: 0033:0x7f78a1c7dda9
Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 e1 20 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f78a0fde0c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036
RAX: ffffffffffffffda RBX: 00007f78a1dac050 RCX: 00007f78a1c7dda9
RDX: 000000000000002a RSI: 0000000000000001 RDI: 0000000000000006
RBP: 00007f78a1cca47a R08: 0000000000000004 R09: 0000000000000000
R10: 0000000020000180 R11: 0000000000000246 R12: 0000000000000000
R13: 000000000000006e R14: 00007f78a1dac050 R15: 00007ffe5cd81ae8

Fixes: 859051dd165e ("bpf: Implement cgroup sockaddr hooks for unix sockets")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
Cc: Daan De Meyer <daan.j.demeyer@gmail.com>
Cc: Kuniyuki Iwashima <kuniyu@amazon.com>
Cc: Martin KaFai Lau <martin.lau@kernel.org>
Cc: David Ahern <dsahern@kernel.org>
---
 net/core/sock.c    | 23 +++++++++++------------
 net/ipv4/udp.c     |  7 +------
 net/unix/af_unix.c | 19 +++----------------
 3 files changed, 15 insertions(+), 34 deletions(-)

Comments

Willem de Bruijn Feb. 19, 2024, 4:07 p.m. UTC | #1
Eric Dumazet wrote:
> syzbot reported a lockdep violation [1] involving af_unix
> support of SO_PEEK_OFF.
> 
> Since SO_PEEK_OFF is inherently not thread safe (it uses a per-socket
> sk_peek_off field), there is really no point to enforce a pointless
> thread safety in the kernel.

Would it be sufficient to just move the setsockopt, so that the
socket lock is not taken, but iolock still is?

Agreed with the general principle that this whole interface is not
thread safe. So agreed on simplifying. Doubly so for the (lockless)
UDP path.

sk_peek_offset(), sk_peek_offset_fwd() and sk_peek_offset_bwd() calls
currently all take place inside a single iolock critical section. If
not taking iolock, perhaps it would be better if sk_peek_off is at
least only read once in that critical section, rather than reread
in sk_peek_offset_fwd() and sk_peek_offset_bwd()?

> 
> After this patch :
> 
> - setsockopt(SO_PEEK_OFF) no longer acquires the socket lock.
> 
> - skb_consume_udp() no longer has to acquire the socket lock.
> 
> - af_unix no longer needs a special version of sk_set_peek_off(),
>   because it does not lock u->iolock anymore.
> 
> As a followup, we could replace prot->set_peek_off to be a boolean
> and avoid an indirect call, since we always use sk_set_peek_off().
Eric Dumazet Feb. 19, 2024, 5 p.m. UTC | #2
On Mon, Feb 19, 2024 at 5:07 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Eric Dumazet wrote:
> > syzbot reported a lockdep violation [1] involving af_unix
> > support of SO_PEEK_OFF.
> >
> > Since SO_PEEK_OFF is inherently not thread safe (it uses a per-socket
> > sk_peek_off field), there is really no point to enforce a pointless
> > thread safety in the kernel.
>
> Would it be sufficient to just move the setsockopt, so that the
> socket lock is not taken, but iolock still is?

Probably, if we focus on the lockdep issue rather than the general
SO_PEEK_OFF mechanism.

We could remove unix_set_peek_off() in net-next,
unless someone explains why keeping a locking on iolock is needed.


>
> Agreed with the general principle that this whole interface is not
> thread safe. So agreed on simplifying. Doubly so for the (lockless)
> UDP path.
>
> sk_peek_offset(), sk_peek_offset_fwd() and sk_peek_offset_bwd() calls
> currently all take place inside a single iolock critical section. If
> not taking iolock, perhaps it would be better if sk_peek_off is at
> least only read once in that critical section, rather than reread
> in sk_peek_offset_fwd() and sk_peek_offset_bwd()?

Note that for sk_peek_offset_bwd() operations, there is no prior read
of sk_peek_off,
since the caller does not use MSG_PEEK (only UDP does a read in an
attempt to avoid the lock...)

static inline int sk_peek_offset(const struct sock *sk, int flags)
{
    if (unlikely(flags & MSG_PEEK))
        return READ_ONCE(sk->sk_peek_off);

    return 0;
}




>
> >
> > After this patch :
> >
> > - setsockopt(SO_PEEK_OFF) no longer acquires the socket lock.
> >
> > - skb_consume_udp() no longer has to acquire the socket lock.
> >
> > - af_unix no longer needs a special version of sk_set_peek_off(),
> >   because it does not lock u->iolock anymore.
> >
> > As a followup, we could replace prot->set_peek_off to be a boolean
> > and avoid an indirect call, since we always use sk_set_peek_off().
>
Willem de Bruijn Feb. 19, 2024, 5:20 p.m. UTC | #3
Eric Dumazet wrote:
> On Mon, Feb 19, 2024 at 5:07 PM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > Eric Dumazet wrote:
> > > syzbot reported a lockdep violation [1] involving af_unix
> > > support of SO_PEEK_OFF.
> > >
> > > Since SO_PEEK_OFF is inherently not thread safe (it uses a per-socket
> > > sk_peek_off field), there is really no point to enforce a pointless
> > > thread safety in the kernel.
> >
> > Would it be sufficient to just move the setsockopt, so that the
> > socket lock is not taken, but iolock still is?
> 
> Probably, if we focus on the lockdep issue rather than the general
> SO_PEEK_OFF mechanism.
> 
> We could remove unix_set_peek_off() in net-next,
> unless someone explains why keeping a locking on iolock is needed.

Since calling SO_PEEK_OFF and recvmsg concurrently is inherently not
thread-safe, fine to remove it all.

All unix_set_peek_off does is an unconditional WRITE_ONCE.

It's just not the smallest change to address this specific report.
+1 on cleaning up more thoroughly in net-next.

> >
> > Agreed with the general principle that this whole interface is not
> > thread safe. So agreed on simplifying. Doubly so for the (lockless)
> > UDP path.
> >
> > sk_peek_offset(), sk_peek_offset_fwd() and sk_peek_offset_bwd() calls
> > currently all take place inside a single iolock critical section. If
> > not taking iolock, perhaps it would be better if sk_peek_off is at
> > least only read once in that critical section, rather than reread
> > in sk_peek_offset_fwd() and sk_peek_offset_bwd()?
> 
> Note that for sk_peek_offset_bwd() operations, there is no prior read
> of sk_peek_off,
> since the caller does not use MSG_PEEK (only UDP does a read in an
> attempt to avoid the lock...)
> 
> static inline int sk_peek_offset(const struct sock *sk, int flags)
> {
>     if (unlikely(flags & MSG_PEEK))
>         return READ_ONCE(sk->sk_peek_off);
> 
>     return 0;
> }

Good point, thanks.

Reviewed-by: Willem de Bruijn <willemb@google.com>
> 
> 
> 
> 
> >
> > >
> > > After this patch :
> > >
> > > - setsockopt(SO_PEEK_OFF) no longer acquires the socket lock.
> > >
> > > - skb_consume_udp() no longer has to acquire the socket lock.
> > >
> > > - af_unix no longer needs a special version of sk_set_peek_off(),
> > >   because it does not lock u->iolock anymore.
> > >
> > > As a followup, we could replace prot->set_peek_off to be a boolean
> > > and avoid an indirect call, since we always use sk_set_peek_off().
> >
Paolo Abeni Feb. 19, 2024, 6:07 p.m. UTC | #4
On Mon, 2024-02-19 at 14:12 +0000, Eric Dumazet wrote:
> syzbot reported a lockdep violation [1] involving af_unix
> support of SO_PEEK_OFF.
> 
> Since SO_PEEK_OFF is inherently not thread safe (it uses a per-socket
> sk_peek_off field), there is really no point to enforce a pointless
> thread safety in the kernel.
> 
> After this patch :
> 
> - setsockopt(SO_PEEK_OFF) no longer acquires the socket lock.
> 
> - skb_consume_udp() no longer has to acquire the socket lock.
> 
> - af_unix no longer needs a special version of sk_set_peek_off(),
>   because it does not lock u->iolock anymore.
> 
> As a followup, we could replace prot->set_peek_off to be a boolean
> and avoid an indirect call, since we always use sk_set_peek_off().

Only related to that mentioned possible follow-up: I'm trying to
benchmarking the UDP change mentioned here:

https://lore.kernel.org/netdev/725a92b4813242549f2316e6682d3312b5e658d8.camel@redhat.com/

and that it will require an udp specific set_peek_off() variant.

The indirect call in the control path should not be too bad, right?

Thanks,

Paolo
Eric Dumazet Feb. 19, 2024, 6:31 p.m. UTC | #5
On Mon, Feb 19, 2024 at 7:07 PM Paolo Abeni <pabeni@redhat.com> wrote:
>
> On Mon, 2024-02-19 at 14:12 +0000, Eric Dumazet wrote:
> > syzbot reported a lockdep violation [1] involving af_unix
> > support of SO_PEEK_OFF.
> >
> > Since SO_PEEK_OFF is inherently not thread safe (it uses a per-socket
> > sk_peek_off field), there is really no point to enforce a pointless
> > thread safety in the kernel.
> >
> > After this patch :
> >
> > - setsockopt(SO_PEEK_OFF) no longer acquires the socket lock.
> >
> > - skb_consume_udp() no longer has to acquire the socket lock.
> >
> > - af_unix no longer needs a special version of sk_set_peek_off(),
> >   because it does not lock u->iolock anymore.
> >
> > As a followup, we could replace prot->set_peek_off to be a boolean
> > and avoid an indirect call, since we always use sk_set_peek_off().
>
> Only related to that mentioned possible follow-up: I'm trying to
> benchmarking the UDP change mentioned here:
>
> https://lore.kernel.org/netdev/725a92b4813242549f2316e6682d3312b5e658d8.camel@redhat.com/
>
> and that it will require an udp specific set_peek_off() variant.
>
> The indirect call in the control path should not be too bad, right?

Not at all ;)
Kuniyuki Iwashima Feb. 19, 2024, 10:56 p.m. UTC | #6
From: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
Date: Mon, 19 Feb 2024 12:20:39 -0500
> Eric Dumazet wrote:
> > On Mon, Feb 19, 2024 at 5:07 PM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > Eric Dumazet wrote:
> > > > syzbot reported a lockdep violation [1] involving af_unix
> > > > support of SO_PEEK_OFF.
> > > >
> > > > Since SO_PEEK_OFF is inherently not thread safe (it uses a per-socket
> > > > sk_peek_off field), there is really no point to enforce a pointless
> > > > thread safety in the kernel.
> > >
> > > Would it be sufficient to just move the setsockopt, so that the
> > > socket lock is not taken, but iolock still is?
> > 
> > Probably, if we focus on the lockdep issue rather than the general
> > SO_PEEK_OFF mechanism.
> > 
> > We could remove unix_set_peek_off() in net-next,
> > unless someone explains why keeping a locking on iolock is needed.

Probably to avoid a small race where setsockopt() does not take effect.

                                    sk_peek_offset_bwd
  setsockopt                        |- off = READ_ONCE(sk_peek_off)
  `- WRITE_ONCE(sk_peek_off, val)   |
                                    `- WRITE_ONCE(sk_peek_off, off - val)

> 
> Since calling SO_PEEK_OFF and recvmsg concurrently is inherently not
> thread-safe, fine to remove it all.

Agreed, we can do that locklessly.

Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
patchwork-bot+netdevbpf@kernel.org Feb. 21, 2024, 11:30 a.m. UTC | #7
Hello:

This patch was applied to netdev/net.git (main)
by David S. Miller <davem@davemloft.net>:

On Mon, 19 Feb 2024 14:12:20 +0000 you wrote:
> syzbot reported a lockdep violation [1] involving af_unix
> support of SO_PEEK_OFF.
> 
> Since SO_PEEK_OFF is inherently not thread safe (it uses a per-socket
> sk_peek_off field), there is really no point to enforce a pointless
> thread safety in the kernel.
> 
> [...]

Here is the summary with links:
  - [net] net: implement lockless setsockopt(SO_PEEK_OFF)
    https://git.kernel.org/netdev/net/c/56667da7399e

You are awesome, thank you!
Jeongjun Park April 8, 2024, 2:30 p.m. UTC | #8
Eric Dumazet wrote:
> syzbot reported a lockdep violation [1] involving af_unix
> support of SO_PEEK_OFF.
>
> Since SO_PEEK_OFF is inherently not thread safe (it uses a per-socket
> sk_peek_off field), there is really no point to enforce a pointless
> thread safety in the kernel.
>
> After this patch :
>
> - setsockopt(SO_PEEK_OFF) no longer acquires the socket lock.
>
> - skb_consume_udp() no longer has to acquire the socket lock.
>
> - af_unix no longer needs a special version of sk_set_peek_off(),
>   because it does not lock u->iolock anymore.

The method employed in this patch, which avoids locking u->iolock in
SO_PEEK_OFF, appears to have effectively remedied the immediate vulnerability,
and the patch itself seems robust.

However, if a future scenario arises where mutex_lock(&u->iolock) is required
after sk_setsockopt(sk), this patch would become ineffective.

In practical testing within my environment, I observed that reintroducing
mutex_lock(&u->iolock) within sk_setsockopt() triggered the vulnerability once again.

Therefore, I believe it's crucial to address the fundamental cause triggering this vulnerability
alongside the current patch.

[   30.537400] ======================================================
[   30.537765] WARNING: possible circular locking dependency detected
[   30.538237] 6.9.0-rc1-00058-g4076fa161217-dirty #8 Not tainted
[   30.538541] ------------------------------------------------------
[   30.538791] poc/209 is trying to acquire lock:
[   30.539008] ffff888007a8cd58 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: __unix_dgram_recvmsg+0x37e/0x550
[   30.540060] 
[   30.540060] but task is already holding lock:
[   30.540482] ffff888007a8d070 (&u->iolock){+.+.}-{3:3}, at: __unix_dgram_recvmsg+0xec/0x550
[   30.540871] 
[   30.540871] which lock already depends on the new lock.
[   30.540871] 
[   30.541341] 
[   30.541341] the existing dependency chain (in reverse order) is:
[   30.541816] 
[   30.541816] -> #1 (&u->iolock){+.+.}-{3:3}:
[   30.542411]        lock_acquire+0xc0/0x2e0
[   30.542650]        __mutex_lock+0x91/0x4b0
[   30.542830]        sk_setsockopt+0xae2/0x1510
[   30.543009]        do_sock_setsockopt+0x14e/0x180
[   30.543443]        __sys_setsockopt+0x73/0xc0
[   30.543635]        __x64_sys_setsockopt+0x1a/0x30
[   30.543859]        do_syscall_64+0xc9/0x1e0
[   30.544057]        entry_SYSCALL_64_after_hwframe+0x6d/0x75
[   30.544652] 
[   30.544652] -> #0 (sk_lock-AF_UNIX){+.+.}-{0:0}:
[   30.544987]        check_prev_add+0xeb/0xa20
[   30.545174]        __lock_acquire+0x12fb/0x1740
[   30.545516]        lock_acquire+0xc0/0x2e0
[   30.545692]        lock_sock_nested+0x2d/0x80
[   30.545871]        __unix_dgram_recvmsg+0x37e/0x550
[   30.546066]        sock_recvmsg+0xbf/0xd0
[   30.546419]        ____sys_recvmsg+0x85/0x1d0
[   30.546653]        ___sys_recvmsg+0x77/0xc0
[   30.546971]        __sys_recvmsg+0x55/0xa0
[   30.547149]        do_syscall_64+0xc9/0x1e0
[   30.547428]        entry_SYSCALL_64_after_hwframe+0x6d/0x75
[   30.547740] 
[   30.547740] other info that might help us debug this:
[   30.547740] 
[   30.548217]  Possible unsafe locking scenario:
[   30.548217] 
[   30.548502]        CPU0                    CPU1
[   30.548713]        ----                    ----
[   30.548926]   lock(&u->iolock);
[   30.549234]                                lock(sk_lock-AF_UNIX);
[   30.549535]                                lock(&u->iolock);
[   30.549798]   lock(sk_lock-AF_UNIX);
[   30.549970] 
[   30.549970]  *** DEADLOCK ***
[   30.549970] 
[   30.550504] 1 lock held by poc/209:
[   30.550681]  #0: ffff888007a8d070 (&u->iolock){+.+.}-{3:3}, at: __unix_dgram_recvmsg+0xec/0x550
[   30.551100] 
[   30.551100] stack backtrace:
[   30.551532] CPU: 1 PID: 209 Comm: poc Not tainted 6.9.0-rc1-00058-g4076fa161217-dirty #8
[   30.551910] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
[   30.552539] Call Trace:
[   30.552788]  <TASK>
[   30.552987]  dump_stack_lvl+0x68/0xa0
[   30.553429]  check_noncircular+0x135/0x150
[   30.553626]  check_prev_add+0xeb/0xa20
[   30.553811]  __lock_acquire+0x12fb/0x1740
[   30.553993]  lock_acquire+0xc0/0x2e0
[   30.554234]  ? __unix_dgram_recvmsg+0x37e/0x550
[   30.554543]  ? __skb_try_recv_datagram+0xb2/0x190
[   30.554752]  lock_sock_nested+0x2d/0x80
[   30.554912]  ? __unix_dgram_recvmsg+0x37e/0x550
[   30.555097]  __unix_dgram_recvmsg+0x37e/0x550
[   30.555498]  sock_recvmsg+0xbf/0xd0
[   30.555661]  ____sys_recvmsg+0x85/0x1d0
[   30.555826]  ? __import_iovec+0x177/0x1d0
[   30.555998]  ? import_iovec+0x1a/0x20
[   30.556401]  ? copy_msghdr_from_user+0x68/0xa0
[   30.556676]  ___sys_recvmsg+0x77/0xc0
[   30.556856]  ? __fget_files+0xc8/0x1a0
[   30.557612]  ? lock_release+0xbd/0x290
[   30.557799]  ? __fget_files+0xcd/0x1a0
[   30.557969]  __sys_recvmsg+0x55/0xa0
[   30.558284]  do_syscall_64+0xc9/0x1e0
[   30.558455]  entry_SYSCALL_64_after_hwframe+0x6d/0x75
[   30.558740] RIP: 0033:0x7f3c14632dad
[   30.559329] Code: 28 89 54 24 1c 48 89 74 24 10 89 7c 24 08 e8 6a ef ff ff 8b 54 24 1c 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 2f 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 44 24 08 e8 9e ef f8
[   30.560156] RSP: 002b:00007f3c12c43e60 EFLAGS: 00000293 ORIG_RAX: 000000000000002f
[   30.560582] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f3c14632dad
[   30.560933] RDX: 0000000000000000 RSI: 00007f3c12c44eb0 RDI: 0000000000000005
[   30.562935] RBP: 00007f3c12c44ef0 R08: 0000000000000000 R09: 00007f3c12c45700
[   30.565833] R10: fffffffffffff648 R11: 0000000000000293 R12: 00007ffe93a2bfde
[   30.566161] R13: 00007ffe93a2bfdf R14: 00007f3c12c44fc0 R15: 0000000000802000
[   30.569456]  </TASK>




What are your thoughts on this?
Eric Dumazet April 8, 2024, 2:34 p.m. UTC | #9
On Mon, Apr 8, 2024 at 4:30 PM Jeongjun Park <aha310510@gmail.com> wrote:
>
> Eric Dumazet wrote:
> > syzbot reported a lockdep violation [1] involving af_unix
> > support of SO_PEEK_OFF.
> >
> > Since SO_PEEK_OFF is inherently not thread safe (it uses a per-socket
> > sk_peek_off field), there is really no point to enforce a pointless
> > thread safety in the kernel.
> >
> > After this patch :
> >
> > - setsockopt(SO_PEEK_OFF) no longer acquires the socket lock.
> >
> > - skb_consume_udp() no longer has to acquire the socket lock.
> >
> > - af_unix no longer needs a special version of sk_set_peek_off(),
> >   because it does not lock u->iolock anymore.
>
> The method employed in this patch, which avoids locking u->iolock in
> SO_PEEK_OFF, appears to have effectively remedied the immediate vulnerability,
> and the patch itself seems robust.
>
> However, if a future scenario arises where mutex_lock(&u->iolock) is required
> after sk_setsockopt(sk), this patch would become ineffective.
>
> In practical testing within my environment, I observed that reintroducing
> mutex_lock(&u->iolock) within sk_setsockopt() triggered the vulnerability once again.
>
> Therefore, I believe it's crucial to address the fundamental cause triggering this vulnerability
> alongside the current patch.
>
> [   30.537400] ======================================================
> [   30.537765] WARNING: possible circular locking dependency detected
> [   30.538237] 6.9.0-rc1-00058-g4076fa161217-dirty #8 Not tainted
> [   30.538541] ------------------------------------------------------
> [   30.538791] poc/209 is trying to acquire lock:
> [   30.539008] ffff888007a8cd58 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: __unix_dgram_recvmsg+0x37e/0x550
> [   30.540060]
> [   30.540060] but task is already holding lock:
> [   30.540482] ffff888007a8d070 (&u->iolock){+.+.}-{3:3}, at: __unix_dgram_recvmsg+0xec/0x550
> [   30.540871]
> [   30.540871] which lock already depends on the new lock.
> [   30.540871]
> [   30.541341]
> [   30.541341] the existing dependency chain (in reverse order) is:
> [   30.541816]
> [   30.541816] -> #1 (&u->iolock){+.+.}-{3:3}:
> [   30.542411]        lock_acquire+0xc0/0x2e0
> [   30.542650]        __mutex_lock+0x91/0x4b0
> [   30.542830]        sk_setsockopt+0xae2/0x1510
> [   30.543009]        do_sock_setsockopt+0x14e/0x180
> [   30.543443]        __sys_setsockopt+0x73/0xc0
> [   30.543635]        __x64_sys_setsockopt+0x1a/0x30
> [   30.543859]        do_syscall_64+0xc9/0x1e0
> [   30.544057]        entry_SYSCALL_64_after_hwframe+0x6d/0x75
> [   30.544652]
> [   30.544652] -> #0 (sk_lock-AF_UNIX){+.+.}-{0:0}:
> [   30.544987]        check_prev_add+0xeb/0xa20
> [   30.545174]        __lock_acquire+0x12fb/0x1740
> [   30.545516]        lock_acquire+0xc0/0x2e0
> [   30.545692]        lock_sock_nested+0x2d/0x80
> [   30.545871]        __unix_dgram_recvmsg+0x37e/0x550
> [   30.546066]        sock_recvmsg+0xbf/0xd0
> [   30.546419]        ____sys_recvmsg+0x85/0x1d0
> [   30.546653]        ___sys_recvmsg+0x77/0xc0
> [   30.546971]        __sys_recvmsg+0x55/0xa0
> [   30.547149]        do_syscall_64+0xc9/0x1e0
> [   30.547428]        entry_SYSCALL_64_after_hwframe+0x6d/0x75
> [   30.547740]
> [   30.547740] other info that might help us debug this:
> [   30.547740]
> [   30.548217]  Possible unsafe locking scenario:
> [   30.548217]
> [   30.548502]        CPU0                    CPU1
> [   30.548713]        ----                    ----
> [   30.548926]   lock(&u->iolock);
> [   30.549234]                                lock(sk_lock-AF_UNIX);
> [   30.549535]                                lock(&u->iolock);
> [   30.549798]   lock(sk_lock-AF_UNIX);
> [   30.549970]
> [   30.549970]  *** DEADLOCK ***
> [   30.549970]
> [   30.550504] 1 lock held by poc/209:
> [   30.550681]  #0: ffff888007a8d070 (&u->iolock){+.+.}-{3:3}, at: __unix_dgram_recvmsg+0xec/0x550
> [   30.551100]
> [   30.551100] stack backtrace:
> [   30.551532] CPU: 1 PID: 209 Comm: poc Not tainted 6.9.0-rc1-00058-g4076fa161217-dirty #8
> [   30.551910] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
> [   30.552539] Call Trace:
> [   30.552788]  <TASK>
> [   30.552987]  dump_stack_lvl+0x68/0xa0
> [   30.553429]  check_noncircular+0x135/0x150
> [   30.553626]  check_prev_add+0xeb/0xa20
> [   30.553811]  __lock_acquire+0x12fb/0x1740
> [   30.553993]  lock_acquire+0xc0/0x2e0
> [   30.554234]  ? __unix_dgram_recvmsg+0x37e/0x550
> [   30.554543]  ? __skb_try_recv_datagram+0xb2/0x190
> [   30.554752]  lock_sock_nested+0x2d/0x80
> [   30.554912]  ? __unix_dgram_recvmsg+0x37e/0x550
> [   30.555097]  __unix_dgram_recvmsg+0x37e/0x550
> [   30.555498]  sock_recvmsg+0xbf/0xd0
> [   30.555661]  ____sys_recvmsg+0x85/0x1d0
> [   30.555826]  ? __import_iovec+0x177/0x1d0
> [   30.555998]  ? import_iovec+0x1a/0x20
> [   30.556401]  ? copy_msghdr_from_user+0x68/0xa0
> [   30.556676]  ___sys_recvmsg+0x77/0xc0
> [   30.556856]  ? __fget_files+0xc8/0x1a0
> [   30.557612]  ? lock_release+0xbd/0x290
> [   30.557799]  ? __fget_files+0xcd/0x1a0
> [   30.557969]  __sys_recvmsg+0x55/0xa0
> [   30.558284]  do_syscall_64+0xc9/0x1e0
> [   30.558455]  entry_SYSCALL_64_after_hwframe+0x6d/0x75
> [   30.558740] RIP: 0033:0x7f3c14632dad
> [   30.559329] Code: 28 89 54 24 1c 48 89 74 24 10 89 7c 24 08 e8 6a ef ff ff 8b 54 24 1c 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 2f 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 44 24 08 e8 9e ef f8
> [   30.560156] RSP: 002b:00007f3c12c43e60 EFLAGS: 00000293 ORIG_RAX: 000000000000002f
> [   30.560582] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f3c14632dad
> [   30.560933] RDX: 0000000000000000 RSI: 00007f3c12c44eb0 RDI: 0000000000000005
> [   30.562935] RBP: 00007f3c12c44ef0 R08: 0000000000000000 R09: 00007f3c12c45700
> [   30.565833] R10: fffffffffffff648 R11: 0000000000000293 R12: 00007ffe93a2bfde
> [   30.566161] R13: 00007ffe93a2bfdf R14: 00007f3c12c44fc0 R15: 0000000000802000
> [   30.569456]  </TASK>
>
>
>
>
> What are your thoughts on this?

You are talking about some unreleased code ?

I can not comment, obviously.
Jeongjun Park April 8, 2024, 3:43 p.m. UTC | #10
Eric Dumazet wrote:
> On Mon, Apr 8, 2024 at 4:30 PM Jeongjun Park <aha310510@gmail.com> wrote:
> >
> > Eric Dumazet wrote:
> > > syzbot reported a lockdep violation [1] involving af_unix
> > > support of SO_PEEK_OFF.
> > >
> > > Since SO_PEEK_OFF is inherently not thread safe (it uses a per-socket
> > > sk_peek_off field), there is really no point to enforce a pointless
> > > thread safety in the kernel.
> > >
> > > After this patch :
> > >
> > > - setsockopt(SO_PEEK_OFF) no longer acquires the socket lock.
> > >
> > > - skb_consume_udp() no longer has to acquire the socket lock.
> > >
> > > - af_unix no longer needs a special version of sk_set_peek_off(),
> > >   because it does not lock u->iolock anymore.
> >
> > The method employed in this patch, which avoids locking u->iolock in
> > SO_PEEK_OFF, appears to have effectively remedied the immediate vulnerability,
> > and the patch itself seems robust.
> >
> > However, if a future scenario arises where mutex_lock(&u->iolock) is required
> > after sk_setsockopt(sk), this patch would become ineffective.
> >
> > In practical testing within my environment, I observed that reintroducing
> > mutex_lock(&u->iolock) within sk_setsockopt() triggered the vulnerability once again.
> >
> > Therefore, I believe it's crucial to address the fundamental cause triggering this vulnerability
> > alongside the current patch.
> >
> > [   30.537400] ======================================================
> > [   30.537765] WARNING: possible circular locking dependency detected
> > [   30.538237] 6.9.0-rc1-00058-g4076fa161217-dirty #8 Not tainted
> > [   30.538541] ------------------------------------------------------
> > [   30.538791] poc/209 is trying to acquire lock:
> > [   30.539008] ffff888007a8cd58 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: __unix_dgram_recvmsg+0x37e/0x550
> > [   30.540060]
> > [   30.540060] but task is already holding lock:
> > [   30.540482] ffff888007a8d070 (&u->iolock){+.+.}-{3:3}, at: __unix_dgram_recvmsg+0xec/0x550
> > [   30.540871]
> > [   30.540871] which lock already depends on the new lock.
> > [   30.540871]
> > [   30.541341]
> > [   30.541341] the existing dependency chain (in reverse order) is:
> > [   30.541816]
> > [   30.541816] -> #1 (&u->iolock){+.+.}-{3:3}:
> > [   30.542411]        lock_acquire+0xc0/0x2e0
> > [   30.542650]        __mutex_lock+0x91/0x4b0
> > [   30.542830]        sk_setsockopt+0xae2/0x1510
> > [   30.543009]        do_sock_setsockopt+0x14e/0x180
> > [   30.543443]        __sys_setsockopt+0x73/0xc0
> > [   30.543635]        __x64_sys_setsockopt+0x1a/0x30
> > [   30.543859]        do_syscall_64+0xc9/0x1e0
> > [   30.544057]        entry_SYSCALL_64_after_hwframe+0x6d/0x75
> > [   30.544652]
> > [   30.544652] -> #0 (sk_lock-AF_UNIX){+.+.}-{0:0}:
> > [   30.544987]        check_prev_add+0xeb/0xa20
> > [   30.545174]        __lock_acquire+0x12fb/0x1740
> > [   30.545516]        lock_acquire+0xc0/0x2e0
> > [   30.545692]        lock_sock_nested+0x2d/0x80
> > [   30.545871]        __unix_dgram_recvmsg+0x37e/0x550
> > [   30.546066]        sock_recvmsg+0xbf/0xd0
> > [   30.546419]        ____sys_recvmsg+0x85/0x1d0
> > [   30.546653]        ___sys_recvmsg+0x77/0xc0
> > [   30.546971]        __sys_recvmsg+0x55/0xa0
> > [   30.547149]        do_syscall_64+0xc9/0x1e0
> > [   30.547428]        entry_SYSCALL_64_after_hwframe+0x6d/0x75
> > [   30.547740]
> > [   30.547740] other info that might help us debug this:
> > [   30.547740]
> > [   30.548217]  Possible unsafe locking scenario:
> > [   30.548217]
> > [   30.548502]        CPU0                    CPU1
> > [   30.548713]        ----                    ----
> > [   30.548926]   lock(&u->iolock);
> > [   30.549234]                                lock(sk_lock-AF_UNIX);
> > [   30.549535]                                lock(&u->iolock);
> > [   30.549798]   lock(sk_lock-AF_UNIX);
> > [   30.549970]
> > [   30.549970]  *** DEADLOCK ***
> > [   30.549970]
> > [   30.550504] 1 lock held by poc/209:
> > [   30.550681]  #0: ffff888007a8d070 (&u->iolock){+.+.}-{3:3}, at: __unix_dgram_recvmsg+0xec/0x550
> > [   30.551100]
> > [   30.551100] stack backtrace:
> > [   30.551532] CPU: 1 PID: 209 Comm: poc Not tainted 6.9.0-rc1-00058-g4076fa161217-dirty #8
> > [   30.551910] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
> > [   30.552539] Call Trace:
> > [   30.552788]  <TASK>
> > [   30.552987]  dump_stack_lvl+0x68/0xa0
> > [   30.553429]  check_noncircular+0x135/0x150
> > [   30.553626]  check_prev_add+0xeb/0xa20
> > [   30.553811]  __lock_acquire+0x12fb/0x1740
> > [   30.553993]  lock_acquire+0xc0/0x2e0
> > [   30.554234]  ? __unix_dgram_recvmsg+0x37e/0x550
> > [   30.554543]  ? __skb_try_recv_datagram+0xb2/0x190
> > [   30.554752]  lock_sock_nested+0x2d/0x80
> > [   30.554912]  ? __unix_dgram_recvmsg+0x37e/0x550
> > [   30.555097]  __unix_dgram_recvmsg+0x37e/0x550
> > [   30.555498]  sock_recvmsg+0xbf/0xd0
> > [   30.555661]  ____sys_recvmsg+0x85/0x1d0
> > [   30.555826]  ? __import_iovec+0x177/0x1d0
> > [   30.555998]  ? import_iovec+0x1a/0x20
> > [   30.556401]  ? copy_msghdr_from_user+0x68/0xa0
> > [   30.556676]  ___sys_recvmsg+0x77/0xc0
> > [   30.556856]  ? __fget_files+0xc8/0x1a0
> > [   30.557612]  ? lock_release+0xbd/0x290
> > [   30.557799]  ? __fget_files+0xcd/0x1a0
> > [   30.557969]  __sys_recvmsg+0x55/0xa0
> > [   30.558284]  do_syscall_64+0xc9/0x1e0
> > [   30.558455]  entry_SYSCALL_64_after_hwframe+0x6d/0x75
> > [   30.558740] RIP: 0033:0x7f3c14632dad
> > [   30.559329] Code: 28 89 54 24 1c 48 89 74 24 10 89 7c 24 08 e8 6a ef ff ff 8b 54 24 1c 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 2f 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 44 24 08 e8 9e ef f8
> > [   30.560156] RSP: 002b:00007f3c12c43e60 EFLAGS: 00000293 ORIG_RAX: 000000000000002f
> > [   30.560582] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f3c14632dad
> > [   30.560933] RDX: 0000000000000000 RSI: 00007f3c12c44eb0 RDI: 0000000000000005
> > [   30.562935] RBP: 00007f3c12c44ef0 R08: 0000000000000000 R09: 00007f3c12c45700
> > [   30.565833] R10: fffffffffffff648 R11: 0000000000000293 R12: 00007ffe93a2bfde
> > [   30.566161] R13: 00007ffe93a2bfdf R14: 00007f3c12c44fc0 R15: 0000000000802000
> > [   30.569456]  </TASK>
> >
> >
> >
> >
> > What are your thoughts on this?
> 
> You are talking about some unreleased code ?
> 
> I can not comment, obviously.



I think it would be prudent to patch __unix_dgram_recvmsg() as 
depicted below to ensure its proper functionality, even 
in the event of a later addition of mutex_lock in sk_setsockopt(). 

By implementing this patch, we mitigate the risk of potential deadlock scenarios 
in the future by eliminating the condition that could lead to them.



---
 net/unix/af_unix.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 5b41e2321209..f102f08f649f 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2458,11 +2458,14 @@ int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
                                                EPOLLWRBAND);
 
        if (msg->msg_name) {
+               mutex_unlock(&u->iolock);
+
                unix_copy_addr(msg, skb->sk);
-
+
                BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
                                                      msg->msg_name,
                                                      &msg->msg_namelen);
+               mutex_lock(&u->iolock);
        }
 
        if (size > skb->len - skip)
@@ -2814,6 +2817,8 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
 
                /* Copy address just once */
                if (state->msg && state->msg->msg_name) {
+                       mutex_unlock(&u->iolock);
+
                        DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
                                         state->msg->msg_name);
                        unix_copy_addr(state->msg, skb->sk);
@@ -2823,6 +2828,8 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
                                                              &state->msg->msg_namelen);
 
                        sunaddr = NULL;
+
+                       mutex_lock(&u->iolock);
                }
 
                chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
Kuniyuki Iwashima April 8, 2024, 6:31 p.m. UTC | #11
From: Jeongjun Park <aha310510@gmail.com>
Date: Tue,  9 Apr 2024 00:43:54 +0900
> Eric Dumazet wrote:
> > On Mon, Apr 8, 2024 at 4:30 PM Jeongjun Park <aha310510@gmail.com> wrote:
> > >
> > > Eric Dumazet wrote:
> > > > syzbot reported a lockdep violation [1] involving af_unix
> > > > support of SO_PEEK_OFF.
> > > >
> > > > Since SO_PEEK_OFF is inherently not thread safe (it uses a per-socket
> > > > sk_peek_off field), there is really no point to enforce a pointless
> > > > thread safety in the kernel.
> > > >
> > > > After this patch :
> > > >
> > > > - setsockopt(SO_PEEK_OFF) no longer acquires the socket lock.
> > > >
> > > > - skb_consume_udp() no longer has to acquire the socket lock.
> > > >
> > > > - af_unix no longer needs a special version of sk_set_peek_off(),
> > > >   because it does not lock u->iolock anymore.
> > >
> > > The method employed in this patch, which avoids locking u->iolock in
> > > SO_PEEK_OFF, appears to have effectively remedied the immediate vulnerability,
> > > and the patch itself seems robust.
> > >
> > > However, if a future scenario arises where mutex_lock(&u->iolock) is required
> > > after sk_setsockopt(sk), this patch would become ineffective.
> > >
> > > In practical testing within my environment, I observed that reintroducing
> > > mutex_lock(&u->iolock) within sk_setsockopt() triggered the vulnerability once again.
> > >
> > > Therefore, I believe it's crucial to address the fundamental cause triggering this vulnerability
> > > alongside the current patch.
> > >
> > > [   30.537400] ======================================================
> > > [   30.537765] WARNING: possible circular locking dependency detected
> > > [   30.538237] 6.9.0-rc1-00058-g4076fa161217-dirty #8 Not tainted
> > > [   30.538541] ------------------------------------------------------
> > > [   30.538791] poc/209 is trying to acquire lock:
> > > [   30.539008] ffff888007a8cd58 (sk_lock-AF_UNIX){+.+.}-{0:0}, at: __unix_dgram_recvmsg+0x37e/0x550
> > > [   30.540060]
> > > [   30.540060] but task is already holding lock:
> > > [   30.540482] ffff888007a8d070 (&u->iolock){+.+.}-{3:3}, at: __unix_dgram_recvmsg+0xec/0x550
> > > [   30.540871]
> > > [   30.540871] which lock already depends on the new lock.
> > > [   30.540871]
> > > [   30.541341]
> > > [   30.541341] the existing dependency chain (in reverse order) is:
> > > [   30.541816]
> > > [   30.541816] -> #1 (&u->iolock){+.+.}-{3:3}:
> > > [   30.542411]        lock_acquire+0xc0/0x2e0
> > > [   30.542650]        __mutex_lock+0x91/0x4b0
> > > [   30.542830]        sk_setsockopt+0xae2/0x1510
> > > [   30.543009]        do_sock_setsockopt+0x14e/0x180
> > > [   30.543443]        __sys_setsockopt+0x73/0xc0
> > > [   30.543635]        __x64_sys_setsockopt+0x1a/0x30
> > > [   30.543859]        do_syscall_64+0xc9/0x1e0
> > > [   30.544057]        entry_SYSCALL_64_after_hwframe+0x6d/0x75
> > > [   30.544652]
> > > [   30.544652] -> #0 (sk_lock-AF_UNIX){+.+.}-{0:0}:
> > > [   30.544987]        check_prev_add+0xeb/0xa20
> > > [   30.545174]        __lock_acquire+0x12fb/0x1740
> > > [   30.545516]        lock_acquire+0xc0/0x2e0
> > > [   30.545692]        lock_sock_nested+0x2d/0x80
> > > [   30.545871]        __unix_dgram_recvmsg+0x37e/0x550
> > > [   30.546066]        sock_recvmsg+0xbf/0xd0
> > > [   30.546419]        ____sys_recvmsg+0x85/0x1d0
> > > [   30.546653]        ___sys_recvmsg+0x77/0xc0
> > > [   30.546971]        __sys_recvmsg+0x55/0xa0
> > > [   30.547149]        do_syscall_64+0xc9/0x1e0
> > > [   30.547428]        entry_SYSCALL_64_after_hwframe+0x6d/0x75
> > > [   30.547740]
> > > [   30.547740] other info that might help us debug this:
> > > [   30.547740]
> > > [   30.548217]  Possible unsafe locking scenario:
> > > [   30.548217]
> > > [   30.548502]        CPU0                    CPU1
> > > [   30.548713]        ----                    ----
> > > [   30.548926]   lock(&u->iolock);
> > > [   30.549234]                                lock(sk_lock-AF_UNIX);
> > > [   30.549535]                                lock(&u->iolock);
> > > [   30.549798]   lock(sk_lock-AF_UNIX);
> > > [   30.549970]
> > > [   30.549970]  *** DEADLOCK ***
> > > [   30.549970]
> > > [   30.550504] 1 lock held by poc/209:
> > > [   30.550681]  #0: ffff888007a8d070 (&u->iolock){+.+.}-{3:3}, at: __unix_dgram_recvmsg+0xec/0x550
> > > [   30.551100]
> > > [   30.551100] stack backtrace:
> > > [   30.551532] CPU: 1 PID: 209 Comm: poc Not tainted 6.9.0-rc1-00058-g4076fa161217-dirty #8
> > > [   30.551910] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
> > > [   30.552539] Call Trace:
> > > [   30.552788]  <TASK>
> > > [   30.552987]  dump_stack_lvl+0x68/0xa0
> > > [   30.553429]  check_noncircular+0x135/0x150
> > > [   30.553626]  check_prev_add+0xeb/0xa20
> > > [   30.553811]  __lock_acquire+0x12fb/0x1740
> > > [   30.553993]  lock_acquire+0xc0/0x2e0
> > > [   30.554234]  ? __unix_dgram_recvmsg+0x37e/0x550
> > > [   30.554543]  ? __skb_try_recv_datagram+0xb2/0x190
> > > [   30.554752]  lock_sock_nested+0x2d/0x80
> > > [   30.554912]  ? __unix_dgram_recvmsg+0x37e/0x550
> > > [   30.555097]  __unix_dgram_recvmsg+0x37e/0x550
> > > [   30.555498]  sock_recvmsg+0xbf/0xd0
> > > [   30.555661]  ____sys_recvmsg+0x85/0x1d0
> > > [   30.555826]  ? __import_iovec+0x177/0x1d0
> > > [   30.555998]  ? import_iovec+0x1a/0x20
> > > [   30.556401]  ? copy_msghdr_from_user+0x68/0xa0
> > > [   30.556676]  ___sys_recvmsg+0x77/0xc0
> > > [   30.556856]  ? __fget_files+0xc8/0x1a0
> > > [   30.557612]  ? lock_release+0xbd/0x290
> > > [   30.557799]  ? __fget_files+0xcd/0x1a0
> > > [   30.557969]  __sys_recvmsg+0x55/0xa0
> > > [   30.558284]  do_syscall_64+0xc9/0x1e0
> > > [   30.558455]  entry_SYSCALL_64_after_hwframe+0x6d/0x75
> > > [   30.558740] RIP: 0033:0x7f3c14632dad
> > > [   30.559329] Code: 28 89 54 24 1c 48 89 74 24 10 89 7c 24 08 e8 6a ef ff ff 8b 54 24 1c 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 2f 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 44 24 08 e8 9e ef f8
> > > [   30.560156] RSP: 002b:00007f3c12c43e60 EFLAGS: 00000293 ORIG_RAX: 000000000000002f
> > > [   30.560582] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f3c14632dad
> > > [   30.560933] RDX: 0000000000000000 RSI: 00007f3c12c44eb0 RDI: 0000000000000005
> > > [   30.562935] RBP: 00007f3c12c44ef0 R08: 0000000000000000 R09: 00007f3c12c45700
> > > [   30.565833] R10: fffffffffffff648 R11: 0000000000000293 R12: 00007ffe93a2bfde
> > > [   30.566161] R13: 00007ffe93a2bfdf R14: 00007f3c12c44fc0 R15: 0000000000802000
> > > [   30.569456]  </TASK>
> > >
> > >
> > >
> > >
> > > What are your thoughts on this?
> > 
> > You are talking about some unreleased code ?
> > 
> > I can not comment, obviously.
> 
> 
> 
> I think it would be prudent to patch __unix_dgram_recvmsg() as 
> depicted below to ensure its proper functionality, even 
> in the event of a later addition of mutex_lock in sk_setsockopt(). 
> 
> By implementing this patch, we mitigate the risk of potential deadlock scenarios 
> in the future by eliminating the condition that could lead to them.

It might mitigate your risk, but it does not exist upstream.

We don't accept such a patch that adds unnecessary locks just
for a future possible issue.  It should be fixed when such an
option requiring u->iolock is added.


> 
> ---
>  net/unix/af_unix.c | 9 ++++++++-
>  1 file changed, 8 insertions(+), 1 deletion(-)
> 
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index 5b41e2321209..f102f08f649f 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -2458,11 +2458,14 @@ int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
>                                                 EPOLLWRBAND);
>  
>         if (msg->msg_name) {
> +               mutex_unlock(&u->iolock);
> +
>                 unix_copy_addr(msg, skb->sk);
> -
> +
>                 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
>                                                       msg->msg_name,
>                                                       &msg->msg_namelen);
> +               mutex_lock(&u->iolock);
>         }
>  
>         if (size > skb->len - skip)
> @@ -2814,6 +2817,8 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
>  
>                 /* Copy address just once */
>                 if (state->msg && state->msg->msg_name) {
> +                       mutex_unlock(&u->iolock);
> +
>                         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
>                                          state->msg->msg_name);
>                         unix_copy_addr(state->msg, skb->sk);
> @@ -2823,6 +2828,8 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
>                                                               &state->msg->msg_namelen);
>  
>                         sunaddr = NULL;
> +
> +                       mutex_lock(&u->iolock);
>                 }
>  
>                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
> -- 
> 2.34.1
Jeongjun Park April 9, 2024, 1:43 p.m. UTC | #12
Kuniyuki Iwashima wrote:
> It might mitigate your risk, but it does not exist upstream.

> We don't accept such a patch that adds unnecessary locks just
> for a future possible issue.  It should be fixed when such an
> option requiring u->iolock is added.


Ah i see.

I learned that you should not carelessly patch major code 
considering potential elements that do not immediately exist upstream.

Thanks.
diff mbox series

Patch

diff --git a/net/core/sock.c b/net/core/sock.c
index 0a7f46c37f0cfc169e11377107c8342c229da0de..5e78798456fd81dbd34e94021531340f7ba5ab0a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1188,6 +1188,17 @@  int sk_setsockopt(struct sock *sk, int level, int optname,
 		 */
 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
 		return 0;
+	case SO_PEEK_OFF:
+		{
+		int (*set_peek_off)(struct sock *sk, int val);
+
+		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
+		if (set_peek_off)
+			ret = set_peek_off(sk, val);
+		else
+			ret = -EOPNOTSUPP;
+		return ret;
+		}
 	}
 
 	sockopt_lock_sock(sk);
@@ -1430,18 +1441,6 @@  int sk_setsockopt(struct sock *sk, int level, int optname,
 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
 		break;
 
-	case SO_PEEK_OFF:
-		{
-		int (*set_peek_off)(struct sock *sk, int val);
-
-		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
-		if (set_peek_off)
-			ret = set_peek_off(sk, val);
-		else
-			ret = -EOPNOTSUPP;
-		break;
-		}
-
 	case SO_NOFCS:
 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
 		break;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f631b0a21af4c7a520212c94ed0580f86d269ed2..e474b201900f9317069a31e4b507964fe11b2297 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1589,12 +1589,7 @@  int udp_init_sock(struct sock *sk)
 
 void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
 {
-	if (unlikely(READ_ONCE(sk->sk_peek_off) >= 0)) {
-		bool slow = lock_sock_fast(sk);
-
-		sk_peek_offset_bwd(sk, len);
-		unlock_sock_fast(sk, slow);
-	}
+	sk_peek_offset_bwd(sk, len);
 
 	if (!skb_unref(skb))
 		return;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 30b178ebba60aa810e8442a326a14edcee071061..0748e7ea5210e7d597acf87fc6caf1ea2156562e 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -782,19 +782,6 @@  static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
 				  int);
 
-static int unix_set_peek_off(struct sock *sk, int val)
-{
-	struct unix_sock *u = unix_sk(sk);
-
-	if (mutex_lock_interruptible(&u->iolock))
-		return -EINTR;
-
-	WRITE_ONCE(sk->sk_peek_off, val);
-	mutex_unlock(&u->iolock);
-
-	return 0;
-}
-
 #ifdef CONFIG_PROC_FS
 static int unix_count_nr_fds(struct sock *sk)
 {
@@ -862,7 +849,7 @@  static const struct proto_ops unix_stream_ops = {
 	.read_skb =	unix_stream_read_skb,
 	.mmap =		sock_no_mmap,
 	.splice_read =	unix_stream_splice_read,
-	.set_peek_off =	unix_set_peek_off,
+	.set_peek_off =	sk_set_peek_off,
 	.show_fdinfo =	unix_show_fdinfo,
 };
 
@@ -886,7 +873,7 @@  static const struct proto_ops unix_dgram_ops = {
 	.read_skb =	unix_read_skb,
 	.recvmsg =	unix_dgram_recvmsg,
 	.mmap =		sock_no_mmap,
-	.set_peek_off =	unix_set_peek_off,
+	.set_peek_off =	sk_set_peek_off,
 	.show_fdinfo =	unix_show_fdinfo,
 };
 
@@ -909,7 +896,7 @@  static const struct proto_ops unix_seqpacket_ops = {
 	.sendmsg =	unix_seqpacket_sendmsg,
 	.recvmsg =	unix_seqpacket_recvmsg,
 	.mmap =		sock_no_mmap,
-	.set_peek_off =	unix_set_peek_off,
+	.set_peek_off =	sk_set_peek_off,
 	.show_fdinfo =	unix_show_fdinfo,
 };