diff mbox series

[net] net/sched: act_mirred: use the backlog for mirred ingress

Message ID 33dc43f587ec1388ba456b4915c75f02a8aae226.1663945716.git.dcaratti@redhat.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net] net/sched: act_mirred: use the backlog for mirred ingress | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net
netdev/fixes_present success Fixes tag present in non-next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers fail 2 blamed authors not CCed: shmulik.ladkani@gmail.com davem@davemloft.net; 8 maintainers not CCed: shmulik.ladkani@gmail.com kuba@kernel.org shuah@kernel.org idosch@nvidia.com davem@davemloft.net linux-kselftest@vger.kernel.org vladimir.oltean@nxp.com edumazet@google.com
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success net selftest script(s) already in Makefile
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 49 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Davide Caratti Sept. 23, 2022, 3:11 p.m. UTC
William reports kernel soft-lockups on some OVS topologies when TC mirred
"egress-to-ingress" action is hit by local TCP traffic. Indeed, using the
mirred action in egress-to-ingress can easily produce a dmesg splat like:

 ============================================
 WARNING: possible recursive locking detected
 6.0.0-rc4+ #511 Not tainted
 --------------------------------------------
 nc/1037 is trying to acquire lock:
 ffff950687843cb0 (slock-AF_INET/1){+.-.}-{2:2}, at: tcp_v4_rcv+0x1023/0x1160

 but task is already holding lock:
 ffff950687846cb0 (slock-AF_INET/1){+.-.}-{2:2}, at: tcp_v4_rcv+0x1023/0x1160

 other info that might help us debug this:
  Possible unsafe locking scenario:

        CPU0
        ----
   lock(slock-AF_INET/1);
   lock(slock-AF_INET/1);

  *** DEADLOCK ***

  May be due to missing lock nesting notation

 12 locks held by nc/1037:
  #0: ffff950687843d40 (sk_lock-AF_INET){+.+.}-{0:0}, at: tcp_sendmsg+0x19/0x40
  #1: ffffffff9be07320 (rcu_read_lock){....}-{1:2}, at: __ip_queue_xmit+0x5/0x610
  #2: ffffffff9be072e0 (rcu_read_lock_bh){....}-{1:2}, at: ip_finish_output2+0xaa/0xa10
  #3: ffffffff9be072e0 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x72/0x11b0
  #4: ffffffff9be07320 (rcu_read_lock){....}-{1:2}, at: netif_receive_skb+0x181/0x400
  #5: ffffffff9be07320 (rcu_read_lock){....}-{1:2}, at: ip_local_deliver_finish+0x54/0x160
  #6: ffff950687846cb0 (slock-AF_INET/1){+.-.}-{2:2}, at: tcp_v4_rcv+0x1023/0x1160
  #7: ffffffff9be07320 (rcu_read_lock){....}-{1:2}, at: __ip_queue_xmit+0x5/0x610
  #8: ffffffff9be072e0 (rcu_read_lock_bh){....}-{1:2}, at: ip_finish_output2+0xaa/0xa10
  #9: ffffffff9be072e0 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x72/0x11b0
  #10: ffffffff9be07320 (rcu_read_lock){....}-{1:2}, at: netif_receive_skb+0x181/0x400
  #11: ffffffff9be07320 (rcu_read_lock){....}-{1:2}, at: ip_local_deliver_finish+0x54/0x160

 stack backtrace:
 CPU: 1 PID: 1037 Comm: nc Not tainted 6.0.0-rc4+ #511
 Hardware name: Red Hat KVM, BIOS 1.13.0-2.module+el8.3.0+7353+9de0a3cc 04/01/2014
 Call Trace:
  <TASK>
  dump_stack_lvl+0x44/0x5b
  __lock_acquire.cold.76+0x121/0x2a7
  lock_acquire+0xd5/0x310
  _raw_spin_lock_nested+0x39/0x70
  tcp_v4_rcv+0x1023/0x1160
  ip_protocol_deliver_rcu+0x4d/0x280
  ip_local_deliver_finish+0xac/0x160
  ip_local_deliver+0x71/0x220
  ip_rcv+0x5a/0x200
  __netif_receive_skb_one_core+0x89/0xa0
  netif_receive_skb+0x1c1/0x400
  tcf_mirred_act+0x2a5/0x610 [act_mirred]
  tcf_action_exec+0xb3/0x210
  fl_classify+0x1f7/0x240 [cls_flower]
  tcf_classify+0x7b/0x320
  __dev_queue_xmit+0x3a4/0x11b0
  ip_finish_output2+0x3b8/0xa10
  ip_output+0x7f/0x260
  __ip_queue_xmit+0x1ce/0x610
  __tcp_transmit_skb+0xabc/0xc80
  tcp_rcv_state_process+0x669/0x1290
  tcp_v4_do_rcv+0xd7/0x370
  tcp_v4_rcv+0x10bc/0x1160
  ip_protocol_deliver_rcu+0x4d/0x280
  ip_local_deliver_finish+0xac/0x160
  ip_local_deliver+0x71/0x220
  ip_rcv+0x5a/0x200
  __netif_receive_skb_one_core+0x89/0xa0
  netif_receive_skb+0x1c1/0x400
  tcf_mirred_act+0x2a5/0x610 [act_mirred]
  tcf_action_exec+0xb3/0x210
  fl_classify+0x1f7/0x240 [cls_flower]
  tcf_classify+0x7b/0x320
  __dev_queue_xmit+0x3a4/0x11b0
  ip_finish_output2+0x3b8/0xa10
  ip_output+0x7f/0x260
  __ip_queue_xmit+0x1ce/0x610
  __tcp_transmit_skb+0xabc/0xc80
  tcp_write_xmit+0x229/0x12c0
  __tcp_push_pending_frames+0x32/0xf0
  tcp_sendmsg_locked+0x297/0xe10
  tcp_sendmsg+0x27/0x40
  sock_sendmsg+0x58/0x70
  __sys_sendto+0xfd/0x170
  __x64_sys_sendto+0x24/0x30
  do_syscall_64+0x3a/0x90
  entry_SYSCALL_64_after_hwframe+0x63/0xcd
 RIP: 0033:0x7f11a06fd281
 Code: 00 00 00 00 0f 1f 44 00 00 f3 0f 1e fa 48 8d 05 e5 43 2c 00 41 89 ca 8b 00 85 c0 75 1c 45 31 c9 45 31 c0 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 67 c3 66 0f 1f 44 00 00 41 56 41 89 ce 41 55
 RSP: 002b:00007ffd17958358 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
 RAX: ffffffffffffffda RBX: 0000555c6e671610 RCX: 00007f11a06fd281
 RDX: 0000000000002000 RSI: 0000555c6e73a9f0 RDI: 0000000000000003
 RBP: 0000555c6e6433b0 R08: 0000000000000000 R09: 0000000000000000
 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000002000
 R13: 0000555c6e671410 R14: 0000555c6e671410 R15: 0000555c6e6433f8
  </TASK>

that is very similar to those observed by William in his setup.
By using netif_rx() for mirred ingress packets, packets are queued in the
backlog, like it's done in the receive path of "loopback" and "veth", and
the deadlock is not visible anymore. Also add a selftest that can be used
to reproduce the problem / verify the fix.

Fixes: 53592b364001 ("net/sched: act_mirred: Implement ingress actions")
Reported-by: William Zhao <wizhao@redhat.com>
Suggested-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
---
 net/sched/act_mirred.c                        |  2 +-
 .../selftests/net/forwarding/tc_actions.sh    | 29 ++++++++++++++++++-
 2 files changed, 29 insertions(+), 2 deletions(-)

Comments

Cong Wang Sept. 25, 2022, 6:08 p.m. UTC | #1
On Fri, Sep 23, 2022 at 05:11:12PM +0200, Davide Caratti wrote:
> William reports kernel soft-lockups on some OVS topologies when TC mirred
> "egress-to-ingress" action is hit by local TCP traffic. Indeed, using the
> mirred action in egress-to-ingress can easily produce a dmesg splat like:
> 
>  ============================================
>  WARNING: possible recursive locking detected
>  6.0.0-rc4+ #511 Not tainted
>  --------------------------------------------
>  nc/1037 is trying to acquire lock:
>  ffff950687843cb0 (slock-AF_INET/1){+.-.}-{2:2}, at: tcp_v4_rcv+0x1023/0x1160
> 
>  but task is already holding lock:
>  ffff950687846cb0 (slock-AF_INET/1){+.-.}-{2:2}, at: tcp_v4_rcv+0x1023/0x1160
> 
>  other info that might help us debug this:
>   Possible unsafe locking scenario:
> 
>         CPU0
>         ----
>    lock(slock-AF_INET/1);
>    lock(slock-AF_INET/1);
> 
>   *** DEADLOCK ***
> 
>   May be due to missing lock nesting notation
> 
>  12 locks held by nc/1037:
>   #0: ffff950687843d40 (sk_lock-AF_INET){+.+.}-{0:0}, at: tcp_sendmsg+0x19/0x40
>   #1: ffffffff9be07320 (rcu_read_lock){....}-{1:2}, at: __ip_queue_xmit+0x5/0x610
>   #2: ffffffff9be072e0 (rcu_read_lock_bh){....}-{1:2}, at: ip_finish_output2+0xaa/0xa10
>   #3: ffffffff9be072e0 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x72/0x11b0
>   #4: ffffffff9be07320 (rcu_read_lock){....}-{1:2}, at: netif_receive_skb+0x181/0x400
>   #5: ffffffff9be07320 (rcu_read_lock){....}-{1:2}, at: ip_local_deliver_finish+0x54/0x160
>   #6: ffff950687846cb0 (slock-AF_INET/1){+.-.}-{2:2}, at: tcp_v4_rcv+0x1023/0x1160
>   #7: ffffffff9be07320 (rcu_read_lock){....}-{1:2}, at: __ip_queue_xmit+0x5/0x610
>   #8: ffffffff9be072e0 (rcu_read_lock_bh){....}-{1:2}, at: ip_finish_output2+0xaa/0xa10
>   #9: ffffffff9be072e0 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x72/0x11b0
>   #10: ffffffff9be07320 (rcu_read_lock){....}-{1:2}, at: netif_receive_skb+0x181/0x400
>   #11: ffffffff9be07320 (rcu_read_lock){....}-{1:2}, at: ip_local_deliver_finish+0x54/0x160
> 
>  stack backtrace:
>  CPU: 1 PID: 1037 Comm: nc Not tainted 6.0.0-rc4+ #511
>  Hardware name: Red Hat KVM, BIOS 1.13.0-2.module+el8.3.0+7353+9de0a3cc 04/01/2014
>  Call Trace:
>   <TASK>
>   dump_stack_lvl+0x44/0x5b
>   __lock_acquire.cold.76+0x121/0x2a7
>   lock_acquire+0xd5/0x310
>   _raw_spin_lock_nested+0x39/0x70
>   tcp_v4_rcv+0x1023/0x1160
>   ip_protocol_deliver_rcu+0x4d/0x280
>   ip_local_deliver_finish+0xac/0x160
>   ip_local_deliver+0x71/0x220
>   ip_rcv+0x5a/0x200
>   __netif_receive_skb_one_core+0x89/0xa0
>   netif_receive_skb+0x1c1/0x400
>   tcf_mirred_act+0x2a5/0x610 [act_mirred]
>   tcf_action_exec+0xb3/0x210
>   fl_classify+0x1f7/0x240 [cls_flower]
>   tcf_classify+0x7b/0x320
>   __dev_queue_xmit+0x3a4/0x11b0
>   ip_finish_output2+0x3b8/0xa10
>   ip_output+0x7f/0x260
>   __ip_queue_xmit+0x1ce/0x610
>   __tcp_transmit_skb+0xabc/0xc80
>   tcp_rcv_state_process+0x669/0x1290
>   tcp_v4_do_rcv+0xd7/0x370
>   tcp_v4_rcv+0x10bc/0x1160
>   ip_protocol_deliver_rcu+0x4d/0x280
>   ip_local_deliver_finish+0xac/0x160
>   ip_local_deliver+0x71/0x220
>   ip_rcv+0x5a/0x200
>   __netif_receive_skb_one_core+0x89/0xa0
>   netif_receive_skb+0x1c1/0x400
>   tcf_mirred_act+0x2a5/0x610 [act_mirred]
>   tcf_action_exec+0xb3/0x210
>   fl_classify+0x1f7/0x240 [cls_flower]
>   tcf_classify+0x7b/0x320
>   __dev_queue_xmit+0x3a4/0x11b0
>   ip_finish_output2+0x3b8/0xa10
>   ip_output+0x7f/0x260
>   __ip_queue_xmit+0x1ce/0x610
>   __tcp_transmit_skb+0xabc/0xc80
>   tcp_write_xmit+0x229/0x12c0
>   __tcp_push_pending_frames+0x32/0xf0
>   tcp_sendmsg_locked+0x297/0xe10
>   tcp_sendmsg+0x27/0x40
>   sock_sendmsg+0x58/0x70
>   __sys_sendto+0xfd/0x170
>   __x64_sys_sendto+0x24/0x30
>   do_syscall_64+0x3a/0x90
>   entry_SYSCALL_64_after_hwframe+0x63/0xcd
>  RIP: 0033:0x7f11a06fd281
>  Code: 00 00 00 00 0f 1f 44 00 00 f3 0f 1e fa 48 8d 05 e5 43 2c 00 41 89 ca 8b 00 85 c0 75 1c 45 31 c9 45 31 c0 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 67 c3 66 0f 1f 44 00 00 41 56 41 89 ce 41 55
>  RSP: 002b:00007ffd17958358 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
>  RAX: ffffffffffffffda RBX: 0000555c6e671610 RCX: 00007f11a06fd281
>  RDX: 0000000000002000 RSI: 0000555c6e73a9f0 RDI: 0000000000000003
>  RBP: 0000555c6e6433b0 R08: 0000000000000000 R09: 0000000000000000
>  R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000002000
>  R13: 0000555c6e671410 R14: 0000555c6e671410 R15: 0000555c6e6433f8
>   </TASK>
> 
> that is very similar to those observed by William in his setup.
> By using netif_rx() for mirred ingress packets, packets are queued in the
> backlog, like it's done in the receive path of "loopback" and "veth", and
> the deadlock is not visible anymore. Also add a selftest that can be used
> to reproduce the problem / verify the fix.

Which also means we can no longer know the RX path status any more,
right? I mean if we have filters on ingress, we can't know whether they
drop this packet or not, after this patch? To me, this at least breaks
users' expectation.

BTW, have you thought about solving the above lockdep warning in TCP
layer?

Thanks.
Davide Caratti Oct. 4, 2022, 5:40 p.m. UTC | #2
hello Cong, thanks for looking at this!

On Sun, Sep 25, 2022 at 11:08:48AM -0700, Cong Wang wrote:
> On Fri, Sep 23, 2022 at 05:11:12PM +0200, Davide Caratti wrote:
> > William reports kernel soft-lockups on some OVS topologies when TC mirred
> > "egress-to-ingress" action is hit by local TCP traffic. Indeed, using the
> > mirred action in egress-to-ingress can easily produce a dmesg splat like:
> > 
> >  ============================================
> >  WARNING: possible recursive locking detected

[...]

> >  6.0.0-rc4+ #511 Not tainted
> >  --------------------------------------------
> >  nc/1037 is trying to acquire lock:
> >  ffff950687843cb0 (slock-AF_INET/1){+.-.}-{2:2}, at: tcp_v4_rcv+0x1023/0x1160
> > 
> >  but task is already holding lock:
> >  ffff950687846cb0 (slock-AF_INET/1){+.-.}-{2:2}, at: tcp_v4_rcv+0x1023/0x1160

FTR, this is:

2091         sk_incoming_cpu_update(sk);
2092
2093         bh_lock_sock_nested(sk); <--- the lock reported in the splat
2094         tcp_segs_in(tcp_sk(sk), skb);
2095         ret = 0;
2096         if (!sock_owned_by_user(sk)) {

> BTW, have you thought about solving the above lockdep warning in TCP
> layer?

yes, but that doesn't look like a trivial fix at all - and I doubt it's
worth doing it just to make mirred and TCP "friends". Please note:
on current kernel this doesn't just result in a lockdep warning: using
iperf3 on unpatched kernels it's possible to see a real deadlock, like:

WARRNING: possible circular locking dependency detected
 6.0.0-rc4+ #511 Not tainted
 ------------------------------------------------------
 iperf3/1021 is trying to acquire lock:
 ffff976005c5a630 (slock-AF_INET6/1){+...}-{2:2}, at: tcp_v4_rcv+0x1023/0x1160

 but task is already holding lock:
 ffff97607b06e0b0 (slock-AF_INET/1){+.-.}-{2:2}, at: tcp_v4_rcv+0x1023/0x1160

 which lock already depends on the new lock.


 the existing dependency chain (in reverse order) is:

 -> #1 (slock-AF_INET/1){+.-.}-{2:2}:
        lock_acquire+0xd5/0x310
        _raw_spin_lock_nested+0x39/0x70
        tcp_v4_rcv+0x1023/0x1160
        ip_protocol_deliver_rcu+0x4d/0x280
        ip_local_deliver_finish+0xac/0x160
        ip_local_deliver+0x71/0x220
        ip_rcv+0x5a/0x200
        __netif_receive_skb_one_core+0x89/0xa0
        netif_receive_skb+0x1c1/0x400
        tcf_mirred_act+0x2a5/0x610 [act_mirred]
        tcf_action_exec+0xb3/0x210
        fl_classify+0x1f7/0x240 [cls_flower]
        tcf_classify+0x7b/0x320
        __dev_queue_xmit+0x3a4/0x11b0
        ip_finish_output2+0x3b8/0xa10
        ip_output+0x7f/0x260
        __ip_queue_xmit+0x1ce/0x610
        __tcp_transmit_skb+0xabc/0xc80
        tcp_rcv_established+0x284/0x810
        tcp_v4_do_rcv+0x1f3/0x370
        tcp_v4_rcv+0x10bc/0x1160
        ip_protocol_deliver_rcu+0x4d/0x280
        ip_local_deliver_finish+0xac/0x160
        ip_local_deliver+0x71/0x220
        ip_rcv+0x5a/0x200
        __netif_receive_skb_one_core+0x89/0xa0
        netif_receive_skb+0x1c1/0x400
        tcf_mirred_act+0x2a5/0x610 [act_mirred]
        tcf_action_exec+0xb3/0x210
        fl_classify+0x1f7/0x240 [cls_flower]
        tcf_classify+0x7b/0x320
        __dev_queue_xmit+0x3a4/0x11b0
        ip_finish_output2+0x3b8/0xa10
        ip_output+0x7f/0x260
        __ip_queue_xmit+0x1ce/0x610
        __tcp_transmit_skb+0xabc/0xc80
        tcp_write_xmit+0x229/0x12c0
        __tcp_push_pending_frames+0x32/0xf0
        tcp_sendmsg_locked+0x297/0xe10
        tcp_sendmsg+0x27/0x40
        sock_sendmsg+0x58/0x70
        sock_write_iter+0x9a/0x100
        vfs_write+0x481/0x4f0
        ksys_write+0xc2/0xe0
        do_syscall_64+0x3a/0x90
        entry_SYSCALL_64_after_hwframe+0x63/0xcd

 -> #0 (slock-AF_INET6/1){+...}-{2:2}:
        check_prevs_add+0x185/0xf50
        __lock_acquire+0x11eb/0x1620
        lock_acquire+0xd5/0x310
        _raw_spin_lock_nested+0x39/0x70
        tcp_v4_rcv+0x1023/0x1160
        ip_protocol_deliver_rcu+0x4d/0x280
        ip_local_deliver_finish+0xac/0x160
        ip_local_deliver+0x71/0x220
        ip_rcv+0x5a/0x200
        __netif_receive_skb_one_core+0x89/0xa0
        netif_receive_skb+0x1c1/0x400
        tcf_mirred_act+0x2a5/0x610 [act_mirred]
        tcf_action_exec+0xb3/0x210
        fl_classify+0x1f7/0x240 [cls_flower]
        tcf_classify+0x7b/0x320
        __dev_queue_xmit+0x3a4/0x11b0
        ip_finish_output2+0x3b8/0xa10
        ip_output+0x7f/0x260
        __ip_queue_xmit+0x1ce/0x610
        __tcp_transmit_skb+0xabc/0xc80
        tcp_rcv_established+0x284/0x810
        tcp_v4_do_rcv+0x1f3/0x370
        tcp_v4_rcv+0x10bc/0x1160
        ip_protocol_deliver_rcu+0x4d/0x280
        ip_local_deliver_finish+0xac/0x160
        ip_local_deliver+0x71/0x220
        ip_rcv+0x5a/0x200
        __netif_receive_skb_one_core+0x89/0xa0
        netif_receive_skb+0x1c1/0x400
        tcf_mirred_act+0x2a5/0x610 [act_mirred]
        tcf_action_exec+0xb3/0x210
        fl_classify+0x1f7/0x240 [cls_flower]
        tcf_classify+0x7b/0x320
        __dev_queue_xmit+0x3a4/0x11b0
        ip_finish_output2+0x3b8/0xa10
        ip_output+0x7f/0x260
        __ip_queue_xmit+0x1ce/0x610
        __tcp_transmit_skb+0xabc/0xc80
        tcp_write_xmit+0x229/0x12c0
        __tcp_push_pending_frames+0x32/0xf0
        tcp_sendmsg_locked+0x297/0xe10
        tcp_sendmsg+0x27/0x40
        sock_sendmsg+0x42/0x70
        sock_write_iter+0x9a/0x100
        vfs_write+0x481/0x4f0
        ksys_write+0xc2/0xe0
        do_syscall_64+0x3a/0x90
        entry_SYSCALL_64_after_hwframe+0x63/0xcd

 other info that might help us debug this:

  Possible unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock(slock-AF_INET/1);
                                lock(slock-AF_INET6/1);
                                lock(slock-AF_INET/1);
   lock(slock-AF_INET6/1);

  *** DEADLOCK ***

 12 locks held by iperf3/1021:
  #0: ffff976005c5a6c0 (sk_lock-AF_INET6){+.+.}-{0:0}, at: tcp_sendmsg+0x19/0x40
  #1: ffffffffbca07320 (rcu_read_lock){....}-{1:2}, at: __ip_queue_xmit+0x5/0x610
  #2: ffffffffbca072e0 (rcu_read_lock_bh){....}-{1:2}, at: ip_finish_output2+0xaa/0xa10
  #3: ffffffffbca072e0 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x72/0x11b0
  #4: ffffffffbca07320 (rcu_read_lock){....}-{1:2}, at: netif_receive_skb+0x181/0x400
  #5: ffffffffbca07320 (rcu_read_lock){....}-{1:2}, at: ip_local_deliver_finish+0x54/0x160
  #6: ffff97607b06e0b0 (slock-AF_INET/1){+.-.}-{2:2}, at: tcp_v4_rcv+0x1023/0x1160
  #7: ffffffffbca07320 (rcu_read_lock){....}-{1:2}, at: __ip_queue_xmit+0x5/0x610
  #8: ffffffffbca072e0 (rcu_read_lock_bh){....}-{1:2}, at: ip_finish_output2+0xaa/0xa10
  #9: ffffffffbca072e0 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x72/0x11b0
  #10: ffffffffbca07320 (rcu_read_lock){....}-{1:2}, at: netif_receive_skb+0x181/0x400
  #11: ffffffffbca07320 (rcu_read_lock){....}-{1:2}, at: ip_local_deliver_finish+0x54/0x160

 [...]

 kernel:watchdog: BUG: soft lockup - CPU#1 stuck for 26s! [swapper/1:0]

Moreover, even if we improve TCP locking in order to avoid lockups for
this simple topology, I suspect that TCP will experience some packet
losses: when mirred detects 4 nested calls of tcf_mirred_act(), the kernel
will protect against excessive stack growth and drop the skb (that can
also be a full TSO packet). Probably the protocol can recover, but the
performance will be certainly non-optimal.

> Which also means we can no longer know the RX path status any more,
> right? I mean if we have filters on ingress, we can't know whether they
> drop this packet or not, after this patch? To me, this at least breaks
> users' expectation.

Fair point! Then maybe we don't need to change the whole TC mirred ingress:
since the problem only affects egress to ingress, we can preserve the call
to netif_recive_skb() on ingress->ingress, and just use the backlog in the
egress->ingress direction _ that has been broken since the very beginning
and got similar fixes in the past [1]. Something like:

-- >8 --
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -205,12 +205,14 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
        return err;
 }

-static int tcf_mirred_forward(bool want_ingress, struct sk_buff *skb)
+static int tcf_mirred_forward(bool want_ingress, bool at_ingress, struct sk_buff *skb)
 {
        int err;

        if (!want_ingress)
                err = tcf_dev_queue_xmit(skb, dev_queue_xmit);
+       else if (!at_ingress)
+               err = netif_rx(skb);
        else
                err = netif_receive_skb(skb);

@@ -306,7 +308,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
                /* let's the caller reinsert the packet, if possible */
                if (use_reinsert) {
                        res->ingress = want_ingress;
-                       err = tcf_mirred_forward(res->ingress, skb);
+                       err = tcf_mirred_forward(res->ingress, at_ingress, skb);
                        if (err)
                                tcf_action_inc_overlimit_qstats(&m->common);
                        __this_cpu_dec(mirred_rec_level);
@@ -314,7 +316,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
                }
        }

-       err = tcf_mirred_forward(want_ingress, skb2);
+       err = tcf_mirred_forward(want_ingress, at_ingress, skb2);
        if (err) {
 out:
                tcf_action_inc_overlimit_qstats(&m->common);
-- >8 --

WDYT? Any feedback appreciated, thanks!

[1] https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=f799ada6bf2397c351220088b9b0980125c77280
Cong Wang Oct. 16, 2022, 5:28 p.m. UTC | #3
On Tue, Oct 04, 2022 at 07:40:27PM +0200, Davide Caratti wrote:
> hello Cong, thanks for looking at this!
> 
> On Sun, Sep 25, 2022 at 11:08:48AM -0700, Cong Wang wrote:
> > On Fri, Sep 23, 2022 at 05:11:12PM +0200, Davide Caratti wrote:
> > > William reports kernel soft-lockups on some OVS topologies when TC mirred
> > > "egress-to-ingress" action is hit by local TCP traffic. Indeed, using the
> > > mirred action in egress-to-ingress can easily produce a dmesg splat like:
> > > 
> > >  ============================================
> > >  WARNING: possible recursive locking detected
> 
> [...]
> 
> > >  6.0.0-rc4+ #511 Not tainted
> > >  --------------------------------------------
> > >  nc/1037 is trying to acquire lock:
> > >  ffff950687843cb0 (slock-AF_INET/1){+.-.}-{2:2}, at: tcp_v4_rcv+0x1023/0x1160
> > > 
> > >  but task is already holding lock:
> > >  ffff950687846cb0 (slock-AF_INET/1){+.-.}-{2:2}, at: tcp_v4_rcv+0x1023/0x1160
> 
> FTR, this is:

Yeah, Peilin actually looked deeper into this issue. Let's copy him.

> 
> 2091         sk_incoming_cpu_update(sk);
> 2092
> 2093         bh_lock_sock_nested(sk); <--- the lock reported in the splat
> 2094         tcp_segs_in(tcp_sk(sk), skb);
> 2095         ret = 0;
> 2096         if (!sock_owned_by_user(sk)) {
> 
> > BTW, have you thought about solving the above lockdep warning in TCP
> > layer?
> 
> yes, but that doesn't look like a trivial fix at all - and I doubt it's
> worth doing it just to make mirred and TCP "friends". Please note:
> on current kernel this doesn't just result in a lockdep warning: using
> iperf3 on unpatched kernels it's possible to see a real deadlock, like:

I'd say your test case is rare, because I don't think it is trivial for
a TCP socket to send packets to itself.

 
> > Which also means we can no longer know the RX path status any more,
> > right? I mean if we have filters on ingress, we can't know whether they
> > drop this packet or not, after this patch? To me, this at least breaks
> > users' expectation.
> 
> Fair point! Then maybe we don't need to change the whole TC mirred ingress:
> since the problem only affects egress to ingress, we can preserve the call
> to netif_recive_skb() on ingress->ingress, and just use the backlog in the
> egress->ingress direction _ that has been broken since the very beginning
> and got similar fixes in the past [1]. Something like:

Regarless ingress->ingress or egress->ingress, this patch breaks
users' expectation. And, actually egress->ingress is more common than
ingress->ingress, in my experience.

Thanks.
Peilin Ye Nov. 18, 2022, 11:07 p.m. UTC | #4
Hi all,

On Fri, Sep 23, 2022 at 05:11:12PM +0200, Davide Caratti wrote:
> +mirred_egress_to_ingress_tcp_test()
> +{
> +	local tmpfile=$(mktemp) tmpfile1=$(mktemp)
> +
> +	RET=0
> +	dd conv=sparse status=none if=/dev/zero bs=1M count=2 of=$tmpfile
> +	tc filter add dev $h1 protocol ip pref 100 handle 100 egress flower \
> +		ip_proto tcp src_ip 192.0.2.1 dst_ip 192.0.2.2 \
> +			action ct commit nat src addr 192.0.2.2 pipe \
> +			action ct clear pipe \
> +			action ct commit nat dst addr 192.0.2.1 pipe \
> +			action ct clear pipe \
> +			action skbedit ptype host pipe \
> +			action mirred ingress redirect dev $h1

FWIW, I couldn't reproduce the lockup using this test case (with
forwarding.config.sample), but I got the same lockup in tcp_v4_rcv()
using a different (but probably less realistic) TC filter:

	tc filter add dev $h1 protocol ip pref 100 handle 100 egress flower \
		ip_proto tcp src_ip 192.0.2.1 dst_ip 192.0.2.2 \
			action pedit ex munge ip src set 192.0.2.2 pipe \
			action pedit ex munge ip dst set 192.0.2.1 pipe \
			action skbedit ptype host pipe \
			action mirred ingress redirect dev $h1

Thanks,
Peilin Ye
diff mbox series

Patch

diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index a1d70cf86843..ff965ed2dd9f 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -213,7 +213,7 @@  static int tcf_mirred_forward(bool want_ingress, struct sk_buff *skb)
 	if (!want_ingress)
 		err = tcf_dev_queue_xmit(skb, dev_queue_xmit);
 	else
-		err = netif_receive_skb(skb);
+		err = netif_rx(skb);
 
 	return err;
 }
diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh
index 1e0a62f638fe..6e1b0cc68f7d 100755
--- a/tools/testing/selftests/net/forwarding/tc_actions.sh
+++ b/tools/testing/selftests/net/forwarding/tc_actions.sh
@@ -3,7 +3,8 @@ 
 
 ALL_TESTS="gact_drop_and_ok_test mirred_egress_redirect_test \
 	mirred_egress_mirror_test matchall_mirred_egress_mirror_test \
-	gact_trap_test mirred_egress_to_ingress_test"
+	gact_trap_test mirred_egress_to_ingress_test \
+	mirred_egress_to_ingress_tcp_test"
 NUM_NETIFS=4
 source tc_common.sh
 source lib.sh
@@ -198,6 +199,32 @@  mirred_egress_to_ingress_test()
 	log_test "mirred_egress_to_ingress ($tcflags)"
 }
 
+mirred_egress_to_ingress_tcp_test()
+{
+	local tmpfile=$(mktemp) tmpfile1=$(mktemp)
+
+	RET=0
+	dd conv=sparse status=none if=/dev/zero bs=1M count=2 of=$tmpfile
+	tc filter add dev $h1 protocol ip pref 100 handle 100 egress flower \
+		ip_proto tcp src_ip 192.0.2.1 dst_ip 192.0.2.2 \
+			action ct commit nat src addr 192.0.2.2 pipe \
+			action ct clear pipe \
+			action ct commit nat dst addr 192.0.2.1 pipe \
+			action ct clear pipe \
+			action skbedit ptype host pipe \
+			action mirred ingress redirect dev $h1
+
+	ip vrf exec v$h1 nc --recv-only -w10 -l -p 12345 -o $tmpfile1 &
+	local rpid=$!
+	ip vrf exec v$h1 nc -w1 --send-only 192.0.2.2 12345 <$tmpfile
+	wait -n $rpid
+	cmp -s $tmpfile $tmpfile1
+	check_err $? "server output check failed"
+	tc filter del dev $h1 egress protocol ip pref 100 handle 100 flower
+	rm -f $tmpfile $tmpfile1
+	log_test "mirred_egress_to_ingress_tcp"
+}
+
 setup_prepare()
 {
 	h1=${NETIFS[p1]}