diff mbox series

[mptcp-net,v2,1/2] mptcp: prevent MPC handshake on port-based signal endpoints

Message ID 20241007-mpc-hs-port-v2-1-0c9e7827bd0f@kernel.org (mailing list archive)
State Superseded, archived
Headers show
Series mptcp: prevent MPC handshake on port-based signal endpoints | expand

Checks

Context Check Description
matttbe/build success Build and static analysis OK
matttbe/checkpatch warning total: 0 errors, 2 warnings, 0 checks, 57 lines checked
matttbe/shellcheck success MPTCP selftests files have not been modified
matttbe/KVM_Validation__normal success Success! ✅
matttbe/KVM_Validation__debug success Success! ✅
matttbe/KVM_Validation__btf__only_bpftest_all_ success Success! ✅

Commit Message

Matthieu Baerts Oct. 7, 2024, 6:22 p.m. UTC
From: Paolo Abeni <pabeni@redhat.com>

Syzkaller reported a lockdep splat:

  ============================================
  WARNING: possible recursive locking detected
  6.11.0-rc6-syzkaller-00019-g67784a74e258 #0 Not tainted
  --------------------------------------------
  syz-executor364/5113 is trying to acquire lock:
  ffff8880449f1958 (k-slock-AF_INET){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline]
  ffff8880449f1958 (k-slock-AF_INET){+.-.}-{2:2}, at: sk_clone_lock+0x2cd/0xf40 net/core/sock.c:2328

  but task is already holding lock:
  ffff88803fe3cb58 (k-slock-AF_INET){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline]
  ffff88803fe3cb58 (k-slock-AF_INET){+.-.}-{2:2}, at: sk_clone_lock+0x2cd/0xf40 net/core/sock.c:2328

  other info that might help us debug this:
   Possible unsafe locking scenario:

         CPU0
         ----
    lock(k-slock-AF_INET);
    lock(k-slock-AF_INET);

   *** DEADLOCK ***

   May be due to missing lock nesting notation

  7 locks held by syz-executor364/5113:
   #0: ffff8880449f0e18 (sk_lock-AF_INET){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1607 [inline]
   #0: ffff8880449f0e18 (sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_sendmsg+0x153/0x1b10 net/mptcp/protocol.c:1806
   #1: ffff88803fe39ad8 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1607 [inline]
   #1: ffff88803fe39ad8 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_sendmsg_fastopen+0x11f/0x530 net/mptcp/protocol.c:1727
   #2: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:326 [inline]
   #2: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline]
   #2: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: __ip_queue_xmit+0x5f/0x1b80 net/ipv4/ip_output.c:470
   #3: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:326 [inline]
   #3: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline]
   #3: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: ip_finish_output2+0x45f/0x1390 net/ipv4/ip_output.c:228
   #4: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: local_lock_acquire include/linux/local_lock_internal.h:29 [inline]
   #4: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: process_backlog+0x33b/0x15b0 net/core/dev.c:6104
   #5: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:326 [inline]
   #5: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline]
   #5: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: ip_local_deliver_finish+0x230/0x5f0 net/ipv4/ip_input.c:232
   #6: ffff88803fe3cb58 (k-slock-AF_INET){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline]
   #6: ffff88803fe3cb58 (k-slock-AF_INET){+.-.}-{2:2}, at: sk_clone_lock+0x2cd/0xf40 net/core/sock.c:2328

  stack backtrace:
  CPU: 0 UID: 0 PID: 5113 Comm: syz-executor364 Not tainted 6.11.0-rc6-syzkaller-00019-g67784a74e258 #0
  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014
  Call Trace:
   <IRQ>
   __dump_stack lib/dump_stack.c:93 [inline]
   dump_stack_lvl+0x241/0x360 lib/dump_stack.c:119
   check_deadlock kernel/locking/lockdep.c:3061 [inline]
   validate_chain+0x15d3/0x5900 kernel/locking/lockdep.c:3855
   __lock_acquire+0x137a/0x2040 kernel/locking/lockdep.c:5142
   lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5759
   __raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline]
   _raw_spin_lock+0x2e/0x40 kernel/locking/spinlock.c:154
   spin_lock include/linux/spinlock.h:351 [inline]
   sk_clone_lock+0x2cd/0xf40 net/core/sock.c:2328
   mptcp_sk_clone_init+0x32/0x13c0 net/mptcp/protocol.c:3279
   subflow_syn_recv_sock+0x931/0x1920 net/mptcp/subflow.c:874
   tcp_check_req+0xfe4/0x1a20 net/ipv4/tcp_minisocks.c:853
   tcp_v4_rcv+0x1c3e/0x37f0 net/ipv4/tcp_ipv4.c:2267
   ip_protocol_deliver_rcu+0x22e/0x440 net/ipv4/ip_input.c:205
   ip_local_deliver_finish+0x341/0x5f0 net/ipv4/ip_input.c:233
   NF_HOOK+0x3a4/0x450 include/linux/netfilter.h:314
   NF_HOOK+0x3a4/0x450 include/linux/netfilter.h:314
   __netif_receive_skb_one_core net/core/dev.c:5661 [inline]
   __netif_receive_skb+0x2bf/0x650 net/core/dev.c:5775
   process_backlog+0x662/0x15b0 net/core/dev.c:6108
   __napi_poll+0xcb/0x490 net/core/dev.c:6772
   napi_poll net/core/dev.c:6841 [inline]
   net_rx_action+0x89b/0x1240 net/core/dev.c:6963
   handle_softirqs+0x2c4/0x970 kernel/softirq.c:554
   do_softirq+0x11b/0x1e0 kernel/softirq.c:455
   </IRQ>
   <TASK>
   __local_bh_enable_ip+0x1bb/0x200 kernel/softirq.c:382
   local_bh_enable include/linux/bottom_half.h:33 [inline]
   rcu_read_unlock_bh include/linux/rcupdate.h:908 [inline]
   __dev_queue_xmit+0x1763/0x3e90 net/core/dev.c:4450
   dev_queue_xmit include/linux/netdevice.h:3105 [inline]
   neigh_hh_output include/net/neighbour.h:526 [inline]
   neigh_output include/net/neighbour.h:540 [inline]
   ip_finish_output2+0xd41/0x1390 net/ipv4/ip_output.c:235
   ip_local_out net/ipv4/ip_output.c:129 [inline]
   __ip_queue_xmit+0x118c/0x1b80 net/ipv4/ip_output.c:535
   __tcp_transmit_skb+0x2544/0x3b30 net/ipv4/tcp_output.c:1466
   tcp_rcv_synsent_state_process net/ipv4/tcp_input.c:6542 [inline]
   tcp_rcv_state_process+0x2c32/0x4570 net/ipv4/tcp_input.c:6729
   tcp_v4_do_rcv+0x77d/0xc70 net/ipv4/tcp_ipv4.c:1934
   sk_backlog_rcv include/net/sock.h:1111 [inline]
   __release_sock+0x214/0x350 net/core/sock.c:3004
   release_sock+0x61/0x1f0 net/core/sock.c:3558
   mptcp_sendmsg_fastopen+0x1ad/0x530 net/mptcp/protocol.c:1733
   mptcp_sendmsg+0x1884/0x1b10 net/mptcp/protocol.c:1812
   sock_sendmsg_nosec net/socket.c:730 [inline]
   __sock_sendmsg+0x1a6/0x270 net/socket.c:745
   ____sys_sendmsg+0x525/0x7d0 net/socket.c:2597
   ___sys_sendmsg net/socket.c:2651 [inline]
   __sys_sendmmsg+0x3b2/0x740 net/socket.c:2737
   __do_sys_sendmmsg net/socket.c:2766 [inline]
   __se_sys_sendmmsg net/socket.c:2763 [inline]
   __x64_sys_sendmmsg+0xa0/0xb0 net/socket.c:2763
   do_syscall_x64 arch/x86/entry/common.c:52 [inline]
   do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83
   entry_SYSCALL_64_after_hwframe+0x77/0x7f
  RIP: 0033:0x7f04fb13a6b9
  Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 01 1a 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
  RSP: 002b:00007ffd651f42d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000133
  RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f04fb13a6b9
  RDX: 0000000000000001 RSI: 0000000020000d00 RDI: 0000000000000004
  RBP: 00007ffd651f4310 R08: 0000000000000001 R09: 0000000000000001
  R10: 0000000020000080 R11: 0000000000000246 R12: 00000000000f4240
  R13: 00007f04fb187449 R14: 00007ffd651f42f4 R15: 00007ffd651f4300
   </TASK>

As noted by Cong Wang, the splat is false positive, but the code
path leading to the report is an unexpected one: a client is
attempting an MPC handshake towards the in-kernel listener created
by the in-kernel PM for a port based signal endpoint.

Such connection will be never accepted; many of them can make the
listener queue full and preventing the creation of MPJ subflow via
such listener - its intended role.

Explicitly detect this scenario at initial-syn time and drop the
incoming MPC request.

Fixes: 1729cf186d8a ("mptcp: create the listening socket for new port")
Reported-by: syzbot+f4aacdfef2c6a6529c3e@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=f4aacdfef2c6a6529c3e
Cc: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
---
 net/mptcp/mib.c        |  1 +
 net/mptcp/mib.h        |  1 +
 net/mptcp/pm_netlink.c |  1 +
 net/mptcp/protocol.h   |  1 +
 net/mptcp/subflow.c    | 11 +++++++++++
 5 files changed, 15 insertions(+)

Comments

Matthieu Baerts Oct. 8, 2024, 9:29 a.m. UTC | #1
Hi Paolo,

(I guess you dropped the MPTCP ML from Cc by mistake, re-adding it)

On 08/10/2024 08:40, Paolo Abeni wrote:
> On 10/7/24 20:22, Matthieu Baerts (NGI0) wrote:
>> From: Paolo Abeni <pabeni@redhat.com>
>>
>> Syzkaller reported a lockdep splat:

(...)

>> Fixes: 1729cf186d8a ("mptcp: create the listening socket for new port")
>> Reported-by: syzbot+f4aacdfef2c6a6529c3e@syzkaller.appspotmail.com
>> Closes: https://syzkaller.appspot.com/bug?extid=f4aacdfef2c6a6529c3e
>> Cc: Cong Wang <cong.wang@bytedance.com>
>> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
>> Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
> 
> Please add your co-dev tag instead here.

I only cleaned-up comments, etc. without changing the behaviour, I sent
the v2 mainly because I had the suggested changes on my side already
(when checking if comments could go on one line, checking shellcheck
with the other patch, etc.). I can add it on the other patch, because I
might have done more modifications changing a bit the behaviour, but I
can add my co-dev tag here as well if you prefer.

(...)

>> diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
>> index
>> a1e28e1d8b4e39e14bc8f98164013d302d62595c..5958b63f84469b980bb1af472117643baf740713 100644
>> --- a/net/mptcp/subflow.c
>> +++ b/net/mptcp/subflow.c
>> @@ -132,6 +132,13 @@ static void subflow_add_reset_reason(struct
>> sk_buff *skb, u8 reason)
>>       }
>>   }
>>   +static int subflow_reset_req_endp(struct request_sock *req, struct
>> sk_buff *skb)
> 
> I used a different name because I was wondering about to re-using this
> helper in future patches for other req reset - passing explicitly reason
> and mib. I guess we can rename it as needed later.

Good point. But yes, probably better to rename it later on if we want
something more generic, taking different parameters.

> Side note: Jakub still strongly prefers not exceeding the 80 chars per
> column. A couple of comments and the above line are now above such
> threshold.

Mmh, I think we mostly followed the default 100 chars max per line in
our code. We don't abuse it though: we use more than 80 to improve the
readability, e.g. when the opening parenthesis is "too close" to the end
of the line, when each line needs a comment next to, most multilines
comments are under 80 chars, etc. In other words, if we want to enforce
the max of 80 chars per line, we might uniform that, but I don't think
that's worth it. Or do you think we should?

In this patch, only the comments are above 80 chars, but they are
aligned with the ones just above and below. I can still reduce them I see:

  /* Prohibited MPC attempted towards a port-based signal endp */
  /* Prohibited MPC towards a port-based signal endp */

  /* subflow is a listener managed by the in-kernel PM */
  /* a listener managed by the kernel PM? */

No hurry, I guess we can send this patch later on.

Cheers,
Matt
Paolo Abeni Oct. 8, 2024, 2:24 p.m. UTC | #2
On 10/8/24 11:29, Matthieu Baerts wrote:
>> On 10/7/24 20:22, Matthieu Baerts (NGI0) wrote:
>> Side note: Jakub still strongly prefers not exceeding the 80 chars per
>> column. A couple of comments and the above line are now above such
>> threshold.
> 
> Mmh, I think we mostly followed the default 100 chars max per line in
> our code. We don't abuse it though: we use more than 80 to improve the
> readability, e.g. when the opening parenthesis is "too close" to the end
> of the line, when each line needs a comment next to, most multilines
> comments are under 80 chars, etc. In other words, if we want to enforce
> the max of 80 chars per line, we might uniform that, but I don't think
> that's worth it. Or do you think we should?

My take is that we should avoid adding more lines exceeding 80 chars, 
when possible.

> 
> In this patch, only the comments are above 80 chars, but they are
> aligned with the ones just above and below. I can still reduce them I see:
> 
>    /* Prohibited MPC attempted towards a port-based signal endp */
>    /* Prohibited MPC towards a port-based signal endp */
> 
>    /* subflow is a listener managed by the in-kernel PM */
>    /* a listener managed by the kernel PM? */
> 
> No hurry, I guess we can send this patch later on.

I would opt for the multi-line comment.

/P
Matthieu Baerts Oct. 8, 2024, 2:57 p.m. UTC | #3
On 08/10/2024 16:24, Paolo Abeni wrote:
> On 10/8/24 11:29, Matthieu Baerts wrote:
>>> On 10/7/24 20:22, Matthieu Baerts (NGI0) wrote:
>>> Side note: Jakub still strongly prefers not exceeding the 80 chars per
>>> column. A couple of comments and the above line are now above such
>>> threshold.
>>
>> Mmh, I think we mostly followed the default 100 chars max per line in
>> our code. We don't abuse it though: we use more than 80 to improve the
>> readability, e.g. when the opening parenthesis is "too close" to the end
>> of the line, when each line needs a comment next to, most multilines
>> comments are under 80 chars, etc. In other words, if we want to enforce
>> the max of 80 chars per line, we might uniform that, but I don't think
>> that's worth it. Or do you think we should?
> 
> My take is that we should avoid adding more lines exceeding 80 chars,
> when possible.

OK, I will try to find something to have the new lines under 80 chars,
and keep the consistency with the current code style.

>> In this patch, only the comments are above 80 chars, but they are
>> aligned with the ones just above and below. I can still reduce them I
>> see:
>>
>>    /* Prohibited MPC attempted towards a port-based signal endp */
>>    /* Prohibited MPC towards a port-based signal endp */
>>
>>    /* subflow is a listener managed by the in-kernel PM */
>>    /* a listener managed by the kernel PM? */
>>
>> No hurry, I guess we can send this patch later on.
> 
> I would opt for the multi-line comment.

OK!

Cheers,
Matt
diff mbox series

Patch

diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
index ad88bd3c58dffed8335eedb43ca6290418e3c4f4..19eb9292bd6093a760b41f98c1774fd2490c48e3 100644
--- a/net/mptcp/mib.c
+++ b/net/mptcp/mib.c
@@ -17,6 +17,7 @@  static const struct snmp_mib mptcp_snmp_list[] = {
 	SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK),
 	SNMP_MIB_ITEM("MPCapableSYNTXDrop", MPTCP_MIB_MPCAPABLEACTIVEDROP),
 	SNMP_MIB_ITEM("MPCapableSYNTXDisabled", MPTCP_MIB_MPCAPABLEACTIVEDISABLED),
+	SNMP_MIB_ITEM("MPCapableEndpAttempt", MPTCP_MIB_MPCAPABLEENDPATTEMPT),
 	SNMP_MIB_ITEM("MPFallbackTokenInit", MPTCP_MIB_TOKENFALLBACKINIT),
 	SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS),
 	SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN),
diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
index 3206cdda8bb1067f9a8354fd45deed86b67ac7da..42e21b23009462b93553473a7b02c5e09e561a66 100644
--- a/net/mptcp/mib.h
+++ b/net/mptcp/mib.h
@@ -12,6 +12,7 @@  enum linux_mptcp_mib_field {
 	MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */
 	MPTCP_MIB_MPCAPABLEACTIVEDROP,	/* Client-side fallback due to a MPC drop */
 	MPTCP_MIB_MPCAPABLEACTIVEDISABLED, /* Client-side disabled due to past issues */
+	MPTCP_MIB_MPCAPABLEENDPATTEMPT,	/* Prohibited MPC attempted towards a port-based signal endp */
 	MPTCP_MIB_TOKENFALLBACKINIT,	/* Could not init/allocate token */
 	MPTCP_MIB_RETRANSSEGS,		/* Segments retransmitted at the MPTCP-level */
 	MPTCP_MIB_JOINNOTOKEN,		/* Received MP_JOIN but the token was not found */
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index c738330ba403a75dae0a47a34abc3b9b65b36655..d37fefbaf34f50c16f6c36d785e064797d5df7ad 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -1129,6 +1129,7 @@  static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
 	 */
 	inet_sk_state_store(newsk, TCP_LISTEN);
 	lock_sock(ssk);
+	WRITE_ONCE(mptcp_subflow_ctx(ssk)->pm_listener, true);
 	err = __inet_listen_sk(ssk, backlog);
 	if (!err)
 		mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index c3942416fa3ab46a0e72dd4aed851a6e716398fc..de5c3275df1757d387b275fe58e4d36f3a7de84c 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -532,6 +532,7 @@  struct mptcp_subflow_context {
 		close_event_done : 1,       /* has done the post-closed part */
 		mpc_drop : 1,	    /* the MPC option has been dropped in a rtx */
 		__unused : 9;
+	bool	pm_listener;	    /* subflow is a listener managed by the in-kernel PM */
 	bool	data_avail;
 	bool	scheduled;
 	bool	fully_established;  /* path validated */
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index a1e28e1d8b4e39e14bc8f98164013d302d62595c..5958b63f84469b980bb1af472117643baf740713 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -132,6 +132,13 @@  static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason)
 	}
 }
 
+static int subflow_reset_req_endp(struct request_sock *req, struct sk_buff *skb)
+{
+	SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEENDPATTEMPT);
+	subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
+	return -EPERM;
+}
+
 /* Init mptcp request socket.
  *
  * Returns an error code if a JOIN has failed and a TCP reset
@@ -165,6 +172,8 @@  static int subflow_check_req(struct request_sock *req,
 	if (opt_mp_capable) {
 		SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
 
+		if (unlikely(listener->pm_listener))
+			return subflow_reset_req_endp(req, skb);
 		if (opt_mp_join)
 			return 0;
 	} else if (opt_mp_join) {
@@ -172,6 +181,8 @@  static int subflow_check_req(struct request_sock *req,
 
 		if (mp_opt.backup)
 			SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNBACKUPRX);
+	} else if (unlikely(listener->pm_listener)) {
+		return subflow_reset_req_endp(req, skb);
 	}
 
 	if (opt_mp_capable && listener->request_mptcp) {