diff mbox series

[v3,net,11/16] ipv6: input: convert to dev_net_rcu()

Message ID 20250204132357.102354-12-edumazet@google.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series net: first round to use dev_net_rcu() | expand

Checks

Context Check Description
netdev/series_format fail Series longer than 15 patches
netdev/tree_selection success Clearly marked for net, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag present in non-next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 1 maintainers not CCed: dsahern@kernel.org
netdev/build_clang success Errors and warnings before: 2 this patch: 2
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1 this patch: 1
netdev/checkpatch warning CHECK: Alignment should match open parenthesis WARNING: line length of 83 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest fail net-next-2025-02-04--18-00 (tests: 882)

Commit Message

Eric Dumazet Feb. 4, 2025, 1:23 p.m. UTC
dev_net() calls from net/ipv6/ip6_input.c seem to
happen under RCU protection.

Convert them to dev_net_rcu() to ensure LOCKDEP support.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
 net/ipv6/ip6_input.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

Comments

Jakub Kicinski Feb. 4, 2025, 8:09 p.m. UTC | #1
On Tue,  4 Feb 2025 13:23:52 +0000 Eric Dumazet wrote:
> @@ -488,7 +488,7 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk
>  int ip6_input(struct sk_buff *skb)
>  {
>  	return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
> -		       dev_net(skb->dev), NULL, skb, skb->dev, NULL,
> +		       dev_net_rcu(skb->dev), NULL, skb, skb->dev, NULL,
>  		       ip6_input_finish);
>  }
>  EXPORT_SYMBOL_GPL(ip6_input);

One more here:

[ 4326.034939][   T50] =============================
[ 4326.035125][   T50] WARNING: suspicious RCU usage
[ 4326.035299][   T50] 6.13.0-virtme #1 Not tainted
[ 4326.035955][   T50] -----------------------------
[ 4326.036124][   T50] ./include/net/net_namespace.h:404 suspicious rcu_dereference_check() usage!
[ 4326.036398][   T50] 
[ 4326.036398][   T50] other info that might help us debug this:
[ 4326.036398][   T50] 
[ 4326.036684][   T50] 
[ 4326.036684][   T50] rcu_scheduler_active = 2, debug_locks = 1
[ 4326.036910][   T50] 2 locks held by kworker/2:1/50:
[ 4326.037111][   T50]  #0: ffff8880010a9548 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x7ec/0x16d0
[ 4326.037439][   T50]  #1: ffffc9000036fd40 ((work_completion)(&trans->work)){+.+.}-{0:0}, at: process_one_work+0xe0b/0x16d0
[ 4326.037741][   T50] 
[ 4326.037741][   T50] stack backtrace:
[ 4326.037930][   T50] CPU: 2 UID: 0 PID: 50 Comm: kworker/2:1 Not tainted 6.13.0-virtme #1
[ 4326.037935][   T50] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[ 4326.037937][   T50] Workqueue: events xfrm_trans_reinject
[ 4326.037947][   T50] Call Trace:
[ 4326.037949][   T50]  <TASK>
[ 4326.037952][   T50]  dump_stack_lvl+0xb0/0xd0
[ 4326.037963][   T50]  lockdep_rcu_suspicious+0x1ea/0x280
[ 4326.037975][   T50]  ip6_input+0x262/0x3e0
[ 4326.038009][   T50]  xfrm_trans_reinject+0x2a2/0x460
[ 4326.038055][   T50]  process_one_work+0xe55/0x16d0
[ 4326.038098][   T50]  worker_thread+0x58c/0xce0
[ 4326.038121][   T50]  kthread+0x359/0x5d0
[ 4326.038141][   T50]  ret_from_fork+0x31/0x70
[ 4326.038150][   T50]  ret_from_fork_asm+0x1a/0x30

Test output:
https://netdev-3.bots.linux.dev/vmksft-net-dbg/results/978202/61-l2tp-sh/
Decoded:
https://netdev-3.bots.linux.dev/vmksft-net-dbg/results/978202/vm-crash-thr2-0
Eric Dumazet Feb. 4, 2025, 8:10 p.m. UTC | #2
On Tue, Feb 4, 2025 at 9:09 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Tue,  4 Feb 2025 13:23:52 +0000 Eric Dumazet wrote:
> > @@ -488,7 +488,7 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk
> >  int ip6_input(struct sk_buff *skb)
> >  {
> >       return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
> > -                    dev_net(skb->dev), NULL, skb, skb->dev, NULL,
> > +                    dev_net_rcu(skb->dev), NULL, skb, skb->dev, NULL,
> >                      ip6_input_finish);
> >  }
> >  EXPORT_SYMBOL_GPL(ip6_input);
>
> One more here:
>
> [ 4326.034939][   T50] =============================
> [ 4326.035125][   T50] WARNING: suspicious RCU usage
> [ 4326.035299][   T50] 6.13.0-virtme #1 Not tainted
> [ 4326.035955][   T50] -----------------------------
> [ 4326.036124][   T50] ./include/net/net_namespace.h:404 suspicious rcu_dereference_check() usage!
> [ 4326.036398][   T50]
> [ 4326.036398][   T50] other info that might help us debug this:
> [ 4326.036398][   T50]
> [ 4326.036684][   T50]
> [ 4326.036684][   T50] rcu_scheduler_active = 2, debug_locks = 1
> [ 4326.036910][   T50] 2 locks held by kworker/2:1/50:
> [ 4326.037111][   T50]  #0: ffff8880010a9548 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x7ec/0x16d0
> [ 4326.037439][   T50]  #1: ffffc9000036fd40 ((work_completion)(&trans->work)){+.+.}-{0:0}, at: process_one_work+0xe0b/0x16d0
> [ 4326.037741][   T50]
> [ 4326.037741][   T50] stack backtrace:
> [ 4326.037930][   T50] CPU: 2 UID: 0 PID: 50 Comm: kworker/2:1 Not tainted 6.13.0-virtme #1
> [ 4326.037935][   T50] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
> [ 4326.037937][   T50] Workqueue: events xfrm_trans_reinject
> [ 4326.037947][   T50] Call Trace:
> [ 4326.037949][   T50]  <TASK>
> [ 4326.037952][   T50]  dump_stack_lvl+0xb0/0xd0
> [ 4326.037963][   T50]  lockdep_rcu_suspicious+0x1ea/0x280
> [ 4326.037975][   T50]  ip6_input+0x262/0x3e0
> [ 4326.038009][   T50]  xfrm_trans_reinject+0x2a2/0x460
> [ 4326.038055][   T50]  process_one_work+0xe55/0x16d0
> [ 4326.038098][   T50]  worker_thread+0x58c/0xce0
> [ 4326.038121][   T50]  kthread+0x359/0x5d0
> [ 4326.038141][   T50]  ret_from_fork+0x31/0x70
> [ 4326.038150][   T50]  ret_from_fork_asm+0x1a/0x30
>
> Test output:
> https://netdev-3.bots.linux.dev/vmksft-net-dbg/results/978202/61-l2tp-sh/
> Decoded:
> https://netdev-3.bots.linux.dev/vmksft-net-dbg/results/978202/vm-crash-thr2-0

Oh well. So many bugs.
Jakub Kicinski Feb. 4, 2025, 9 p.m. UTC | #3
On Tue, 4 Feb 2025 21:10:59 +0100 Eric Dumazet wrote:
> > Test output:
> > https://netdev-3.bots.linux.dev/vmksft-net-dbg/results/978202/61-l2tp-sh/
> > Decoded:
> > https://netdev-3.bots.linux.dev/vmksft-net-dbg/results/978202/vm-crash-thr2-0  
> 
> Oh well. So many bugs.

TBH I'm slightly confused by this, and the previous warnings.

The previous one was from a timer callback.

This one is with BH disabled.

I thought BH implies RCU protection. We certainly depend on that 
in NAPI for XDP. And threaded NAPI does the exact same thing as
xfrm_trans_reinject(), a bare local_bh_disable().

RCU folks, did something change or is just holes in my brain again?
Eric Dumazet Feb. 4, 2025, 9:06 p.m. UTC | #4
On Tue, Feb 4, 2025 at 10:00 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Tue, 4 Feb 2025 21:10:59 +0100 Eric Dumazet wrote:
> > > Test output:
> > > https://netdev-3.bots.linux.dev/vmksft-net-dbg/results/978202/61-l2tp-sh/
> > > Decoded:
> > > https://netdev-3.bots.linux.dev/vmksft-net-dbg/results/978202/vm-crash-thr2-0
> >
> > Oh well. So many bugs.
>
> TBH I'm slightly confused by this, and the previous warnings.
>
> The previous one was from a timer callback.
>
> This one is with BH disabled.
>
> I thought BH implies RCU protection. We certainly depend on that
> in NAPI for XDP. And threaded NAPI does the exact same thing as
> xfrm_trans_reinject(), a bare local_bh_disable().
>
> RCU folks, did something change or is just holes in my brain again?

Nope, BH does not imply rcu_read_lock()
Paul E. McKenney Feb. 4, 2025, 9:17 p.m. UTC | #5
On Tue, Feb 04, 2025 at 10:06:15PM +0100, Eric Dumazet wrote:
> On Tue, Feb 4, 2025 at 10:00 PM Jakub Kicinski <kuba@kernel.org> wrote:
> >
> > On Tue, 4 Feb 2025 21:10:59 +0100 Eric Dumazet wrote:
> > > > Test output:
> > > > https://netdev-3.bots.linux.dev/vmksft-net-dbg/results/978202/61-l2tp-sh/
> > > > Decoded:
> > > > https://netdev-3.bots.linux.dev/vmksft-net-dbg/results/978202/vm-crash-thr2-0
> > >
> > > Oh well. So many bugs.
> >
> > TBH I'm slightly confused by this, and the previous warnings.
> >
> > The previous one was from a timer callback.
> >
> > This one is with BH disabled.
> >
> > I thought BH implies RCU protection. We certainly depend on that
> > in NAPI for XDP. And threaded NAPI does the exact same thing as
> > xfrm_trans_reinject(), a bare local_bh_disable().
> >
> > RCU folks, did something change or is just holes in my brain again?
> 
> Nope, BH does not imply rcu_read_lock()

You are both right?  ;-)

The synchronize_rcu() function will wait for all types of RCU readers,
including BH-disabled regions of code.  However, lockdep can distinguish
between the various sorts of readers.  So for example

	lockdep_assert_in_rcu_read_lock_bh();

will complain unless you did rcu_read_lock_bh(), even if you did something
like disable_bh().  If you don't want to distinguish and are happy with
any type of RCU reader, you can use

	lockdep_assert_in_rcu_reader();

I have been expecting that CONFIG_PREEMPT_RT=y kernels will break this
any day now, but so far so good.  ;-)

							Thanx, Paul
Jakub Kicinski Feb. 4, 2025, 9:30 p.m. UTC | #6
On Tue, 4 Feb 2025 13:17:08 -0800 Paul E. McKenney wrote:
> > > TBH I'm slightly confused by this, and the previous warnings.
> > >
> > > The previous one was from a timer callback.
> > >
> > > This one is with BH disabled.
> > >
> > > I thought BH implies RCU protection. We certainly depend on that
> > > in NAPI for XDP. And threaded NAPI does the exact same thing as
> > > xfrm_trans_reinject(), a bare local_bh_disable().
> > >
> > > RCU folks, did something change or is just holes in my brain again?  
> > 
> > Nope, BH does not imply rcu_read_lock()  
> 
> You are both right?  ;-)
> 
> The synchronize_rcu() function will wait for all types of RCU readers,
> including BH-disabled regions of code.  However, lockdep can distinguish
> between the various sorts of readers.  So for example
> 
> 	lockdep_assert_in_rcu_read_lock_bh();
> 
> will complain unless you did rcu_read_lock_bh(), even if you did something
> like disable_bh().  If you don't want to distinguish and are happy with
> any type of RCU reader, you can use
> 
> 	lockdep_assert_in_rcu_reader();
> 
> I have been expecting that CONFIG_PREEMPT_RT=y kernels will break this
> any day now, but so far so good.  ;-)

Thanks Paul! So IIUC in this case we could:

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 0f5eb9db0c62..58ec1eb9ae6a 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -401,7 +401,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
 static inline struct net *read_pnet_rcu(possible_net_t *pnet)
 {
 #ifdef CONFIG_NET_NS
-	return rcu_dereference(pnet->net);
+	return rcu_dereference_check(pnet->net, rcu_read_lock_bh_held());
 #else
 	return &init_net;
 #endif

Sorry for the sideline, Eric, up to you how to proceed..
I'll try to remember the details better next time :)
Paul E. McKenney Feb. 4, 2025, 11:25 p.m. UTC | #7
On Tue, Feb 04, 2025 at 01:30:25PM -0800, Jakub Kicinski wrote:
> On Tue, 4 Feb 2025 13:17:08 -0800 Paul E. McKenney wrote:
> > > > TBH I'm slightly confused by this, and the previous warnings.
> > > >
> > > > The previous one was from a timer callback.
> > > >
> > > > This one is with BH disabled.
> > > >
> > > > I thought BH implies RCU protection. We certainly depend on that
> > > > in NAPI for XDP. And threaded NAPI does the exact same thing as
> > > > xfrm_trans_reinject(), a bare local_bh_disable().
> > > >
> > > > RCU folks, did something change or is just holes in my brain again?  
> > > 
> > > Nope, BH does not imply rcu_read_lock()  
> > 
> > You are both right?  ;-)
> > 
> > The synchronize_rcu() function will wait for all types of RCU readers,
> > including BH-disabled regions of code.  However, lockdep can distinguish
> > between the various sorts of readers.  So for example
> > 
> > 	lockdep_assert_in_rcu_read_lock_bh();
> > 
> > will complain unless you did rcu_read_lock_bh(), even if you did something
> > like disable_bh().  If you don't want to distinguish and are happy with
> > any type of RCU reader, you can use
> > 
> > 	lockdep_assert_in_rcu_reader();
> > 
> > I have been expecting that CONFIG_PREEMPT_RT=y kernels will break this
> > any day now, but so far so good.  ;-)
> 
> Thanks Paul! So IIUC in this case we could:
> 
> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> index 0f5eb9db0c62..58ec1eb9ae6a 100644
> --- a/include/net/net_namespace.h
> +++ b/include/net/net_namespace.h
> @@ -401,7 +401,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
>  static inline struct net *read_pnet_rcu(possible_net_t *pnet)
>  {
>  #ifdef CONFIG_NET_NS
> -	return rcu_dereference(pnet->net);
> +	return rcu_dereference_check(pnet->net, rcu_read_lock_bh_held());

That should do it!

							Thanx, Paul

>  #else
>  	return &init_net;
>  #endif
> 
> Sorry for the sideline, Eric, up to you how to proceed..
> I'll try to remember the details better next time :)
Eric Dumazet Feb. 5, 2025, 7:57 a.m. UTC | #8
On Tue, Feb 4, 2025 at 10:30 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Tue, 4 Feb 2025 13:17:08 -0800 Paul E. McKenney wrote:
> > > > TBH I'm slightly confused by this, and the previous warnings.
> > > >
> > > > The previous one was from a timer callback.
> > > >
> > > > This one is with BH disabled.
> > > >
> > > > I thought BH implies RCU protection. We certainly depend on that
> > > > in NAPI for XDP. And threaded NAPI does the exact same thing as
> > > > xfrm_trans_reinject(), a bare local_bh_disable().
> > > >
> > > > RCU folks, did something change or is just holes in my brain again?
> > >
> > > Nope, BH does not imply rcu_read_lock()
> >
> > You are both right?  ;-)
> >
> > The synchronize_rcu() function will wait for all types of RCU readers,
> > including BH-disabled regions of code.  However, lockdep can distinguish
> > between the various sorts of readers.  So for example
> >
> >       lockdep_assert_in_rcu_read_lock_bh();
> >
> > will complain unless you did rcu_read_lock_bh(), even if you did something
> > like disable_bh().  If you don't want to distinguish and are happy with
> > any type of RCU reader, you can use
> >
> >       lockdep_assert_in_rcu_reader();
> >
> > I have been expecting that CONFIG_PREEMPT_RT=y kernels will break this
> > any day now, but so far so good.  ;-)
>
> Thanks Paul! So IIUC in this case we could:
>
> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> index 0f5eb9db0c62..58ec1eb9ae6a 100644
> --- a/include/net/net_namespace.h
> +++ b/include/net/net_namespace.h
> @@ -401,7 +401,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
>  static inline struct net *read_pnet_rcu(possible_net_t *pnet)
>  {
>  #ifdef CONFIG_NET_NS
> -       return rcu_dereference(pnet->net);
> +       return rcu_dereference_check(pnet->net, rcu_read_lock_bh_held());
>  #else
>         return &init_net;
>  #endif
>
> Sorry for the sideline, Eric, up to you how to proceed..

I will squash this diff to the following iteration, and keep rcu_dereference()

Note that nf_hook() also grabs rcu_read_lock().

diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 4030527ebe098e86764f37c9068d2f2f9af2d183..ee5a69fdc67a30e55c5a073455e1d7299f168f34
100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -477,9 +477,7 @@ void ip6_protocol_deliver_rcu(struct net *net,
struct sk_buff *skb, int nexthdr,
 static int ip6_input_finish(struct net *net, struct sock *sk, struct
sk_buff *skb)
 {
        skb_clear_delivery_time(skb);
-       rcu_read_lock();
        ip6_protocol_deliver_rcu(net, skb, 0, false);
-       rcu_read_unlock();

        return 0;
 }
@@ -487,9 +485,14 @@ static int ip6_input_finish(struct net *net,
struct sock *sk, struct sk_buff *sk

 int ip6_input(struct sk_buff *skb)
 {
-       return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
-                      dev_net_rcu(skb->dev), NULL, skb, skb->dev, NULL,
-                      ip6_input_finish);
+       int res;
+
+       rcu_read_lock();
+       res = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
+                     dev_net_rcu(skb->dev), NULL, skb, skb->dev, NULL,
+                     ip6_input_finish);
+       rcu_read_unlock();
+       return res;
 }
 EXPORT_SYMBOL_GPL(ip6_input);
Eric Dumazet Feb. 5, 2025, 8:05 a.m. UTC | #9
On Wed, Feb 5, 2025 at 8:57 AM Eric Dumazet <edumazet@google.com> wrote:
>

>
> I will squash this diff to the following iteration, and keep rcu_dereference()
>
>

I will shrink the series in V4 to only include known bug fixes, to
lower the risk of having 10 more iterations.
diff mbox series

Patch

diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 70c0e16c0ae6837d1c64d0036829c8b61799578b..4030527ebe098e86764f37c9068d2f2f9af2d183 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -301,7 +301,7 @@  static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
 
 int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
-	struct net *net = dev_net(skb->dev);
+	struct net *net = dev_net_rcu(skb->dev);
 
 	skb = ip6_rcv_core(skb, dev, net);
 	if (skb == NULL)
@@ -330,7 +330,7 @@  void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
 
 	list_for_each_entry_safe(skb, next, head, list) {
 		struct net_device *dev = skb->dev;
-		struct net *net = dev_net(dev);
+		struct net *net = dev_net_rcu(dev);
 
 		skb_list_del_init(skb);
 		skb = ip6_rcv_core(skb, dev, net);
@@ -488,7 +488,7 @@  static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk
 int ip6_input(struct sk_buff *skb)
 {
 	return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
-		       dev_net(skb->dev), NULL, skb, skb->dev, NULL,
+		       dev_net_rcu(skb->dev), NULL, skb, skb->dev, NULL,
 		       ip6_input_finish);
 }
 EXPORT_SYMBOL_GPL(ip6_input);
@@ -500,14 +500,14 @@  int ip6_mc_input(struct sk_buff *skb)
 	struct net_device *dev;
 	bool deliver;
 
-	__IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev),
+	__IP6_UPD_PO_STATS(dev_net_rcu(skb_dst(skb)->dev),
 			 __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST,
 			 skb->len);
 
 	/* skb->dev passed may be master dev for vrfs. */
 	if (sdif) {
 		rcu_read_lock();
-		dev = dev_get_by_index_rcu(dev_net(skb->dev), sdif);
+		dev = dev_get_by_index_rcu(dev_net_rcu(skb->dev), sdif);
 		if (!dev) {
 			rcu_read_unlock();
 			kfree_skb(skb);
@@ -526,7 +526,7 @@  int ip6_mc_input(struct sk_buff *skb)
 	/*
 	 *      IPv6 multicast router mode is now supported ;)
 	 */
-	if (atomic_read(&dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding) &&
+	if (atomic_read(&dev_net_rcu(skb->dev)->ipv6.devconf_all->mc_forwarding) &&
 	    !(ipv6_addr_type(&hdr->daddr) &
 	      (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) &&
 	    likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) {