diff mbox series

neighbour: purge nf_bridged skb from foreign device neigh

Message ID 20240108085232.95437-1-ptikhomirov@virtuozzo.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series neighbour: purge nf_bridged skb from foreign device neigh | expand

Checks

Context Check Description
netdev/series_format warning Single patches do not need cover letters; Target tree name not specified in the subject
netdev/tree_selection success Guessed tree name to be net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1085 this patch: 1085
netdev/cc_maintainers success CCed 0 of 0 maintainers
netdev/build_clang success Errors and warnings before: 1109 this patch: 1109
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1112 this patch: 1112
netdev/checkpatch warning WARNING: line length of 86 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Pavel Tikhomirov Jan. 8, 2024, 8:50 a.m. UTC
An skb can be added to a neigh->arp_queue while waiting for an arp
reply. Where original skb's skb->dev can be different to neigh's
neigh->dev. For instance in case of bridging dnated skb from one veth to
another, the skb would be added to a neigh->arp_queue of the bridge.

There is no explicit mechanism that prevents the original skb->dev link
of such skb from being freed under us. For instance neigh_flush_dev does
not cleanup skbs from different device's neigh queue. But that original
link can be used and lead to crash on e.g. this stack:

arp_process
  neigh_update
    skb = __skb_dequeue(&neigh->arp_queue)
      neigh_resolve_output(..., skb)
        ...
          br_nf_dev_xmit
            br_nf_pre_routing_finish_bridge_slow
              skb->dev = nf_bridge->physindev
              br_handle_frame_finish

So let's improve neigh_flush_dev to also purge skbs when device
equal to their skb->nf_bridge->physindev gets destroyed.

Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
---
I'm not fully sure, but likely it is:
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
---
 net/core/neighbour.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

Comments

Eric Dumazet Jan. 8, 2024, 9:10 a.m. UTC | #1
On Mon, Jan 8, 2024 at 9:52 AM Pavel Tikhomirov
<ptikhomirov@virtuozzo.com> wrote:
>
> An skb can be added to a neigh->arp_queue while waiting for an arp
> reply. Where original skb's skb->dev can be different to neigh's
> neigh->dev. For instance in case of bridging dnated skb from one veth to
> another, the skb would be added to a neigh->arp_queue of the bridge.
>
> There is no explicit mechanism that prevents the original skb->dev link
> of such skb from being freed under us. For instance neigh_flush_dev does
> not cleanup skbs from different device's neigh queue. But that original
> link can be used and lead to crash on e.g. this stack:
>
> arp_process
>   neigh_update
>     skb = __skb_dequeue(&neigh->arp_queue)
>       neigh_resolve_output(..., skb)
>         ...
>           br_nf_dev_xmit
>             br_nf_pre_routing_finish_bridge_slow
>               skb->dev = nf_bridge->physindev
>               br_handle_frame_finish
>
> So let's improve neigh_flush_dev to also purge skbs when device
> equal to their skb->nf_bridge->physindev gets destroyed.
>
> Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> ---
> I'm not fully sure, but likely it is:
> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> ---
>  net/core/neighbour.c | 26 ++++++++++++++++++++++++++
>  1 file changed, 26 insertions(+)
>
> diff --git a/net/core/neighbour.c b/net/core/neighbour.c
> index 552719c3bbc3d..47d2d52f17da3 100644
> --- a/net/core/neighbour.c
> +++ b/net/core/neighbour.c
> @@ -39,6 +39,9 @@
>  #include <linux/inetdevice.h>
>  #include <net/addrconf.h>
>
> +#include <linux/skbuff.h>
> +#include <linux/netfilter_bridge.h>
> +
>  #include <trace/events/neigh.h>
>
>  #define NEIGH_DEBUG 1
> @@ -377,6 +380,28 @@ static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net,
>         }
>  }
>
> +static void neigh_purge_nf_bridge_dev(struct neighbour *neigh, struct net_device *dev)
> +{
> +       struct sk_buff_head *list = &neigh->arp_queue;
> +       struct nf_bridge_info *nf_bridge;
> +       struct sk_buff *skb, *next;
> +
> +       write_lock(&neigh->lock);
> +       skb = skb_peek(list);
> +       while (skb) {
> +               nf_bridge = nf_bridge_info_get(skb);

This depends on CONFIG_BRIDGE_NETFILTER

Can we solve this issue without adding another layer violation ?

> +
> +               next = skb_peek_next(skb, list);
> +               if (nf_bridge && nf_bridge->physindev == dev) {
> +                       __skb_unlink(skb, list);
> +                       neigh->arp_queue_len_bytes -= skb->truesize;
> +                       kfree_skb(skb);
> +               }
> +               skb = next;
> +       }
> +       write_unlock(&neigh->lock);
> +}
> +
>  static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
>                             bool skip_perm)
>  {
> @@ -393,6 +418,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
>                 while ((n = rcu_dereference_protected(*np,
>                                         lockdep_is_held(&tbl->lock))) != NULL) {
>                         if (dev && n->dev != dev) {
> +                               neigh_purge_nf_bridge_dev(n, dev);
>                                 np = &n->next;
>                                 continue;
>                         }
> --
> 2.43.0
>
Florian Westphal Jan. 8, 2024, 11:15 a.m. UTC | #2
Pavel Tikhomirov <ptikhomirov@virtuozzo.com> wrote:
> An skb can be added to a neigh->arp_queue while waiting for an arp
> reply. Where original skb's skb->dev can be different to neigh's
> neigh->dev. For instance in case of bridging dnated skb from one veth to
> another, the skb would be added to a neigh->arp_queue of the bridge.
> 
> There is no explicit mechanism that prevents the original skb->dev link
> of such skb from being freed under us. For instance neigh_flush_dev does
> not cleanup skbs from different device's neigh queue. But that original
> link can be used and lead to crash on e.g. this stack:
> 
> arp_process
>   neigh_update
>     skb = __skb_dequeue(&neigh->arp_queue)
>       neigh_resolve_output(..., skb)
>         ...
>           br_nf_dev_xmit
>             br_nf_pre_routing_finish_bridge_slow
>               skb->dev = nf_bridge->physindev
>               br_handle_frame_finish
> 
> So let's improve neigh_flush_dev to also purge skbs when device
> equal to their skb->nf_bridge->physindev gets destroyed.

Can we fix this by replacing physindev pointer with plain
ifindex instead?  There are not too many places that need to
peek into the original net_device struct, so I don't think
the additional dev_get_by_index_rcu() would be an issue.
Pavel Tikhomirov Jan. 8, 2024, 11:26 a.m. UTC | #3
On 08/01/2024 19:15, Florian Westphal wrote:
> Pavel Tikhomirov <ptikhomirov@virtuozzo.com> wrote:
>> An skb can be added to a neigh->arp_queue while waiting for an arp
>> reply. Where original skb's skb->dev can be different to neigh's
>> neigh->dev. For instance in case of bridging dnated skb from one veth to
>> another, the skb would be added to a neigh->arp_queue of the bridge.
>>
>> There is no explicit mechanism that prevents the original skb->dev link
>> of such skb from being freed under us. For instance neigh_flush_dev does
>> not cleanup skbs from different device's neigh queue. But that original
>> link can be used and lead to crash on e.g. this stack:
>>
>> arp_process
>>    neigh_update
>>      skb = __skb_dequeue(&neigh->arp_queue)
>>        neigh_resolve_output(..., skb)
>>          ...
>>            br_nf_dev_xmit
>>              br_nf_pre_routing_finish_bridge_slow
>>                skb->dev = nf_bridge->physindev
>>                br_handle_frame_finish
>>
>> So let's improve neigh_flush_dev to also purge skbs when device
>> equal to their skb->nf_bridge->physindev gets destroyed.
> 
> Can we fix this by replacing physindev pointer with plain
> ifindex instead?  There are not too many places that need to
> peek into the original net_device struct, so I don't think
> the additional dev_get_by_index_rcu() would be an issue.

I will work on it, thanks for a good idea!
Pavel Tikhomirov Jan. 9, 2024, 4:57 a.m. UTC | #4
On 08/01/2024 19:26, Pavel Tikhomirov wrote:
> 
> 
> On 08/01/2024 19:15, Florian Westphal wrote:
>> Pavel Tikhomirov <ptikhomirov@virtuozzo.com> wrote:
>>> An skb can be added to a neigh->arp_queue while waiting for an arp
>>> reply. Where original skb's skb->dev can be different to neigh's
>>> neigh->dev. For instance in case of bridging dnated skb from one veth to
>>> another, the skb would be added to a neigh->arp_queue of the bridge.
>>>
>>> There is no explicit mechanism that prevents the original skb->dev link
>>> of such skb from being freed under us. For instance neigh_flush_dev does
>>> not cleanup skbs from different device's neigh queue. But that original
>>> link can be used and lead to crash on e.g. this stack:
>>>
>>> arp_process
>>>    neigh_update
>>>      skb = __skb_dequeue(&neigh->arp_queue)
>>>        neigh_resolve_output(..., skb)
>>>          ...
>>>            br_nf_dev_xmit
>>>              br_nf_pre_routing_finish_bridge_slow
>>>                skb->dev = nf_bridge->physindev
>>>                br_handle_frame_finish
>>>
>>> So let's improve neigh_flush_dev to also purge skbs when device
>>> equal to their skb->nf_bridge->physindev gets destroyed.
>>
>> Can we fix this by replacing physindev pointer with plain
>> ifindex instead?  There are not too many places that need to
>> peek into the original net_device struct, so I don't think
>> the additional dev_get_by_index_rcu() would be an issue.
> 
> I will work on it, thanks for a good idea!
> 

If we replace nf_bridge->physindev completely, we would need to do 
something like this in every place physindev was used:

diff --git a/include/linux/netfilter_bridge.h 
b/include/linux/netfilter_bridge.h
index f980edfdd2783..105fbdb029261 100644
--- a/include/linux/netfilter_bridge.h
+++ b/include/linux/netfilter_bridge.h
@@ -56,11 +56,15 @@ static inline int nf_bridge_get_physoutif(const 
struct sk_buff *skb)
  }

  static inline struct net_device *
-nf_bridge_get_physindev(const struct sk_buff *skb)
+nf_bridge_get_physindev_rcu(const struct sk_buff *skb)
  {
         const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+       struct net_device *dev;

-       return nf_bridge ? nf_bridge->physindev : NULL;
+       if (!nf_bridge || !skb->dev)
+               return 0;
+
+       return dev_get_by_index_rcu(skb->dev->net, nf_bridge->physindev_if);
  }

  static inline struct net_device *
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a5ae952454c89..51e7cdf9b51c9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -295,7 +295,7 @@ struct nf_bridge_info {
         u8                      bridged_dnat:1;
         u8                      sabotage_in_done:1;
         __u16                   frag_max_size;
-       struct net_device       *physindev;
+       int                     *physindev_if;

         /* always valid & non-NULL from FORWARD on, for physdev match */
         struct net_device       *physoutdev;
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c 
b/net/ipv4/netfilter/nf_reject_ipv4.c
index f01b038fc1cda..01b3eb169772e 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -289,7 +289,8 @@ void nf_send_reset(struct net *net, struct sock *sk, 
struct sk_buff *oldskb,
          * build the eth header using the original destination's MAC as the
          * source, and send the RST packet directly.
          */
-       br_indev = nf_bridge_get_physindev(oldskb);
+       rcu_read_lock_bh();
+       br_indev = nf_bridge_get_physindev_rcu(oldskb);
         if (br_indev) {
                 struct ethhdr *oeth = eth_hdr(oldskb);

@@ -297,12 +298,19 @@ void nf_send_reset(struct net *net, struct sock 
*sk, struct sk_buff *oldskb,
                 niph->tot_len = htons(nskb->len);
                 ip_send_check(niph);
                 if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol),
-                                   oeth->h_source, oeth->h_dest, 
nskb->len) < 0)
+                                   oeth->h_source, oeth->h_dest, 
nskb->len) < 0) {
+                       rcu_read_unlock_bh();
                         goto free_nskb;
+               }
                 dev_queue_xmit(nskb);
-       } else
+               rcu_read_unlock_bh();
+       } else {
+               rcu_read_unlock_bh();
  #endif
                 ip_local_out(net, nskb->sk, nskb);
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+       }
+#endif

         return;

Does it sound good?

Or maybe instead we can have extra physindev_if field in addition to 
existing physindev to only do dev_get_by_index_rcu inside 
br_nf_pre_routing_finish_bridge_slow to doublecheck the ->physindev link?

Sorry in advance if I'm missing anything obvious.
kernel test robot Jan. 9, 2024, 5:38 a.m. UTC | #5
Hi Pavel,

kernel test robot noticed the following build warnings:

[auto build test WARNING on net-next/main]
[also build test WARNING on net/main linus/master horms-ipvs/master v6.7 next-20240108]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Pavel-Tikhomirov/neighbour-purge-nf_bridged-skb-from-foreign-device-neigh/20240108-165551
base:   net-next/main
patch link:    https://lore.kernel.org/r/20240108085232.95437-1-ptikhomirov%40virtuozzo.com
patch subject: [PATCH] neighbour: purge nf_bridged skb from foreign device neigh
config: x86_64-defconfig (https://download.01.org/0day-ci/archive/20240109/202401091351.CqYRoau7-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-12) 11.3.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240109/202401091351.CqYRoau7-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202401091351.CqYRoau7-lkp@intel.com/

All warnings (new ones prefixed by >>):

   net/core/neighbour.c: In function 'neigh_purge_nf_bridge_dev':
   net/core/neighbour.c:392:29: error: implicit declaration of function 'nf_bridge_info_get' [-Werror=implicit-function-declaration]
     392 |                 nf_bridge = nf_bridge_info_get(skb);
         |                             ^~~~~~~~~~~~~~~~~~
>> net/core/neighbour.c:392:27: warning: assignment to 'struct nf_bridge_info *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
     392 |                 nf_bridge = nf_bridge_info_get(skb);
         |                           ^
   net/core/neighbour.c:395:43: error: invalid use of undefined type 'struct nf_bridge_info'
     395 |                 if (nf_bridge && nf_bridge->physindev == dev) {
         |                                           ^~
   cc1: some warnings being treated as errors


vim +392 net/core/neighbour.c

   382	
   383	static void neigh_purge_nf_bridge_dev(struct neighbour *neigh, struct net_device *dev)
   384	{
   385		struct sk_buff_head *list = &neigh->arp_queue;
   386		struct nf_bridge_info *nf_bridge;
   387		struct sk_buff *skb, *next;
   388	
   389		write_lock(&neigh->lock);
   390		skb = skb_peek(list);
   391		while (skb) {
 > 392			nf_bridge = nf_bridge_info_get(skb);
   393	
   394			next = skb_peek_next(skb, list);
   395			if (nf_bridge && nf_bridge->physindev == dev) {
   396				__skb_unlink(skb, list);
   397				neigh->arp_queue_len_bytes -= skb->truesize;
   398				kfree_skb(skb);
   399			}
   400			skb = next;
   401		}
   402		write_unlock(&neigh->lock);
   403	}
   404
Pavel Tikhomirov Jan. 9, 2024, 6:05 a.m. UTC | #6
That problem happens because the patch is not ready to handle the lack 
of CONFIG_BRIDGE_NETFILTER (as Eric already mentioned earlier in this 
thread).

On 09/01/2024 13:38, kernel test robot wrote:
> Hi Pavel,
> 
> kernel test robot noticed the following build warnings:
> 
> [auto build test WARNING on net-next/main]
> [also build test WARNING on net/main linus/master horms-ipvs/master v6.7 next-20240108]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Pavel-Tikhomirov/neighbour-purge-nf_bridged-skb-from-foreign-device-neigh/20240108-165551
> base:   net-next/main
> patch link:    https://lore.kernel.org/r/20240108085232.95437-1-ptikhomirov%40virtuozzo.com
> patch subject: [PATCH] neighbour: purge nf_bridged skb from foreign device neigh
> config: x86_64-defconfig (https://download.01.org/0day-ci/archive/20240109/202401091351.CqYRoau7-lkp@intel.com/config)
> compiler: gcc-11 (Debian 11.3.0-12) 11.3.0
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240109/202401091351.CqYRoau7-lkp@intel.com/reproduce)
> 
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202401091351.CqYRoau7-lkp@intel.com/
> 
> All warnings (new ones prefixed by >>):
> 
>     net/core/neighbour.c: In function 'neigh_purge_nf_bridge_dev':
>     net/core/neighbour.c:392:29: error: implicit declaration of function 'nf_bridge_info_get' [-Werror=implicit-function-declaration]
>       392 |                 nf_bridge = nf_bridge_info_get(skb);
>           |                             ^~~~~~~~~~~~~~~~~~
>>> net/core/neighbour.c:392:27: warning: assignment to 'struct nf_bridge_info *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
>       392 |                 nf_bridge = nf_bridge_info_get(skb);
>           |                           ^
>     net/core/neighbour.c:395:43: error: invalid use of undefined type 'struct nf_bridge_info'
>       395 |                 if (nf_bridge && nf_bridge->physindev == dev) {
>           |                                           ^~
>     cc1: some warnings being treated as errors
> 
> 
> vim +392 net/core/neighbour.c
> 
>     382	
>     383	static void neigh_purge_nf_bridge_dev(struct neighbour *neigh, struct net_device *dev)
>     384	{
>     385		struct sk_buff_head *list = &neigh->arp_queue;
>     386		struct nf_bridge_info *nf_bridge;
>     387		struct sk_buff *skb, *next;
>     388	
>     389		write_lock(&neigh->lock);
>     390		skb = skb_peek(list);
>     391		while (skb) {
>   > 392			nf_bridge = nf_bridge_info_get(skb);
>     393	
>     394			next = skb_peek_next(skb, list);
>     395			if (nf_bridge && nf_bridge->physindev == dev) {
>     396				__skb_unlink(skb, list);
>     397				neigh->arp_queue_len_bytes -= skb->truesize;
>     398				kfree_skb(skb);
>     399			}
>     400			skb = next;
>     401		}
>     402		write_unlock(&neigh->lock);
>     403	}
>     404	
>
kernel test robot Jan. 9, 2024, 9:01 a.m. UTC | #7
Hi Pavel,

kernel test robot noticed the following build warnings:

[auto build test WARNING on net-next/main]
[also build test WARNING on net/main linus/master horms-ipvs/master v6.7 next-20240109]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Pavel-Tikhomirov/neighbour-purge-nf_bridged-skb-from-foreign-device-neigh/20240108-165551
base:   net-next/main
patch link:    https://lore.kernel.org/r/20240108085232.95437-1-ptikhomirov%40virtuozzo.com
patch subject: [PATCH] neighbour: purge nf_bridged skb from foreign device neigh
config: i386-defconfig (https://download.01.org/0day-ci/archive/20240109/202401091607.P1JJMaxg-lkp@intel.com/config)
compiler: gcc-7 (Ubuntu 7.5.0-6ubuntu2) 7.5.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240109/202401091607.P1JJMaxg-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202401091607.P1JJMaxg-lkp@intel.com/

All warnings (new ones prefixed by >>):

   net/core/neighbour.c: In function 'neigh_purge_nf_bridge_dev':
   net/core/neighbour.c:392:15: error: implicit declaration of function 'nf_bridge_info_get'; did you mean 'nf_bridge_in_prerouting'? [-Werror=implicit-function-declaration]
      nf_bridge = nf_bridge_info_get(skb);
                  ^~~~~~~~~~~~~~~~~~
                  nf_bridge_in_prerouting
>> net/core/neighbour.c:392:13: warning: assignment makes pointer from integer without a cast [-Wint-conversion]
      nf_bridge = nf_bridge_info_get(skb);
                ^
   net/core/neighbour.c:395:29: error: dereferencing pointer to incomplete type 'struct nf_bridge_info'
      if (nf_bridge && nf_bridge->physindev == dev) {
                                ^~
   cc1: some warnings being treated as errors


vim +392 net/core/neighbour.c

   382	
   383	static void neigh_purge_nf_bridge_dev(struct neighbour *neigh, struct net_device *dev)
   384	{
   385		struct sk_buff_head *list = &neigh->arp_queue;
   386		struct nf_bridge_info *nf_bridge;
   387		struct sk_buff *skb, *next;
   388	
   389		write_lock(&neigh->lock);
   390		skb = skb_peek(list);
   391		while (skb) {
 > 392			nf_bridge = nf_bridge_info_get(skb);
   393	
   394			next = skb_peek_next(skb, list);
   395			if (nf_bridge && nf_bridge->physindev == dev) {
   396				__skb_unlink(skb, list);
   397				neigh->arp_queue_len_bytes -= skb->truesize;
   398				kfree_skb(skb);
   399			}
   400			skb = next;
   401		}
   402		write_unlock(&neigh->lock);
   403	}
   404
kernel test robot Jan. 9, 2024, 10:50 a.m. UTC | #8
Hi Pavel,

kernel test robot noticed the following build errors:

[auto build test ERROR on net-next/main]
[also build test ERROR on net/main linus/master horms-ipvs/master v6.7 next-20240109]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Pavel-Tikhomirov/neighbour-purge-nf_bridged-skb-from-foreign-device-neigh/20240108-165551
base:   net-next/main
patch link:    https://lore.kernel.org/r/20240108085232.95437-1-ptikhomirov%40virtuozzo.com
patch subject: [PATCH] neighbour: purge nf_bridged skb from foreign device neigh
config: x86_64-defconfig (https://download.01.org/0day-ci/archive/20240109/202401091858.8boOfnEz-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-12) 11.3.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240109/202401091858.8boOfnEz-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202401091858.8boOfnEz-lkp@intel.com/

All errors (new ones prefixed by >>):

   net/core/neighbour.c: In function 'neigh_purge_nf_bridge_dev':
>> net/core/neighbour.c:392:29: error: implicit declaration of function 'nf_bridge_info_get' [-Werror=implicit-function-declaration]
     392 |                 nf_bridge = nf_bridge_info_get(skb);
         |                             ^~~~~~~~~~~~~~~~~~
   net/core/neighbour.c:392:27: warning: assignment to 'struct nf_bridge_info *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
     392 |                 nf_bridge = nf_bridge_info_get(skb);
         |                           ^
>> net/core/neighbour.c:395:43: error: invalid use of undefined type 'struct nf_bridge_info'
     395 |                 if (nf_bridge && nf_bridge->physindev == dev) {
         |                                           ^~
   cc1: some warnings being treated as errors


vim +/nf_bridge_info_get +392 net/core/neighbour.c

   382	
   383	static void neigh_purge_nf_bridge_dev(struct neighbour *neigh, struct net_device *dev)
   384	{
   385		struct sk_buff_head *list = &neigh->arp_queue;
   386		struct nf_bridge_info *nf_bridge;
   387		struct sk_buff *skb, *next;
   388	
   389		write_lock(&neigh->lock);
   390		skb = skb_peek(list);
   391		while (skb) {
 > 392			nf_bridge = nf_bridge_info_get(skb);
   393	
   394			next = skb_peek_next(skb, list);
 > 395			if (nf_bridge && nf_bridge->physindev == dev) {
   396				__skb_unlink(skb, list);
   397				neigh->arp_queue_len_bytes -= skb->truesize;
   398				kfree_skb(skb);
   399			}
   400			skb = next;
   401		}
   402		write_unlock(&neigh->lock);
   403	}
   404
Florian Westphal Jan. 9, 2024, 11:12 a.m. UTC | #9
Pavel Tikhomirov <ptikhomirov@virtuozzo.com> wrote:
> index f980edfdd2783..105fbdb029261 100644
> --- a/include/linux/netfilter_bridge.h
> +++ b/include/linux/netfilter_bridge.h
> @@ -56,11 +56,15 @@ static inline int nf_bridge_get_physoutif(const struct
> sk_buff *skb)
>  }
> 
>  static inline struct net_device *
> -nf_bridge_get_physindev(const struct sk_buff *skb)
> +nf_bridge_get_physindev_rcu(const struct sk_buff *skb)
>  {
>         const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
> +       struct net_device *dev;
> 
> -       return nf_bridge ? nf_bridge->physindev : NULL;
> +       if (!nf_bridge || !skb->dev)
> +               return 0;
> +
> +       return dev_get_by_index_rcu(skb->dev->net, nf_bridge->physindev_if);

You could use dev_net(skb->dev), yes.

Or create a preparation patch that does:

-nf_bridge_get_physindev(const struct sk_buff *skb)
+nf_bridge_get_physindev(const struct sk_buff *skb, struct net *net)

(all callers have a struct net available).

No need to rename the function, see below.

> -       br_indev = nf_bridge_get_physindev(oldskb);
> +       rcu_read_lock_bh();
> +       br_indev = nf_bridge_get_physindev_rcu(oldskb);

No need for rcu read lock, all netfilter hooks run inside
rcu_read_lock().

> Does it sound good?

Yes, seems ok to me.

> Or maybe instead we can have extra physindev_if field in addition to
> existing physindev to only do dev_get_by_index_rcu inside
> br_nf_pre_routing_finish_bridge_slow to doublecheck the ->physindev link?
> 
> Sorry in advance if I'm missing anything obvious.

Alternative would be to add a 'br_nf_unreg_serno' that gets incremented
from brnf_device_event(), then store that in nf_bridge_info struct and
compare to current value before net_device deref. If not equal, toss skb.

Problem is that we'd need some indirection to retrieve the current
value, otherwise places like nfnetlink_log() gain a module dependency on
br_netfilter :-(

We'd likely need
const atomic_t *br_nf_unreg_serno __read_mostly;
EXPORT_SYMBOL_GPL(br_nf_unreg_serno);

in net/netfilter/core.c for this, then set/clear the
pointer from br_netfilter_hooks.c.

I can't say/don't know which of the two options is better/worse.

s/struct net_device */int// has the benefit of shrinking nf_bridge_info,
so I'd try that first.
Pavel Tikhomirov Jan. 10, 2024, 11:16 a.m. UTC | #10
Here is the new version 
https://lore.kernel.org/netdev/20240110110451.5473-1-ptikhomirov@virtuozzo.com/

On 09/01/2024 19:12, Florian Westphal wrote:
> Pavel Tikhomirov <ptikhomirov@virtuozzo.com> wrote:
>> index f980edfdd2783..105fbdb029261 100644
>> --- a/include/linux/netfilter_bridge.h
>> +++ b/include/linux/netfilter_bridge.h
>> @@ -56,11 +56,15 @@ static inline int nf_bridge_get_physoutif(const struct
>> sk_buff *skb)
>>   }
>>
>>   static inline struct net_device *
>> -nf_bridge_get_physindev(const struct sk_buff *skb)
>> +nf_bridge_get_physindev_rcu(const struct sk_buff *skb)
>>   {
>>          const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
>> +       struct net_device *dev;
>>
>> -       return nf_bridge ? nf_bridge->physindev : NULL;
>> +       if (!nf_bridge || !skb->dev)
>> +               return 0;
>> +
>> +       return dev_get_by_index_rcu(skb->dev->net, nf_bridge->physindev_if);
> 
> You could use dev_net(skb->dev), yes.

In br_nf_pre_routing_finish_bridge_slow I had to use dev_net(skb->dev).

> 
> Or create a preparation patch that does:
> 
> -nf_bridge_get_physindev(const struct sk_buff *skb)
> +nf_bridge_get_physindev(const struct sk_buff *skb, struct net *net)
> 
> (all callers have a struct net available).

For all other cases I did the prep-patch propagating net.

> 
> No need to rename the function, see below.
> 
>> -       br_indev = nf_bridge_get_physindev(oldskb);
>> +       rcu_read_lock_bh();
>> +       br_indev = nf_bridge_get_physindev_rcu(oldskb);
> 
> No need for rcu read lock, all netfilter hooks run inside
> rcu_read_lock().

Thanks for this hint! I have checked all those tons of cases and 
actually proved to myself that all cases have rcu_read_lock =)

> 
>> Does it sound good?
> 
> Yes, seems ok to me.
> 
>> Or maybe instead we can have extra physindev_if field in addition to
>> existing physindev to only do dev_get_by_index_rcu inside
>> br_nf_pre_routing_finish_bridge_slow to doublecheck the ->physindev link?
>>
>> Sorry in advance if I'm missing anything obvious.
> 
> Alternative would be to add a 'br_nf_unreg_serno' that gets incremented
> from brnf_device_event(), then store that in nf_bridge_info struct and
> compare to current value before net_device deref. If not equal, toss skb.
> 
> Problem is that we'd need some indirection to retrieve the current
> value, otherwise places like nfnetlink_log() gain a module dependency on
> br_netfilter :-(
> 
> We'd likely need
> const atomic_t *br_nf_unreg_serno __read_mostly;
> EXPORT_SYMBOL_GPL(br_nf_unreg_serno);
> 
> in net/netfilter/core.c for this, then set/clear the
> pointer from br_netfilter_hooks.c.
> 
> I can't say/don't know which of the two options is better/worse.
> 
> s/struct net_device */int// has the benefit of shrinking nf_bridge_info,
> so I'd try that first.

Ok, did s/struct net_device */int// variant.
diff mbox series

Patch

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 552719c3bbc3d..47d2d52f17da3 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -39,6 +39,9 @@ 
 #include <linux/inetdevice.h>
 #include <net/addrconf.h>
 
+#include <linux/skbuff.h>
+#include <linux/netfilter_bridge.h>
+
 #include <trace/events/neigh.h>
 
 #define NEIGH_DEBUG 1
@@ -377,6 +380,28 @@  static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net,
 	}
 }
 
+static void neigh_purge_nf_bridge_dev(struct neighbour *neigh, struct net_device *dev)
+{
+	struct sk_buff_head *list = &neigh->arp_queue;
+	struct nf_bridge_info *nf_bridge;
+	struct sk_buff *skb, *next;
+
+	write_lock(&neigh->lock);
+	skb = skb_peek(list);
+	while (skb) {
+		nf_bridge = nf_bridge_info_get(skb);
+
+		next = skb_peek_next(skb, list);
+		if (nf_bridge && nf_bridge->physindev == dev) {
+			__skb_unlink(skb, list);
+			neigh->arp_queue_len_bytes -= skb->truesize;
+			kfree_skb(skb);
+		}
+		skb = next;
+	}
+	write_unlock(&neigh->lock);
+}
+
 static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
 			    bool skip_perm)
 {
@@ -393,6 +418,7 @@  static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
 		while ((n = rcu_dereference_protected(*np,
 					lockdep_is_held(&tbl->lock))) != NULL) {
 			if (dev && n->dev != dev) {
+				neigh_purge_nf_bridge_dev(n, dev);
 				np = &n->next;
 				continue;
 			}