diff mbox series

[net] net: zero-initialize skb extensions on allocation

Message ID 20210524061959.2349342-1-vladbu@nvidia.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net] net: zero-initialize skb extensions on allocation | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for net
netdev/subject_prefix success Link
netdev/cc_maintainers fail 3 blamed authors not CCed: wenxu@ucloud.cn cong.wang@bytedance.com marcelo.leitner@gmail.com; 18 maintainers not CCed: linmiaohe@huawei.com jonathan.lemon@gmail.com kadlec@netfilter.org marcelo.leitner@gmail.com coreteam@netfilter.org bpf@vger.kernel.org gnault@redhat.com hawk@kernel.org daniel@iogearbox.net wenxu@ucloud.cn cong.wang@bytedance.com ast@kernel.org mptcp@lists.linux.dev john.fastabend@gmail.com willemb@google.com linyunsheng@huawei.com alobakin@pm.me netfilter-devel@vger.kernel.org
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 4 this patch: 4
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 57 lines checked
netdev/build_allmodconfig_warn success Errors and warnings before: 4 this patch: 4
netdev/header_inline success Link

Commit Message

Vlad Buslov May 24, 2021, 6:19 a.m. UTC
Function skb_ext_add() doesn't initialize created skb extension with any
value and leaves it up to the user. However, since extension of type
TC_SKB_EXT originally contained only single value tc_skb_ext->chain its
users used to just assign the chain value without setting whole extension
memory to zero first. This assumption changed when TC_SKB_EXT extension was
extended with additional fields but not all users were updated to
initialize the new fields which leads to use of uninitialized memory
afterwards. UBSAN log:

[  778.299821] UBSAN: invalid-load in net/openvswitch/flow.c:899:28
[  778.301495] load of value 107 is not a valid value for type '_Bool'
[  778.303215] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.12.0-rc7+ #2
[  778.304933] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[  778.307901] Call Trace:
[  778.308680]  <IRQ>
[  778.309358]  dump_stack+0xbb/0x107
[  778.310307]  ubsan_epilogue+0x5/0x40
[  778.311167]  __ubsan_handle_load_invalid_value.cold+0x43/0x48
[  778.312454]  ? memset+0x20/0x40
[  778.313230]  ovs_flow_key_extract.cold+0xf/0x14 [openvswitch]
[  778.314532]  ovs_vport_receive+0x19e/0x2e0 [openvswitch]
[  778.315749]  ? ovs_vport_find_upcall_portid+0x330/0x330 [openvswitch]
[  778.317188]  ? create_prof_cpu_mask+0x20/0x20
[  778.318220]  ? arch_stack_walk+0x82/0xf0
[  778.319153]  ? secondary_startup_64_no_verify+0xb0/0xbb
[  778.320399]  ? stack_trace_save+0x91/0xc0
[  778.321362]  ? stack_trace_consume_entry+0x160/0x160
[  778.322517]  ? lock_release+0x52e/0x760
[  778.323444]  netdev_frame_hook+0x323/0x610 [openvswitch]
[  778.324668]  ? ovs_netdev_get_vport+0xe0/0xe0 [openvswitch]
[  778.325950]  __netif_receive_skb_core+0x771/0x2db0
[  778.327067]  ? lock_downgrade+0x6e0/0x6f0
[  778.328021]  ? lock_acquire+0x565/0x720
[  778.328940]  ? generic_xdp_tx+0x4f0/0x4f0
[  778.329902]  ? inet_gro_receive+0x2a7/0x10a0
[  778.330914]  ? lock_downgrade+0x6f0/0x6f0
[  778.331867]  ? udp4_gro_receive+0x4c4/0x13e0
[  778.332876]  ? lock_release+0x52e/0x760
[  778.333808]  ? dev_gro_receive+0xcc8/0x2380
[  778.334810]  ? lock_downgrade+0x6f0/0x6f0
[  778.335769]  __netif_receive_skb_list_core+0x295/0x820
[  778.336955]  ? process_backlog+0x780/0x780
[  778.337941]  ? mlx5e_rep_tc_netdevice_event_unregister+0x20/0x20 [mlx5_core]
[  778.339613]  ? seqcount_lockdep_reader_access.constprop.0+0xa7/0xc0
[  778.341033]  ? kvm_clock_get_cycles+0x14/0x20
[  778.342072]  netif_receive_skb_list_internal+0x5f5/0xcb0
[  778.343288]  ? __kasan_kmalloc+0x7a/0x90
[  778.344234]  ? mlx5e_handle_rx_cqe_mpwrq+0x9e0/0x9e0 [mlx5_core]
[  778.345676]  ? mlx5e_xmit_xdp_frame_mpwqe+0x14d0/0x14d0 [mlx5_core]
[  778.347140]  ? __netif_receive_skb_list_core+0x820/0x820
[  778.348351]  ? mlx5e_post_rx_mpwqes+0xa6/0x25d0 [mlx5_core]
[  778.349688]  ? napi_gro_flush+0x26c/0x3c0
[  778.350641]  napi_complete_done+0x188/0x6b0
[  778.351627]  mlx5e_napi_poll+0x373/0x1b80 [mlx5_core]
[  778.352853]  __napi_poll+0x9f/0x510
[  778.353704]  ? mlx5_flow_namespace_set_mode+0x260/0x260 [mlx5_core]
[  778.355158]  net_rx_action+0x34c/0xa40
[  778.356060]  ? napi_threaded_poll+0x3d0/0x3d0
[  778.357083]  ? sched_clock_cpu+0x18/0x190
[  778.358041]  ? __common_interrupt+0x8e/0x1a0
[  778.359045]  __do_softirq+0x1ce/0x984
[  778.359938]  __irq_exit_rcu+0x137/0x1d0
[  778.360865]  irq_exit_rcu+0xa/0x20
[  778.361708]  common_interrupt+0x80/0xa0
[  778.362640]  </IRQ>
[  778.363212]  asm_common_interrupt+0x1e/0x40
[  778.364204] RIP: 0010:native_safe_halt+0xe/0x10
[  778.365273] Code: 4f ff ff ff 4c 89 e7 e8 50 3f 40 fe e9 dc fe ff ff 48 89 df e8 43 3f 40 fe eb 90 cc e9 07 00 00 00 0f 00 2d 74 05 62 00 fb f4 <c3> 90 e9 07 00 00 00 0f 00 2d 64 05 62 00 f4 c3 cc cc 0f 1f 44 00
[  778.369355] RSP: 0018:ffffffff84407e48 EFLAGS: 00000246
[  778.370570] RAX: ffff88842de46a80 RBX: ffffffff84425840 RCX: ffffffff83418468
[  778.372143] RDX: 000000000026f1da RSI: 0000000000000004 RDI: ffffffff8343af5e
[  778.373722] RBP: fffffbfff0884b08 R08: 0000000000000000 R09: ffff88842de46bcb
[  778.375292] R10: ffffed1085bc8d79 R11: 0000000000000001 R12: 0000000000000000
[  778.376860] R13: ffffffff851124a0 R14: 0000000000000000 R15: dffffc0000000000
[  778.378491]  ? rcu_eqs_enter.constprop.0+0xb8/0xe0
[  778.379606]  ? default_idle_call+0x5e/0xe0
[  778.380578]  default_idle+0xa/0x10
[  778.381406]  default_idle_call+0x96/0xe0
[  778.382350]  do_idle+0x3d4/0x550
[  778.383153]  ? arch_cpu_idle_exit+0x40/0x40
[  778.384143]  cpu_startup_entry+0x19/0x20
[  778.385078]  start_kernel+0x3c7/0x3e5
[  778.385978]  secondary_startup_64_no_verify+0xb0/0xbb

Fix the issue by changing __skb_ext_alloc() function to request
zero-initialized memory from kmem cache. Note that skb extension allocation
in skb_ext_maybe_cow() is not changed because newly allocated memory is
immediately overwritten with content of old skb extension so there is no
need to pre-initialize it.

Multiple users of skb extension API have already been manually setting
newly allocated skb extension memory to zero. Remove such code and rely on
skb extension API instead.

Fixes: 038ebb1a713d ("net/sched: act_ct: fix miss set mru for ovs after defrag in act_ct")
Fixes: d29334c15d33 ("net/sched: act_api: fix miss set post_ct for ovs after do conntrack in act_ct")
Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
---
 include/net/netfilter/br_netfilter.h |  7 +------
 net/core/skbuff.c                    |  6 ++----
 net/mptcp/options.c                  |  2 --
 net/xfrm/xfrm_input.c                | 16 +---------------
 4 files changed, 4 insertions(+), 27 deletions(-)

Comments

Florian Westphal May 24, 2021, 10:01 a.m. UTC | #1
Vlad Buslov <vladbu@nvidia.com> wrote:
> Function skb_ext_add() doesn't initialize created skb extension with any
> value and leaves it up to the user.

That was intentional.

Its unlikely that all extensions are active at the same time on same skb.

This is also the reason why the extension struct uses offset addressing
to get the extension data rather than the simpler

skb_ext {
	struct sec_path sp;
	struct nf_bridge_info nfbr;
	...
}

So adding e.g. mptcp extension will only touch 1 cacheline instead of 3
(or more if more extensions get added in the future).

IOW, i would prefer if tc would add tc_skb_add_ext() or similar and
zero whats needed there.

> Fix the issue by changing __skb_ext_alloc() function to request
> zero-initialized memory from kmem cache. Note that skb extension allocation
> in skb_ext_maybe_cow() is not changed because newly allocated memory is
> immediately overwritten with content of old skb extension so there is no
> need to pre-initialize it.
>
> Multiple users of skb extension API have already been manually setting
> newly allocated skb extension memory to zero. Remove such code and rely on
> skb extension API instead.

Are you sure its safe?

>  static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb)
>  {
>  #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
> -	struct nf_bridge_info *b = skb_ext_add(skb, SKB_EXT_BRIDGE_NF);
> -
> -	if (b)
> -		memset(b, 0, sizeof(*b));
> -
> -	return b;
> +	return skb_ext_add(skb, SKB_EXT_BRIDGE_NF);

So in the (unlikely) case where skb_ext_add did not allocate a new
extension, the memory is no longer cleared.

If the skb had an nf_bridge_info extension previously that got
discarded earlier via skb_ext_del() this now leaks the old content.
Vlad Buslov May 24, 2021, 11:23 a.m. UTC | #2
On Mon 24 May 2021 at 13:01, Florian Westphal <fw@strlen.de> wrote:
> Vlad Buslov <vladbu@nvidia.com> wrote:
>> Function skb_ext_add() doesn't initialize created skb extension with any
>> value and leaves it up to the user.
>
> That was intentional.
>
> Its unlikely that all extensions are active at the same time on same skb.
>
> This is also the reason why the extension struct uses offset addressing
> to get the extension data rather than the simpler
>
> skb_ext {
> 	struct sec_path sp;
> 	struct nf_bridge_info nfbr;
> 	...
> }
>
> So adding e.g. mptcp extension will only touch 1 cacheline instead of 3
> (or more if more extensions get added in the future).
>
> IOW, i would prefer if tc would add tc_skb_add_ext() or similar and
> zero whats needed there.

Got it. Thanks for the explanation!

>
>> Fix the issue by changing __skb_ext_alloc() function to request
>> zero-initialized memory from kmem cache. Note that skb extension allocation
>> in skb_ext_maybe_cow() is not changed because newly allocated memory is
>> immediately overwritten with content of old skb extension so there is no
>> need to pre-initialize it.
>>
>> Multiple users of skb extension API have already been manually setting
>> newly allocated skb extension memory to zero. Remove such code and rely on
>> skb extension API instead.
>
> Are you sure its safe?

Apparently it is not, since you found a problem with nf_bridge_alloc :)

>
>>  static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb)
>>  {
>>  #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
>> -	struct nf_bridge_info *b = skb_ext_add(skb, SKB_EXT_BRIDGE_NF);
>> -
>> -	if (b)
>> -		memset(b, 0, sizeof(*b));
>> -
>> -	return b;
>> +	return skb_ext_add(skb, SKB_EXT_BRIDGE_NF);
>
> So in the (unlikely) case where skb_ext_add did not allocate a new
> extension, the memory is no longer cleared.
>
> If the skb had an nf_bridge_info extension previously that got
> discarded earlier via skb_ext_del() this now leaks the old content.

So what would you suggest: provide a dedicated wrapper for TC skb
extension that will memset resulting extension to zero or refactor my
patch to zero-initialize specific skb extension in skb_ext_add() (only
the extension requested and also when previously discarded extension is
reused)?
Florian Westphal May 24, 2021, 5:13 p.m. UTC | #3
Vlad Buslov <vladbu@nvidia.com> wrote:
> So what would you suggest: provide a dedicated wrapper for TC skb
> extension that will memset resulting extension to zero or refactor my
> patch to zero-initialize specific skb extension in skb_ext_add() (only
> the extension requested and also when previously discarded extension is
> reused)?

I would go for A), but if you prefer B) I would not mind.
diff mbox series

Patch

diff --git a/include/net/netfilter/br_netfilter.h b/include/net/netfilter/br_netfilter.h
index 371696ec11b2..d4a6521ca8c3 100644
--- a/include/net/netfilter/br_netfilter.h
+++ b/include/net/netfilter/br_netfilter.h
@@ -9,12 +9,7 @@ 
 static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb)
 {
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-	struct nf_bridge_info *b = skb_ext_add(skb, SKB_EXT_BRIDGE_NF);
-
-	if (b)
-		memset(b, 0, sizeof(*b));
-
-	return b;
+	return skb_ext_add(skb, SKB_EXT_BRIDGE_NF);
 #else
 	return NULL;
 #endif
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3ad22870298c..f94614ec2d41 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -6316,12 +6316,10 @@  static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
  */
 struct skb_ext *__skb_ext_alloc(gfp_t flags)
 {
-	struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags);
+	struct skb_ext *new = kmem_cache_zalloc(skbuff_ext_cache, flags);
 
-	if (new) {
-		memset(new->offset, 0, sizeof(new->offset));
+	if (new)
 		refcount_set(&new->refcnt, 1);
-	}
 
 	return new;
 }
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 99fc21406168..aaa467b055c8 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -1078,8 +1078,6 @@  void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
 	if (!mpext)
 		return;
 
-	memset(mpext, 0, sizeof(*mpext));
-
 	if (mp_opt.use_map) {
 		if (mp_opt.mpc_map) {
 			/* this is an MP_CAPABLE carrying MPTCP data
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 1158cd0311d7..281d134e0f34 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -116,21 +116,7 @@  static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol,
 
 struct sec_path *secpath_set(struct sk_buff *skb)
 {
-	struct sec_path *sp, *tmp = skb_ext_find(skb, SKB_EXT_SEC_PATH);
-
-	sp = skb_ext_add(skb, SKB_EXT_SEC_PATH);
-	if (!sp)
-		return NULL;
-
-	if (tmp) /* reused existing one (was COW'd if needed) */
-		return sp;
-
-	/* allocated new secpath */
-	memset(sp->ovec, 0, sizeof(sp->ovec));
-	sp->olen = 0;
-	sp->len = 0;
-
-	return sp;
+	return skb_ext_add(skb, SKB_EXT_SEC_PATH);
 }
 EXPORT_SYMBOL(secpath_set);