diff mbox series

[bpf] xsk: fix possible crash when multiple sockets are created

Message ID 20220425153745.481322-1-maciej.fijalkowski@intel.com (mailing list archive)
State Accepted
Commit ba3beec2ec1d3b4fd8672ca6e781dac4b3267f6e
Delegated to: BPF
Headers show
Series [bpf] xsk: fix possible crash when multiple sockets are created | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf
netdev/fixes_present success Fixes tag present in non-next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 3 this patch: 3
netdev/cc_maintainers fail 1 blamed authors not CCed: alexandr.lobakin@intel.com; 13 maintainers not CCed: songliubraving@fb.com kuba@kernel.org davem@davemloft.net andrii@kernel.org alexandr.lobakin@intel.com pabeni@redhat.com kafai@fb.com jonathan.lemon@gmail.com yhs@fb.com john.fastabend@gmail.com bjorn@kernel.org hawk@kernel.org kpsingh@kernel.org
netdev/build_clang success Errors and warnings before: 9 this patch: 9
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 3 this patch: 3
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 55 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-PR fail PR summary
bpf/vmtest-bpf-VM_Test-2 fail Logs for Kernel LATEST on z15 + selftests
bpf/vmtest-bpf-VM_Test-1 fail Logs for Kernel LATEST on ubuntu-latest + selftests

Commit Message

Maciej Fijalkowski April 25, 2022, 3:37 p.m. UTC
Fix a crash that happens if an Rx only socket is created first, then a
second socket is created that is Tx only and bound to the same umem as
the first socket and also the same netdev and queue_id together with the
XDP_SHARED_UMEM flag. In this specific case, the tx_descs array page
pool was not created by the first socket as it was an Rx only socket.
When the second socket is bound it needs this tx_descs array of this
shared page pool as it has a Tx component, but unfortunately it was
never allocated, leading to a crash. Note that this array is only used
for zero-copy drivers using the batched Tx APIs, currently only ice and
i40e.

[ 5511.150360] BUG: kernel NULL pointer dereference, address: 0000000000000008
[ 5511.158419] #PF: supervisor write access in kernel mode
[ 5511.164472] #PF: error_code(0x0002) - not-present page
[ 5511.170416] PGD 0 P4D 0
[ 5511.173347] Oops: 0002 [#1] PREEMPT SMP PTI
[ 5511.178186] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G            E     5.18.0-rc1+ #97
[ 5511.187245] Hardware name: Intel Corp. GRANTLEY/GRANTLEY, BIOS GRRFCRB1.86B.0276.D07.1605190235 05/19/2016
[ 5511.198418] RIP: 0010:xsk_tx_peek_release_desc_batch+0x198/0x310
[ 5511.205375] Code: c0 83 c6 01 84 c2 74 6d 8d 46 ff 23 07 44 89 e1 48 83 c0 14 48 c1 e1 04 48 c1 e0 04 48 03 47 10 4c 01 c1 48 8b 50 08 48 8b 00 <48> 89 51 08 48 89 01 41 80 bd d7 00 00 00 00 75 82 48 8b 19 49 8b
[ 5511.227091] RSP: 0018:ffffc90000003dd0 EFLAGS: 00010246
[ 5511.233135] RAX: 0000000000000000 RBX: ffff88810c8da600 RCX: 0000000000000000
[ 5511.241384] RDX: 000000000000003c RSI: 0000000000000001 RDI: ffff888115f555c0
[ 5511.249634] RBP: ffffc90000003e08 R08: 0000000000000000 R09: ffff889092296b48
[ 5511.257886] R10: 0000ffffffffffff R11: ffff889092296800 R12: 0000000000000000
[ 5511.266138] R13: ffff88810c8db500 R14: 0000000000000040 R15: 0000000000000100
[ 5511.274387] FS:  0000000000000000(0000) GS:ffff88903f800000(0000) knlGS:0000000000000000
[ 5511.283746] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 5511.290389] CR2: 0000000000000008 CR3: 00000001046e2001 CR4: 00000000003706f0
[ 5511.298640] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 5511.306892] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 5511.315142] Call Trace:
[ 5511.317972]  <IRQ>
[ 5511.320301]  ice_xmit_zc+0x68/0x2f0 [ice]
[ 5511.324977]  ? ktime_get+0x38/0xa0
[ 5511.328913]  ice_napi_poll+0x7a/0x6a0 [ice]
[ 5511.333784]  __napi_poll+0x2c/0x160
[ 5511.337821]  net_rx_action+0xdd/0x200
[ 5511.342058]  __do_softirq+0xe6/0x2dd
[ 5511.346198]  irq_exit_rcu+0xb5/0x100
[ 5511.350339]  common_interrupt+0xa4/0xc0
[ 5511.354777]  </IRQ>
[ 5511.357201]  <TASK>
[ 5511.359625]  asm_common_interrupt+0x1e/0x40
[ 5511.364466] RIP: 0010:cpuidle_enter_state+0xd2/0x360
[ 5511.370211] Code: 49 89 c5 0f 1f 44 00 00 31 ff e8 e9 00 7b ff 45 84 ff 74 12 9c 58 f6 c4 02 0f 85 72 02 00 00 31 ff e8 02 0c 80 ff fb 45 85 f6 <0f> 88 11 01 00 00 49 63 c6 4c 2b 2c 24 48 8d 14 40 48 8d 14 90 49
[ 5511.391921] RSP: 0018:ffffffff82a03e60 EFLAGS: 00000202
[ 5511.397962] RAX: ffff88903f800000 RBX: 0000000000000001 RCX: 000000000000001f
[ 5511.406214] RDX: 0000000000000000 RSI: ffffffff823400b9 RDI: ffffffff8234c046
[ 5511.424646] RBP: ffff88810a384800 R08: 000005032a28c046 R09: 0000000000000008
[ 5511.443233] R10: 000000000000000b R11: 0000000000000006 R12: ffffffff82bcf700
[ 5511.461922] R13: 000005032a28c046 R14: 0000000000000001 R15: 0000000000000000
[ 5511.480300]  cpuidle_enter+0x29/0x40
[ 5511.494329]  do_idle+0x1c7/0x250
[ 5511.507610]  cpu_startup_entry+0x19/0x20
[ 5511.521394]  start_kernel+0x649/0x66e
[ 5511.534626]  secondary_startup_64_no_verify+0xc3/0xcb
[ 5511.549230]  </TASK>

Detect such case during bind() and allocate this memory region via newly
introduced xp_alloc_tx_descs(). Also, use kvcalloc instead of kcalloc as
for other buffer pool allocations, so that it matches the kvfree() from
xp_destroy().

Fixes: d1bc532e99be ("i40e: xsk: Move tmp desc array from driver to pool")
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
---
 include/net/xsk_buff_pool.h |  1 +
 net/xdp/xsk.c               | 13 +++++++++++++
 net/xdp/xsk_buff_pool.c     | 16 ++++++++++++----
 3 files changed, 26 insertions(+), 4 deletions(-)

Comments

Magnus Karlsson April 26, 2022, 9:01 a.m. UTC | #1
On Tue, Apr 26, 2022 at 12:28 AM Maciej Fijalkowski
<maciej.fijalkowski@intel.com> wrote:
>
> Fix a crash that happens if an Rx only socket is created first, then a
> second socket is created that is Tx only and bound to the same umem as
> the first socket and also the same netdev and queue_id together with the
> XDP_SHARED_UMEM flag. In this specific case, the tx_descs array page
> pool was not created by the first socket as it was an Rx only socket.
> When the second socket is bound it needs this tx_descs array of this
> shared page pool as it has a Tx component, but unfortunately it was
> never allocated, leading to a crash. Note that this array is only used
> for zero-copy drivers using the batched Tx APIs, currently only ice and
> i40e.
>
> [ 5511.150360] BUG: kernel NULL pointer dereference, address: 0000000000000008
> [ 5511.158419] #PF: supervisor write access in kernel mode
> [ 5511.164472] #PF: error_code(0x0002) - not-present page
> [ 5511.170416] PGD 0 P4D 0
> [ 5511.173347] Oops: 0002 [#1] PREEMPT SMP PTI
> [ 5511.178186] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G            E     5.18.0-rc1+ #97
> [ 5511.187245] Hardware name: Intel Corp. GRANTLEY/GRANTLEY, BIOS GRRFCRB1.86B.0276.D07.1605190235 05/19/2016
> [ 5511.198418] RIP: 0010:xsk_tx_peek_release_desc_batch+0x198/0x310
> [ 5511.205375] Code: c0 83 c6 01 84 c2 74 6d 8d 46 ff 23 07 44 89 e1 48 83 c0 14 48 c1 e1 04 48 c1 e0 04 48 03 47 10 4c 01 c1 48 8b 50 08 48 8b 00 <48> 89 51 08 48 89 01 41 80 bd d7 00 00 00 00 75 82 48 8b 19 49 8b
> [ 5511.227091] RSP: 0018:ffffc90000003dd0 EFLAGS: 00010246
> [ 5511.233135] RAX: 0000000000000000 RBX: ffff88810c8da600 RCX: 0000000000000000
> [ 5511.241384] RDX: 000000000000003c RSI: 0000000000000001 RDI: ffff888115f555c0
> [ 5511.249634] RBP: ffffc90000003e08 R08: 0000000000000000 R09: ffff889092296b48
> [ 5511.257886] R10: 0000ffffffffffff R11: ffff889092296800 R12: 0000000000000000
> [ 5511.266138] R13: ffff88810c8db500 R14: 0000000000000040 R15: 0000000000000100
> [ 5511.274387] FS:  0000000000000000(0000) GS:ffff88903f800000(0000) knlGS:0000000000000000
> [ 5511.283746] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 5511.290389] CR2: 0000000000000008 CR3: 00000001046e2001 CR4: 00000000003706f0
> [ 5511.298640] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [ 5511.306892] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [ 5511.315142] Call Trace:
> [ 5511.317972]  <IRQ>
> [ 5511.320301]  ice_xmit_zc+0x68/0x2f0 [ice]
> [ 5511.324977]  ? ktime_get+0x38/0xa0
> [ 5511.328913]  ice_napi_poll+0x7a/0x6a0 [ice]
> [ 5511.333784]  __napi_poll+0x2c/0x160
> [ 5511.337821]  net_rx_action+0xdd/0x200
> [ 5511.342058]  __do_softirq+0xe6/0x2dd
> [ 5511.346198]  irq_exit_rcu+0xb5/0x100
> [ 5511.350339]  common_interrupt+0xa4/0xc0
> [ 5511.354777]  </IRQ>
> [ 5511.357201]  <TASK>
> [ 5511.359625]  asm_common_interrupt+0x1e/0x40
> [ 5511.364466] RIP: 0010:cpuidle_enter_state+0xd2/0x360
> [ 5511.370211] Code: 49 89 c5 0f 1f 44 00 00 31 ff e8 e9 00 7b ff 45 84 ff 74 12 9c 58 f6 c4 02 0f 85 72 02 00 00 31 ff e8 02 0c 80 ff fb 45 85 f6 <0f> 88 11 01 00 00 49 63 c6 4c 2b 2c 24 48 8d 14 40 48 8d 14 90 49
> [ 5511.391921] RSP: 0018:ffffffff82a03e60 EFLAGS: 00000202
> [ 5511.397962] RAX: ffff88903f800000 RBX: 0000000000000001 RCX: 000000000000001f
> [ 5511.406214] RDX: 0000000000000000 RSI: ffffffff823400b9 RDI: ffffffff8234c046
> [ 5511.424646] RBP: ffff88810a384800 R08: 000005032a28c046 R09: 0000000000000008
> [ 5511.443233] R10: 000000000000000b R11: 0000000000000006 R12: ffffffff82bcf700
> [ 5511.461922] R13: 000005032a28c046 R14: 0000000000000001 R15: 0000000000000000
> [ 5511.480300]  cpuidle_enter+0x29/0x40
> [ 5511.494329]  do_idle+0x1c7/0x250
> [ 5511.507610]  cpu_startup_entry+0x19/0x20
> [ 5511.521394]  start_kernel+0x649/0x66e
> [ 5511.534626]  secondary_startup_64_no_verify+0xc3/0xcb
> [ 5511.549230]  </TASK>
>
> Detect such case during bind() and allocate this memory region via newly
> introduced xp_alloc_tx_descs(). Also, use kvcalloc instead of kcalloc as
> for other buffer pool allocations, so that it matches the kvfree() from
> xp_destroy().

Thank you for this fix Maciej.

Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>

> Fixes: d1bc532e99be ("i40e: xsk: Move tmp desc array from driver to pool")
> Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> ---
>  include/net/xsk_buff_pool.h |  1 +
>  net/xdp/xsk.c               | 13 +++++++++++++
>  net/xdp/xsk_buff_pool.c     | 16 ++++++++++++----
>  3 files changed, 26 insertions(+), 4 deletions(-)
>
> diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
> index 5554ee75e7da..647722e847b4 100644
> --- a/include/net/xsk_buff_pool.h
> +++ b/include/net/xsk_buff_pool.h
> @@ -97,6 +97,7 @@ int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev,
>                   u16 queue_id, u16 flags);
>  int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem,
>                          struct net_device *dev, u16 queue_id);
> +int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs);
>  void xp_destroy(struct xsk_buff_pool *pool);
>  void xp_get_pool(struct xsk_buff_pool *pool);
>  bool xp_put_pool(struct xsk_buff_pool *pool);
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 040c73345b7c..57afb96c41e8 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -967,6 +967,19 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
>
>                         xp_get_pool(umem_xs->pool);
>                         xs->pool = umem_xs->pool;
> +
> +                       /* If underlying shared umem was created without Tx
> +                        * ring, allocate Tx descs array that Tx batching API
> +                        * utilizes
> +                        */
> +                       if (xs->tx && !xs->pool->tx_descs) {
> +                               err = xp_alloc_tx_descs(xs->pool, xs);
> +                               if (err) {
> +                                       xp_put_pool(xs->pool);
> +                                       sockfd_put(sock);
> +                                       goto out_unlock;
> +                               }
> +                       }
>                 }
>
>                 xdp_get_umem(umem_xs->umem);
> diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
> index af040ffa14ff..87bdd71c7bb6 100644
> --- a/net/xdp/xsk_buff_pool.c
> +++ b/net/xdp/xsk_buff_pool.c
> @@ -42,6 +42,16 @@ void xp_destroy(struct xsk_buff_pool *pool)
>         kvfree(pool);
>  }
>
> +int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs)
> +{
> +       pool->tx_descs = kvcalloc(xs->tx->nentries, sizeof(*pool->tx_descs),
> +                                 GFP_KERNEL);
> +       if (!pool->tx_descs)
> +               return -ENOMEM;
> +
> +       return 0;
> +}
> +
>  struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
>                                                 struct xdp_umem *umem)
>  {
> @@ -59,11 +69,9 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
>         if (!pool->heads)
>                 goto out;
>
> -       if (xs->tx) {
> -               pool->tx_descs = kcalloc(xs->tx->nentries, sizeof(*pool->tx_descs), GFP_KERNEL);
> -               if (!pool->tx_descs)
> +       if (xs->tx)
> +               if (xp_alloc_tx_descs(pool, xs))
>                         goto out;
> -       }
>
>         pool->chunk_mask = ~((u64)umem->chunk_size - 1);
>         pool->addrs_cnt = umem->size;
> --
> 2.27.0
>
patchwork-bot+netdevbpf@kernel.org April 26, 2022, 2:30 p.m. UTC | #2
Hello:

This patch was applied to bpf/bpf.git (master)
by Daniel Borkmann <daniel@iogearbox.net>:

On Mon, 25 Apr 2022 17:37:45 +0200 you wrote:
> Fix a crash that happens if an Rx only socket is created first, then a
> second socket is created that is Tx only and bound to the same umem as
> the first socket and also the same netdev and queue_id together with the
> XDP_SHARED_UMEM flag. In this specific case, the tx_descs array page
> pool was not created by the first socket as it was an Rx only socket.
> When the second socket is bound it needs this tx_descs array of this
> shared page pool as it has a Tx component, but unfortunately it was
> never allocated, leading to a crash. Note that this array is only used
> for zero-copy drivers using the batched Tx APIs, currently only ice and
> i40e.
> 
> [...]

Here is the summary with links:
  - [bpf] xsk: fix possible crash when multiple sockets are created
    https://git.kernel.org/bpf/bpf/c/ba3beec2ec1d

You are awesome, thank you!
diff mbox series

Patch

diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index 5554ee75e7da..647722e847b4 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -97,6 +97,7 @@  int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev,
 		  u16 queue_id, u16 flags);
 int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem,
 			 struct net_device *dev, u16 queue_id);
+int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs);
 void xp_destroy(struct xsk_buff_pool *pool);
 void xp_get_pool(struct xsk_buff_pool *pool);
 bool xp_put_pool(struct xsk_buff_pool *pool);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 040c73345b7c..57afb96c41e8 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -967,6 +967,19 @@  static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 
 			xp_get_pool(umem_xs->pool);
 			xs->pool = umem_xs->pool;
+
+			/* If underlying shared umem was created without Tx
+			 * ring, allocate Tx descs array that Tx batching API
+			 * utilizes
+			 */
+			if (xs->tx && !xs->pool->tx_descs) {
+				err = xp_alloc_tx_descs(xs->pool, xs);
+				if (err) {
+					xp_put_pool(xs->pool);
+					sockfd_put(sock);
+					goto out_unlock;
+				}
+			}
 		}
 
 		xdp_get_umem(umem_xs->umem);
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index af040ffa14ff..87bdd71c7bb6 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -42,6 +42,16 @@  void xp_destroy(struct xsk_buff_pool *pool)
 	kvfree(pool);
 }
 
+int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs)
+{
+	pool->tx_descs = kvcalloc(xs->tx->nentries, sizeof(*pool->tx_descs),
+				  GFP_KERNEL);
+	if (!pool->tx_descs)
+		return -ENOMEM;
+
+	return 0;
+}
+
 struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
 						struct xdp_umem *umem)
 {
@@ -59,11 +69,9 @@  struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
 	if (!pool->heads)
 		goto out;
 
-	if (xs->tx) {
-		pool->tx_descs = kcalloc(xs->tx->nentries, sizeof(*pool->tx_descs), GFP_KERNEL);
-		if (!pool->tx_descs)
+	if (xs->tx)
+		if (xp_alloc_tx_descs(pool, xs))
 			goto out;
-	}
 
 	pool->chunk_mask = ~((u64)umem->chunk_size - 1);
 	pool->addrs_cnt = umem->size;