diff mbox series

[net-next,v25,10/13] net: add SO_DEVMEM_DONTNEED setsockopt to release RX frags

Message ID 20240909054318.1809580-11-almasrymina@google.com (mailing list archive)
State New
Headers show
Series Device Memory TCP | expand

Commit Message

Mina Almasry Sept. 9, 2024, 5:43 a.m. UTC
Add an interface for the user to notify the kernel that it is done
reading the devmem dmabuf frags returned as cmsg. The kernel will
drop the reference on the frags to make them available for reuse.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com>
Signed-off-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>

---

v16:
- Use sk_is_tcp().
- Fix unnamed 128 DONTNEED limit (David).
- Fix kernel allocating for 128 tokens even if the user didn't ask for
  that much (Eric).
- Fix number assignement (Arnd).

v10:
- Fix leak of tokens (Nikolay).

v7:
- Updated SO_DEVMEM_* uapi to use the next available entry (Arnd).

v6:
- Squash in locking optimizations from edumazet@google.com. With his
  changes we lock the xarray once per sock_devmem_dontneed operation
  rather than once per frag.

Changes in v1:
- devmemtoken -> dmabuf_token (David).
- Use napi_pp_put_page() for refcounting (Yunsheng).
- Fix build error with missing socket options on other asms.

---
 arch/alpha/include/uapi/asm/socket.h  |  1 +
 arch/mips/include/uapi/asm/socket.h   |  1 +
 arch/parisc/include/uapi/asm/socket.h |  1 +
 arch/sparc/include/uapi/asm/socket.h  |  1 +
 include/uapi/asm-generic/socket.h     |  1 +
 include/uapi/linux/uio.h              |  4 ++
 net/core/sock.c                       | 68 +++++++++++++++++++++++++++
 7 files changed, 77 insertions(+)

Comments

Jakub Kicinski Sept. 10, 2024, 2:05 a.m. UTC | #1
On Mon,  9 Sep 2024 05:43:15 +0000 Mina Almasry wrote:
> --- a/include/uapi/linux/uio.h
> +++ b/include/uapi/linux/uio.h
> @@ -33,6 +33,10 @@ struct dmabuf_cmsg {
>  				 */
>  };
>  
> +struct dmabuf_token {
> +	__u32 token_start;
> +	__u32 token_count;
> +};
>  /*

nit: missing new line
Lai, Yi Oct. 10, 2024, 11:16 a.m. UTC | #2
Hi Mina Almasry,

Greetings!

I used Syzkaller and found that there is BUG: soft lockup inqt in linux-next tree next-20241008

After bisection and the first bad commit is:
"
678f6e28b5f6 net: add SO_DEVMEM_DONTNEED setsockopt to release RX frags
"

All detailed into can be found at:
https://github.com/laifryiee/syzkaller_logs/tree/main/241009_103423_do_sock_setsockopt
Syzkaller repro code:
https://github.com/laifryiee/syzkaller_logs/tree/main/241009_103423_do_sock_setsockopt/repro.c
Syzkaller repro syscall steps:
https://github.com/laifryiee/syzkaller_logs/tree/main/241009_103423_do_sock_setsockopt/repro.prog
Syzkaller report:
https://github.com/laifryiee/syzkaller_logs/tree/main/241009_103423_do_sock_setsockopt/repro.report
Kconfig(make olddefconfig):
https://github.com/laifryiee/syzkaller_logs/tree/main/241009_103423_do_sock_setsockopt/kconfig_origin
Bisect info:
https://github.com/laifryiee/syzkaller_logs/tree/main/241009_103423_do_sock_setsockopt/bisect_info.log
bzImage:
https://github.com/laifryiee/syzkaller_logs/raw/refs/heads/main/241009_103423_do_sock_setsockopt/bzImage_8cf0b93919e13d1e8d4466eb4080a4c4d9d66d7b
Issue dmesg:
https://github.com/laifryiee/syzkaller_logs/blob/main/241009_103423_do_sock_setsockopt/8cf0b93919e13d1e8d4466eb4080a4c4d9d66d7b_dmesg.log

"
[   48.825073]  ? __lock_acquire+0x1b0f/0x5c90
[   48.825419]  ? __pfx___lock_acquire+0x10/0x10
[   48.825774]  sock_setsockopt+0x68/0x90
[   48.826117]  do_sock_setsockopt+0x3fb/0x480
[   48.826455]  ? __pfx_do_sock_setsockopt+0x10/0x10
[   48.826829]  ? lock_release+0x441/0x870
[   48.827140]  ? __sanitizer_cov_trace_const_cmp4+0x1a/0x20
[   48.827558]  ? fdget+0x188/0x230
[   48.827846]  __sys_setsockopt+0x131/0x200
[   48.828184]  ? __pfx___sys_setsockopt+0x10/0x10
[   48.828551]  ? seqcount_lockdep_reader_access.constprop.0+0xc0/0xd0
[   48.829042]  ? __sanitizer_cov_trace_cmp4+0x1a/0x20
[   48.829425]  ? ktime_get_coarse_real_ts64+0xbf/0xf0
[   48.829817]  __x64_sys_setsockopt+0xc6/0x160
[   48.830160]  ? syscall_trace_enter+0x14a/0x230
[   48.830520]  x64_sys_call+0x6cf/0x20d0
[   48.830825]  do_syscall_64+0x6d/0x140
[   48.831124]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   48.831517] RIP: 0033:0x7f26cdc3ee5d
[   48.831804] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 93 af 1b 00 f7 d8 64 89 01 48
[   48.833180] RSP: 002b:00007fff33f36278 EFLAGS: 00000213 ORIG_RAX: 0000000000000036
[   48.833756] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f26cdc3ee5d
[   48.834294] RDX: 0000000000000050 RSI: 0000000000000001 RDI: 0000000000000003
[   48.834830] RBP: 00007fff33f36290 R08: 0000000000000010 R09: 00007fff33f36290
[   48.835368] R10: 0000000020000080 R11: 0000000000000213 R12: 00007fff33f363e8
[   48.835906] R13: 000000000040178f R14: 0000000000403e08 R15: 00007f26cde51000
[   48.836466]  </TASK>
[   48.836648] Kernel panic - not syncing: softlockup: hung tasks
[   48.837096] CPU: 1 UID: 0 PID: 729 Comm: repro Tainted: G             L     6.12.0-rc2-8cf0b93919e1 #1
[   48.837796] Tainted: [L]=SOFTLOCKUP
[   48.838071] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[   48.838916] Call Trace:
[   48.839113]  <IRQ>
[   48.839282]  dump_stack_lvl+0x42/0x150
[   48.839584]  dump_stack+0x19/0x20
[   48.839846]  panic+0x703/0x790
[   48.840100]  ? __pfx_panic+0x10/0x10
[   48.840394]  ? watchdog_timer_fn+0x599/0x6b0
[   48.840727]  ? watchdog_timer_fn+0x58c/0x6b0
[   48.841065]  watchdog_timer_fn+0x5aa/0x6b0
[   48.841382]  ? __pfx_watchdog_timer_fn+0x10/0x10
[   48.841743]  __hrtimer_run_queues+0x5d6/0xc30
[   48.842091]  ? __pfx___hrtimer_run_queues+0x10/0x10
[   48.842473]  hrtimer_interrupt+0x324/0x7a0
[   48.842802]  __sysvec_apic_timer_interrupt+0x10b/0x410
[   48.843198]  ? debug_smp_processor_id+0x20/0x30
[   48.843551]  sysvec_apic_timer_interrupt+0xaf/0xd0
[   48.843922]  </IRQ>
[   48.844101]  <TASK>
[   48.844275]  asm_sysvec_apic_timer_interrupt+0x1f/0x30
[   48.844711] RIP: 0010:__sanitizer_cov_trace_pc+0x45/0x70
[   48.845130] Code: a9 00 01 ff 00 74 1d f6 c4 01 74 43 a9 00 00 0f 00 75 3c a9 00 00 f0 00 75 35 8b 82 04 1e 00 00 85 c0 74 2b 8b 82 e0 1d 00 00 <83> f8 02 75 20 48 8b 8a e8 1d 00 00 8b 92 e4 1d 00 00 48 8b 01 48
[   48.846480] RSP: 0018:ffff8880239cf790 EFLAGS: 00000246
[   48.846876] RAX: 0000000000000000 RBX: ffff8880239cf900 RCX: ffffffff8581c19f
[   48.847407] RDX: ffff88801a818000 RSI: ffffffff8581c1d5 RDI: 0000000000000007
[   48.847933] RBP: ffff8880239cf790 R08: 0000000000000001 R09: ffffed1004739f23
[   48.848472] R10: 0000000077cc006e R11: 0000000000000001 R12: 0000000000000000
[   48.849002] R13: 0000000077cc006e R14: ffff8880239cf918 R15: 0000000000000000
[   48.849536]  ? xas_start+0x11f/0x730
[   48.849818]  ? xas_start+0x155/0x730
[   48.850101]  xas_start+0x155/0x730
[   48.850372]  xas_load+0x2f/0x520
[   48.850629]  ? irqentry_exit+0x3e/0xa0
[   48.850922]  ? sysvec_apic_timer_interrupt+0x6a/0xd0
[   48.851304]  xas_store+0x1165/0x1ad0
[   48.851588]  ? __this_cpu_preempt_check+0x21/0x30
[   48.851950]  ? irqentry_exit+0x3e/0xa0
[   48.852254]  __xa_erase+0xc6/0x180
[   48.852524]  ? __pfx___xa_erase+0x10/0x10
[   48.852842]  ? __xa_erase+0xf1/0x180
[   48.853123]  ? sock_devmem_dontneed+0x42c/0x6d0
[   48.853480]  sock_devmem_dontneed+0x3a8/0x6d0
[   48.853829]  ? __pfx_sock_devmem_dontneed+0x10/0x10
[   48.854205]  ? trace_lock_acquire+0x139/0x1b0
[   48.854548]  ? lock_acquire+0x80/0xb0
[   48.854833]  ? __might_fault+0xf1/0x1b0
[   48.855133]  ? __might_fault+0xf1/0x1b0
[   48.855437]  ? __sanitizer_cov_trace_const_cmp8+0x1c/0x30
[   48.855849]  sk_setsockopt+0x480/0x3c60
[   48.856158]  ? __pfx_sk_setsockopt+0x10/0x10
[   48.856491]  ? __kasan_check_read+0x15/0x20
[   48.856814]  ? __lock_acquire+0x1b0f/0x5c90
[   48.857144]  ? __pfx___lock_acquire+0x10/0x10
[   48.857488]  sock_setsockopt+0x68/0x90
[   48.857785]  do_sock_setsockopt+0x3fb/0x480
[   48.858110]  ? __pfx_do_sock_setsockopt+0x10/0x10
[   48.858474]  ? lock_release+0x441/0x870
[   48.858776]  ? __sanitizer_cov_trace_const_cmp4+0x1a/0x20
[   48.859184]  ? fdget+0x188/0x230
[   48.859448]  __sys_setsockopt+0x131/0x200
[   48.859764]  ? __pfx___sys_setsockopt+0x10/0x10
[   48.860123]  ? seqcount_lockdep_reader_access.constprop.0+0xc0/0xd0
[   48.860598]  ? __sanitizer_cov_trace_cmp4+0x1a/0x20
[   48.860982]  ? ktime_get_coarse_real_ts64+0xbf/0xf0
[   48.861370]  __x64_sys_setsockopt+0xc6/0x160
[   48.861710]  ? syscall_trace_enter+0x14a/0x230
[   48.862057]  x64_sys_call+0x6cf/0x20d0
[   48.862350]  do_syscall_64+0x6d/0x140
[   48.862639]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[   48.863023] RIP: 0033:0x7f26cdc3ee5d
[   48.863301] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 93 af 1b 00 f7 d8 64 89 01 48
[   48.864659] RSP: 002b:00007fff33f36278 EFLAGS: 00000213 ORIG_RAX: 0000000000000036
[   48.865223] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f26cdc3ee5d
"

I hope you find it useful.

Regards,
Yi Lai

---

If you don't need the following environment to reproduce the problem or if you
already have one reproduced environment, please ignore the following information.

How to reproduce:
git clone https://gitlab.com/xupengfe/repro_vm_env.git
cd repro_vm_env
tar -xvf repro_vm_env.tar.gz
cd repro_vm_env; ./start3.sh  // it needs qemu-system-x86_64 and I used v7.1.0
  // start3.sh will load bzImage_2241ab53cbb5cdb08a6b2d4688feb13971058f65 v6.2-rc5 kernel
  // You could change the bzImage_xxx as you want
  // Maybe you need to remove line "-drive if=pflash,format=raw,readonly=on,file=./OVMF_CODE.fd \" for different qemu version
You could use below command to log in, there is no password for root.
ssh -p 10023 root@localhost

After login vm(virtual machine) successfully, you could transfer reproduced
binary to the vm by below way, and reproduce the problem in vm:
gcc -pthread -o repro repro.c
scp -P 10023 repro root@localhost:/root/

Get the bzImage for target kernel:
Please use target kconfig and copy it to kernel_src/.config
make olddefconfig
make -jx bzImage           //x should equal or less than cpu num your pc has

Fill the bzImage file into above start3.sh to load the target kernel in vm.


Tips:
If you already have qemu-system-x86_64, please ignore below info.
If you want to install qemu v7.1.0 version:
git clone https://github.com/qemu/qemu.git
cd qemu
git checkout -f v7.1.0
mkdir build
cd build
yum install -y ninja-build.x86_64
yum -y install libslirp-devel.x86_64
../configure --target-list=x86_64-softmmu --enable-kvm --enable-vnc --enable-gtk --enable-sdl --enable-usb-redir --enable-slirp
make
make install 

On Mon, Sep 09, 2024 at 05:43:15AM +0000, Mina Almasry wrote:
> Add an interface for the user to notify the kernel that it is done
> reading the devmem dmabuf frags returned as cmsg. The kernel will
> drop the reference on the frags to make them available for reuse.
> 
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com>
> Signed-off-by: Mina Almasry <almasrymina@google.com>
> Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
> Reviewed-by: Eric Dumazet <edumazet@google.com>
> 
> ---
> 
> v16:
> - Use sk_is_tcp().
> - Fix unnamed 128 DONTNEED limit (David).
> - Fix kernel allocating for 128 tokens even if the user didn't ask for
>   that much (Eric).
> - Fix number assignement (Arnd).
> 
> v10:
> - Fix leak of tokens (Nikolay).
> 
> v7:
> - Updated SO_DEVMEM_* uapi to use the next available entry (Arnd).
> 
> v6:
> - Squash in locking optimizations from edumazet@google.com. With his
>   changes we lock the xarray once per sock_devmem_dontneed operation
>   rather than once per frag.
> 
> Changes in v1:
> - devmemtoken -> dmabuf_token (David).
> - Use napi_pp_put_page() for refcounting (Yunsheng).
> - Fix build error with missing socket options on other asms.
> 
> ---
>  arch/alpha/include/uapi/asm/socket.h  |  1 +
>  arch/mips/include/uapi/asm/socket.h   |  1 +
>  arch/parisc/include/uapi/asm/socket.h |  1 +
>  arch/sparc/include/uapi/asm/socket.h  |  1 +
>  include/uapi/asm-generic/socket.h     |  1 +
>  include/uapi/linux/uio.h              |  4 ++
>  net/core/sock.c                       | 68 +++++++++++++++++++++++++++
>  7 files changed, 77 insertions(+)
> 
> diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
> index ef4656a41058..251b73c5481e 100644
> --- a/arch/alpha/include/uapi/asm/socket.h
> +++ b/arch/alpha/include/uapi/asm/socket.h
> @@ -144,6 +144,7 @@
>  #define SCM_DEVMEM_LINEAR	SO_DEVMEM_LINEAR
>  #define SO_DEVMEM_DMABUF	79
>  #define SCM_DEVMEM_DMABUF	SO_DEVMEM_DMABUF
> +#define SO_DEVMEM_DONTNEED	80
>  
>  #if !defined(__KERNEL__)
>  
> diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
> index 414807d55e33..8ab7582291ab 100644
> --- a/arch/mips/include/uapi/asm/socket.h
> +++ b/arch/mips/include/uapi/asm/socket.h
> @@ -155,6 +155,7 @@
>  #define SCM_DEVMEM_LINEAR	SO_DEVMEM_LINEAR
>  #define SO_DEVMEM_DMABUF	79
>  #define SCM_DEVMEM_DMABUF	SO_DEVMEM_DMABUF
> +#define SO_DEVMEM_DONTNEED	80
>  
>  #if !defined(__KERNEL__)
>  
> diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
> index 2b817efd4544..38fc0b188e08 100644
> --- a/arch/parisc/include/uapi/asm/socket.h
> +++ b/arch/parisc/include/uapi/asm/socket.h
> @@ -136,6 +136,7 @@
>  #define SCM_DEVMEM_LINEAR	SO_DEVMEM_LINEAR
>  #define SO_DEVMEM_DMABUF	79
>  #define SCM_DEVMEM_DMABUF	SO_DEVMEM_DMABUF
> +#define SO_DEVMEM_DONTNEED	80
>  
>  #if !defined(__KERNEL__)
>  
> diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
> index 00248fc68977..57084ed2f3c4 100644
> --- a/arch/sparc/include/uapi/asm/socket.h
> +++ b/arch/sparc/include/uapi/asm/socket.h
> @@ -137,6 +137,7 @@
>  #define SCM_DEVMEM_LINEAR        SO_DEVMEM_LINEAR
>  #define SO_DEVMEM_DMABUF         0x0058
>  #define SCM_DEVMEM_DMABUF        SO_DEVMEM_DMABUF
> +#define SO_DEVMEM_DONTNEED       0x0059
>  
>  #if !defined(__KERNEL__)
>  
> diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
> index e993edc9c0ee..3b4e3e815602 100644
> --- a/include/uapi/asm-generic/socket.h
> +++ b/include/uapi/asm-generic/socket.h
> @@ -139,6 +139,7 @@
>  #define SCM_DEVMEM_LINEAR	SO_DEVMEM_LINEAR
>  #define SO_DEVMEM_DMABUF	79
>  #define SCM_DEVMEM_DMABUF	SO_DEVMEM_DMABUF
> +#define SO_DEVMEM_DONTNEED	80
>  
>  #if !defined(__KERNEL__)
>  
> diff --git a/include/uapi/linux/uio.h b/include/uapi/linux/uio.h
> index 3a22ddae376a..d17f8fcd93ec 100644
> --- a/include/uapi/linux/uio.h
> +++ b/include/uapi/linux/uio.h
> @@ -33,6 +33,10 @@ struct dmabuf_cmsg {
>  				 */
>  };
>  
> +struct dmabuf_token {
> +	__u32 token_start;
> +	__u32 token_count;
> +};
>  /*
>   *	UIO_MAXIOV shall be at least 16 1003.1g (5.4.1.1)
>   */
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 468b1239606c..bbb57b5af0b1 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -124,6 +124,7 @@
>  #include <linux/netdevice.h>
>  #include <net/protocol.h>
>  #include <linux/skbuff.h>
> +#include <linux/skbuff_ref.h>
>  #include <net/net_namespace.h>
>  #include <net/request_sock.h>
>  #include <net/sock.h>
> @@ -1049,6 +1050,69 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
>  	return 0;
>  }
>  
> +#ifdef CONFIG_PAGE_POOL
> +
> +/* This is the number of tokens that the user can SO_DEVMEM_DONTNEED in
> + * 1 syscall. The limit exists to limit the amount of memory the kernel
> + * allocates to copy these tokens.
> + */
> +#define MAX_DONTNEED_TOKENS 128
> +
> +static noinline_for_stack int
> +sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
> +{
> +	unsigned int num_tokens, i, j, k, netmem_num = 0;
> +	struct dmabuf_token *tokens;
> +	netmem_ref netmems[16];
> +	int ret = 0;
> +
> +	if (!sk_is_tcp(sk))
> +		return -EBADF;
> +
> +	if (optlen % sizeof(struct dmabuf_token) ||
> +	    optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
> +		return -EINVAL;
> +
> +	tokens = kvmalloc_array(optlen, sizeof(*tokens), GFP_KERNEL);
> +	if (!tokens)
> +		return -ENOMEM;
> +
> +	num_tokens = optlen / sizeof(struct dmabuf_token);
> +	if (copy_from_sockptr(tokens, optval, optlen)) {
> +		kvfree(tokens);
> +		return -EFAULT;
> +	}
> +
> +	xa_lock_bh(&sk->sk_user_frags);
> +	for (i = 0; i < num_tokens; i++) {
> +		for (j = 0; j < tokens[i].token_count; j++) {
> +			netmem_ref netmem = (__force netmem_ref)__xa_erase(
> +				&sk->sk_user_frags, tokens[i].token_start + j);
> +
> +			if (netmem &&
> +			    !WARN_ON_ONCE(!netmem_is_net_iov(netmem))) {
> +				netmems[netmem_num++] = netmem;
> +				if (netmem_num == ARRAY_SIZE(netmems)) {
> +					xa_unlock_bh(&sk->sk_user_frags);
> +					for (k = 0; k < netmem_num; k++)
> +						WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
> +					netmem_num = 0;
> +					xa_lock_bh(&sk->sk_user_frags);
> +				}
> +				ret++;
> +			}
> +		}
> +	}
> +
> +	xa_unlock_bh(&sk->sk_user_frags);
> +	for (k = 0; k < netmem_num; k++)
> +		WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
> +
> +	kvfree(tokens);
> +	return ret;
> +}
> +#endif
> +
>  void sockopt_lock_sock(struct sock *sk)
>  {
>  	/* When current->bpf_ctx is set, the setsockopt is called from
> @@ -1211,6 +1275,10 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
>  			ret = -EOPNOTSUPP;
>  		return ret;
>  		}
> +#ifdef CONFIG_PAGE_POOL
> +	case SO_DEVMEM_DONTNEED:
> +		return sock_devmem_dontneed(sk, optval, optlen);
> +#endif
>  	}
>  
>  	sockopt_lock_sock(sk);
> -- 
> 2.46.0.469.g59c65b2a67-goog
>
Mina Almasry Oct. 10, 2024, 7:05 p.m. UTC | #3
On Thu, Oct 10, 2024 at 4:17 AM Lai, Yi <yi1.lai@linux.intel.com> wrote:
>
> Hi Mina Almasry,
>
> Greetings!
>
> I used Syzkaller and found that there is BUG: soft lockup inqt in linux-next tree next-20241008
>
> After bisection and the first bad commit is:
> "
> 678f6e28b5f6 net: add SO_DEVMEM_DONTNEED setsockopt to release RX frags
> "
>
> All detailed into can be found at:
> https://github.com/laifryiee/syzkaller_logs/tree/main/241009_103423_do_sock_setsockopt
> Syzkaller repro code:
> https://github.com/laifryiee/syzkaller_logs/tree/main/241009_103423_do_sock_setsockopt/repro.c
> Syzkaller repro syscall steps:
> https://github.com/laifryiee/syzkaller_logs/tree/main/241009_103423_do_sock_setsockopt/repro.prog
> Syzkaller report:
> https://github.com/laifryiee/syzkaller_logs/tree/main/241009_103423_do_sock_setsockopt/repro.report
> Kconfig(make olddefconfig):
> https://github.com/laifryiee/syzkaller_logs/tree/main/241009_103423_do_sock_setsockopt/kconfig_origin
> Bisect info:
> https://github.com/laifryiee/syzkaller_logs/tree/main/241009_103423_do_sock_setsockopt/bisect_info.log
> bzImage:
> https://github.com/laifryiee/syzkaller_logs/raw/refs/heads/main/241009_103423_do_sock_setsockopt/bzImage_8cf0b93919e13d1e8d4466eb4080a4c4d9d66d7b
> Issue dmesg:
> https://github.com/laifryiee/syzkaller_logs/blob/main/241009_103423_do_sock_setsockopt/8cf0b93919e13d1e8d4466eb4080a4c4d9d66d7b_dmesg.log
>
> "
> [   48.825073]  ? __lock_acquire+0x1b0f/0x5c90
> [   48.825419]  ? __pfx___lock_acquire+0x10/0x10
> [   48.825774]  sock_setsockopt+0x68/0x90
> [   48.826117]  do_sock_setsockopt+0x3fb/0x480
> [   48.826455]  ? __pfx_do_sock_setsockopt+0x10/0x10
> [   48.826829]  ? lock_release+0x441/0x870
> [   48.827140]  ? __sanitizer_cov_trace_const_cmp4+0x1a/0x20
> [   48.827558]  ? fdget+0x188/0x230
> [   48.827846]  __sys_setsockopt+0x131/0x200
> [   48.828184]  ? __pfx___sys_setsockopt+0x10/0x10
> [   48.828551]  ? seqcount_lockdep_reader_access.constprop.0+0xc0/0xd0
> [   48.829042]  ? __sanitizer_cov_trace_cmp4+0x1a/0x20
> [   48.829425]  ? ktime_get_coarse_real_ts64+0xbf/0xf0
> [   48.829817]  __x64_sys_setsockopt+0xc6/0x160
> [   48.830160]  ? syscall_trace_enter+0x14a/0x230
> [   48.830520]  x64_sys_call+0x6cf/0x20d0
> [   48.830825]  do_syscall_64+0x6d/0x140
> [   48.831124]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [   48.831517] RIP: 0033:0x7f26cdc3ee5d
> [   48.831804] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 93 af 1b 00 f7 d8 64 89 01 48
> [   48.833180] RSP: 002b:00007fff33f36278 EFLAGS: 00000213 ORIG_RAX: 0000000000000036
> [   48.833756] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f26cdc3ee5d
> [   48.834294] RDX: 0000000000000050 RSI: 0000000000000001 RDI: 0000000000000003
> [   48.834830] RBP: 00007fff33f36290 R08: 0000000000000010 R09: 00007fff33f36290
> [   48.835368] R10: 0000000020000080 R11: 0000000000000213 R12: 00007fff33f363e8
> [   48.835906] R13: 000000000040178f R14: 0000000000403e08 R15: 00007f26cde51000
> [   48.836466]  </TASK>
> [   48.836648] Kernel panic - not syncing: softlockup: hung tasks
> [   48.837096] CPU: 1 UID: 0 PID: 729 Comm: repro Tainted: G             L     6.12.0-rc2-8cf0b93919e1 #1
> [   48.837796] Tainted: [L]=SOFTLOCKUP
> [   48.838071] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
> [   48.838916] Call Trace:
> [   48.839113]  <IRQ>
> [   48.839282]  dump_stack_lvl+0x42/0x150
> [   48.839584]  dump_stack+0x19/0x20
> [   48.839846]  panic+0x703/0x790
> [   48.840100]  ? __pfx_panic+0x10/0x10
> [   48.840394]  ? watchdog_timer_fn+0x599/0x6b0
> [   48.840727]  ? watchdog_timer_fn+0x58c/0x6b0
> [   48.841065]  watchdog_timer_fn+0x5aa/0x6b0
> [   48.841382]  ? __pfx_watchdog_timer_fn+0x10/0x10
> [   48.841743]  __hrtimer_run_queues+0x5d6/0xc30
> [   48.842091]  ? __pfx___hrtimer_run_queues+0x10/0x10
> [   48.842473]  hrtimer_interrupt+0x324/0x7a0
> [   48.842802]  __sysvec_apic_timer_interrupt+0x10b/0x410
> [   48.843198]  ? debug_smp_processor_id+0x20/0x30
> [   48.843551]  sysvec_apic_timer_interrupt+0xaf/0xd0
> [   48.843922]  </IRQ>
> [   48.844101]  <TASK>
> [   48.844275]  asm_sysvec_apic_timer_interrupt+0x1f/0x30
> [   48.844711] RIP: 0010:__sanitizer_cov_trace_pc+0x45/0x70
> [   48.845130] Code: a9 00 01 ff 00 74 1d f6 c4 01 74 43 a9 00 00 0f 00 75 3c a9 00 00 f0 00 75 35 8b 82 04 1e 00 00 85 c0 74 2b 8b 82 e0 1d 00 00 <83> f8 02 75 20 48 8b 8a e8 1d 00 00 8b 92 e4 1d 00 00 48 8b 01 48
> [   48.846480] RSP: 0018:ffff8880239cf790 EFLAGS: 00000246
> [   48.846876] RAX: 0000000000000000 RBX: ffff8880239cf900 RCX: ffffffff8581c19f
> [   48.847407] RDX: ffff88801a818000 RSI: ffffffff8581c1d5 RDI: 0000000000000007
> [   48.847933] RBP: ffff8880239cf790 R08: 0000000000000001 R09: ffffed1004739f23
> [   48.848472] R10: 0000000077cc006e R11: 0000000000000001 R12: 0000000000000000
> [   48.849002] R13: 0000000077cc006e R14: ffff8880239cf918 R15: 0000000000000000
> [   48.849536]  ? xas_start+0x11f/0x730
> [   48.849818]  ? xas_start+0x155/0x730
> [   48.850101]  xas_start+0x155/0x730
> [   48.850372]  xas_load+0x2f/0x520
> [   48.850629]  ? irqentry_exit+0x3e/0xa0
> [   48.850922]  ? sysvec_apic_timer_interrupt+0x6a/0xd0
> [   48.851304]  xas_store+0x1165/0x1ad0
> [   48.851588]  ? __this_cpu_preempt_check+0x21/0x30
> [   48.851950]  ? irqentry_exit+0x3e/0xa0
> [   48.852254]  __xa_erase+0xc6/0x180
> [   48.852524]  ? __pfx___xa_erase+0x10/0x10
> [   48.852842]  ? __xa_erase+0xf1/0x180
> [   48.853123]  ? sock_devmem_dontneed+0x42c/0x6d0
> [   48.853480]  sock_devmem_dontneed+0x3a8/0x6d0
> [   48.853829]  ? __pfx_sock_devmem_dontneed+0x10/0x10
> [   48.854205]  ? trace_lock_acquire+0x139/0x1b0
> [   48.854548]  ? lock_acquire+0x80/0xb0
> [   48.854833]  ? __might_fault+0xf1/0x1b0
> [   48.855133]  ? __might_fault+0xf1/0x1b0
> [   48.855437]  ? __sanitizer_cov_trace_const_cmp8+0x1c/0x30
> [   48.855849]  sk_setsockopt+0x480/0x3c60
> [   48.856158]  ? __pfx_sk_setsockopt+0x10/0x10
> [   48.856491]  ? __kasan_check_read+0x15/0x20
> [   48.856814]  ? __lock_acquire+0x1b0f/0x5c90
> [   48.857144]  ? __pfx___lock_acquire+0x10/0x10
> [   48.857488]  sock_setsockopt+0x68/0x90
> [   48.857785]  do_sock_setsockopt+0x3fb/0x480
> [   48.858110]  ? __pfx_do_sock_setsockopt+0x10/0x10
> [   48.858474]  ? lock_release+0x441/0x870
> [   48.858776]  ? __sanitizer_cov_trace_const_cmp4+0x1a/0x20
> [   48.859184]  ? fdget+0x188/0x230
> [   48.859448]  __sys_setsockopt+0x131/0x200
> [   48.859764]  ? __pfx___sys_setsockopt+0x10/0x10
> [   48.860123]  ? seqcount_lockdep_reader_access.constprop.0+0xc0/0xd0
> [   48.860598]  ? __sanitizer_cov_trace_cmp4+0x1a/0x20
> [   48.860982]  ? ktime_get_coarse_real_ts64+0xbf/0xf0
> [   48.861370]  __x64_sys_setsockopt+0xc6/0x160
> [   48.861710]  ? syscall_trace_enter+0x14a/0x230
> [   48.862057]  x64_sys_call+0x6cf/0x20d0
> [   48.862350]  do_syscall_64+0x6d/0x140
> [   48.862639]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [   48.863023] RIP: 0033:0x7f26cdc3ee5d
> [   48.863301] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 93 af 1b 00 f7 d8 64 89 01 48
> [   48.864659] RSP: 002b:00007fff33f36278 EFLAGS: 00000213 ORIG_RAX: 0000000000000036
> [   48.865223] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f26cdc3ee5d
> "
>
> I hope you find it useful.

Thank you for the report. I think I see the issue and I commented on
the fix in the code below.

Only issue is that this is unlucky timing for me. I have a flight
tomorrow for a vacation where I think I may have internet access and
may not. I will try to follow up here, but in case I can't, what's the
urgency for this issue? Can this wait 2 weeks when I get back?

> > +     if (optlen % sizeof(struct dmabuf_token) ||
> > +         optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
> > +             return -EINVAL;
> > +
> > +     tokens = kvmalloc_array(optlen, sizeof(*tokens), GFP_KERNEL);
> > +     if (!tokens)
> > +             return -ENOMEM;
> > +

There is an unrelated bug here. The first argument for kvmalloc_array
is the number of elements, I think, not the number of bytes. So this
should be:

num_tokens = optlen / sizeof(struct dmabuf_token);
tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
if (!tokens)
   return -ENOMEM;

> > +
> > +     if (copy_from_sockptr(tokens, optval, optlen)) {
> > +             kvfree(tokens);
> > +             return -EFAULT;
> > +     }
> > +
> > +     xa_lock_bh(&sk->sk_user_frags);
> > +     for (i = 0; i < num_tokens; i++) {
> > +             for (j = 0; j < tokens[i].token_count; j++) {

The bug should be here. tokens[i].token_count is a u32 provided by the
user. The user can specify U32_MAX here, which will make the loop
below spin for a very long time with the lock held, which should be
the cause of the soft lockup.

We should add a check that token_count is < MAX_DONTNEED_TOKENS or
something like that, above this line.

Please let me know of urgency. If this can't wait I'll try very hard
to repro the issue/fix while I'm out. Untested fix I'm going to try
out:

diff --git a/net/core/sock.c b/net/core/sock.c
index 083d438d8b6f..cb3d8b19de14 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1071,11 +1071,11 @@ sock_devmem_dontneed(struct sock *sk,
sockptr_t optval, unsigned int optlen)
            optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
                return -EINVAL;

-       tokens = kvmalloc_array(optlen, sizeof(*tokens), GFP_KERNEL);
+       num_tokens = optlen / sizeof(struct dmabuf_token);
+       tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
        if (!tokens)
                return -ENOMEM;

-       num_tokens = optlen / sizeof(struct dmabuf_token);
        if (copy_from_sockptr(tokens, optval, optlen)) {
                kvfree(tokens);
                return -EFAULT;
@@ -1083,6 +1083,10 @@ sock_devmem_dontneed(struct sock *sk, sockptr_t
optval, unsigned int optlen)

        xa_lock_bh(&sk->sk_user_frags);
        for (i = 0; i < num_tokens; i++) {
+
+               if (tokens[i].token_count > MAX_DONTNEED_TOKENS)
+                       continue;
+
                for (j = 0; j < tokens[i].token_count; j++) {
                        netmem_ref netmem = (__force netmem_ref)__xa_erase(
                                &sk->sk_user_frags, tokens[i].token_start + j);
Lai, Yi Oct. 11, 2024, 2:53 a.m. UTC | #4
On Thu, Oct 10, 2024 at 12:05:38PM -0700, Mina Almasry wrote:
> On Thu, Oct 10, 2024 at 4:17 AM Lai, Yi <yi1.lai@linux.intel.com> wrote:
> >
> > Hi Mina Almasry,
> >
> > Greetings!
> >
> > I used Syzkaller and found that there is BUG: soft lockup inqt in linux-next tree next-20241008
> >
> > After bisection and the first bad commit is:
> > "
> > 678f6e28b5f6 net: add SO_DEVMEM_DONTNEED setsockopt to release RX frags
> > "
> >
> > All detailed into can be found at:
> > https://github.com/laifryiee/syzkaller_logs/tree/main/241009_103423_do_sock_setsockopt
> > Syzkaller repro code:
> > https://github.com/laifryiee/syzkaller_logs/tree/main/241009_103423_do_sock_setsockopt/repro.c
> > Syzkaller repro syscall steps:
> > https://github.com/laifryiee/syzkaller_logs/tree/main/241009_103423_do_sock_setsockopt/repro.prog
> > Syzkaller report:
> > https://github.com/laifryiee/syzkaller_logs/tree/main/241009_103423_do_sock_setsockopt/repro.report
> > Kconfig(make olddefconfig):
> > https://github.com/laifryiee/syzkaller_logs/tree/main/241009_103423_do_sock_setsockopt/kconfig_origin
> > Bisect info:
> > https://github.com/laifryiee/syzkaller_logs/tree/main/241009_103423_do_sock_setsockopt/bisect_info.log
> > bzImage:
> > https://github.com/laifryiee/syzkaller_logs/raw/refs/heads/main/241009_103423_do_sock_setsockopt/bzImage_8cf0b93919e13d1e8d4466eb4080a4c4d9d66d7b
> > Issue dmesg:
> > https://github.com/laifryiee/syzkaller_logs/blob/main/241009_103423_do_sock_setsockopt/8cf0b93919e13d1e8d4466eb4080a4c4d9d66d7b_dmesg.log
> >
> > "
> > [   48.825073]  ? __lock_acquire+0x1b0f/0x5c90
> > [   48.825419]  ? __pfx___lock_acquire+0x10/0x10
> > [   48.825774]  sock_setsockopt+0x68/0x90
> > [   48.826117]  do_sock_setsockopt+0x3fb/0x480
> > [   48.826455]  ? __pfx_do_sock_setsockopt+0x10/0x10
> > [   48.826829]  ? lock_release+0x441/0x870
> > [   48.827140]  ? __sanitizer_cov_trace_const_cmp4+0x1a/0x20
> > [   48.827558]  ? fdget+0x188/0x230
> > [   48.827846]  __sys_setsockopt+0x131/0x200
> > [   48.828184]  ? __pfx___sys_setsockopt+0x10/0x10
> > [   48.828551]  ? seqcount_lockdep_reader_access.constprop.0+0xc0/0xd0
> > [   48.829042]  ? __sanitizer_cov_trace_cmp4+0x1a/0x20
> > [   48.829425]  ? ktime_get_coarse_real_ts64+0xbf/0xf0
> > [   48.829817]  __x64_sys_setsockopt+0xc6/0x160
> > [   48.830160]  ? syscall_trace_enter+0x14a/0x230
> > [   48.830520]  x64_sys_call+0x6cf/0x20d0
> > [   48.830825]  do_syscall_64+0x6d/0x140
> > [   48.831124]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > [   48.831517] RIP: 0033:0x7f26cdc3ee5d
> > [   48.831804] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 93 af 1b 00 f7 d8 64 89 01 48
> > [   48.833180] RSP: 002b:00007fff33f36278 EFLAGS: 00000213 ORIG_RAX: 0000000000000036
> > [   48.833756] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f26cdc3ee5d
> > [   48.834294] RDX: 0000000000000050 RSI: 0000000000000001 RDI: 0000000000000003
> > [   48.834830] RBP: 00007fff33f36290 R08: 0000000000000010 R09: 00007fff33f36290
> > [   48.835368] R10: 0000000020000080 R11: 0000000000000213 R12: 00007fff33f363e8
> > [   48.835906] R13: 000000000040178f R14: 0000000000403e08 R15: 00007f26cde51000
> > [   48.836466]  </TASK>
> > [   48.836648] Kernel panic - not syncing: softlockup: hung tasks
> > [   48.837096] CPU: 1 UID: 0 PID: 729 Comm: repro Tainted: G             L     6.12.0-rc2-8cf0b93919e1 #1
> > [   48.837796] Tainted: [L]=SOFTLOCKUP
> > [   48.838071] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
> > [   48.838916] Call Trace:
> > [   48.839113]  <IRQ>
> > [   48.839282]  dump_stack_lvl+0x42/0x150
> > [   48.839584]  dump_stack+0x19/0x20
> > [   48.839846]  panic+0x703/0x790
> > [   48.840100]  ? __pfx_panic+0x10/0x10
> > [   48.840394]  ? watchdog_timer_fn+0x599/0x6b0
> > [   48.840727]  ? watchdog_timer_fn+0x58c/0x6b0
> > [   48.841065]  watchdog_timer_fn+0x5aa/0x6b0
> > [   48.841382]  ? __pfx_watchdog_timer_fn+0x10/0x10
> > [   48.841743]  __hrtimer_run_queues+0x5d6/0xc30
> > [   48.842091]  ? __pfx___hrtimer_run_queues+0x10/0x10
> > [   48.842473]  hrtimer_interrupt+0x324/0x7a0
> > [   48.842802]  __sysvec_apic_timer_interrupt+0x10b/0x410
> > [   48.843198]  ? debug_smp_processor_id+0x20/0x30
> > [   48.843551]  sysvec_apic_timer_interrupt+0xaf/0xd0
> > [   48.843922]  </IRQ>
> > [   48.844101]  <TASK>
> > [   48.844275]  asm_sysvec_apic_timer_interrupt+0x1f/0x30
> > [   48.844711] RIP: 0010:__sanitizer_cov_trace_pc+0x45/0x70
> > [   48.845130] Code: a9 00 01 ff 00 74 1d f6 c4 01 74 43 a9 00 00 0f 00 75 3c a9 00 00 f0 00 75 35 8b 82 04 1e 00 00 85 c0 74 2b 8b 82 e0 1d 00 00 <83> f8 02 75 20 48 8b 8a e8 1d 00 00 8b 92 e4 1d 00 00 48 8b 01 48
> > [   48.846480] RSP: 0018:ffff8880239cf790 EFLAGS: 00000246
> > [   48.846876] RAX: 0000000000000000 RBX: ffff8880239cf900 RCX: ffffffff8581c19f
> > [   48.847407] RDX: ffff88801a818000 RSI: ffffffff8581c1d5 RDI: 0000000000000007
> > [   48.847933] RBP: ffff8880239cf790 R08: 0000000000000001 R09: ffffed1004739f23
> > [   48.848472] R10: 0000000077cc006e R11: 0000000000000001 R12: 0000000000000000
> > [   48.849002] R13: 0000000077cc006e R14: ffff8880239cf918 R15: 0000000000000000
> > [   48.849536]  ? xas_start+0x11f/0x730
> > [   48.849818]  ? xas_start+0x155/0x730
> > [   48.850101]  xas_start+0x155/0x730
> > [   48.850372]  xas_load+0x2f/0x520
> > [   48.850629]  ? irqentry_exit+0x3e/0xa0
> > [   48.850922]  ? sysvec_apic_timer_interrupt+0x6a/0xd0
> > [   48.851304]  xas_store+0x1165/0x1ad0
> > [   48.851588]  ? __this_cpu_preempt_check+0x21/0x30
> > [   48.851950]  ? irqentry_exit+0x3e/0xa0
> > [   48.852254]  __xa_erase+0xc6/0x180
> > [   48.852524]  ? __pfx___xa_erase+0x10/0x10
> > [   48.852842]  ? __xa_erase+0xf1/0x180
> > [   48.853123]  ? sock_devmem_dontneed+0x42c/0x6d0
> > [   48.853480]  sock_devmem_dontneed+0x3a8/0x6d0
> > [   48.853829]  ? __pfx_sock_devmem_dontneed+0x10/0x10
> > [   48.854205]  ? trace_lock_acquire+0x139/0x1b0
> > [   48.854548]  ? lock_acquire+0x80/0xb0
> > [   48.854833]  ? __might_fault+0xf1/0x1b0
> > [   48.855133]  ? __might_fault+0xf1/0x1b0
> > [   48.855437]  ? __sanitizer_cov_trace_const_cmp8+0x1c/0x30
> > [   48.855849]  sk_setsockopt+0x480/0x3c60
> > [   48.856158]  ? __pfx_sk_setsockopt+0x10/0x10
> > [   48.856491]  ? __kasan_check_read+0x15/0x20
> > [   48.856814]  ? __lock_acquire+0x1b0f/0x5c90
> > [   48.857144]  ? __pfx___lock_acquire+0x10/0x10
> > [   48.857488]  sock_setsockopt+0x68/0x90
> > [   48.857785]  do_sock_setsockopt+0x3fb/0x480
> > [   48.858110]  ? __pfx_do_sock_setsockopt+0x10/0x10
> > [   48.858474]  ? lock_release+0x441/0x870
> > [   48.858776]  ? __sanitizer_cov_trace_const_cmp4+0x1a/0x20
> > [   48.859184]  ? fdget+0x188/0x230
> > [   48.859448]  __sys_setsockopt+0x131/0x200
> > [   48.859764]  ? __pfx___sys_setsockopt+0x10/0x10
> > [   48.860123]  ? seqcount_lockdep_reader_access.constprop.0+0xc0/0xd0
> > [   48.860598]  ? __sanitizer_cov_trace_cmp4+0x1a/0x20
> > [   48.860982]  ? ktime_get_coarse_real_ts64+0xbf/0xf0
> > [   48.861370]  __x64_sys_setsockopt+0xc6/0x160
> > [   48.861710]  ? syscall_trace_enter+0x14a/0x230
> > [   48.862057]  x64_sys_call+0x6cf/0x20d0
> > [   48.862350]  do_syscall_64+0x6d/0x140
> > [   48.862639]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > [   48.863023] RIP: 0033:0x7f26cdc3ee5d
> > [   48.863301] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 93 af 1b 00 f7 d8 64 89 01 48
> > [   48.864659] RSP: 002b:00007fff33f36278 EFLAGS: 00000213 ORIG_RAX: 0000000000000036
> > [   48.865223] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f26cdc3ee5d
> > "
> >
> > I hope you find it useful.
> 
> Thank you for the report. I think I see the issue and I commented on
> the fix in the code below.
> 
> Only issue is that this is unlucky timing for me. I have a flight
> tomorrow for a vacation where I think I may have internet access and
> may not. I will try to follow up here, but in case I can't, what's the
> urgency for this issue? Can this wait 2 weeks when I get back?
> 
> > > +     if (optlen % sizeof(struct dmabuf_token) ||
> > > +         optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
> > > +             return -EINVAL;
> > > +
> > > +     tokens = kvmalloc_array(optlen, sizeof(*tokens), GFP_KERNEL);
> > > +     if (!tokens)
> > > +             return -ENOMEM;
> > > +
> 
> There is an unrelated bug here. The first argument for kvmalloc_array
> is the number of elements, I think, not the number of bytes. So this
> should be:
> 
> num_tokens = optlen / sizeof(struct dmabuf_token);
> tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
> if (!tokens)
>    return -ENOMEM;
> 
> > > +
> > > +     if (copy_from_sockptr(tokens, optval, optlen)) {
> > > +             kvfree(tokens);
> > > +             return -EFAULT;
> > > +     }
> > > +
> > > +     xa_lock_bh(&sk->sk_user_frags);
> > > +     for (i = 0; i < num_tokens; i++) {
> > > +             for (j = 0; j < tokens[i].token_count; j++) {
> 
> The bug should be here. tokens[i].token_count is a u32 provided by the
> user. The user can specify U32_MAX here, which will make the loop
> below spin for a very long time with the lock held, which should be
> the cause of the soft lockup.
> 
> We should add a check that token_count is < MAX_DONTNEED_TOKENS or
> something like that, above this line.
> 
> Please let me know of urgency. If this can't wait I'll try very hard
> to repro the issue/fix while I'm out. Untested fix I'm going to try
> out:
>

Hi Mina

This bug can wait as you mentioned two weeks. Just let you know,
after applied your proposed fix, the issue cannot be reproduced using
the same repro binary.

If you have a formal fix patch later, I would be happy to test it again.
Just let me know.

Regards,
Yi Lai

> diff --git a/net/core/sock.c b/net/core/sock.c
> index 083d438d8b6f..cb3d8b19de14 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1071,11 +1071,11 @@ sock_devmem_dontneed(struct sock *sk,
> sockptr_t optval, unsigned int optlen)
>             optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
>                 return -EINVAL;
> 
> -       tokens = kvmalloc_array(optlen, sizeof(*tokens), GFP_KERNEL);
> +       num_tokens = optlen / sizeof(struct dmabuf_token);
> +       tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
>         if (!tokens)
>                 return -ENOMEM;
> 
> -       num_tokens = optlen / sizeof(struct dmabuf_token);
>         if (copy_from_sockptr(tokens, optval, optlen)) {
>                 kvfree(tokens);
>                 return -EFAULT;
> @@ -1083,6 +1083,10 @@ sock_devmem_dontneed(struct sock *sk, sockptr_t
> optval, unsigned int optlen)
> 
>         xa_lock_bh(&sk->sk_user_frags);
>         for (i = 0; i < num_tokens; i++) {
> +
> +               if (tokens[i].token_count > MAX_DONTNEED_TOKENS)
> +                       continue;
> +
>                 for (j = 0; j < tokens[i].token_count; j++) {
>                         netmem_ref netmem = (__force netmem_ref)__xa_erase(
>                                 &sk->sk_user_frags, tokens[i].token_start + j);
> 
> -- 
> Thanks,
> Mina
Jakub Kicinski Oct. 11, 2024, 3:27 p.m. UTC | #5
On Thu, 10 Oct 2024 12:05:38 -0700 Mina Almasry wrote:
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 083d438d8b6f..cb3d8b19de14 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1071,11 +1071,11 @@ sock_devmem_dontneed(struct sock *sk,
> sockptr_t optval, unsigned int optlen)
>             optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
>                 return -EINVAL;
> 
> -       tokens = kvmalloc_array(optlen, sizeof(*tokens), GFP_KERNEL);
> +       num_tokens = optlen / sizeof(struct dmabuf_token);
> +       tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
>         if (!tokens)
>                 return -ENOMEM;
> 
> -       num_tokens = optlen / sizeof(struct dmabuf_token);
>         if (copy_from_sockptr(tokens, optval, optlen)) {
>                 kvfree(tokens);
>                 return -EFAULT;
> @@ -1083,6 +1083,10 @@ sock_devmem_dontneed(struct sock *sk, sockptr_t
> optval, unsigned int optlen)
> 
>         xa_lock_bh(&sk->sk_user_frags);
>         for (i = 0; i < num_tokens; i++) {
> +
> +               if (tokens[i].token_count > MAX_DONTNEED_TOKENS)
> +                       continue;

For the real fix let's scan the tokens before we take the xa lock
and return an error rather than silently skipping?

>                 for (j = 0; j < tokens[i].token_count; j++) {
Mina Almasry Oct. 11, 2024, 7:38 p.m. UTC | #6
On Fri, Oct 11, 2024 at 8:27 AM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Thu, 10 Oct 2024 12:05:38 -0700 Mina Almasry wrote:
> > diff --git a/net/core/sock.c b/net/core/sock.c
> > index 083d438d8b6f..cb3d8b19de14 100644
> > --- a/net/core/sock.c
> > +++ b/net/core/sock.c
> > @@ -1071,11 +1071,11 @@ sock_devmem_dontneed(struct sock *sk,
> > sockptr_t optval, unsigned int optlen)
> >             optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
> >                 return -EINVAL;
> >
> > -       tokens = kvmalloc_array(optlen, sizeof(*tokens), GFP_KERNEL);
> > +       num_tokens = optlen / sizeof(struct dmabuf_token);
> > +       tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
> >         if (!tokens)
> >                 return -ENOMEM;
> >
> > -       num_tokens = optlen / sizeof(struct dmabuf_token);
> >         if (copy_from_sockptr(tokens, optval, optlen)) {
> >                 kvfree(tokens);
> >                 return -EFAULT;
> > @@ -1083,6 +1083,10 @@ sock_devmem_dontneed(struct sock *sk, sockptr_t
> > optval, unsigned int optlen)
> >
> >         xa_lock_bh(&sk->sk_user_frags);
> >         for (i = 0; i < num_tokens; i++) {
> > +
> > +               if (tokens[i].token_count > MAX_DONTNEED_TOKENS)
> > +                       continue;
>
> For the real fix let's scan the tokens before we take the xa lock
> and return an error rather than silently skipping?
>
> >                 for (j = 0; j < tokens[i].token_count; j++) {
>

Yes, sorry, I called the diff above an 'untested fix' but it was more
of a hack to see if I got the root cause right. For a proper fix, we
should do exactly that. Scan and see how many tokens the user is
asking us to free ahead of time, then exit early if it's too much
before we acquire locks and loop. Will do!
diff mbox series

Patch

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index ef4656a41058..251b73c5481e 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -144,6 +144,7 @@ 
 #define SCM_DEVMEM_LINEAR	SO_DEVMEM_LINEAR
 #define SO_DEVMEM_DMABUF	79
 #define SCM_DEVMEM_DMABUF	SO_DEVMEM_DMABUF
+#define SO_DEVMEM_DONTNEED	80
 
 #if !defined(__KERNEL__)
 
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 414807d55e33..8ab7582291ab 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -155,6 +155,7 @@ 
 #define SCM_DEVMEM_LINEAR	SO_DEVMEM_LINEAR
 #define SO_DEVMEM_DMABUF	79
 #define SCM_DEVMEM_DMABUF	SO_DEVMEM_DMABUF
+#define SO_DEVMEM_DONTNEED	80
 
 #if !defined(__KERNEL__)
 
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 2b817efd4544..38fc0b188e08 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -136,6 +136,7 @@ 
 #define SCM_DEVMEM_LINEAR	SO_DEVMEM_LINEAR
 #define SO_DEVMEM_DMABUF	79
 #define SCM_DEVMEM_DMABUF	SO_DEVMEM_DMABUF
+#define SO_DEVMEM_DONTNEED	80
 
 #if !defined(__KERNEL__)
 
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 00248fc68977..57084ed2f3c4 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -137,6 +137,7 @@ 
 #define SCM_DEVMEM_LINEAR        SO_DEVMEM_LINEAR
 #define SO_DEVMEM_DMABUF         0x0058
 #define SCM_DEVMEM_DMABUF        SO_DEVMEM_DMABUF
+#define SO_DEVMEM_DONTNEED       0x0059
 
 #if !defined(__KERNEL__)
 
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index e993edc9c0ee..3b4e3e815602 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -139,6 +139,7 @@ 
 #define SCM_DEVMEM_LINEAR	SO_DEVMEM_LINEAR
 #define SO_DEVMEM_DMABUF	79
 #define SCM_DEVMEM_DMABUF	SO_DEVMEM_DMABUF
+#define SO_DEVMEM_DONTNEED	80
 
 #if !defined(__KERNEL__)
 
diff --git a/include/uapi/linux/uio.h b/include/uapi/linux/uio.h
index 3a22ddae376a..d17f8fcd93ec 100644
--- a/include/uapi/linux/uio.h
+++ b/include/uapi/linux/uio.h
@@ -33,6 +33,10 @@  struct dmabuf_cmsg {
 				 */
 };
 
+struct dmabuf_token {
+	__u32 token_start;
+	__u32 token_count;
+};
 /*
  *	UIO_MAXIOV shall be at least 16 1003.1g (5.4.1.1)
  */
diff --git a/net/core/sock.c b/net/core/sock.c
index 468b1239606c..bbb57b5af0b1 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -124,6 +124,7 @@ 
 #include <linux/netdevice.h>
 #include <net/protocol.h>
 #include <linux/skbuff.h>
+#include <linux/skbuff_ref.h>
 #include <net/net_namespace.h>
 #include <net/request_sock.h>
 #include <net/sock.h>
@@ -1049,6 +1050,69 @@  static int sock_reserve_memory(struct sock *sk, int bytes)
 	return 0;
 }
 
+#ifdef CONFIG_PAGE_POOL
+
+/* This is the number of tokens that the user can SO_DEVMEM_DONTNEED in
+ * 1 syscall. The limit exists to limit the amount of memory the kernel
+ * allocates to copy these tokens.
+ */
+#define MAX_DONTNEED_TOKENS 128
+
+static noinline_for_stack int
+sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
+{
+	unsigned int num_tokens, i, j, k, netmem_num = 0;
+	struct dmabuf_token *tokens;
+	netmem_ref netmems[16];
+	int ret = 0;
+
+	if (!sk_is_tcp(sk))
+		return -EBADF;
+
+	if (optlen % sizeof(struct dmabuf_token) ||
+	    optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
+		return -EINVAL;
+
+	tokens = kvmalloc_array(optlen, sizeof(*tokens), GFP_KERNEL);
+	if (!tokens)
+		return -ENOMEM;
+
+	num_tokens = optlen / sizeof(struct dmabuf_token);
+	if (copy_from_sockptr(tokens, optval, optlen)) {
+		kvfree(tokens);
+		return -EFAULT;
+	}
+
+	xa_lock_bh(&sk->sk_user_frags);
+	for (i = 0; i < num_tokens; i++) {
+		for (j = 0; j < tokens[i].token_count; j++) {
+			netmem_ref netmem = (__force netmem_ref)__xa_erase(
+				&sk->sk_user_frags, tokens[i].token_start + j);
+
+			if (netmem &&
+			    !WARN_ON_ONCE(!netmem_is_net_iov(netmem))) {
+				netmems[netmem_num++] = netmem;
+				if (netmem_num == ARRAY_SIZE(netmems)) {
+					xa_unlock_bh(&sk->sk_user_frags);
+					for (k = 0; k < netmem_num; k++)
+						WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
+					netmem_num = 0;
+					xa_lock_bh(&sk->sk_user_frags);
+				}
+				ret++;
+			}
+		}
+	}
+
+	xa_unlock_bh(&sk->sk_user_frags);
+	for (k = 0; k < netmem_num; k++)
+		WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
+
+	kvfree(tokens);
+	return ret;
+}
+#endif
+
 void sockopt_lock_sock(struct sock *sk)
 {
 	/* When current->bpf_ctx is set, the setsockopt is called from
@@ -1211,6 +1275,10 @@  int sk_setsockopt(struct sock *sk, int level, int optname,
 			ret = -EOPNOTSUPP;
 		return ret;
 		}
+#ifdef CONFIG_PAGE_POOL
+	case SO_DEVMEM_DONTNEED:
+		return sock_devmem_dontneed(sk, optval, optlen);
+#endif
 	}
 
 	sockopt_lock_sock(sk);