diff mbox series

[bpf-next,v2,2/5] bpf: implement sleepable uprobes by chaining tasks_trace and normal rcu

Message ID 588dd77e9e7424e0abc0e0e624524ef8a2c7b847.1651532419.git.delyank@fb.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series sleepable uprobe support | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 3209 this patch: 3210
netdev/cc_maintainers warning 8 maintainers not CCed: rostedt@goodmis.org songliubraving@fb.com netdev@vger.kernel.org kafai@fb.com yhs@fb.com mingo@redhat.com john.fastabend@gmail.com kpsingh@kernel.org
netdev/build_clang success Errors and warnings before: 439 this patch: 439
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 3269 this patch: 3270
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 158 lines checked
netdev/kdoc fail Errors and warnings before: 0 this patch: 4
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-1 success Logs for Kernel LATEST on ubuntu-latest + selftests
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Kernel LATEST on z15 + selftests

Commit Message

Delyan Kratunov May 2, 2022, 11:09 p.m. UTC
uprobes work by raising a trap, setting a task flag from within the
interrupt handler, and processing the actual work for the uprobe on the
way back to userspace. As a result, uprobe handlers already execute in a
user context. The primary obstacle to sleepable bpf uprobe programs is
therefore on the bpf side.

Namely, the bpf_prog_array attached to the uprobe is protected by normal
rcu and runs with disabled preemption. In order for uprobe bpf programs
to become actually sleepable, we need it to be protected by the tasks_trace
rcu flavor instead (and kfree() called after a corresponding grace period).

Based on Alexei's proposal, we change the free path for bpf_prog_array to
chain a tasks_trace and normal grace periods one after the other.

Users who iterate under tasks_trace read section would
be safe, as would users who iterate under normal read sections (from
non-sleepable locations). The downside is that we take the tasks_trace latency
for all perf_event-attached bpf programs (and not just uprobe ones)
but this is deemed safe given the possible attach rates for
kprobe/uprobe/tp programs.

Separately, non-sleepable programs need access to dynamically sized
rcu-protected maps, so we conditionally disable preemption and take an rcu
read section around them, in addition to the overarching tasks_trace section.

Signed-off-by: Delyan Kratunov <delyank@fb.com>
---
 include/linux/bpf.h          | 57 ++++++++++++++++++++++++++++++++++++
 include/linux/trace_events.h |  1 +
 kernel/bpf/core.c            | 15 ++++++++++
 kernel/trace/bpf_trace.c     | 27 +++++++++++++++--
 kernel/trace/trace_uprobe.c  |  4 +--
 5 files changed, 99 insertions(+), 5 deletions(-)

--
2.35.1

Comments

kernel test robot May 3, 2022, 6:30 a.m. UTC | #1
Hi Delyan,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Delyan-Kratunov/sleepable-uprobe-support/20220503-071247
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: i386-defconfig (https://download.01.org/0day-ci/archive/20220503/202205031441.1fhDuUQK-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.2.0-20) 11.2.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/cfa0f114829902b579da16d7520a39317905c502
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Delyan-Kratunov/sleepable-uprobe-support/20220503-071247
        git checkout cfa0f114829902b579da16d7520a39317905c502
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   kernel/trace/trace_uprobe.c: In function '__uprobe_perf_func':
>> kernel/trace/trace_uprobe.c:1349:23: error: implicit declaration of function 'uprobe_call_bpf'; did you mean 'trace_call_bpf'? [-Werror=implicit-function-declaration]
    1349 |                 ret = uprobe_call_bpf(call, regs);
         |                       ^~~~~~~~~~~~~~~
         |                       trace_call_bpf
   cc1: some warnings being treated as errors


vim +1349 kernel/trace/trace_uprobe.c

  1334	
  1335	static void __uprobe_perf_func(struct trace_uprobe *tu,
  1336				       unsigned long func, struct pt_regs *regs,
  1337				       struct uprobe_cpu_buffer *ucb, int dsize)
  1338	{
  1339		struct trace_event_call *call = trace_probe_event_call(&tu->tp);
  1340		struct uprobe_trace_entry_head *entry;
  1341		struct hlist_head *head;
  1342		void *data;
  1343		int size, esize;
  1344		int rctx;
  1345	
  1346		if (bpf_prog_array_valid(call)) {
  1347			u32 ret;
  1348	
> 1349			ret = uprobe_call_bpf(call, regs);
  1350			if (!ret)
  1351				return;
  1352		}
  1353	
  1354		esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
  1355	
  1356		size = esize + tu->tp.size + dsize;
  1357		size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32);
  1358		if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
  1359			return;
  1360	
  1361		preempt_disable();
  1362		head = this_cpu_ptr(call->perf_events);
  1363		if (hlist_empty(head))
  1364			goto out;
  1365	
  1366		entry = perf_trace_buf_alloc(size, NULL, &rctx);
  1367		if (!entry)
  1368			goto out;
  1369	
  1370		if (is_ret_probe(tu)) {
  1371			entry->vaddr[0] = func;
  1372			entry->vaddr[1] = instruction_pointer(regs);
  1373			data = DATAOF_TRACE_ENTRY(entry, true);
  1374		} else {
  1375			entry->vaddr[0] = instruction_pointer(regs);
  1376			data = DATAOF_TRACE_ENTRY(entry, false);
  1377		}
  1378	
  1379		memcpy(data, ucb->buf, tu->tp.size + dsize);
  1380	
  1381		if (size - esize > tu->tp.size + dsize) {
  1382			int len = tu->tp.size + dsize;
  1383	
  1384			memset(data + len, 0, size - esize - len);
  1385		}
  1386	
  1387		perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
  1388				      head, NULL);
  1389	 out:
  1390		preempt_enable();
  1391	}
  1392
kernel test robot May 3, 2022, 8:48 a.m. UTC | #2
Hi Delyan,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Delyan-Kratunov/sleepable-uprobe-support/20220503-071247
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: s390-randconfig-r003-20220501 (https://download.01.org/0day-ci/archive/20220503/202205031643.UGYirnXb-lkp@intel.com/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project 363b3a645a1e30011cc8da624f13dac5fd915628)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install s390 cross compiling tool for clang build
        # apt-get install binutils-s390x-linux-gnu
        # https://github.com/intel-lab-lkp/linux/commit/cfa0f114829902b579da16d7520a39317905c502
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Delyan-Kratunov/sleepable-uprobe-support/20220503-071247
        git checkout cfa0f114829902b579da16d7520a39317905c502
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=s390 SHELL=/bin/bash kernel/trace/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from kernel/trace/trace_uprobe.c:10:
   In file included from include/linux/bpf-cgroup.h:11:
   In file included from include/net/sock.h:46:
   In file included from include/linux/netdevice.h:38:
   In file included from include/net/net_namespace.h:40:
   In file included from include/linux/skbuff.h:31:
   In file included from include/linux/dma-mapping.h:10:
   In file included from include/linux/scatterlist.h:9:
   In file included from arch/s390/include/asm/io.h:75:
   include/asm-generic/io.h:464:31: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           val = __raw_readb(PCI_IOBASE + addr);
                             ~~~~~~~~~~ ^
   include/asm-generic/io.h:477:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + addr));
                                                           ~~~~~~~~~~ ^
   include/uapi/linux/byteorder/big_endian.h:37:59: note: expanded from macro '__le16_to_cpu'
   #define __le16_to_cpu(x) __swab16((__force __u16)(__le16)(x))
                                                             ^
   include/uapi/linux/swab.h:102:54: note: expanded from macro '__swab16'
   #define __swab16(x) (__u16)__builtin_bswap16((__u16)(x))
                                                        ^
   In file included from kernel/trace/trace_uprobe.c:10:
   In file included from include/linux/bpf-cgroup.h:11:
   In file included from include/net/sock.h:46:
   In file included from include/linux/netdevice.h:38:
   In file included from include/net/net_namespace.h:40:
   In file included from include/linux/skbuff.h:31:
   In file included from include/linux/dma-mapping.h:10:
   In file included from include/linux/scatterlist.h:9:
   In file included from arch/s390/include/asm/io.h:75:
   include/asm-generic/io.h:490:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
                                                           ~~~~~~~~~~ ^
   include/uapi/linux/byteorder/big_endian.h:35:59: note: expanded from macro '__le32_to_cpu'
   #define __le32_to_cpu(x) __swab32((__force __u32)(__le32)(x))
                                                             ^
   include/uapi/linux/swab.h:115:54: note: expanded from macro '__swab32'
   #define __swab32(x) (__u32)__builtin_bswap32((__u32)(x))
                                                        ^
   In file included from kernel/trace/trace_uprobe.c:10:
   In file included from include/linux/bpf-cgroup.h:11:
   In file included from include/net/sock.h:46:
   In file included from include/linux/netdevice.h:38:
   In file included from include/net/net_namespace.h:40:
   In file included from include/linux/skbuff.h:31:
   In file included from include/linux/dma-mapping.h:10:
   In file included from include/linux/scatterlist.h:9:
   In file included from arch/s390/include/asm/io.h:75:
   include/asm-generic/io.h:501:33: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           __raw_writeb(value, PCI_IOBASE + addr);
                               ~~~~~~~~~~ ^
   include/asm-generic/io.h:511:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           __raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + addr);
                                                         ~~~~~~~~~~ ^
   include/asm-generic/io.h:521:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           __raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + addr);
                                                         ~~~~~~~~~~ ^
   include/asm-generic/io.h:609:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           readsb(PCI_IOBASE + addr, buffer, count);
                  ~~~~~~~~~~ ^
   include/asm-generic/io.h:617:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           readsw(PCI_IOBASE + addr, buffer, count);
                  ~~~~~~~~~~ ^
   include/asm-generic/io.h:625:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           readsl(PCI_IOBASE + addr, buffer, count);
                  ~~~~~~~~~~ ^
   include/asm-generic/io.h:634:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           writesb(PCI_IOBASE + addr, buffer, count);
                   ~~~~~~~~~~ ^
   include/asm-generic/io.h:643:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           writesw(PCI_IOBASE + addr, buffer, count);
                   ~~~~~~~~~~ ^
   include/asm-generic/io.h:652:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
           writesl(PCI_IOBASE + addr, buffer, count);
                   ~~~~~~~~~~ ^
>> kernel/trace/trace_uprobe.c:1349:9: error: call to undeclared function 'uprobe_call_bpf'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
                   ret = uprobe_call_bpf(call, regs);
                         ^
   kernel/trace/trace_uprobe.c:1349:9: note: did you mean 'trace_call_bpf'?
   include/linux/trace_events.h:752:28: note: 'trace_call_bpf' declared here
   static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
                              ^
   12 warnings and 1 error generated.


vim +/uprobe_call_bpf +1349 kernel/trace/trace_uprobe.c

  1334	
  1335	static void __uprobe_perf_func(struct trace_uprobe *tu,
  1336				       unsigned long func, struct pt_regs *regs,
  1337				       struct uprobe_cpu_buffer *ucb, int dsize)
  1338	{
  1339		struct trace_event_call *call = trace_probe_event_call(&tu->tp);
  1340		struct uprobe_trace_entry_head *entry;
  1341		struct hlist_head *head;
  1342		void *data;
  1343		int size, esize;
  1344		int rctx;
  1345	
  1346		if (bpf_prog_array_valid(call)) {
  1347			u32 ret;
  1348	
> 1349			ret = uprobe_call_bpf(call, regs);
  1350			if (!ret)
  1351				return;
  1352		}
  1353	
  1354		esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
  1355	
  1356		size = esize + tu->tp.size + dsize;
  1357		size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32);
  1358		if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
  1359			return;
  1360	
  1361		preempt_disable();
  1362		head = this_cpu_ptr(call->perf_events);
  1363		if (hlist_empty(head))
  1364			goto out;
  1365	
  1366		entry = perf_trace_buf_alloc(size, NULL, &rctx);
  1367		if (!entry)
  1368			goto out;
  1369	
  1370		if (is_ret_probe(tu)) {
  1371			entry->vaddr[0] = func;
  1372			entry->vaddr[1] = instruction_pointer(regs);
  1373			data = DATAOF_TRACE_ENTRY(entry, true);
  1374		} else {
  1375			entry->vaddr[0] = instruction_pointer(regs);
  1376			data = DATAOF_TRACE_ENTRY(entry, false);
  1377		}
  1378	
  1379		memcpy(data, ucb->buf, tu->tp.size + dsize);
  1380	
  1381		if (size - esize > tu->tp.size + dsize) {
  1382			int len = tu->tp.size + dsize;
  1383	
  1384			memset(data + len, 0, size - esize - len);
  1385		}
  1386	
  1387		perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
  1388				      head, NULL);
  1389	 out:
  1390		preempt_enable();
  1391	}
  1392
Delyan Kratunov May 3, 2022, 5:20 p.m. UTC | #3
On Tue, 2022-05-03 at 14:30 +0800, kernel test robot wrote:
> Hi Delyan,
> 
> Thank you for the patch! Yet something to improve:
> 
> [auto build test ERROR on bpf-next/master]
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Delyan-Kratunov/sleepable-uprobe-support/20220503-071247
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
> config: i386-defconfig (https://download.01.org/0day-ci/archive/20220503/202205031441.1fhDuUQK-lkp@intel.com/config )
> compiler: gcc-11 (Debian 11.2.0-20) 11.2.0
> reproduce (this is a W=1 build):
>         # https://github.com/intel-lab-lkp/linux/commit/cfa0f114829902b579da16d7520a39317905c502
>         git remote add linux-review https://github.com/intel-lab-lkp/linux
>         git fetch --no-tags linux-review Delyan-Kratunov/sleepable-uprobe-support/20220503-071247
>         git checkout cfa0f114829902b579da16d7520a39317905c502
>         # save the config file
>         mkdir build_dir && cp config build_dir/.config
>         make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash
> 
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot <lkp@intel.com>
> 
> All errors (new ones prefixed by >>):
> 
>    kernel/trace/trace_uprobe.c: In function '__uprobe_perf_func':
> > > kernel/trace/trace_uprobe.c:1349:23: error: implicit declaration of function 'uprobe_call_bpf'; did you mean 'trace_call_bpf'? [-Werror=implicit-function-declaration]
>     1349 |                 ret = uprobe_call_bpf(call, regs);
>          |                       ^~~~~~~~~~~~~~~
>          |                       trace_call_bpf
>    cc1: some warnings being treated as errors

Hm, CONFIG_BPF_EVENTS doesn't seem to guard the callsite from trace_uprobe.c, it's
only gated by CONFIG_PERF_EVENTS there. A PERF_EVENTS=y && BPF_EVENTS=n config would
lead to this error. 

This is  a pre-existing issue and I'll send a separate patch for it.
Andrii Nakryiko May 9, 2022, 11:58 p.m. UTC | #4
On Tue, May 3, 2022 at 10:20 AM Delyan Kratunov <delyank@fb.com> wrote:
>
> On Tue, 2022-05-03 at 14:30 +0800, kernel test robot wrote:
> > Hi Delyan,
> >
> > Thank you for the patch! Yet something to improve:
> >
> > [auto build test ERROR on bpf-next/master]
> >
> > url:    https://github.com/intel-lab-lkp/linux/commits/Delyan-Kratunov/sleepable-uprobe-support/20220503-071247
> > base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
> > config: i386-defconfig (https://download.01.org/0day-ci/archive/20220503/202205031441.1fhDuUQK-lkp@intel.com/config )
> > compiler: gcc-11 (Debian 11.2.0-20) 11.2.0
> > reproduce (this is a W=1 build):
> >         # https://github.com/intel-lab-lkp/linux/commit/cfa0f114829902b579da16d7520a39317905c502
> >         git remote add linux-review https://github.com/intel-lab-lkp/linux
> >         git fetch --no-tags linux-review Delyan-Kratunov/sleepable-uprobe-support/20220503-071247
> >         git checkout cfa0f114829902b579da16d7520a39317905c502
> >         # save the config file
> >         mkdir build_dir && cp config build_dir/.config
> >         make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash
> >
> > If you fix the issue, kindly add following tag as appropriate
> > Reported-by: kernel test robot <lkp@intel.com>
> >
> > All errors (new ones prefixed by >>):
> >
> >    kernel/trace/trace_uprobe.c: In function '__uprobe_perf_func':
> > > > kernel/trace/trace_uprobe.c:1349:23: error: implicit declaration of function 'uprobe_call_bpf'; did you mean 'trace_call_bpf'? [-Werror=implicit-function-declaration]
> >     1349 |                 ret = uprobe_call_bpf(call, regs);
> >          |                       ^~~~~~~~~~~~~~~
> >          |                       trace_call_bpf
> >    cc1: some warnings being treated as errors
>
> Hm, CONFIG_BPF_EVENTS doesn't seem to guard the callsite from trace_uprobe.c, it's
> only gated by CONFIG_PERF_EVENTS there. A PERF_EVENTS=y && BPF_EVENTS=n config would
> lead to this error.
>
> This is  a pre-existing issue and I'll send a separate patch for it.

It would probably make sense to include it as a pre-patch for your
series, so that bpf-next doesn't trigger these warnings?
Alexei Starovoitov May 10, 2022, 2:54 a.m. UTC | #5
On Tue, May 03, 2022 at 05:20:26PM +0000, Delyan Kratunov wrote:
> On Tue, 2022-05-03 at 14:30 +0800, kernel test robot wrote:
> > Hi Delyan,
> > 
> > Thank you for the patch! Yet something to improve:
> > 
> > [auto build test ERROR on bpf-next/master]
> > 
> > url:    https://github.com/intel-lab-lkp/linux/commits/Delyan-Kratunov/sleepable-uprobe-support/20220503-071247
> > base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
> > config: i386-defconfig (https://download.01.org/0day-ci/archive/20220503/202205031441.1fhDuUQK-lkp@intel.com/config )
> > compiler: gcc-11 (Debian 11.2.0-20) 11.2.0
> > reproduce (this is a W=1 build):
> >         # https://github.com/intel-lab-lkp/linux/commit/cfa0f114829902b579da16d7520a39317905c502
> >         git remote add linux-review https://github.com/intel-lab-lkp/linux
> >         git fetch --no-tags linux-review Delyan-Kratunov/sleepable-uprobe-support/20220503-071247
> >         git checkout cfa0f114829902b579da16d7520a39317905c502
> >         # save the config file
> >         mkdir build_dir && cp config build_dir/.config
> >         make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash
> > 
> > If you fix the issue, kindly add following tag as appropriate
> > Reported-by: kernel test robot <lkp@intel.com>
> > 
> > All errors (new ones prefixed by >>):
> > 
> >    kernel/trace/trace_uprobe.c: In function '__uprobe_perf_func':
> > > > kernel/trace/trace_uprobe.c:1349:23: error: implicit declaration of function 'uprobe_call_bpf'; did you mean 'trace_call_bpf'? [-Werror=implicit-function-declaration]
> >     1349 |                 ret = uprobe_call_bpf(call, regs);
> >          |                       ^~~~~~~~~~~~~~~
> >          |                       trace_call_bpf
> >    cc1: some warnings being treated as errors
> 
> Hm, CONFIG_BPF_EVENTS doesn't seem to guard the callsite from trace_uprobe.c, it's
> only gated by CONFIG_PERF_EVENTS there. A PERF_EVENTS=y && BPF_EVENTS=n config would
> lead to this error. 
> 
> This is  a pre-existing issue and I'll send a separate patch for it.

Maybe move uprobe_call_bpf into trace_uprobe.c so it gets a chance of being
fully inlined and will avoid this issue.
Alexei Starovoitov May 10, 2022, 3:02 a.m. UTC | #6
On Mon, May 02, 2022 at 11:09:37PM +0000, Delyan Kratunov wrote:
> uprobes work by raising a trap, setting a task flag from within the
> interrupt handler, and processing the actual work for the uprobe on the
> way back to userspace. As a result, uprobe handlers already execute in a
> user context. The primary obstacle to sleepable bpf uprobe programs is
> therefore on the bpf side.
> 
> Namely, the bpf_prog_array attached to the uprobe is protected by normal
> rcu and runs with disabled preemption.

disabled preemption was an artifact of the past.
It became unnecessary when migrate_disable was introduced.
Looks like we forgot to update this spot.

> In order for uprobe bpf programs
> to become actually sleepable, we need it to be protected by the tasks_trace
> rcu flavor instead (and kfree() called after a corresponding grace period).
> 
> Based on Alexei's proposal, we change the free path for bpf_prog_array to
> chain a tasks_trace and normal grace periods one after the other.
> 
> Users who iterate under tasks_trace read section would
> be safe, as would users who iterate under normal read sections (from
> non-sleepable locations). The downside is that we take the tasks_trace latency
> for all perf_event-attached bpf programs (and not just uprobe ones)
> but this is deemed safe given the possible attach rates for
> kprobe/uprobe/tp programs.
> 
> Separately, non-sleepable programs need access to dynamically sized
> rcu-protected maps, so we conditionally disable preemption and take an rcu
> read section around them, in addition to the overarching tasks_trace section.
> 
> Signed-off-by: Delyan Kratunov <delyank@fb.com>
> ---
>  include/linux/bpf.h          | 57 ++++++++++++++++++++++++++++++++++++
>  include/linux/trace_events.h |  1 +
>  kernel/bpf/core.c            | 15 ++++++++++
>  kernel/trace/bpf_trace.c     | 27 +++++++++++++++--
>  kernel/trace/trace_uprobe.c  |  4 +--
>  5 files changed, 99 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 57ec619cf729..592886115011 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -26,6 +26,7 @@
>  #include <linux/stddef.h>
>  #include <linux/bpfptr.h>
>  #include <linux/btf.h>
> +#include <linux/rcupdate_trace.h>
> 
>  struct bpf_verifier_env;
>  struct bpf_verifier_log;
> @@ -1343,6 +1344,8 @@ extern struct bpf_empty_prog_array bpf_empty_prog_array;
> 
>  struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
>  void bpf_prog_array_free(struct bpf_prog_array *progs);
> +/* Use when traversal over the bpf_prog_array uses tasks_trace rcu */
> +void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs);
>  int bpf_prog_array_length(struct bpf_prog_array *progs);
>  bool bpf_prog_array_is_empty(struct bpf_prog_array *array);
>  int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs,
> @@ -1428,6 +1431,60 @@ bpf_prog_run_array(const struct bpf_prog_array *array,
>  	return ret;
>  }
> 
> +/**
> + * Notes on RCU design for bpf_prog_arrays containing sleepable programs:
> + *
> + * We use the tasks_trace rcu flavor read section to protect the bpf_prog_array
> + * overall. As a result, we must use the bpf_prog_array_free_sleepable
> + * in order to use the tasks_trace rcu grace period.
> + *
> + * When a non-sleepable program is inside the array, we take the rcu read
> + * section and disable preemption for that program alone, so it can access
> + * rcu-protected dynamically sized maps.
> + */
> +static __always_inline u32
> +bpf_prog_run_array_sleepable(const struct bpf_prog_array __rcu *array_rcu,
> +			     const void *ctx, bpf_prog_run_fn run_prog)
> +{
> +	const struct bpf_prog_array_item *item;
> +	const struct bpf_prog *prog;
> +	const struct bpf_prog_array *array;
> +	struct bpf_run_ctx *old_run_ctx;
> +	struct bpf_trace_run_ctx run_ctx;
> +	u32 ret = 1;
> +
> +	might_fault();
> +
> +	migrate_disable();
> +	rcu_read_lock_trace();

pls swap this order to make it the same as bpf trampoline.

> +
> +	array = rcu_dereference_check(array_rcu, rcu_read_lock_trace_held());
> +	if (unlikely(!array))
> +		goto out;
> +	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
> +	item = &array->items[0];
> +	while ((prog = READ_ONCE(item->prog))) {
> +		if (!prog->aux->sleepable) {
> +			preempt_disable();

just remove this line.

> +			rcu_read_lock();

In config-s where rcu_read_lock/unlock is a nop
this whole 'if' and one below will be optimized out by the compiler.
Which is nice.

> +		}
> +
> +		run_ctx.bpf_cookie = item->bpf_cookie;
> +		ret &= run_prog(prog, ctx);
> +		item++;
> +
> +		if (!prog->aux->sleepable) {
> +			rcu_read_unlock();
> +			preempt_enable();
> +		}
> +	}
> +	bpf_reset_run_ctx(old_run_ctx);
> +out:
> +	rcu_read_unlock_trace();
> +	migrate_enable();
> +	return ret;
> +}
> +
>  #ifdef CONFIG_BPF_SYSCALL
>  DECLARE_PER_CPU(int, bpf_prog_active);
>  extern struct mutex bpf_stats_enabled_mutex;
> diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
> index e6e95a9f07a5..d45889f1210d 100644
> --- a/include/linux/trace_events.h
> +++ b/include/linux/trace_events.h
> @@ -736,6 +736,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
> 
>  #ifdef CONFIG_BPF_EVENTS
>  unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
> +unsigned int uprobe_call_bpf(struct trace_event_call *call, void *ctx);
>  int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie);
>  void perf_event_detach_bpf_prog(struct perf_event *event);
>  int perf_event_query_prog_array(struct perf_event *event, void __user *info);
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 13e9dbeeedf3..9271b708807a 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -2268,6 +2268,21 @@ void bpf_prog_array_free(struct bpf_prog_array *progs)
>  	kfree_rcu(progs, rcu);
>  }
> 
> +static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
> +{
> +	struct bpf_prog_array *progs;
> +
> +	progs = container_of(rcu, struct bpf_prog_array, rcu);
> +	kfree_rcu(progs, rcu);
> +}
> +
> +void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
> +{
> +	if (!progs || progs == &bpf_empty_prog_array.hdr)
> +		return;
> +	call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
> +}
> +
>  int bpf_prog_array_length(struct bpf_prog_array *array)
>  {
>  	struct bpf_prog_array_item *item;
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index f15b826f9899..582a6171e096 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -140,6 +140,29 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
>  	return ret;
>  }
> 
> +unsigned int uprobe_call_bpf(struct trace_event_call *call, void *ctx)
> +{
> +	unsigned int ret;
> +
> +	/*
> +	 * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
> +	 * to all call sites, we did a bpf_prog_array_valid() there to check
> +	 * whether call->prog_array is empty or not, which is
> +	 * a heuristic to speed up execution.
> +	 *
> +	 * If bpf_prog_array_valid() fetched prog_array was
> +	 * non-NULL, we go into uprobe_call_bpf() and do the actual
> +	 * proper rcu_dereference() under RCU trace lock.
> +	 * If it turns out that prog_array is NULL then, we bail out.
> +	 * For the opposite, if the bpf_prog_array_valid() fetched pointer
> +	 * was NULL, you'll skip the prog_array with the risk of missing
> +	 * out of events when it was updated in between this and the
> +	 * rcu_dereference() which is accepted risk.
> +	 */
> +	ret = bpf_prog_run_array_sleepable(call->prog_array, ctx, bpf_prog_run);
> +	return ret;
> +}
> +
>  #ifdef CONFIG_BPF_KPROBE_OVERRIDE
>  BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
>  {
> @@ -1915,7 +1938,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
>  	event->prog = prog;
>  	event->bpf_cookie = bpf_cookie;
>  	rcu_assign_pointer(event->tp_event->prog_array, new_array);
> -	bpf_prog_array_free(old_array);
> +	bpf_prog_array_free_sleepable(old_array);
> 
>  unlock:
>  	mutex_unlock(&bpf_event_mutex);
> @@ -1941,7 +1964,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
>  		bpf_prog_array_delete_safe(old_array, event->prog);
>  	} else {
>  		rcu_assign_pointer(event->tp_event->prog_array, new_array);
> -		bpf_prog_array_free(old_array);
> +		bpf_prog_array_free_sleepable(old_array);
>  	}
> 
>  	bpf_prog_put(event->prog);
> diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
> index 9711589273cd..3eb48897d15b 100644
> --- a/kernel/trace/trace_uprobe.c
> +++ b/kernel/trace/trace_uprobe.c
> @@ -1346,9 +1346,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
>  	if (bpf_prog_array_valid(call)) {
>  		u32 ret;
> 
> -		preempt_disable();

It should have been replaced with migrate_disable long ago.

> -		ret = trace_call_bpf(call, regs);
> -		preempt_enable();
> +		ret = uprobe_call_bpf(call, regs);
>  		if (!ret)
>  			return;
>  	}
> --
> 2.35.1
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 57ec619cf729..592886115011 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -26,6 +26,7 @@ 
 #include <linux/stddef.h>
 #include <linux/bpfptr.h>
 #include <linux/btf.h>
+#include <linux/rcupdate_trace.h>

 struct bpf_verifier_env;
 struct bpf_verifier_log;
@@ -1343,6 +1344,8 @@  extern struct bpf_empty_prog_array bpf_empty_prog_array;

 struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
 void bpf_prog_array_free(struct bpf_prog_array *progs);
+/* Use when traversal over the bpf_prog_array uses tasks_trace rcu */
+void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs);
 int bpf_prog_array_length(struct bpf_prog_array *progs);
 bool bpf_prog_array_is_empty(struct bpf_prog_array *array);
 int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs,
@@ -1428,6 +1431,60 @@  bpf_prog_run_array(const struct bpf_prog_array *array,
 	return ret;
 }

+/**
+ * Notes on RCU design for bpf_prog_arrays containing sleepable programs:
+ *
+ * We use the tasks_trace rcu flavor read section to protect the bpf_prog_array
+ * overall. As a result, we must use the bpf_prog_array_free_sleepable
+ * in order to use the tasks_trace rcu grace period.
+ *
+ * When a non-sleepable program is inside the array, we take the rcu read
+ * section and disable preemption for that program alone, so it can access
+ * rcu-protected dynamically sized maps.
+ */
+static __always_inline u32
+bpf_prog_run_array_sleepable(const struct bpf_prog_array __rcu *array_rcu,
+			     const void *ctx, bpf_prog_run_fn run_prog)
+{
+	const struct bpf_prog_array_item *item;
+	const struct bpf_prog *prog;
+	const struct bpf_prog_array *array;
+	struct bpf_run_ctx *old_run_ctx;
+	struct bpf_trace_run_ctx run_ctx;
+	u32 ret = 1;
+
+	might_fault();
+
+	migrate_disable();
+	rcu_read_lock_trace();
+
+	array = rcu_dereference_check(array_rcu, rcu_read_lock_trace_held());
+	if (unlikely(!array))
+		goto out;
+	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+	item = &array->items[0];
+	while ((prog = READ_ONCE(item->prog))) {
+		if (!prog->aux->sleepable) {
+			preempt_disable();
+			rcu_read_lock();
+		}
+
+		run_ctx.bpf_cookie = item->bpf_cookie;
+		ret &= run_prog(prog, ctx);
+		item++;
+
+		if (!prog->aux->sleepable) {
+			rcu_read_unlock();
+			preempt_enable();
+		}
+	}
+	bpf_reset_run_ctx(old_run_ctx);
+out:
+	rcu_read_unlock_trace();
+	migrate_enable();
+	return ret;
+}
+
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 extern struct mutex bpf_stats_enabled_mutex;
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index e6e95a9f07a5..d45889f1210d 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -736,6 +736,7 @@  trace_trigger_soft_disabled(struct trace_event_file *file)

 #ifdef CONFIG_BPF_EVENTS
 unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
+unsigned int uprobe_call_bpf(struct trace_event_call *call, void *ctx);
 int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie);
 void perf_event_detach_bpf_prog(struct perf_event *event);
 int perf_event_query_prog_array(struct perf_event *event, void __user *info);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 13e9dbeeedf3..9271b708807a 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2268,6 +2268,21 @@  void bpf_prog_array_free(struct bpf_prog_array *progs)
 	kfree_rcu(progs, rcu);
 }

+static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
+{
+	struct bpf_prog_array *progs;
+
+	progs = container_of(rcu, struct bpf_prog_array, rcu);
+	kfree_rcu(progs, rcu);
+}
+
+void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
+{
+	if (!progs || progs == &bpf_empty_prog_array.hdr)
+		return;
+	call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
+}
+
 int bpf_prog_array_length(struct bpf_prog_array *array)
 {
 	struct bpf_prog_array_item *item;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f15b826f9899..582a6171e096 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -140,6 +140,29 @@  unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 	return ret;
 }

+unsigned int uprobe_call_bpf(struct trace_event_call *call, void *ctx)
+{
+	unsigned int ret;
+
+	/*
+	 * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
+	 * to all call sites, we did a bpf_prog_array_valid() there to check
+	 * whether call->prog_array is empty or not, which is
+	 * a heuristic to speed up execution.
+	 *
+	 * If bpf_prog_array_valid() fetched prog_array was
+	 * non-NULL, we go into uprobe_call_bpf() and do the actual
+	 * proper rcu_dereference() under RCU trace lock.
+	 * If it turns out that prog_array is NULL then, we bail out.
+	 * For the opposite, if the bpf_prog_array_valid() fetched pointer
+	 * was NULL, you'll skip the prog_array with the risk of missing
+	 * out of events when it was updated in between this and the
+	 * rcu_dereference() which is accepted risk.
+	 */
+	ret = bpf_prog_run_array_sleepable(call->prog_array, ctx, bpf_prog_run);
+	return ret;
+}
+
 #ifdef CONFIG_BPF_KPROBE_OVERRIDE
 BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
 {
@@ -1915,7 +1938,7 @@  int perf_event_attach_bpf_prog(struct perf_event *event,
 	event->prog = prog;
 	event->bpf_cookie = bpf_cookie;
 	rcu_assign_pointer(event->tp_event->prog_array, new_array);
-	bpf_prog_array_free(old_array);
+	bpf_prog_array_free_sleepable(old_array);

 unlock:
 	mutex_unlock(&bpf_event_mutex);
@@ -1941,7 +1964,7 @@  void perf_event_detach_bpf_prog(struct perf_event *event)
 		bpf_prog_array_delete_safe(old_array, event->prog);
 	} else {
 		rcu_assign_pointer(event->tp_event->prog_array, new_array);
-		bpf_prog_array_free(old_array);
+		bpf_prog_array_free_sleepable(old_array);
 	}

 	bpf_prog_put(event->prog);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 9711589273cd..3eb48897d15b 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1346,9 +1346,7 @@  static void __uprobe_perf_func(struct trace_uprobe *tu,
 	if (bpf_prog_array_valid(call)) {
 		u32 ret;

-		preempt_disable();
-		ret = trace_call_bpf(call, regs);
-		preempt_enable();
+		ret = uprobe_call_bpf(call, regs);
 		if (!ret)
 			return;
 	}