diff mbox series

[bpf-next,v2,4/8] xsk: register XDP sockets at bind(), and add new AF_XDP BPF helper

Message ID 20210119155013.154808-5-bjorn.topel@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Introduce bpf_redirect_xsk() helper | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 6 maintainers not CCed: quentin@isovalent.com songliubraving@fb.com andrii@kernel.org kpsingh@kernel.org kafai@fb.com yhs@fb.com
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 12220 this patch: 12220
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning WARNING: line length of 84 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 12877 this patch: 12877
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Björn Töpel Jan. 19, 2021, 3:50 p.m. UTC
From: Björn Töpel <bjorn.topel@intel.com>

Extend bind() for XDP sockets, so that the bound socket is added to
the netdev_rx_queue _rx array in the netdevice. We call this to
register an XDP socket. To redirect packets to a registered socket, a
new BPF helper is used: bpf_redirect_xsk().

For shared XDP sockets, only the first bound socket is
registered. Users that require more advanced setups, continue to the
XSKMAP and bpf_redirect_map().

Now, why would one use bpf_redirect_xsk() over the regular
bpf_redirect_map() helper? First: Slightly better performance. Second:
Convenience. Most user use one socket per queue. This scenario is what
registered sockets support. There is no need to create an XSKMAP. This
can also reduce complexity from containerized setups, where users
might what to use XDP sockets without CAP_SYS_ADMIN capabilities.

Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 include/linux/filter.h         |  1 +
 include/linux/netdevice.h      |  1 +
 include/net/xdp_sock.h         | 12 +++++
 include/net/xsk_buff_pool.h    |  2 +-
 include/uapi/linux/bpf.h       |  7 +++
 net/core/filter.c              | 49 ++++++++++++++++--
 net/xdp/xsk.c                  | 93 ++++++++++++++++++++++++++++------
 net/xdp/xsk_buff_pool.c        |  4 +-
 tools/include/uapi/linux/bpf.h |  7 +++
 9 files changed, 153 insertions(+), 23 deletions(-)

Comments

kernel test robot Jan. 20, 2021, 8:25 a.m. UTC | #1
Hi "Björn,

I love your patch! Yet something to improve:

[auto build test ERROR on 95204c9bfa48d2f4d3bab7df55c1cc823957ff81]

url:    https://github.com/0day-ci/linux/commits/Bj-rn-T-pel/Introduce-bpf_redirect_xsk-helper/20210120-150357
base:    95204c9bfa48d2f4d3bab7df55c1cc823957ff81
config: x86_64-randconfig-m031-20210120 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/419b1341d7980ee57fb10f4306719eef3c1a15f8
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Bj-rn-T-pel/Introduce-bpf_redirect_xsk-helper/20210120-150357
        git checkout 419b1341d7980ee57fb10f4306719eef3c1a15f8
        # save the attached .config to linux build tree
        make W=1 ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from <command-line>:
   net/core/filter.c: In function '____bpf_xdp_redirect_xsk':
>> net/core/filter.c:4165:35: error: 'struct netdev_rx_queue' has no member named 'xsk'
    4165 |  xs = READ_ONCE(dev->_rx[queue_id].xsk);
         |                                   ^
   include/linux/compiler_types.h:300:9: note: in definition of macro '__compiletime_assert'
     300 |   if (!(condition))     \
         |         ^~~~~~~~~
   include/linux/compiler_types.h:320:2: note: in expansion of macro '_compiletime_assert'
     320 |  _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
         |  ^~~~~~~~~~~~~~~~~~~
   include/asm-generic/rwonce.h:36:2: note: in expansion of macro 'compiletime_assert'
      36 |  compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \
         |  ^~~~~~~~~~~~~~~~~~
   include/asm-generic/rwonce.h:36:21: note: in expansion of macro '__native_word'
      36 |  compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \
         |                     ^~~~~~~~~~~~~
   include/asm-generic/rwonce.h:49:2: note: in expansion of macro 'compiletime_assert_rwonce_type'
      49 |  compiletime_assert_rwonce_type(x);    \
         |  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   net/core/filter.c:4165:7: note: in expansion of macro 'READ_ONCE'
    4165 |  xs = READ_ONCE(dev->_rx[queue_id].xsk);
         |       ^~~~~~~~~
>> net/core/filter.c:4165:35: error: 'struct netdev_rx_queue' has no member named 'xsk'
    4165 |  xs = READ_ONCE(dev->_rx[queue_id].xsk);
         |                                   ^
   include/linux/compiler_types.h:300:9: note: in definition of macro '__compiletime_assert'
     300 |   if (!(condition))     \
         |         ^~~~~~~~~
   include/linux/compiler_types.h:320:2: note: in expansion of macro '_compiletime_assert'
     320 |  _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
         |  ^~~~~~~~~~~~~~~~~~~
   include/asm-generic/rwonce.h:36:2: note: in expansion of macro 'compiletime_assert'
      36 |  compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \
         |  ^~~~~~~~~~~~~~~~~~
   include/asm-generic/rwonce.h:36:21: note: in expansion of macro '__native_word'
      36 |  compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \
         |                     ^~~~~~~~~~~~~
   include/asm-generic/rwonce.h:49:2: note: in expansion of macro 'compiletime_assert_rwonce_type'
      49 |  compiletime_assert_rwonce_type(x);    \
         |  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   net/core/filter.c:4165:7: note: in expansion of macro 'READ_ONCE'
    4165 |  xs = READ_ONCE(dev->_rx[queue_id].xsk);
         |       ^~~~~~~~~
>> net/core/filter.c:4165:35: error: 'struct netdev_rx_queue' has no member named 'xsk'
    4165 |  xs = READ_ONCE(dev->_rx[queue_id].xsk);
         |                                   ^
   include/linux/compiler_types.h:300:9: note: in definition of macro '__compiletime_assert'
     300 |   if (!(condition))     \
         |         ^~~~~~~~~
   include/linux/compiler_types.h:320:2: note: in expansion of macro '_compiletime_assert'
     320 |  _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
         |  ^~~~~~~~~~~~~~~~~~~
   include/asm-generic/rwonce.h:36:2: note: in expansion of macro 'compiletime_assert'
      36 |  compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \
         |  ^~~~~~~~~~~~~~~~~~
   include/asm-generic/rwonce.h:36:21: note: in expansion of macro '__native_word'
      36 |  compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \
         |                     ^~~~~~~~~~~~~
   include/asm-generic/rwonce.h:49:2: note: in expansion of macro 'compiletime_assert_rwonce_type'
      49 |  compiletime_assert_rwonce_type(x);    \
         |  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   net/core/filter.c:4165:7: note: in expansion of macro 'READ_ONCE'
    4165 |  xs = READ_ONCE(dev->_rx[queue_id].xsk);
         |       ^~~~~~~~~
>> net/core/filter.c:4165:35: error: 'struct netdev_rx_queue' has no member named 'xsk'
    4165 |  xs = READ_ONCE(dev->_rx[queue_id].xsk);
         |                                   ^
   include/linux/compiler_types.h:300:9: note: in definition of macro '__compiletime_assert'
     300 |   if (!(condition))     \
         |         ^~~~~~~~~
   include/linux/compiler_types.h:320:2: note: in expansion of macro '_compiletime_assert'
     320 |  _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
         |  ^~~~~~~~~~~~~~~~~~~
   include/asm-generic/rwonce.h:36:2: note: in expansion of macro 'compiletime_assert'
      36 |  compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \
         |  ^~~~~~~~~~~~~~~~~~
   include/asm-generic/rwonce.h:36:21: note: in expansion of macro '__native_word'
      36 |  compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \
         |                     ^~~~~~~~~~~~~
   include/asm-generic/rwonce.h:49:2: note: in expansion of macro 'compiletime_assert_rwonce_type'
      49 |  compiletime_assert_rwonce_type(x);    \
         |  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   net/core/filter.c:4165:7: note: in expansion of macro 'READ_ONCE'
    4165 |  xs = READ_ONCE(dev->_rx[queue_id].xsk);
         |       ^~~~~~~~~
>> net/core/filter.c:4165:35: error: 'struct netdev_rx_queue' has no member named 'xsk'
    4165 |  xs = READ_ONCE(dev->_rx[queue_id].xsk);
         |                                   ^
   include/linux/compiler_types.h:300:9: note: in definition of macro '__compiletime_assert'
     300 |   if (!(condition))     \
         |         ^~~~~~~~~
   include/linux/compiler_types.h:320:2: note: in expansion of macro '_compiletime_assert'
     320 |  _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
         |  ^~~~~~~~~~~~~~~~~~~
   include/asm-generic/rwonce.h:36:2: note: in expansion of macro 'compiletime_assert'
      36 |  compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \
         |  ^~~~~~~~~~~~~~~~~~
   include/asm-generic/rwonce.h:49:2: note: in expansion of macro 'compiletime_assert_rwonce_type'
      49 |  compiletime_assert_rwonce_type(x);    \
         |  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   net/core/filter.c:4165:7: note: in expansion of macro 'READ_ONCE'
    4165 |  xs = READ_ONCE(dev->_rx[queue_id].xsk);
         |       ^~~~~~~~~
>> net/core/filter.c:4165:35: error: 'struct netdev_rx_queue' has no member named 'xsk'
    4165 |  xs = READ_ONCE(dev->_rx[queue_id].xsk);
         |                                   ^
   include/linux/compiler_types.h:271:13: note: in definition of macro '__unqual_scalar_typeof'
     271 |   _Generic((x),      \
         |             ^
   include/asm-generic/rwonce.h:50:2: note: in expansion of macro '__READ_ONCE'
      50 |  __READ_ONCE(x);       \
         |  ^~~~~~~~~~~
   net/core/filter.c:4165:7: note: in expansion of macro 'READ_ONCE'
    4165 |  xs = READ_ONCE(dev->_rx[queue_id].xsk);
         |       ^~~~~~~~~
   In file included from ./arch/x86/include/generated/asm/rwonce.h:1,
                    from include/linux/compiler.h:246,
                    from include/linux/export.h:43,
                    from include/linux/linkage.h:7,
                    from include/linux/kernel.h:7,
                    from include/linux/list.h:9,
                    from include/linux/module.h:12,
                    from net/core/filter.c:20:
>> net/core/filter.c:4165:35: error: 'struct netdev_rx_queue' has no member named 'xsk'
    4165 |  xs = READ_ONCE(dev->_rx[queue_id].xsk);
         |                                   ^
   include/asm-generic/rwonce.h:44:72: note: in definition of macro '__READ_ONCE'
      44 | #define __READ_ONCE(x) (*(const volatile __unqual_scalar_typeof(x) *)&(x))
         |                                                                        ^
   net/core/filter.c:4165:7: note: in expansion of macro 'READ_ONCE'
    4165 |  xs = READ_ONCE(dev->_rx[queue_id].xsk);
         |       ^~~~~~~~~


vim +4165 net/core/filter.c

  4157	
  4158	BPF_CALL_2(bpf_xdp_redirect_xsk, struct xdp_buff *, xdp, u64, action)
  4159	{
  4160		struct net_device *dev = xdp->rxq->dev;
  4161		u32 queue_id = xdp->rxq->queue_index;
  4162		struct bpf_redirect_info *ri;
  4163		struct xdp_sock *xs;
  4164	
> 4165		xs = READ_ONCE(dev->_rx[queue_id].xsk);
  4166		if (!xs)
  4167			return action;
  4168	
  4169		ri = this_cpu_ptr(&bpf_redirect_info);
  4170		ri->tgt_type = XDP_REDIR_XSK;
  4171		ri->tgt_value = xs;
  4172	
  4173		return XDP_REDIRECT;
  4174	}
  4175	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Björn Töpel Jan. 20, 2021, 8:41 a.m. UTC | #2
On 2021-01-20 09:25, kernel test robot wrote:
> Hi "Björn,
> 
> I love your patch! Yet something to improve:
> 
> [auto build test ERROR on 95204c9bfa48d2f4d3bab7df55c1cc823957ff81]
> 
> url:    https://github.com/0day-ci/linux/commits/Bj-rn-T-pel/Introduce-bpf_redirect_xsk-helper/20210120-150357
> base:    95204c9bfa48d2f4d3bab7df55c1cc823957ff81
> config: x86_64-randconfig-m031-20210120 (attached as .config)
> compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
> reproduce (this is a W=1 build):
>          # https://github.com/0day-ci/linux/commit/419b1341d7980ee57fb10f4306719eef3c1a15f8
>          git remote add linux-review https://github.com/0day-ci/linux
>          git fetch --no-tags linux-review Bj-rn-T-pel/Introduce-bpf_redirect_xsk-helper/20210120-150357
>          git checkout 419b1341d7980ee57fb10f4306719eef3c1a15f8
>          # save the attached .config to linux build tree
>          make W=1 ARCH=x86_64
> 
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot <lkp@intel.com>
> 
> All errors (new ones prefixed by >>):
> 
>     In file included from <command-line>:
>     net/core/filter.c: In function '____bpf_xdp_redirect_xsk':
>>> net/core/filter.c:4165:35: error: 'struct netdev_rx_queue' has no member named 'xsk'

This rev is missing proper CONFIG_XDP_SOCKET ifdefs. I'll fix this in
the next spin, but I'll wait a bit for more comments.


Björn

[...]
kernel test robot Jan. 20, 2021, 8:50 a.m. UTC | #3
Hi "Björn,

I love your patch! Yet something to improve:

[auto build test ERROR on 95204c9bfa48d2f4d3bab7df55c1cc823957ff81]

url:    https://github.com/0day-ci/linux/commits/Bj-rn-T-pel/Introduce-bpf_redirect_xsk-helper/20210120-150357
base:    95204c9bfa48d2f4d3bab7df55c1cc823957ff81
config: arm-randconfig-r013-20210120 (attached as .config)
compiler: clang version 12.0.0 (https://github.com/llvm/llvm-project 22b68440e1647e16b5ee24b924986207173c02d1)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install arm cross compiling tool for clang build
        # apt-get install binutils-arm-linux-gnueabi
        # https://github.com/0day-ci/linux/commit/419b1341d7980ee57fb10f4306719eef3c1a15f8
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Bj-rn-T-pel/Introduce-bpf_redirect_xsk-helper/20210120-150357
        git checkout 419b1341d7980ee57fb10f4306719eef3c1a15f8
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=arm 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> net/core/filter.c:4165:36: error: no member named 'xsk' in 'struct netdev_rx_queue'
           xs = READ_ONCE(dev->_rx[queue_id].xsk);
                          ~~~~~~~~~~~~~~~~~~ ^
   include/asm-generic/rwonce.h:49:33: note: expanded from macro 'READ_ONCE'
           compiletime_assert_rwonce_type(x);                              \
                                          ^
   include/asm-generic/rwonce.h:36:35: note: expanded from macro 'compiletime_assert_rwonce_type'
           compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long),  \
                                            ^
   include/linux/compiler_types.h:282:10: note: expanded from macro '__native_word'
           (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \
                   ^
   include/linux/compiler_types.h:320:22: note: expanded from macro 'compiletime_assert'
           _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
                               ^~~~~~~~~
   include/linux/compiler_types.h:308:23: note: expanded from macro '_compiletime_assert'
           __compiletime_assert(condition, msg, prefix, suffix)
                                ^~~~~~~~~
   include/linux/compiler_types.h:300:9: note: expanded from macro '__compiletime_assert'
                   if (!(condition))                                       \
                         ^~~~~~~~~
>> net/core/filter.c:4165:36: error: no member named 'xsk' in 'struct netdev_rx_queue'
           xs = READ_ONCE(dev->_rx[queue_id].xsk);
                          ~~~~~~~~~~~~~~~~~~ ^
   include/asm-generic/rwonce.h:49:33: note: expanded from macro 'READ_ONCE'
           compiletime_assert_rwonce_type(x);                              \
                                          ^
   include/asm-generic/rwonce.h:36:35: note: expanded from macro 'compiletime_assert_rwonce_type'
           compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long),  \
                                            ^
   include/linux/compiler_types.h:282:39: note: expanded from macro '__native_word'
           (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \
                                                ^
   include/linux/compiler_types.h:320:22: note: expanded from macro 'compiletime_assert'
           _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
                               ^~~~~~~~~
   include/linux/compiler_types.h:308:23: note: expanded from macro '_compiletime_assert'
           __compiletime_assert(condition, msg, prefix, suffix)
                                ^~~~~~~~~
   include/linux/compiler_types.h:300:9: note: expanded from macro '__compiletime_assert'
                   if (!(condition))                                       \
                         ^~~~~~~~~
>> net/core/filter.c:4165:36: error: no member named 'xsk' in 'struct netdev_rx_queue'
           xs = READ_ONCE(dev->_rx[queue_id].xsk);
                          ~~~~~~~~~~~~~~~~~~ ^
   include/asm-generic/rwonce.h:49:33: note: expanded from macro 'READ_ONCE'
           compiletime_assert_rwonce_type(x);                              \
                                          ^
   include/asm-generic/rwonce.h:36:35: note: expanded from macro 'compiletime_assert_rwonce_type'
           compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long),  \
                                            ^
   include/linux/compiler_types.h:283:10: note: expanded from macro '__native_word'
            sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long))
                   ^
   include/linux/compiler_types.h:320:22: note: expanded from macro 'compiletime_assert'
           _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
                               ^~~~~~~~~
   include/linux/compiler_types.h:308:23: note: expanded from macro '_compiletime_assert'
           __compiletime_assert(condition, msg, prefix, suffix)
                                ^~~~~~~~~
   include/linux/compiler_types.h:300:9: note: expanded from macro '__compiletime_assert'
                   if (!(condition))                                       \
                         ^~~~~~~~~
>> net/core/filter.c:4165:36: error: no member named 'xsk' in 'struct netdev_rx_queue'
           xs = READ_ONCE(dev->_rx[queue_id].xsk);
                          ~~~~~~~~~~~~~~~~~~ ^
   include/asm-generic/rwonce.h:49:33: note: expanded from macro 'READ_ONCE'
           compiletime_assert_rwonce_type(x);                              \
                                          ^
   include/asm-generic/rwonce.h:36:35: note: expanded from macro 'compiletime_assert_rwonce_type'
           compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long),  \
                                            ^
   include/linux/compiler_types.h:283:38: note: expanded from macro '__native_word'
            sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long))
                                               ^
   include/linux/compiler_types.h:320:22: note: expanded from macro 'compiletime_assert'
           _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
                               ^~~~~~~~~
   include/linux/compiler_types.h:308:23: note: expanded from macro '_compiletime_assert'
           __compiletime_assert(condition, msg, prefix, suffix)
                                ^~~~~~~~~
   include/linux/compiler_types.h:300:9: note: expanded from macro '__compiletime_assert'
                   if (!(condition))                                       \
                         ^~~~~~~~~
>> net/core/filter.c:4165:36: error: no member named 'xsk' in 'struct netdev_rx_queue'
           xs = READ_ONCE(dev->_rx[queue_id].xsk);
                          ~~~~~~~~~~~~~~~~~~ ^
   include/asm-generic/rwonce.h:49:33: note: expanded from macro 'READ_ONCE'
           compiletime_assert_rwonce_type(x);                              \
                                          ^
   include/asm-generic/rwonce.h:36:48: note: expanded from macro 'compiletime_assert_rwonce_type'
           compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long),  \
                                                         ^
   include/linux/compiler_types.h:320:22: note: expanded from macro 'compiletime_assert'
           _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
                               ^~~~~~~~~
   include/linux/compiler_types.h:308:23: note: expanded from macro '_compiletime_assert'
           __compiletime_assert(condition, msg, prefix, suffix)
                                ^~~~~~~~~
   include/linux/compiler_types.h:300:9: note: expanded from macro '__compiletime_assert'
                   if (!(condition))                                       \
                         ^~~~~~~~~
>> net/core/filter.c:4165:36: error: no member named 'xsk' in 'struct netdev_rx_queue'
           xs = READ_ONCE(dev->_rx[queue_id].xsk);
                          ~~~~~~~~~~~~~~~~~~ ^
   include/asm-generic/rwonce.h:50:14: note: expanded from macro 'READ_ONCE'
           __READ_ONCE(x);                                                 \
                       ^
   include/asm-generic/rwonce.h:44:65: note: expanded from macro '__READ_ONCE'
   #define __READ_ONCE(x)  (*(const volatile __unqual_scalar_typeof(x) *)&(x))
                                                                    ^
   include/linux/compiler_types.h:271:13: note: expanded from macro '__unqual_scalar_typeof'
                   _Generic((x),                                           \
                             ^
>> net/core/filter.c:4165:36: error: no member named 'xsk' in 'struct netdev_rx_queue'
           xs = READ_ONCE(dev->_rx[queue_id].xsk);
                          ~~~~~~~~~~~~~~~~~~ ^
   include/asm-generic/rwonce.h:50:14: note: expanded from macro 'READ_ONCE'
           __READ_ONCE(x);                                                 \
                       ^
   include/asm-generic/rwonce.h:44:65: note: expanded from macro '__READ_ONCE'
   #define __READ_ONCE(x)  (*(const volatile __unqual_scalar_typeof(x) *)&(x))
                                                                    ^
   include/linux/compiler_types.h:278:15: note: expanded from macro '__unqual_scalar_typeof'
                            default: (x)))
                                      ^
>> net/core/filter.c:4165:36: error: no member named 'xsk' in 'struct netdev_rx_queue'
           xs = READ_ONCE(dev->_rx[queue_id].xsk);
                          ~~~~~~~~~~~~~~~~~~ ^
   include/asm-generic/rwonce.h:50:14: note: expanded from macro 'READ_ONCE'
           __READ_ONCE(x);                                                 \
                       ^
   include/asm-generic/rwonce.h:44:72: note: expanded from macro '__READ_ONCE'
   #define __READ_ONCE(x)  (*(const volatile __unqual_scalar_typeof(x) *)&(x))
                                                                           ^
>> net/core/filter.c:4165:5: error: assigning to 'struct xdp_sock *' from incompatible type 'void'
           xs = READ_ONCE(dev->_rx[queue_id].xsk);
              ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   9 errors generated.


vim +4165 net/core/filter.c

  4157	
  4158	BPF_CALL_2(bpf_xdp_redirect_xsk, struct xdp_buff *, xdp, u64, action)
  4159	{
  4160		struct net_device *dev = xdp->rxq->dev;
  4161		u32 queue_id = xdp->rxq->queue_index;
  4162		struct bpf_redirect_info *ri;
  4163		struct xdp_sock *xs;
  4164	
> 4165		xs = READ_ONCE(dev->_rx[queue_id].xsk);
  4166		if (!xs)
  4167			return action;
  4168	
  4169		ri = this_cpu_ptr(&bpf_redirect_info);
  4170		ri->tgt_type = XDP_REDIR_XSK;
  4171		ri->tgt_value = xs;
  4172	
  4173		return XDP_REDIRECT;
  4174	}
  4175	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Toke Høiland-Jørgensen Jan. 20, 2021, 12:50 p.m. UTC | #4
Björn Töpel <bjorn.topel@gmail.com> writes:

> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index c001766adcbc..bbc7d9a57262 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -3836,6 +3836,12 @@ union bpf_attr {
>   *	Return
>   *		A pointer to a struct socket on success or NULL if the file is
>   *		not a socket.
> + *
> + * long bpf_redirect_xsk(struct xdp_buff *xdp_md, u64 action)
> + *	Description
> + *		Redirect to the registered AF_XDP socket.
> + *	Return
> + *		**XDP_REDIRECT** on success, otherwise the action parameter is returned.
>   */

I think it would be better to make the second argument a 'flags'
argument and make values > XDP_TX invalid (like we do in
bpf_xdp_redirect_map() now). By allowing any value as return you lose
the ability to turn it into a flags argument later...

-Toke
Björn Töpel Jan. 20, 2021, 1:25 p.m. UTC | #5
On 2021-01-20 13:50, Toke Høiland-Jørgensen wrote:
> Björn Töpel <bjorn.topel@gmail.com> writes:
> 
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index c001766adcbc..bbc7d9a57262 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -3836,6 +3836,12 @@ union bpf_attr {
>>    *	Return
>>    *		A pointer to a struct socket on success or NULL if the file is
>>    *		not a socket.
>> + *
>> + * long bpf_redirect_xsk(struct xdp_buff *xdp_md, u64 action)
>> + *	Description
>> + *		Redirect to the registered AF_XDP socket.
>> + *	Return
>> + *		**XDP_REDIRECT** on success, otherwise the action parameter is returned.
>>    */
> 
> I think it would be better to make the second argument a 'flags'
> argument and make values > XDP_TX invalid (like we do in
> bpf_xdp_redirect_map() now). By allowing any value as return you lose
> the ability to turn it into a flags argument later...
>

Yes, but that adds a run-time check. I prefer this non-checked version,
even though it is a bit less futureproof.


Björn
Toke Høiland-Jørgensen Jan. 20, 2021, 2:54 p.m. UTC | #6
Björn Töpel <bjorn.topel@intel.com> writes:

> On 2021-01-20 13:50, Toke Høiland-Jørgensen wrote:
>> Björn Töpel <bjorn.topel@gmail.com> writes:
>> 
>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>> index c001766adcbc..bbc7d9a57262 100644
>>> --- a/include/uapi/linux/bpf.h
>>> +++ b/include/uapi/linux/bpf.h
>>> @@ -3836,6 +3836,12 @@ union bpf_attr {
>>>    *	Return
>>>    *		A pointer to a struct socket on success or NULL if the file is
>>>    *		not a socket.
>>> + *
>>> + * long bpf_redirect_xsk(struct xdp_buff *xdp_md, u64 action)
>>> + *	Description
>>> + *		Redirect to the registered AF_XDP socket.
>>> + *	Return
>>> + *		**XDP_REDIRECT** on success, otherwise the action parameter is returned.
>>>    */
>> 
>> I think it would be better to make the second argument a 'flags'
>> argument and make values > XDP_TX invalid (like we do in
>> bpf_xdp_redirect_map() now). By allowing any value as return you lose
>> the ability to turn it into a flags argument later...
>>
>
> Yes, but that adds a run-time check. I prefer this non-checked version,
> even though it is a bit less futureproof.

That...seems a bit short-sighted? :)
Can you actually see a difference in your performance numbers?

-Toke
Björn Töpel Jan. 20, 2021, 3:18 p.m. UTC | #7
On 2021-01-20 15:54, Toke Høiland-Jørgensen wrote:
> Björn Töpel <bjorn.topel@intel.com> writes:
> 
>> On 2021-01-20 13:50, Toke Høiland-Jørgensen wrote:
>>> Björn Töpel <bjorn.topel@gmail.com> writes:
>>>
>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>>> index c001766adcbc..bbc7d9a57262 100644
>>>> --- a/include/uapi/linux/bpf.h
>>>> +++ b/include/uapi/linux/bpf.h
>>>> @@ -3836,6 +3836,12 @@ union bpf_attr {
>>>>     *	Return
>>>>     *		A pointer to a struct socket on success or NULL if the file is
>>>>     *		not a socket.
>>>> + *
>>>> + * long bpf_redirect_xsk(struct xdp_buff *xdp_md, u64 action)
>>>> + *	Description
>>>> + *		Redirect to the registered AF_XDP socket.
>>>> + *	Return
>>>> + *		**XDP_REDIRECT** on success, otherwise the action parameter is returned.
>>>>     */
>>>
>>> I think it would be better to make the second argument a 'flags'
>>> argument and make values > XDP_TX invalid (like we do in
>>> bpf_xdp_redirect_map() now). By allowing any value as return you lose
>>> the ability to turn it into a flags argument later...
>>>
>>
>> Yes, but that adds a run-time check. I prefer this non-checked version,
>> even though it is a bit less futureproof.
> 
> That...seems a bit short-sighted? :)
> Can you actually see a difference in your performance numbers?
>

I would rather add an additional helper *if* we see the need for flags,
instead of paying for that upfront. For me, BPF is about being able to
specialize, and not having one call with tons of checks.

(Related; Going forward, the growing switch() for redirect targets in
xdp_do_redirect() is a concern for me...)

And yes, even with all those fancy branch predictors, less instructions
is still less. :-) (It shows in my ubenchs.)


Björn
Toke Høiland-Jørgensen Jan. 20, 2021, 5:29 p.m. UTC | #8
Björn Töpel <bjorn.topel@intel.com> writes:

> On 2021-01-20 15:54, Toke Høiland-Jørgensen wrote:
>> Björn Töpel <bjorn.topel@intel.com> writes:
>> 
>>> On 2021-01-20 13:50, Toke Høiland-Jørgensen wrote:
>>>> Björn Töpel <bjorn.topel@gmail.com> writes:
>>>>
>>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>>>> index c001766adcbc..bbc7d9a57262 100644
>>>>> --- a/include/uapi/linux/bpf.h
>>>>> +++ b/include/uapi/linux/bpf.h
>>>>> @@ -3836,6 +3836,12 @@ union bpf_attr {
>>>>>     *	Return
>>>>>     *		A pointer to a struct socket on success or NULL if the file is
>>>>>     *		not a socket.
>>>>> + *
>>>>> + * long bpf_redirect_xsk(struct xdp_buff *xdp_md, u64 action)
>>>>> + *	Description
>>>>> + *		Redirect to the registered AF_XDP socket.
>>>>> + *	Return
>>>>> + *		**XDP_REDIRECT** on success, otherwise the action parameter is returned.
>>>>>     */
>>>>
>>>> I think it would be better to make the second argument a 'flags'
>>>> argument and make values > XDP_TX invalid (like we do in
>>>> bpf_xdp_redirect_map() now). By allowing any value as return you lose
>>>> the ability to turn it into a flags argument later...
>>>>
>>>
>>> Yes, but that adds a run-time check. I prefer this non-checked version,
>>> even though it is a bit less futureproof.
>> 
>> That...seems a bit short-sighted? :)
>> Can you actually see a difference in your performance numbers?
>>
>
> I would rather add an additional helper *if* we see the need for flags,
> instead of paying for that upfront. For me, BPF is about being able to
> specialize, and not having one call with tons of checks.

I get that, I'm just pushing back because omitting a 'flags' argument is
literally among the most frequent reasons for having to replace a
syscall (see e.g., [0]) instead of extending it. And yeah, I do realise
that the performance implications are different for XDP than for
syscalls, but maintainability of the API is also important; it's all a
tradeoff. This will be the third redirect helper variant for XDP and I'd
hate for the fourth one to have to be bpf_redirect_xsk_flags() because
it did turn out to be needed...

(One potential concrete reason for this: I believe Magnus was talking
about an API that would allow a BPF program to redirect a packet into
more than one socket (cloning it in the process), or to redirect to a
socket+another target. How would you do that with this new helper?)

[0] https://lwn.net/Articles/585415/

> (Related; Going forward, the growing switch() for redirect targets in
> xdp_do_redirect() is a concern for me...)
>
> And yes, even with all those fancy branch predictors, less instructions
> is still less. :-) (It shows in my ubenchs.)

Right, I do agree that the run-time performance hit of checking the flag
sucks (along with being hard to check for, cf. our parallel discussion
about version checks). So ideally this would be fixed by having the
verifier enforce the argument ranges instead; but if we merge this
without the runtime check now we can't add that later without
potentially breaking programs... :(

-Toke
Björn Töpel Jan. 20, 2021, 6:22 p.m. UTC | #9
On 2021-01-20 18:29, Toke Høiland-Jørgensen wrote:
> Björn Töpel <bjorn.topel@intel.com> writes:
> 
>> On 2021-01-20 15:54, Toke Høiland-Jørgensen wrote:
>>> Björn Töpel <bjorn.topel@intel.com> writes:
>>>
>>>> On 2021-01-20 13:50, Toke Høiland-Jørgensen wrote:
>>>>> Björn Töpel <bjorn.topel@gmail.com> writes:
>>>>>
>>>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>>>>> index c001766adcbc..bbc7d9a57262 100644
>>>>>> --- a/include/uapi/linux/bpf.h
>>>>>> +++ b/include/uapi/linux/bpf.h
>>>>>> @@ -3836,6 +3836,12 @@ union bpf_attr {
>>>>>>      *	Return
>>>>>>      *		A pointer to a struct socket on success or NULL if the file is
>>>>>>      *		not a socket.
>>>>>> + *
>>>>>> + * long bpf_redirect_xsk(struct xdp_buff *xdp_md, u64 action)
>>>>>> + *	Description
>>>>>> + *		Redirect to the registered AF_XDP socket.
>>>>>> + *	Return
>>>>>> + *		**XDP_REDIRECT** on success, otherwise the action parameter is returned.
>>>>>>      */
>>>>>
>>>>> I think it would be better to make the second argument a 'flags'
>>>>> argument and make values > XDP_TX invalid (like we do in
>>>>> bpf_xdp_redirect_map() now). By allowing any value as return you lose
>>>>> the ability to turn it into a flags argument later...
>>>>>
>>>>
>>>> Yes, but that adds a run-time check. I prefer this non-checked version,
>>>> even though it is a bit less futureproof.
>>>
>>> That...seems a bit short-sighted? :)
>>> Can you actually see a difference in your performance numbers?
>>>
>>
>> I would rather add an additional helper *if* we see the need for flags,
>> instead of paying for that upfront. For me, BPF is about being able to
>> specialize, and not having one call with tons of checks.
> 
> I get that, I'm just pushing back because omitting a 'flags' argument is
> literally among the most frequent reasons for having to replace a
> syscall (see e.g., [0]) instead of extending it. And yeah, I do realise
> that the performance implications are different for XDP than for
> syscalls, but maintainability of the API is also important; it's all a
> tradeoff. This will be the third redirect helper variant for XDP and I'd
> hate for the fourth one to have to be bpf_redirect_xsk_flags() because
> it did turn out to be needed...
> 
> (One potential concrete reason for this: I believe Magnus was talking
> about an API that would allow a BPF program to redirect a packet into
> more than one socket (cloning it in the process), or to redirect to a
> socket+another target. How would you do that with this new helper?)
> 
> [0] https://lwn.net/Articles/585415/
>

I have a bit of different view. One of the really nice parts about BPF
is exactly specialization. A user can tailor the kernel do a specific
thing. I *don't* see an issue with yet another helper, if that is needed
in the future. I think that is better than bloated helpers trying to
cope for all scenarios. I don't mean we should just add helpers all over
the place, but I do see more lightly on adding helpers, than adding
syscalls.

Elaborating a bit on this: many device drivers try to handle all the
things in the fast-path. I see BPF as one way forward to moving away
from that. Setup what you need, and only run what you currently need,
instead of the current "Is bleh on, then baz? Is this on, then that."

So, I would like to avoid "future proofing" the helpers, if that makes
sense. Use what you need. That's why BPF is so good (one of the things)!

As for bpf_redirect_xsk() it's a leaner version of bpf_redirect_map().
You want flags/shared sockets/...? Well go use bpf_redirect_map() and
XSKMAP. bpf_redirect_xsk() is not for you.

A lot of back-and-forth for *one* if-statement, but it's kind of a
design thing for me. ;-)


Björn


>> (Related; Going forward, the growing switch() for redirect targets in
>> xdp_do_redirect() is a concern for me...)
>>
>> And yes, even with all those fancy branch predictors, less instructions
>> is still less. :-) (It shows in my ubenchs.)
> 
> Right, I do agree that the run-time performance hit of checking the flag
> sucks (along with being hard to check for, cf. our parallel discussion
> about version checks). So ideally this would be fixed by having the
> verifier enforce the argument ranges instead; but if we merge this
> without the runtime check now we can't add that later without
> potentially breaking programs... :(
>
> -Toke
>
Toke Høiland-Jørgensen Jan. 20, 2021, 8:26 p.m. UTC | #10
Björn Töpel <bjorn.topel@intel.com> writes:

> On 2021-01-20 18:29, Toke Høiland-Jørgensen wrote:
>> Björn Töpel <bjorn.topel@intel.com> writes:
>> 
>>> On 2021-01-20 15:54, Toke Høiland-Jørgensen wrote:
>>>> Björn Töpel <bjorn.topel@intel.com> writes:
>>>>
>>>>> On 2021-01-20 13:50, Toke Høiland-Jørgensen wrote:
>>>>>> Björn Töpel <bjorn.topel@gmail.com> writes:
>>>>>>
>>>>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>>>>>> index c001766adcbc..bbc7d9a57262 100644
>>>>>>> --- a/include/uapi/linux/bpf.h
>>>>>>> +++ b/include/uapi/linux/bpf.h
>>>>>>> @@ -3836,6 +3836,12 @@ union bpf_attr {
>>>>>>>      *	Return
>>>>>>>      *		A pointer to a struct socket on success or NULL if the file is
>>>>>>>      *		not a socket.
>>>>>>> + *
>>>>>>> + * long bpf_redirect_xsk(struct xdp_buff *xdp_md, u64 action)
>>>>>>> + *	Description
>>>>>>> + *		Redirect to the registered AF_XDP socket.
>>>>>>> + *	Return
>>>>>>> + *		**XDP_REDIRECT** on success, otherwise the action parameter is returned.
>>>>>>>      */
>>>>>>
>>>>>> I think it would be better to make the second argument a 'flags'
>>>>>> argument and make values > XDP_TX invalid (like we do in
>>>>>> bpf_xdp_redirect_map() now). By allowing any value as return you lose
>>>>>> the ability to turn it into a flags argument later...
>>>>>>
>>>>>
>>>>> Yes, but that adds a run-time check. I prefer this non-checked version,
>>>>> even though it is a bit less futureproof.
>>>>
>>>> That...seems a bit short-sighted? :)
>>>> Can you actually see a difference in your performance numbers?
>>>>
>>>
>>> I would rather add an additional helper *if* we see the need for flags,
>>> instead of paying for that upfront. For me, BPF is about being able to
>>> specialize, and not having one call with tons of checks.
>> 
>> I get that, I'm just pushing back because omitting a 'flags' argument is
>> literally among the most frequent reasons for having to replace a
>> syscall (see e.g., [0]) instead of extending it. And yeah, I do realise
>> that the performance implications are different for XDP than for
>> syscalls, but maintainability of the API is also important; it's all a
>> tradeoff. This will be the third redirect helper variant for XDP and I'd
>> hate for the fourth one to have to be bpf_redirect_xsk_flags() because
>> it did turn out to be needed...
>> 
>> (One potential concrete reason for this: I believe Magnus was talking
>> about an API that would allow a BPF program to redirect a packet into
>> more than one socket (cloning it in the process), or to redirect to a
>> socket+another target. How would you do that with this new helper?)
>> 
>> [0] https://lwn.net/Articles/585415/
>>
>
> I have a bit of different view. One of the really nice parts about BPF
> is exactly specialization. A user can tailor the kernel do a specific
> thing. I *don't* see an issue with yet another helper, if that is needed
> in the future. I think that is better than bloated helpers trying to
> cope for all scenarios. I don't mean we should just add helpers all over
> the place, but I do see more lightly on adding helpers, than adding
> syscalls.
>
> Elaborating a bit on this: many device drivers try to handle all the
> things in the fast-path. I see BPF as one way forward to moving away
> from that. Setup what you need, and only run what you currently need,
> instead of the current "Is bleh on, then baz? Is this on, then that."
>
> So, I would like to avoid "future proofing" the helpers, if that makes
> sense. Use what you need. That's why BPF is so good (one of the
> things)!

Well, it's a tradeoff. We're still defining an API that should not be
(too) confusing...

> As for bpf_redirect_xsk() it's a leaner version of bpf_redirect_map().
> You want flags/shared sockets/...? Well go use bpf_redirect_map() and
> XSKMAP. bpf_redirect_xsk() is not for you.

This argument, however, I buy: bpf_redirect() is the single-purpose
helper for redirecting to an ifindex, bpf_redirect_xsk() is the
single-purpose helper for redirecting to an XSK, and bpf_redirect_map()
is the generic one that does both of those and more. Fair enough,
consider me convinced :)

> A lot of back-and-forth for *one* if-statement, but it's kind of a
> design thing for me. ;-)

Surely you don't mean to imply that you have *better* things to do with
your time than have a 10-emails-long argument over a single if
statement? ;)

-Toke
Alexei Starovoitov Jan. 20, 2021, 9:15 p.m. UTC | #11
On Wed, Jan 20, 2021 at 12:26 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> This argument, however, I buy: bpf_redirect() is the single-purpose
> helper for redirecting to an ifindex, bpf_redirect_xsk() is the
> single-purpose helper for redirecting to an XSK, and bpf_redirect_map()
> is the generic one that does both of those and more. Fair enough,
> consider me convinced :)
>
> > A lot of back-and-forth for *one* if-statement, but it's kind of a
> > design thing for me. ;-)
>
> Surely you don't mean to imply that you have *better* things to do with
> your time than have a 10-emails-long argument over a single if
> statement? ;)

After reading this thread I think I have to pour cold water on the design.

The performance blip comes from hard coded assumptions:
+ queue_id = xdp->rxq->queue_index;
+ xs = READ_ONCE(dev->_rx[queue_id].xsk);

bpf can have specialized helpers, but imo this is beyond what's reasonable.
Please move such things into the program and try to make
bpf_redirect_map faster.

Making af_xdp non-root is orthogonal. If there is actual need for that
it has to be designed thoroughly and not presented as "this helper may
help to do that".
I don't think "may" will materialize unless people actually work
toward the goal of non-root.
Björn Töpel Jan. 21, 2021, 8:18 a.m. UTC | #12
On 2021-01-20 22:15, Alexei Starovoitov wrote:
> On Wed, Jan 20, 2021 at 12:26 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>
>> This argument, however, I buy: bpf_redirect() is the single-purpose
>> helper for redirecting to an ifindex, bpf_redirect_xsk() is the
>> single-purpose helper for redirecting to an XSK, and bpf_redirect_map()
>> is the generic one that does both of those and more. Fair enough,
>> consider me convinced :)
>>
>>> A lot of back-and-forth for *one* if-statement, but it's kind of a
>>> design thing for me. ;-)
>>
>> Surely you don't mean to imply that you have *better* things to do with
>> your time than have a 10-emails-long argument over a single if
>> statement? ;)
> 
> After reading this thread I think I have to pour cold water on the design.
> 
> The performance blip comes from hard coded assumptions:
> + queue_id = xdp->rxq->queue_index;
> + xs = READ_ONCE(dev->_rx[queue_id].xsk);
>

Yes, one can see this as a constrained map:

* The map belongs to a certain netdev.
* Each entry corresponds to a certain queue id.

I.e if we do a successful (non-NULL) lookup, we *know* that all sockets
in that map belong to the netdev, and has the correct queue id.

By doing that we can get rid of two run-time checks: "Is the socket
bound to this netdev?" and "Is this the correct queue id?".

> bpf can have specialized helpers, but imo this is beyond what's reasonable.
> Please move such things into the program and try to make
> bpf_redirect_map faster.
>

I obviously prefer this path, and ideally combined with a way to even
more specialize xdp_do_redirect(). Then again, you are the maintainer! :-)

Maybe an alternative be adding a new type of XSKMAP constrained in the
similar way as above, and continue with bpf_redirect_map(), but with
this new map the index argument would be ignored. Unfortunately the BPF
context (xdp_buff in this case) is not passed to bpf_redirect_map(), so
getting the actual queue_id in the helper is hard. Adding the context as
an additional argument would be a new helper...

I'll need to think a bit more about it. Input/ideas are welcome!


> Making af_xdp non-root is orthogonal. If there is actual need for that
> it has to be designed thoroughly and not presented as "this helper may
> help to do that".
> I don't think "may" will materialize unless people actually work
> toward the goal of non-root.
> 

Fair enough! Same goal could be reached using the existing map approach.


Björn
diff mbox series

Patch

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 5fc336a271c2..3f9efbd08cba 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -648,6 +648,7 @@  enum xdp_redirect_type {
 	XDP_REDIR_DEV_MAP,
 	XDP_REDIR_CPU_MAP,
 	XDP_REDIR_XSK_MAP,
+	XDP_REDIR_XSK,
 };
 
 DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5b949076ed23..cb0e215e981c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -749,6 +749,7 @@  struct netdev_rx_queue {
 	struct xdp_rxq_info		xdp_rxq;
 #ifdef CONFIG_XDP_SOCKETS
 	struct xsk_buff_pool            *pool;
+	struct xdp_sock			*xsk;
 #endif
 } ____cacheline_aligned_in_smp;
 
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index cc17bc957548..97b21c483baf 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -77,8 +77,10 @@  struct xdp_sock {
 #ifdef CONFIG_XDP_SOCKETS
 
 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
+int xsk_generic_redirect(struct net_device *dev, struct xdp_buff *xdp);
 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp);
 void __xsk_map_flush(void);
+int xsk_redirect(struct xdp_sock *xs, struct xdp_buff *xdp);
 
 static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
 						     u32 key)
@@ -100,6 +102,11 @@  static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	return -ENOTSUPP;
 }
 
+static inline int xsk_generic_redirect(struct net_device *dev, struct xdp_buff *xdp)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
 	return -EOPNOTSUPP;
@@ -115,6 +122,11 @@  static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
 	return NULL;
 }
 
+static inline int xsk_redirect(struct net_device *dev, struct xdp_buff *xdp)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif /* CONFIG_XDP_SOCKETS */
 
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index eaa8386dbc63..bd531d561c60 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -84,7 +84,7 @@  struct xsk_buff_pool {
 /* AF_XDP core. */
 struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
 						struct xdp_umem *umem);
-int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev,
+int xp_assign_dev(struct xdp_sock *xs, struct xsk_buff_pool *pool, struct net_device *dev,
 		  u16 queue_id, u16 flags);
 int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem,
 			 struct net_device *dev, u16 queue_id);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c001766adcbc..bbc7d9a57262 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3836,6 +3836,12 @@  union bpf_attr {
  *	Return
  *		A pointer to a struct socket on success or NULL if the file is
  *		not a socket.
+ *
+ * long bpf_redirect_xsk(struct xdp_buff *xdp_md, u64 action)
+ *	Description
+ *		Redirect to the registered AF_XDP socket.
+ *	Return
+ *		**XDP_REDIRECT** on success, otherwise the action parameter is returned.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -4001,6 +4007,7 @@  union bpf_attr {
 	FN(ktime_get_coarse_ns),	\
 	FN(ima_inode_hash),		\
 	FN(sock_from_file),		\
+	FN(redirect_xsk),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/net/core/filter.c b/net/core/filter.c
index 5f31e21be531..b457c83fba70 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3977,6 +3977,9 @@  int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 	case XDP_REDIR_XSK_MAP:
 		err = __xsk_map_redirect(fwd, xdp);
 		break;
+	case XDP_REDIR_XSK:
+		err = xsk_redirect(fwd, xdp);
+		break;
 	default:
 		err = -EBADRQC;
 	}
@@ -4044,25 +4047,33 @@  int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
 	ri->tgt_type = XDP_REDIR_UNSET;
 	ri->tgt_value = NULL;
 
-	if (type == XDP_REDIR_DEV_IFINDEX) {
+	switch (type) {
+	case XDP_REDIR_DEV_IFINDEX: {
 		fwd = dev_get_by_index_rcu(dev_net(dev), (u32)(long)fwd);
 		if (unlikely(!fwd)) {
 			err = -EINVAL;
-			goto err;
+			break;
 		}
 
 		err = xdp_ok_fwd_dev(fwd, skb->len);
 		if (unlikely(err))
-			goto err;
+			break;
 
 		skb->dev = fwd;
 		_trace_xdp_redirect(dev, xdp_prog, index);
 		generic_xdp_tx(skb, xdp_prog);
 		return 0;
 	}
+	case XDP_REDIR_XSK:
+		err = xsk_generic_redirect(dev, xdp);
+		if (err)
+			break;
+		consume_skb(skb);
+		break;
+	default:
+		return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, type);
+	}
 
-	return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, type);
-err:
 	_trace_xdp_redirect_err(dev, xdp_prog, index, err);
 	return err;
 }
@@ -4144,6 +4155,32 @@  static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
 	.arg3_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_xdp_redirect_xsk, struct xdp_buff *, xdp, u64, action)
+{
+	struct net_device *dev = xdp->rxq->dev;
+	u32 queue_id = xdp->rxq->queue_index;
+	struct bpf_redirect_info *ri;
+	struct xdp_sock *xs;
+
+	xs = READ_ONCE(dev->_rx[queue_id].xsk);
+	if (!xs)
+		return action;
+
+	ri = this_cpu_ptr(&bpf_redirect_info);
+	ri->tgt_type = XDP_REDIR_XSK;
+	ri->tgt_value = xs;
+
+	return XDP_REDIRECT;
+}
+
+static const struct bpf_func_proto bpf_xdp_redirect_xsk_proto = {
+	.func           = bpf_xdp_redirect_xsk,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
 				  unsigned long off, unsigned long len)
 {
@@ -7260,6 +7297,8 @@  xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_tcp_gen_syncookie:
 		return &bpf_tcp_gen_syncookie_proto;
 #endif
+	case BPF_FUNC_redirect_xsk:
+		return &bpf_xdp_redirect_xsk_proto;
 	default:
 		return bpf_sk_base_func_proto(func_id);
 	}
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 5820de65060b..79f1492e71e2 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -134,6 +134,28 @@  int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
 	return 0;
 }
 
+static struct xdp_sock *xsk_get_at_qid(struct net_device *dev, u16 queue_id)
+{
+	return READ_ONCE(dev->_rx[queue_id].xsk);
+}
+
+static void xsk_clear(struct xdp_sock *xs)
+{
+	struct net_device *dev = xs->dev;
+	u16 queue_id = xs->queue_id;
+
+	if (queue_id < dev->num_rx_queues)
+		WRITE_ONCE(dev->_rx[queue_id].xsk, NULL);
+}
+
+static void xsk_reg(struct xdp_sock *xs)
+{
+	struct net_device *dev = xs->dev;
+	u16 queue_id = xs->queue_id;
+
+	WRITE_ONCE(dev->_rx[queue_id].xsk, xs);
+}
+
 void xp_release(struct xdp_buff_xsk *xskb)
 {
 	xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb;
@@ -184,7 +206,7 @@  static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
 	memcpy(to_buf, from_buf, len + metalen);
 }
 
-static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+static int ____xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
 	struct xdp_buff *xsk_xdp;
 	int err;
@@ -211,6 +233,22 @@  static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	return 0;
 }
 
+static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+	int err;
+	u32 len;
+
+	if (likely(xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)) {
+		len = xdp->data_end - xdp->data;
+		return __xsk_rcv_zc(xs, xdp, len);
+	}
+
+	err = ____xsk_rcv(xs, xdp);
+	if (!err)
+		xdp_return_buff(xdp);
+	return err;
+}
+
 static bool xsk_tx_writeable(struct xdp_sock *xs)
 {
 	if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
@@ -248,6 +286,39 @@  static void xsk_flush(struct xdp_sock *xs)
 	sock_def_readable(&xs->sk);
 }
 
+int xsk_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+	struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
+	int err;
+
+	sk_mark_napi_id_once_xdp(&xs->sk, xdp);
+	err = __xsk_rcv(xs, xdp);
+	if (err)
+		return err;
+
+	if (!xs->flush_node.prev)
+		list_add(&xs->flush_node, flush_list);
+	return 0;
+}
+
+int xsk_generic_redirect(struct net_device *dev, struct xdp_buff *xdp)
+{
+	struct xdp_sock *xs;
+	u32 queue_id;
+	int err;
+
+	queue_id = xdp->rxq->queue_index;
+	xs = xsk_get_at_qid(dev, queue_id);
+	if (!xs)
+		return -EINVAL;
+
+	spin_lock_bh(&xs->rx_lock);
+	err = ____xsk_rcv(xs, xdp);
+	xsk_flush(xs);
+	spin_unlock_bh(&xs->rx_lock);
+	return err;
+}
+
 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
 	int err;
@@ -255,7 +326,7 @@  int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	spin_lock_bh(&xs->rx_lock);
 	err = xsk_rcv_check(xs, xdp);
 	if (!err) {
-		err = __xsk_rcv(xs, xdp);
+		err = ____xsk_rcv(xs, xdp);
 		xsk_flush(xs);
 	}
 	spin_unlock_bh(&xs->rx_lock);
@@ -264,22 +335,12 @@  int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 
 static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
-	int err;
-	u32 len;
+	int err = xsk_rcv_check(xs, xdp);
 
-	err = xsk_rcv_check(xs, xdp);
 	if (err)
 		return err;
 
-	if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
-		len = xdp->data_end - xdp->data;
-		return __xsk_rcv_zc(xs, xdp, len);
-	}
-
-	err = __xsk_rcv(xs, xdp);
-	if (!err)
-		xdp_return_buff(xdp);
-	return err;
+	return __xsk_rcv(xs, xdp);
 }
 
 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
@@ -661,6 +722,7 @@  static void xsk_unbind_dev(struct xdp_sock *xs)
 
 	if (xs->state != XSK_BOUND)
 		return;
+	xsk_clear(xs);
 	WRITE_ONCE(xs->state, XSK_UNBOUND);
 
 	/* Wait for driver to stop using the xdp socket. */
@@ -892,7 +954,7 @@  static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 			goto out_unlock;
 		}
 
-		err = xp_assign_dev(xs->pool, dev, qid, flags);
+		err = xp_assign_dev(xs, xs->pool, dev, qid, flags);
 		if (err) {
 			xp_destroy(xs->pool);
 			xs->pool = NULL;
@@ -918,6 +980,7 @@  static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		 */
 		smp_wmb();
 		WRITE_ONCE(xs->state, XSK_BOUND);
+		xsk_reg(xs);
 	}
 out_release:
 	mutex_unlock(&xs->mutex);
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 8de01aaac4a0..af02a69d0bf7 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -119,7 +119,7 @@  static void xp_disable_drv_zc(struct xsk_buff_pool *pool)
 	}
 }
 
-int xp_assign_dev(struct xsk_buff_pool *pool,
+int xp_assign_dev(struct xdp_sock *xs, struct xsk_buff_pool *pool,
 		  struct net_device *netdev, u16 queue_id, u16 flags)
 {
 	bool force_zc, force_copy;
@@ -204,7 +204,7 @@  int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem,
 	if (pool->uses_need_wakeup)
 		flags |= XDP_USE_NEED_WAKEUP;
 
-	return xp_assign_dev(pool, dev, queue_id, flags);
+	return xp_assign_dev(NULL, pool, dev, queue_id, flags);
 }
 
 void xp_clear_dev(struct xsk_buff_pool *pool)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c001766adcbc..bbc7d9a57262 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3836,6 +3836,12 @@  union bpf_attr {
  *	Return
  *		A pointer to a struct socket on success or NULL if the file is
  *		not a socket.
+ *
+ * long bpf_redirect_xsk(struct xdp_buff *xdp_md, u64 action)
+ *	Description
+ *		Redirect to the registered AF_XDP socket.
+ *	Return
+ *		**XDP_REDIRECT** on success, otherwise the action parameter is returned.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -4001,6 +4007,7 @@  union bpf_attr {
 	FN(ktime_get_coarse_ns),	\
 	FN(ima_inode_hash),		\
 	FN(sock_from_file),		\
+	FN(redirect_xsk),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper