diff mbox series

bpf: devmap: allow for repeated redirect

Message ID 20240909-devel-koalo-fix-redirect-v1-1-2dd90771146c@linutronix.de (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf: devmap: allow for repeated redirect | expand

Checks

Context Check Description
netdev/tree_selection success Guessing tree name failed - patch did not apply, async
bpf/vmtest-bpf-PR success PR summary
bpf/vmtest-bpf-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc

Commit Message

Florian Kauer Sept. 9, 2024, 9:06 p.m. UTC
Currently, returning XDP_REDIRECT from a xdp/devmap program
is considered as invalid action and an exception is traced.

While it might seem counterintuitive to redirect in a xdp/devmap
program (why not just redirect to the correct interface in the
first program?), we faced several use cases where supporting
this would be very useful.

Most importantly, they occur when the first redirect is used
with the BPF_F_BROADCAST flag. Using this together with xdp/devmap
programs, enables to perform different actions on clones of
the same incoming frame. In that case, it is often useful
to redirect either to a different CPU or device AFTER the cloning.

For example:
- Replicate the frame (for redundancy according to IEEE 802.1CB FRER)
  and then use the second redirect with a cpumap to select
  the path-specific egress queue.

- Also, one of the paths might need an encapsulation that
  exceeds the MTU. So a second redirect can be used back
  to the ingress interface to send an ICMP FRAG_NEEDED packet.

- For OAM purposes, you might want to send one frame with
  OAM information back, while the original frame in passed forward.

To enable these use cases, add the XDP_REDIRECT case to
dev_map_bpf_prog_run. Also, when this is called from inside
xdp_do_flush, the redirect might add further entries to the
flush lists that are currently processed. Therefore, loop inside
xdp_do_flush until no more additional redirects were added.

Signed-off-by: Florian Kauer <florian.kauer@linutronix.de>
---
 include/linux/bpf.h        |  4 ++--
 include/trace/events/xdp.h | 10 ++++++----
 kernel/bpf/devmap.c        | 37 +++++++++++++++++++++++++++--------
 net/core/filter.c          | 48 +++++++++++++++++++++++++++-------------------
 4 files changed, 65 insertions(+), 34 deletions(-)


---
base-commit: 8e69c96df771ab469cec278edb47009351de4da6
change-id: 20240909-devel-koalo-fix-redirect-684639694951
prerequisite-patch-id: 6928ae7741727e3b2ab4a8c4256b06a861040a01

Best regards,

Comments

Toke Høiland-Jørgensen Sept. 10, 2024, 8:34 a.m. UTC | #1
Florian Kauer <florian.kauer@linutronix.de> writes:

> Currently, returning XDP_REDIRECT from a xdp/devmap program
> is considered as invalid action and an exception is traced.
>
> While it might seem counterintuitive to redirect in a xdp/devmap
> program (why not just redirect to the correct interface in the
> first program?), we faced several use cases where supporting
> this would be very useful.
>
> Most importantly, they occur when the first redirect is used
> with the BPF_F_BROADCAST flag. Using this together with xdp/devmap
> programs, enables to perform different actions on clones of
> the same incoming frame. In that case, it is often useful
> to redirect either to a different CPU or device AFTER the cloning.
>
> For example:
> - Replicate the frame (for redundancy according to IEEE 802.1CB FRER)
>   and then use the second redirect with a cpumap to select
>   the path-specific egress queue.
>
> - Also, one of the paths might need an encapsulation that
>   exceeds the MTU. So a second redirect can be used back
>   to the ingress interface to send an ICMP FRAG_NEEDED packet.
>
> - For OAM purposes, you might want to send one frame with
>   OAM information back, while the original frame in passed forward.
>
> To enable these use cases, add the XDP_REDIRECT case to
> dev_map_bpf_prog_run. Also, when this is called from inside
> xdp_do_flush, the redirect might add further entries to the
> flush lists that are currently processed. Therefore, loop inside
> xdp_do_flush until no more additional redirects were added.
>
> Signed-off-by: Florian Kauer <florian.kauer@linutronix.de>

This is an interesting use case! However, your implementation makes it
way to easy to end up in a situation that loops forever, so I think we
should add some protection against that. Some details below:

> ---
>  include/linux/bpf.h        |  4 ++--
>  include/trace/events/xdp.h | 10 ++++++----
>  kernel/bpf/devmap.c        | 37 +++++++++++++++++++++++++++--------
>  net/core/filter.c          | 48 +++++++++++++++++++++++++++-------------------
>  4 files changed, 65 insertions(+), 34 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 3b94ec161e8c..1b57cbabf199 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -2498,7 +2498,7 @@ struct sk_buff;
>  struct bpf_dtab_netdev;
>  struct bpf_cpu_map_entry;
>  
> -void __dev_flush(struct list_head *flush_list);
> +void __dev_flush(struct list_head *flush_list, int *redirects);
>  int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
>  		    struct net_device *dev_rx);
>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
> @@ -2740,7 +2740,7 @@ static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd)
>  	return ERR_PTR(-EOPNOTSUPP);
>  }
>  
> -static inline void __dev_flush(struct list_head *flush_list)
> +static inline void __dev_flush(struct list_head *flush_list, int *redirects)
>  {
>  }
>  
> diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
> index a7e5452b5d21..fba2c457e727 100644
> --- a/include/trace/events/xdp.h
> +++ b/include/trace/events/xdp.h
> @@ -269,9 +269,9 @@ TRACE_EVENT(xdp_devmap_xmit,
>  
>  	TP_PROTO(const struct net_device *from_dev,
>  		 const struct net_device *to_dev,
> -		 int sent, int drops, int err),
> +		 int sent, int drops, int redirects, int err),
>  
> -	TP_ARGS(from_dev, to_dev, sent, drops, err),
> +	TP_ARGS(from_dev, to_dev, sent, drops, redirects, err),
>  
>  	TP_STRUCT__entry(
>  		__field(int, from_ifindex)
> @@ -279,6 +279,7 @@ TRACE_EVENT(xdp_devmap_xmit,
>  		__field(int, to_ifindex)
>  		__field(int, drops)
>  		__field(int, sent)
> +		__field(int, redirects)
>  		__field(int, err)
>  	),
>  
> @@ -288,16 +289,17 @@ TRACE_EVENT(xdp_devmap_xmit,
>  		__entry->to_ifindex	= to_dev->ifindex;
>  		__entry->drops		= drops;
>  		__entry->sent		= sent;
> +		__entry->redirects	= redirects;
>  		__entry->err		= err;
>  	),
>  
>  	TP_printk("ndo_xdp_xmit"
>  		  " from_ifindex=%d to_ifindex=%d action=%s"
> -		  " sent=%d drops=%d"
> +		  " sent=%d drops=%d redirects=%d"
>  		  " err=%d",
>  		  __entry->from_ifindex, __entry->to_ifindex,
>  		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
> -		  __entry->sent, __entry->drops,
> +		  __entry->sent, __entry->drops, __entry->redirects,
>  		  __entry->err)
>  );
>  
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 7878be18e9d2..89bdec49ea40 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -334,7 +334,8 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
>  static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
>  				struct xdp_frame **frames, int n,
>  				struct net_device *tx_dev,
> -				struct net_device *rx_dev)
> +				struct net_device *rx_dev,
> +				int *redirects)
>  {
>  	struct xdp_txq_info txq = { .dev = tx_dev };
>  	struct xdp_rxq_info rxq = { .dev = rx_dev };
> @@ -359,6 +360,13 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
>  			else
>  				frames[nframes++] = xdpf;
>  			break;
> +		case XDP_REDIRECT:
> +			err = xdp_do_redirect(rx_dev, &xdp, xdp_prog);
> +			if (unlikely(err))
> +				xdp_return_frame_rx_napi(xdpf);
> +			else
> +				*redirects += 1;
> +			break;

It's a bit subtle, but dev_map_bpf_prog_run() also filters the list of
frames in the queue in-place (the frames[nframes++] = xdpf; line above),
which only works under the assumption that the array in bq->q is not
modified while this loop is being run. But now you're adding a call in
the middle that may result in the packet being put back on the same
queue in the middle, which means that this assumption no longer holds.

So you need to clear the bq->q queue first for this to work.
Specifically, at the start of bq_xmit_all(), you'll need to first copy
all the packet pointer onto an on-stack array, then run the rest of the
function on that array. There's already an initial loop that goes
through all the frames, so you can just do it there.

So the loop at the start of bq_xmit_all() goes from the current:

	for (i = 0; i < cnt; i++) {
		struct xdp_frame *xdpf = bq->q[i];

		prefetch(xdpf);
	}


to something like:

        struct xdp_frame *frames[DEV_MAP_BULK_SIZE];

	for (i = 0; i < cnt; i++) {
		struct xdp_frame *xdpf = bq->q[i];

		prefetch(xdpf);
                frames[i] = xdpf;
	}

        bq->count = 0; /* bq is now empty, use the 'frames' and 'cnt'
                          stack variables for the rest of the function */



>  		default:
>  			bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
>  			fallthrough;
> @@ -373,7 +381,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
>  	return nframes; /* sent frames count */
>  }
>  
> -static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
> +static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags, int *redirects)
>  {
>  	struct net_device *dev = bq->dev;
>  	unsigned int cnt = bq->count;
> @@ -390,8 +398,10 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
>  		prefetch(xdpf);
>  	}
>  
> +	int new_redirects = 0;
>  	if (bq->xdp_prog) {
> -		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx);
> +		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx,
> +				&new_redirects);
>  		if (!to_send)
>  			goto out;
>  	}
> @@ -413,19 +423,21 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
>  
>  out:
>  	bq->count = 0;
> -	trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err);
> +	*redirects += new_redirects;
> +	trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent - new_redirects,
> +			new_redirects, err);
>  }
>  
>  /* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the
>   * driver before returning from its napi->poll() routine. See the comment above
>   * xdp_do_flush() in filter.c.
>   */
> -void __dev_flush(struct list_head *flush_list)
> +void __dev_flush(struct list_head *flush_list, int *redirects)
>  {
>  	struct xdp_dev_bulk_queue *bq, *tmp;
>  
>  	list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
> -		bq_xmit_all(bq, XDP_XMIT_FLUSH);
> +		bq_xmit_all(bq, XDP_XMIT_FLUSH, redirects);
>  		bq->dev_rx = NULL;
>  		bq->xdp_prog = NULL;
>  		__list_del_clearprev(&bq->flush_node);
> @@ -458,8 +470,17 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
>  {
>  	struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
>  
> -	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
> -		bq_xmit_all(bq, 0);
> +	if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) {
> +		int redirects = 0;
> +
> +		bq_xmit_all(bq, 0, &redirects);
> +
> +		/* according to comment above xdp_do_flush() in
> +		 * filter.c, xdp_do_flush is always called at
> +		 * the end of the NAPI anyway, so no need to act
> +		 * on the redirects here
> +		 */

While it's true that it will be called again in NAPI, the purpose of
calling bq_xmit_all() here is to make room space for the packet on the
bulk queue that we're about to enqueue, and if bq_xmit_all() can just
put the packet back on the queue, there is no guarantee that this will
succeed. So you will have to handle that case here.

Since there's also a potential infinite recursion issue in the
do_flush() functions below, I think it may be better to handle this
looping issue inside bq_xmit_all().

I.e., structure the code so that bq_xmit_all() guarantees that when it
returns it has actually done its job; that is, that bq->q is empty.

Given the above "move all frames out of bq->q at the start" change, this
is not all that hard. Simply add a check after the out: label (in
bq_xmit_all()) to check if bq->count is actually 0, and if it isn't,
start over from the beginning of that function. This also makes it
straight forward to add a recursion limit; after looping a set number of
times (say, XMIT_RECURSION_LIMIT), simply turn XDP_REDIRECT into drops.

There will need to be some additional protection against looping forever
in __dev_flush(), to handle the case where a packet is looped between
two interfaces. This one is a bit trickier, but a similar recursion
counter could be used, I think.

In any case, this needs extensive selftests, including ones with devmap
programs that loop packets (both redirect from a->a, and from
a->b->a->b) to make sure the limits work correctly.

> +	}
>  
>  	/* Ingress dev_rx will be the same for all xdp_frame's in
>  	 * bulk_queue, because bq stored per-CPU and must be flushed
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 8569cd2482ee..b33fc0b1444a 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -4287,14 +4287,18 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
>  void xdp_do_flush(void)
>  {
>  	struct list_head *lh_map, *lh_dev, *lh_xsk;
> +	int redirect;
>  
> -	bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
> -	if (lh_dev)
> -		__dev_flush(lh_dev);
> -	if (lh_map)
> -		__cpu_map_flush(lh_map);
> -	if (lh_xsk)
> -		__xsk_map_flush(lh_xsk);
> +	do {
> +		redirect = 0;
> +		bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
> +		if (lh_dev)
> +			__dev_flush(lh_dev, &redirect);
> +		if (lh_map)
> +			__cpu_map_flush(lh_map);
> +		if (lh_xsk)
> +			__xsk_map_flush(lh_xsk);
> +	} while (redirect > 0);
>  }
>  EXPORT_SYMBOL_GPL(xdp_do_flush);
>  
> @@ -4303,20 +4307,24 @@ void xdp_do_check_flushed(struct napi_struct *napi)
>  {
>  	struct list_head *lh_map, *lh_dev, *lh_xsk;
>  	bool missed = false;
> +	int redirect;
>  
> -	bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
> -	if (lh_dev) {
> -		__dev_flush(lh_dev);
> -		missed = true;
> -	}
> -	if (lh_map) {
> -		__cpu_map_flush(lh_map);
> -		missed = true;
> -	}
> -	if (lh_xsk) {
> -		__xsk_map_flush(lh_xsk);
> -		missed = true;
> -	}
> +	do {
> +		redirect = 0;
> +		bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
> +		if (lh_dev) {
> +			__dev_flush(lh_dev, &redirect);
> +			missed = true;
> +		}
> +		if (lh_map) {
> +			__cpu_map_flush(lh_map);
> +			missed = true;
> +		}
> +		if (lh_xsk) {
> +			__xsk_map_flush(lh_xsk);
> +			missed = true;
> +		}
> +	} while (redirect > 0);

With the change suggested above (so that bq_xmit_all() guarantees the
flush is completely done), this looping is not needed anymore. However,
it becomes important in which *order* the flushing is done
(__dev_flush() should always happen first), so adding a comment to note
this would be good.

-Toke
Florian Kauer Sept. 11, 2024, 8:26 a.m. UTC | #2
Hi Toke,

On 9/10/24 10:34, Toke Høiland-Jørgensen wrote:
> Florian Kauer <florian.kauer@linutronix.de> writes:
> 
>> Currently, returning XDP_REDIRECT from a xdp/devmap program
>> is considered as invalid action and an exception is traced.
>>
>> While it might seem counterintuitive to redirect in a xdp/devmap
>> program (why not just redirect to the correct interface in the
>> first program?), we faced several use cases where supporting
>> this would be very useful.
>>
>> Most importantly, they occur when the first redirect is used
>> with the BPF_F_BROADCAST flag. Using this together with xdp/devmap
>> programs, enables to perform different actions on clones of
>> the same incoming frame. In that case, it is often useful
>> to redirect either to a different CPU or device AFTER the cloning.
>>
>> For example:
>> - Replicate the frame (for redundancy according to IEEE 802.1CB FRER)
>>   and then use the second redirect with a cpumap to select
>>   the path-specific egress queue.
>>
>> - Also, one of the paths might need an encapsulation that
>>   exceeds the MTU. So a second redirect can be used back
>>   to the ingress interface to send an ICMP FRAG_NEEDED packet.
>>
>> - For OAM purposes, you might want to send one frame with
>>   OAM information back, while the original frame in passed forward.
>>
>> To enable these use cases, add the XDP_REDIRECT case to
>> dev_map_bpf_prog_run. Also, when this is called from inside
>> xdp_do_flush, the redirect might add further entries to the
>> flush lists that are currently processed. Therefore, loop inside
>> xdp_do_flush until no more additional redirects were added.
>>
>> Signed-off-by: Florian Kauer <florian.kauer@linutronix.de>
> 
> This is an interesting use case! However, your implementation makes it
> way to easy to end up in a situation that loops forever, so I think we
> should add some protection against that. Some details below:
> 
>> ---
>>  include/linux/bpf.h        |  4 ++--
>>  include/trace/events/xdp.h | 10 ++++++----
>>  kernel/bpf/devmap.c        | 37 +++++++++++++++++++++++++++--------
>>  net/core/filter.c          | 48 +++++++++++++++++++++++++++-------------------
>>  4 files changed, 65 insertions(+), 34 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 3b94ec161e8c..1b57cbabf199 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -2498,7 +2498,7 @@ struct sk_buff;
>>  struct bpf_dtab_netdev;
>>  struct bpf_cpu_map_entry;
>>  
>> -void __dev_flush(struct list_head *flush_list);
>> +void __dev_flush(struct list_head *flush_list, int *redirects);
>>  int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
>>  		    struct net_device *dev_rx);
>>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
>> @@ -2740,7 +2740,7 @@ static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd)
>>  	return ERR_PTR(-EOPNOTSUPP);
>>  }
>>  
>> -static inline void __dev_flush(struct list_head *flush_list)
>> +static inline void __dev_flush(struct list_head *flush_list, int *redirects)
>>  {
>>  }
>>  
>> diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
>> index a7e5452b5d21..fba2c457e727 100644
>> --- a/include/trace/events/xdp.h
>> +++ b/include/trace/events/xdp.h
>> @@ -269,9 +269,9 @@ TRACE_EVENT(xdp_devmap_xmit,
>>  
>>  	TP_PROTO(const struct net_device *from_dev,
>>  		 const struct net_device *to_dev,
>> -		 int sent, int drops, int err),
>> +		 int sent, int drops, int redirects, int err),
>>  
>> -	TP_ARGS(from_dev, to_dev, sent, drops, err),
>> +	TP_ARGS(from_dev, to_dev, sent, drops, redirects, err),
>>  
>>  	TP_STRUCT__entry(
>>  		__field(int, from_ifindex)
>> @@ -279,6 +279,7 @@ TRACE_EVENT(xdp_devmap_xmit,
>>  		__field(int, to_ifindex)
>>  		__field(int, drops)
>>  		__field(int, sent)
>> +		__field(int, redirects)
>>  		__field(int, err)
>>  	),
>>  
>> @@ -288,16 +289,17 @@ TRACE_EVENT(xdp_devmap_xmit,
>>  		__entry->to_ifindex	= to_dev->ifindex;
>>  		__entry->drops		= drops;
>>  		__entry->sent		= sent;
>> +		__entry->redirects	= redirects;
>>  		__entry->err		= err;
>>  	),
>>  
>>  	TP_printk("ndo_xdp_xmit"
>>  		  " from_ifindex=%d to_ifindex=%d action=%s"
>> -		  " sent=%d drops=%d"
>> +		  " sent=%d drops=%d redirects=%d"
>>  		  " err=%d",
>>  		  __entry->from_ifindex, __entry->to_ifindex,
>>  		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
>> -		  __entry->sent, __entry->drops,
>> +		  __entry->sent, __entry->drops, __entry->redirects,
>>  		  __entry->err)
>>  );
>>  
>> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
>> index 7878be18e9d2..89bdec49ea40 100644
>> --- a/kernel/bpf/devmap.c
>> +++ b/kernel/bpf/devmap.c
>> @@ -334,7 +334,8 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
>>  static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
>>  				struct xdp_frame **frames, int n,
>>  				struct net_device *tx_dev,
>> -				struct net_device *rx_dev)
>> +				struct net_device *rx_dev,
>> +				int *redirects)
>>  {
>>  	struct xdp_txq_info txq = { .dev = tx_dev };
>>  	struct xdp_rxq_info rxq = { .dev = rx_dev };
>> @@ -359,6 +360,13 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
>>  			else
>>  				frames[nframes++] = xdpf;
>>  			break;
>> +		case XDP_REDIRECT:
>> +			err = xdp_do_redirect(rx_dev, &xdp, xdp_prog);
>> +			if (unlikely(err))
>> +				xdp_return_frame_rx_napi(xdpf);
>> +			else
>> +				*redirects += 1;
>> +			break;
> 
> It's a bit subtle, but dev_map_bpf_prog_run() also filters the list of
> frames in the queue in-place (the frames[nframes++] = xdpf; line above),
> which only works under the assumption that the array in bq->q is not
> modified while this loop is being run. But now you're adding a call in
> the middle that may result in the packet being put back on the same
> queue in the middle, which means that this assumption no longer holds.
> 
> So you need to clear the bq->q queue first for this to work.
> Specifically, at the start of bq_xmit_all(), you'll need to first copy
> all the packet pointer onto an on-stack array, then run the rest of the
> function on that array. There's already an initial loop that goes
> through all the frames, so you can just do it there.
> 
> So the loop at the start of bq_xmit_all() goes from the current:
> 
> 	for (i = 0; i < cnt; i++) {
> 		struct xdp_frame *xdpf = bq->q[i];
> 
> 		prefetch(xdpf);
> 	}
> 
> 
> to something like:
> 
>         struct xdp_frame *frames[DEV_MAP_BULK_SIZE];
> 
> 	for (i = 0; i < cnt; i++) {
> 		struct xdp_frame *xdpf = bq->q[i];
> 
> 		prefetch(xdpf);
>                 frames[i] = xdpf;
> 	}
> 
>         bq->count = 0; /* bq is now empty, use the 'frames' and 'cnt'
>                           stack variables for the rest of the function */
> 
> 
> 
>>  		default:
>>  			bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
>>  			fallthrough;
>> @@ -373,7 +381,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
>>  	return nframes; /* sent frames count */
>>  }
>>  
>> -static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
>> +static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags, int *redirects)
>>  {
>>  	struct net_device *dev = bq->dev;
>>  	unsigned int cnt = bq->count;
>> @@ -390,8 +398,10 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
>>  		prefetch(xdpf);
>>  	}
>>  
>> +	int new_redirects = 0;
>>  	if (bq->xdp_prog) {
>> -		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx);
>> +		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx,
>> +				&new_redirects);
>>  		if (!to_send)
>>  			goto out;
>>  	}
>> @@ -413,19 +423,21 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
>>  
>>  out:
>>  	bq->count = 0;
>> -	trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err);
>> +	*redirects += new_redirects;
>> +	trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent - new_redirects,
>> +			new_redirects, err);
>>  }
>>  
>>  /* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the
>>   * driver before returning from its napi->poll() routine. See the comment above
>>   * xdp_do_flush() in filter.c.
>>   */
>> -void __dev_flush(struct list_head *flush_list)
>> +void __dev_flush(struct list_head *flush_list, int *redirects)
>>  {
>>  	struct xdp_dev_bulk_queue *bq, *tmp;
>>  
>>  	list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
>> -		bq_xmit_all(bq, XDP_XMIT_FLUSH);
>> +		bq_xmit_all(bq, XDP_XMIT_FLUSH, redirects);
>>  		bq->dev_rx = NULL;
>>  		bq->xdp_prog = NULL;
>>  		__list_del_clearprev(&bq->flush_node);
>> @@ -458,8 +470,17 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
>>  {
>>  	struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
>>  
>> -	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
>> -		bq_xmit_all(bq, 0);
>> +	if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) {
>> +		int redirects = 0;
>> +
>> +		bq_xmit_all(bq, 0, &redirects);
>> +
>> +		/* according to comment above xdp_do_flush() in
>> +		 * filter.c, xdp_do_flush is always called at
>> +		 * the end of the NAPI anyway, so no need to act
>> +		 * on the redirects here
>> +		 */
> 
> While it's true that it will be called again in NAPI, the purpose of
> calling bq_xmit_all() here is to make room space for the packet on the
> bulk queue that we're about to enqueue, and if bq_xmit_all() can just
> put the packet back on the queue, there is no guarantee that this will
> succeed. So you will have to handle that case here.
> 
> Since there's also a potential infinite recursion issue in the
> do_flush() functions below, I think it may be better to handle this
> looping issue inside bq_xmit_all().
> 
> I.e., structure the code so that bq_xmit_all() guarantees that when it
> returns it has actually done its job; that is, that bq->q is empty.
> 
> Given the above "move all frames out of bq->q at the start" change, this
> is not all that hard. Simply add a check after the out: label (in
> bq_xmit_all()) to check if bq->count is actually 0, and if it isn't,
> start over from the beginning of that function. This also makes it
> straight forward to add a recursion limit; after looping a set number of
> times (say, XMIT_RECURSION_LIMIT), simply turn XDP_REDIRECT into drops.
> 
> There will need to be some additional protection against looping forever
> in __dev_flush(), to handle the case where a packet is looped between
> two interfaces. This one is a bit trickier, but a similar recursion
> counter could be used, I think.


Thanks a lot for the extensive support!
Regarding __dev_flush(), could the following already be sufficient?

void __dev_flush(struct list_head *flush_list)
{
        struct xdp_dev_bulk_queue *bq, *tmp;
        int i,j;

        for (i = 0; !list_empty(flush_list) && i < XMIT_RECURSION_LIMIT; i++) {
                /* go through list in reverse so that new items
                 * added to the flush_list will only be handled
                 * in the next iteration of the outer loop
                 */
                list_for_each_entry_safe_reverse(bq, tmp, flush_list, flush_node) {
                        bq_xmit_all(bq, XDP_XMIT_FLUSH);
                        bq->dev_rx = NULL;
                        bq->xdp_prog = NULL;
                        __list_del_clearprev(&bq->flush_node);
                }
        }

        if (i == XMIT_RECURSION_LIMIT) {
                pr_warn("XDP recursion limit hit, expect packet loss!\n");

                list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
                        for (j = 0; j < bq->count; j++)
                                xdp_return_frame_rx_napi(bq->q[i]);

                        bq->count = 0;
                        bq->dev_rx = NULL;
                        bq->xdp_prog = NULL;
                        __list_del_clearprev(&bq->flush_node);
                }
        }
}


> 
> In any case, this needs extensive selftests, including ones with devmap
> programs that loop packets (both redirect from a->a, and from
> a->b->a->b) to make sure the limits work correctly.


Good point! I am going to prepare some.


> 
>> +	}
>>  
>>  	/* Ingress dev_rx will be the same for all xdp_frame's in
>>  	 * bulk_queue, because bq stored per-CPU and must be flushed
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index 8569cd2482ee..b33fc0b1444a 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -4287,14 +4287,18 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
>>  void xdp_do_flush(void)
>>  {
>>  	struct list_head *lh_map, *lh_dev, *lh_xsk;
>> +	int redirect;
>>  
>> -	bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
>> -	if (lh_dev)
>> -		__dev_flush(lh_dev);
>> -	if (lh_map)
>> -		__cpu_map_flush(lh_map);
>> -	if (lh_xsk)
>> -		__xsk_map_flush(lh_xsk);
>> +	do {
>> +		redirect = 0;
>> +		bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
>> +		if (lh_dev)
>> +			__dev_flush(lh_dev, &redirect);
>> +		if (lh_map)
>> +			__cpu_map_flush(lh_map);
>> +		if (lh_xsk)
>> +			__xsk_map_flush(lh_xsk);
>> +	} while (redirect > 0);
>>  }
>>  EXPORT_SYMBOL_GPL(xdp_do_flush);
>>  
>> @@ -4303,20 +4307,24 @@ void xdp_do_check_flushed(struct napi_struct *napi)
>>  {
>>  	struct list_head *lh_map, *lh_dev, *lh_xsk;
>>  	bool missed = false;
>> +	int redirect;
>>  
>> -	bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
>> -	if (lh_dev) {
>> -		__dev_flush(lh_dev);
>> -		missed = true;
>> -	}
>> -	if (lh_map) {
>> -		__cpu_map_flush(lh_map);
>> -		missed = true;
>> -	}
>> -	if (lh_xsk) {
>> -		__xsk_map_flush(lh_xsk);
>> -		missed = true;
>> -	}
>> +	do {
>> +		redirect = 0;
>> +		bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
>> +		if (lh_dev) {
>> +			__dev_flush(lh_dev, &redirect);
>> +			missed = true;
>> +		}
>> +		if (lh_map) {
>> +			__cpu_map_flush(lh_map);
>> +			missed = true;
>> +		}
>> +		if (lh_xsk) {
>> +			__xsk_map_flush(lh_xsk);
>> +			missed = true;
>> +		}
>> +	} while (redirect > 0);
> 
> With the change suggested above (so that bq_xmit_all() guarantees the
> flush is completely done), this looping is not needed anymore. However,
> it becomes important in which *order* the flushing is done
> (__dev_flush() should always happen first), so adding a comment to note
> this would be good.


I guess, if we remove the loop here and we still want to cover the case of
redirecting from devmap program via cpumap, we need to fetch the lh_map again
after calling __dev_flush, right?
So I think we should no longer use bpf_net_ctx_get_all_used_flush_lists then:

        lh_dev = bpf_net_ctx_get_dev_flush_list();
        if (lh_dev)
                __dev_flush(lh_dev);
        lh_map = bpf_net_ctx_get_cpu_map_flush_list();
        if (lh_map)
                __cpu_map_flush(lh_map);
        lh_xsk = bpf_net_ctx_get_xskmap_flush_list();
        if (lh_xsk)
                __xsk_map_flush(lh_xsk);

But bpf_net_ctx_get_all_used_flush_lists also includes additional checks
for IS_ENABLED(CONFIG_BPF_SYSCALL) and IS_ENABLED(CONFIG_XDP_SOCKETS),
so I guess they should be directly included in the xdp_do_flush and
xdp_do_check_flushed?
Then we could remove the bpf_net_ctx_get_all_used_flush_lists.
Or do you have an idea for a more elegant solution?

Thanks,
Florian
Toke Høiland-Jørgensen Sept. 11, 2024, 9:25 a.m. UTC | #3
Florian Kauer <florian.kauer@linutronix.de> writes:

> Hi Toke,
>
> On 9/10/24 10:34, Toke Høiland-Jørgensen wrote:
>> Florian Kauer <florian.kauer@linutronix.de> writes:
>> 
>>> Currently, returning XDP_REDIRECT from a xdp/devmap program
>>> is considered as invalid action and an exception is traced.
>>>
>>> While it might seem counterintuitive to redirect in a xdp/devmap
>>> program (why not just redirect to the correct interface in the
>>> first program?), we faced several use cases where supporting
>>> this would be very useful.
>>>
>>> Most importantly, they occur when the first redirect is used
>>> with the BPF_F_BROADCAST flag. Using this together with xdp/devmap
>>> programs, enables to perform different actions on clones of
>>> the same incoming frame. In that case, it is often useful
>>> to redirect either to a different CPU or device AFTER the cloning.
>>>
>>> For example:
>>> - Replicate the frame (for redundancy according to IEEE 802.1CB FRER)
>>>   and then use the second redirect with a cpumap to select
>>>   the path-specific egress queue.
>>>
>>> - Also, one of the paths might need an encapsulation that
>>>   exceeds the MTU. So a second redirect can be used back
>>>   to the ingress interface to send an ICMP FRAG_NEEDED packet.
>>>
>>> - For OAM purposes, you might want to send one frame with
>>>   OAM information back, while the original frame in passed forward.
>>>
>>> To enable these use cases, add the XDP_REDIRECT case to
>>> dev_map_bpf_prog_run. Also, when this is called from inside
>>> xdp_do_flush, the redirect might add further entries to the
>>> flush lists that are currently processed. Therefore, loop inside
>>> xdp_do_flush until no more additional redirects were added.
>>>
>>> Signed-off-by: Florian Kauer <florian.kauer@linutronix.de>
>> 
>> This is an interesting use case! However, your implementation makes it
>> way to easy to end up in a situation that loops forever, so I think we
>> should add some protection against that. Some details below:
>> 
>>> ---
>>>  include/linux/bpf.h        |  4 ++--
>>>  include/trace/events/xdp.h | 10 ++++++----
>>>  kernel/bpf/devmap.c        | 37 +++++++++++++++++++++++++++--------
>>>  net/core/filter.c          | 48 +++++++++++++++++++++++++++-------------------
>>>  4 files changed, 65 insertions(+), 34 deletions(-)
>>>
>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>> index 3b94ec161e8c..1b57cbabf199 100644
>>> --- a/include/linux/bpf.h
>>> +++ b/include/linux/bpf.h
>>> @@ -2498,7 +2498,7 @@ struct sk_buff;
>>>  struct bpf_dtab_netdev;
>>>  struct bpf_cpu_map_entry;
>>>  
>>> -void __dev_flush(struct list_head *flush_list);
>>> +void __dev_flush(struct list_head *flush_list, int *redirects);
>>>  int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
>>>  		    struct net_device *dev_rx);
>>>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
>>> @@ -2740,7 +2740,7 @@ static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd)
>>>  	return ERR_PTR(-EOPNOTSUPP);
>>>  }
>>>  
>>> -static inline void __dev_flush(struct list_head *flush_list)
>>> +static inline void __dev_flush(struct list_head *flush_list, int *redirects)
>>>  {
>>>  }
>>>  
>>> diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
>>> index a7e5452b5d21..fba2c457e727 100644
>>> --- a/include/trace/events/xdp.h
>>> +++ b/include/trace/events/xdp.h
>>> @@ -269,9 +269,9 @@ TRACE_EVENT(xdp_devmap_xmit,
>>>  
>>>  	TP_PROTO(const struct net_device *from_dev,
>>>  		 const struct net_device *to_dev,
>>> -		 int sent, int drops, int err),
>>> +		 int sent, int drops, int redirects, int err),
>>>  
>>> -	TP_ARGS(from_dev, to_dev, sent, drops, err),
>>> +	TP_ARGS(from_dev, to_dev, sent, drops, redirects, err),
>>>  
>>>  	TP_STRUCT__entry(
>>>  		__field(int, from_ifindex)
>>> @@ -279,6 +279,7 @@ TRACE_EVENT(xdp_devmap_xmit,
>>>  		__field(int, to_ifindex)
>>>  		__field(int, drops)
>>>  		__field(int, sent)
>>> +		__field(int, redirects)
>>>  		__field(int, err)
>>>  	),
>>>  
>>> @@ -288,16 +289,17 @@ TRACE_EVENT(xdp_devmap_xmit,
>>>  		__entry->to_ifindex	= to_dev->ifindex;
>>>  		__entry->drops		= drops;
>>>  		__entry->sent		= sent;
>>> +		__entry->redirects	= redirects;
>>>  		__entry->err		= err;
>>>  	),
>>>  
>>>  	TP_printk("ndo_xdp_xmit"
>>>  		  " from_ifindex=%d to_ifindex=%d action=%s"
>>> -		  " sent=%d drops=%d"
>>> +		  " sent=%d drops=%d redirects=%d"
>>>  		  " err=%d",
>>>  		  __entry->from_ifindex, __entry->to_ifindex,
>>>  		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
>>> -		  __entry->sent, __entry->drops,
>>> +		  __entry->sent, __entry->drops, __entry->redirects,
>>>  		  __entry->err)
>>>  );
>>>  
>>> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
>>> index 7878be18e9d2..89bdec49ea40 100644
>>> --- a/kernel/bpf/devmap.c
>>> +++ b/kernel/bpf/devmap.c
>>> @@ -334,7 +334,8 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
>>>  static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
>>>  				struct xdp_frame **frames, int n,
>>>  				struct net_device *tx_dev,
>>> -				struct net_device *rx_dev)
>>> +				struct net_device *rx_dev,
>>> +				int *redirects)
>>>  {
>>>  	struct xdp_txq_info txq = { .dev = tx_dev };
>>>  	struct xdp_rxq_info rxq = { .dev = rx_dev };
>>> @@ -359,6 +360,13 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
>>>  			else
>>>  				frames[nframes++] = xdpf;
>>>  			break;
>>> +		case XDP_REDIRECT:
>>> +			err = xdp_do_redirect(rx_dev, &xdp, xdp_prog);
>>> +			if (unlikely(err))
>>> +				xdp_return_frame_rx_napi(xdpf);
>>> +			else
>>> +				*redirects += 1;
>>> +			break;
>> 
>> It's a bit subtle, but dev_map_bpf_prog_run() also filters the list of
>> frames in the queue in-place (the frames[nframes++] = xdpf; line above),
>> which only works under the assumption that the array in bq->q is not
>> modified while this loop is being run. But now you're adding a call in
>> the middle that may result in the packet being put back on the same
>> queue in the middle, which means that this assumption no longer holds.
>> 
>> So you need to clear the bq->q queue first for this to work.
>> Specifically, at the start of bq_xmit_all(), you'll need to first copy
>> all the packet pointer onto an on-stack array, then run the rest of the
>> function on that array. There's already an initial loop that goes
>> through all the frames, so you can just do it there.
>> 
>> So the loop at the start of bq_xmit_all() goes from the current:
>> 
>> 	for (i = 0; i < cnt; i++) {
>> 		struct xdp_frame *xdpf = bq->q[i];
>> 
>> 		prefetch(xdpf);
>> 	}
>> 
>> 
>> to something like:
>> 
>>         struct xdp_frame *frames[DEV_MAP_BULK_SIZE];
>> 
>> 	for (i = 0; i < cnt; i++) {
>> 		struct xdp_frame *xdpf = bq->q[i];
>> 
>> 		prefetch(xdpf);
>>                 frames[i] = xdpf;
>> 	}
>> 
>>         bq->count = 0; /* bq is now empty, use the 'frames' and 'cnt'
>>                           stack variables for the rest of the function */
>> 
>> 
>> 
>>>  		default:
>>>  			bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
>>>  			fallthrough;
>>> @@ -373,7 +381,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
>>>  	return nframes; /* sent frames count */
>>>  }
>>>  
>>> -static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
>>> +static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags, int *redirects)
>>>  {
>>>  	struct net_device *dev = bq->dev;
>>>  	unsigned int cnt = bq->count;
>>> @@ -390,8 +398,10 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
>>>  		prefetch(xdpf);
>>>  	}
>>>  
>>> +	int new_redirects = 0;
>>>  	if (bq->xdp_prog) {
>>> -		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx);
>>> +		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx,
>>> +				&new_redirects);
>>>  		if (!to_send)
>>>  			goto out;
>>>  	}
>>> @@ -413,19 +423,21 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
>>>  
>>>  out:
>>>  	bq->count = 0;
>>> -	trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err);
>>> +	*redirects += new_redirects;
>>> +	trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent - new_redirects,
>>> +			new_redirects, err);
>>>  }
>>>  
>>>  /* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the
>>>   * driver before returning from its napi->poll() routine. See the comment above
>>>   * xdp_do_flush() in filter.c.
>>>   */
>>> -void __dev_flush(struct list_head *flush_list)
>>> +void __dev_flush(struct list_head *flush_list, int *redirects)
>>>  {
>>>  	struct xdp_dev_bulk_queue *bq, *tmp;
>>>  
>>>  	list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
>>> -		bq_xmit_all(bq, XDP_XMIT_FLUSH);
>>> +		bq_xmit_all(bq, XDP_XMIT_FLUSH, redirects);
>>>  		bq->dev_rx = NULL;
>>>  		bq->xdp_prog = NULL;
>>>  		__list_del_clearprev(&bq->flush_node);
>>> @@ -458,8 +470,17 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
>>>  {
>>>  	struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
>>>  
>>> -	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
>>> -		bq_xmit_all(bq, 0);
>>> +	if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) {
>>> +		int redirects = 0;
>>> +
>>> +		bq_xmit_all(bq, 0, &redirects);
>>> +
>>> +		/* according to comment above xdp_do_flush() in
>>> +		 * filter.c, xdp_do_flush is always called at
>>> +		 * the end of the NAPI anyway, so no need to act
>>> +		 * on the redirects here
>>> +		 */
>> 
>> While it's true that it will be called again in NAPI, the purpose of
>> calling bq_xmit_all() here is to make room space for the packet on the
>> bulk queue that we're about to enqueue, and if bq_xmit_all() can just
>> put the packet back on the queue, there is no guarantee that this will
>> succeed. So you will have to handle that case here.
>> 
>> Since there's also a potential infinite recursion issue in the
>> do_flush() functions below, I think it may be better to handle this
>> looping issue inside bq_xmit_all().
>> 
>> I.e., structure the code so that bq_xmit_all() guarantees that when it
>> returns it has actually done its job; that is, that bq->q is empty.
>> 
>> Given the above "move all frames out of bq->q at the start" change, this
>> is not all that hard. Simply add a check after the out: label (in
>> bq_xmit_all()) to check if bq->count is actually 0, and if it isn't,
>> start over from the beginning of that function. This also makes it
>> straight forward to add a recursion limit; after looping a set number of
>> times (say, XMIT_RECURSION_LIMIT), simply turn XDP_REDIRECT into drops.
>> 
>> There will need to be some additional protection against looping forever
>> in __dev_flush(), to handle the case where a packet is looped between
>> two interfaces. This one is a bit trickier, but a similar recursion
>> counter could be used, I think.
>
>
> Thanks a lot for the extensive support!
> Regarding __dev_flush(), could the following already be sufficient?
>
> void __dev_flush(struct list_head *flush_list)
> {
>         struct xdp_dev_bulk_queue *bq, *tmp;
>         int i,j;
>
>         for (i = 0; !list_empty(flush_list) && i < XMIT_RECURSION_LIMIT; i++) {
>                 /* go through list in reverse so that new items
>                  * added to the flush_list will only be handled
>                  * in the next iteration of the outer loop
>                  */
>                 list_for_each_entry_safe_reverse(bq, tmp, flush_list, flush_node) {
>                         bq_xmit_all(bq, XDP_XMIT_FLUSH);
>                         bq->dev_rx = NULL;
>                         bq->xdp_prog = NULL;
>                         __list_del_clearprev(&bq->flush_node);
>                 }
>         }
>
>         if (i == XMIT_RECURSION_LIMIT) {
>                 pr_warn("XDP recursion limit hit, expect packet loss!\n");
>
>                 list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
>                         for (j = 0; j < bq->count; j++)
>                                 xdp_return_frame_rx_napi(bq->q[i]);
>
>                         bq->count = 0;
>                         bq->dev_rx = NULL;
>                         bq->xdp_prog = NULL;
>                         __list_del_clearprev(&bq->flush_node);
>                 }
>         }
> }

Yeah, this would work, I think (neat trick with iterating the list in
reverse!), but instead of the extra loop in the end, I would suggest
passing in an extra 'allow_redirect' parameter to bq_xmit_all(). Since
you'll already have to handle the recursion limit inside that function,
you can just reuse the same functionality by passing in the parameter in
the last iteration of the first loop.

Also, definitely don't put an unconditional warn_on() in the fast path -
that brings down the system really quickly if it's misconfigured :)

A warn_on_once() could work, but really I would suggest just reusing the
_trace_xdp_redirect_map_err() tracepoint with a new error code (just has
to be one we're not currently using and that vaguely resembles what this
is about; ELOOP, EOVERFLOW or EDEADLOCK, maybe?).

>> In any case, this needs extensive selftests, including ones with devmap
>> programs that loop packets (both redirect from a->a, and from
>> a->b->a->b) to make sure the limits work correctly.
>
>
> Good point! I am going to prepare some.
>
>
>> 
>>> +	}
>>>  
>>>  	/* Ingress dev_rx will be the same for all xdp_frame's in
>>>  	 * bulk_queue, because bq stored per-CPU and must be flushed
>>> diff --git a/net/core/filter.c b/net/core/filter.c
>>> index 8569cd2482ee..b33fc0b1444a 100644
>>> --- a/net/core/filter.c
>>> +++ b/net/core/filter.c
>>> @@ -4287,14 +4287,18 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
>>>  void xdp_do_flush(void)
>>>  {
>>>  	struct list_head *lh_map, *lh_dev, *lh_xsk;
>>> +	int redirect;
>>>  
>>> -	bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
>>> -	if (lh_dev)
>>> -		__dev_flush(lh_dev);
>>> -	if (lh_map)
>>> -		__cpu_map_flush(lh_map);
>>> -	if (lh_xsk)
>>> -		__xsk_map_flush(lh_xsk);
>>> +	do {
>>> +		redirect = 0;
>>> +		bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
>>> +		if (lh_dev)
>>> +			__dev_flush(lh_dev, &redirect);
>>> +		if (lh_map)
>>> +			__cpu_map_flush(lh_map);
>>> +		if (lh_xsk)
>>> +			__xsk_map_flush(lh_xsk);
>>> +	} while (redirect > 0);
>>>  }
>>>  EXPORT_SYMBOL_GPL(xdp_do_flush);
>>>  
>>> @@ -4303,20 +4307,24 @@ void xdp_do_check_flushed(struct napi_struct *napi)
>>>  {
>>>  	struct list_head *lh_map, *lh_dev, *lh_xsk;
>>>  	bool missed = false;
>>> +	int redirect;
>>>  
>>> -	bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
>>> -	if (lh_dev) {
>>> -		__dev_flush(lh_dev);
>>> -		missed = true;
>>> -	}
>>> -	if (lh_map) {
>>> -		__cpu_map_flush(lh_map);
>>> -		missed = true;
>>> -	}
>>> -	if (lh_xsk) {
>>> -		__xsk_map_flush(lh_xsk);
>>> -		missed = true;
>>> -	}
>>> +	do {
>>> +		redirect = 0;
>>> +		bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
>>> +		if (lh_dev) {
>>> +			__dev_flush(lh_dev, &redirect);
>>> +			missed = true;
>>> +		}
>>> +		if (lh_map) {
>>> +			__cpu_map_flush(lh_map);
>>> +			missed = true;
>>> +		}
>>> +		if (lh_xsk) {
>>> +			__xsk_map_flush(lh_xsk);
>>> +			missed = true;
>>> +		}
>>> +	} while (redirect > 0);
>> 
>> With the change suggested above (so that bq_xmit_all() guarantees the
>> flush is completely done), this looping is not needed anymore. However,
>> it becomes important in which *order* the flushing is done
>> (__dev_flush() should always happen first), so adding a comment to note
>> this would be good.
>
>
> I guess, if we remove the loop here and we still want to cover the case of
> redirecting from devmap program via cpumap, we need to fetch the lh_map again
> after calling __dev_flush, right?
> So I think we should no longer use bpf_net_ctx_get_all_used_flush_lists then:
>
>         lh_dev = bpf_net_ctx_get_dev_flush_list();
>         if (lh_dev)
>                 __dev_flush(lh_dev);
>         lh_map = bpf_net_ctx_get_cpu_map_flush_list();
>         if (lh_map)
>                 __cpu_map_flush(lh_map);
>         lh_xsk = bpf_net_ctx_get_xskmap_flush_list();
>         if (lh_xsk)
>                 __xsk_map_flush(lh_xsk);
>
> But bpf_net_ctx_get_all_used_flush_lists also includes additional checks
> for IS_ENABLED(CONFIG_BPF_SYSCALL) and IS_ENABLED(CONFIG_XDP_SOCKETS),
> so I guess they should be directly included in the xdp_do_flush and
> xdp_do_check_flushed?
> Then we could remove the bpf_net_ctx_get_all_used_flush_lists.
> Or do you have an idea for a more elegant solution?

I think cpumap is fine because that doesn't immediately redirect back to
the bulk queue; instead __cpu_map_flush() will put the frames on a
per-CPU ring buffer and wake the kworker on that CPU. Which will in turn
do another xdp_do_flush() if the cpumap program does a redirect. So in
other words, we only need to handle recursion of devmap redirects :)

-Toke
Florian Kauer Sept. 11, 2024, 1:43 p.m. UTC | #4
On 9/11/24 11:25, Toke Høiland-Jørgensen wrote:
> Florian Kauer <florian.kauer@linutronix.de> writes:
> 
>> Hi Toke,
>>
>> On 9/10/24 10:34, Toke Høiland-Jørgensen wrote:
>>> Florian Kauer <florian.kauer@linutronix.de> writes:
>>>
>>>> Currently, returning XDP_REDIRECT from a xdp/devmap program
>>>> is considered as invalid action and an exception is traced.
>>>>
>>>> While it might seem counterintuitive to redirect in a xdp/devmap
>>>> program (why not just redirect to the correct interface in the
>>>> first program?), we faced several use cases where supporting
>>>> this would be very useful.
>>>>
>>>> Most importantly, they occur when the first redirect is used
>>>> with the BPF_F_BROADCAST flag. Using this together with xdp/devmap
>>>> programs, enables to perform different actions on clones of
>>>> the same incoming frame. In that case, it is often useful
>>>> to redirect either to a different CPU or device AFTER the cloning.
>>>>
>>>> For example:
>>>> - Replicate the frame (for redundancy according to IEEE 802.1CB FRER)
>>>>   and then use the second redirect with a cpumap to select
>>>>   the path-specific egress queue.
>>>>
>>>> - Also, one of the paths might need an encapsulation that
>>>>   exceeds the MTU. So a second redirect can be used back
>>>>   to the ingress interface to send an ICMP FRAG_NEEDED packet.
>>>>
>>>> - For OAM purposes, you might want to send one frame with
>>>>   OAM information back, while the original frame in passed forward.
>>>>
>>>> To enable these use cases, add the XDP_REDIRECT case to
>>>> dev_map_bpf_prog_run. Also, when this is called from inside
>>>> xdp_do_flush, the redirect might add further entries to the
>>>> flush lists that are currently processed. Therefore, loop inside
>>>> xdp_do_flush until no more additional redirects were added.
>>>>
>>>> Signed-off-by: Florian Kauer <florian.kauer@linutronix.de>
>>>
>>> This is an interesting use case! However, your implementation makes it
>>> way to easy to end up in a situation that loops forever, so I think we
>>> should add some protection against that. Some details below:
>>>
>>>> ---
>>>>  include/linux/bpf.h        |  4 ++--
>>>>  include/trace/events/xdp.h | 10 ++++++----
>>>>  kernel/bpf/devmap.c        | 37 +++++++++++++++++++++++++++--------
>>>>  net/core/filter.c          | 48 +++++++++++++++++++++++++++-------------------
>>>>  4 files changed, 65 insertions(+), 34 deletions(-)
>>>>
>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>>> index 3b94ec161e8c..1b57cbabf199 100644
>>>> --- a/include/linux/bpf.h
>>>> +++ b/include/linux/bpf.h
>>>> @@ -2498,7 +2498,7 @@ struct sk_buff;
>>>>  struct bpf_dtab_netdev;
>>>>  struct bpf_cpu_map_entry;
>>>>  
>>>> -void __dev_flush(struct list_head *flush_list);
>>>> +void __dev_flush(struct list_head *flush_list, int *redirects);
>>>>  int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
>>>>  		    struct net_device *dev_rx);
>>>>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
>>>> @@ -2740,7 +2740,7 @@ static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd)
>>>>  	return ERR_PTR(-EOPNOTSUPP);
>>>>  }
>>>>  
>>>> -static inline void __dev_flush(struct list_head *flush_list)
>>>> +static inline void __dev_flush(struct list_head *flush_list, int *redirects)
>>>>  {
>>>>  }
>>>>  
>>>> diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
>>>> index a7e5452b5d21..fba2c457e727 100644
>>>> --- a/include/trace/events/xdp.h
>>>> +++ b/include/trace/events/xdp.h
>>>> @@ -269,9 +269,9 @@ TRACE_EVENT(xdp_devmap_xmit,
>>>>  
>>>>  	TP_PROTO(const struct net_device *from_dev,
>>>>  		 const struct net_device *to_dev,
>>>> -		 int sent, int drops, int err),
>>>> +		 int sent, int drops, int redirects, int err),
>>>>  
>>>> -	TP_ARGS(from_dev, to_dev, sent, drops, err),
>>>> +	TP_ARGS(from_dev, to_dev, sent, drops, redirects, err),
>>>>  
>>>>  	TP_STRUCT__entry(
>>>>  		__field(int, from_ifindex)
>>>> @@ -279,6 +279,7 @@ TRACE_EVENT(xdp_devmap_xmit,
>>>>  		__field(int, to_ifindex)
>>>>  		__field(int, drops)
>>>>  		__field(int, sent)
>>>> +		__field(int, redirects)
>>>>  		__field(int, err)
>>>>  	),
>>>>  
>>>> @@ -288,16 +289,17 @@ TRACE_EVENT(xdp_devmap_xmit,
>>>>  		__entry->to_ifindex	= to_dev->ifindex;
>>>>  		__entry->drops		= drops;
>>>>  		__entry->sent		= sent;
>>>> +		__entry->redirects	= redirects;
>>>>  		__entry->err		= err;
>>>>  	),
>>>>  
>>>>  	TP_printk("ndo_xdp_xmit"
>>>>  		  " from_ifindex=%d to_ifindex=%d action=%s"
>>>> -		  " sent=%d drops=%d"
>>>> +		  " sent=%d drops=%d redirects=%d"
>>>>  		  " err=%d",
>>>>  		  __entry->from_ifindex, __entry->to_ifindex,
>>>>  		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
>>>> -		  __entry->sent, __entry->drops,
>>>> +		  __entry->sent, __entry->drops, __entry->redirects,
>>>>  		  __entry->err)
>>>>  );
>>>>  
>>>> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
>>>> index 7878be18e9d2..89bdec49ea40 100644
>>>> --- a/kernel/bpf/devmap.c
>>>> +++ b/kernel/bpf/devmap.c
>>>> @@ -334,7 +334,8 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
>>>>  static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
>>>>  				struct xdp_frame **frames, int n,
>>>>  				struct net_device *tx_dev,
>>>> -				struct net_device *rx_dev)
>>>> +				struct net_device *rx_dev,
>>>> +				int *redirects)
>>>>  {
>>>>  	struct xdp_txq_info txq = { .dev = tx_dev };
>>>>  	struct xdp_rxq_info rxq = { .dev = rx_dev };
>>>> @@ -359,6 +360,13 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
>>>>  			else
>>>>  				frames[nframes++] = xdpf;
>>>>  			break;
>>>> +		case XDP_REDIRECT:
>>>> +			err = xdp_do_redirect(rx_dev, &xdp, xdp_prog);
>>>> +			if (unlikely(err))
>>>> +				xdp_return_frame_rx_napi(xdpf);
>>>> +			else
>>>> +				*redirects += 1;
>>>> +			break;
>>>
>>> It's a bit subtle, but dev_map_bpf_prog_run() also filters the list of
>>> frames in the queue in-place (the frames[nframes++] = xdpf; line above),
>>> which only works under the assumption that the array in bq->q is not
>>> modified while this loop is being run. But now you're adding a call in
>>> the middle that may result in the packet being put back on the same
>>> queue in the middle, which means that this assumption no longer holds.
>>>
>>> So you need to clear the bq->q queue first for this to work.
>>> Specifically, at the start of bq_xmit_all(), you'll need to first copy
>>> all the packet pointer onto an on-stack array, then run the rest of the
>>> function on that array. There's already an initial loop that goes
>>> through all the frames, so you can just do it there.
>>>
>>> So the loop at the start of bq_xmit_all() goes from the current:
>>>
>>> 	for (i = 0; i < cnt; i++) {
>>> 		struct xdp_frame *xdpf = bq->q[i];
>>>
>>> 		prefetch(xdpf);
>>> 	}
>>>
>>>
>>> to something like:
>>>
>>>         struct xdp_frame *frames[DEV_MAP_BULK_SIZE];
>>>
>>> 	for (i = 0; i < cnt; i++) {
>>> 		struct xdp_frame *xdpf = bq->q[i];
>>>
>>> 		prefetch(xdpf);
>>>                 frames[i] = xdpf;
>>> 	}
>>>
>>>         bq->count = 0; /* bq is now empty, use the 'frames' and 'cnt'
>>>                           stack variables for the rest of the function */
>>>
>>>
>>>
>>>>  		default:
>>>>  			bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
>>>>  			fallthrough;
>>>> @@ -373,7 +381,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
>>>>  	return nframes; /* sent frames count */
>>>>  }
>>>>  
>>>> -static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
>>>> +static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags, int *redirects)
>>>>  {
>>>>  	struct net_device *dev = bq->dev;
>>>>  	unsigned int cnt = bq->count;
>>>> @@ -390,8 +398,10 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
>>>>  		prefetch(xdpf);
>>>>  	}
>>>>  
>>>> +	int new_redirects = 0;
>>>>  	if (bq->xdp_prog) {
>>>> -		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx);
>>>> +		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx,
>>>> +				&new_redirects);
>>>>  		if (!to_send)
>>>>  			goto out;
>>>>  	}
>>>> @@ -413,19 +423,21 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
>>>>  
>>>>  out:
>>>>  	bq->count = 0;
>>>> -	trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err);
>>>> +	*redirects += new_redirects;
>>>> +	trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent - new_redirects,
>>>> +			new_redirects, err);
>>>>  }
>>>>  
>>>>  /* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the
>>>>   * driver before returning from its napi->poll() routine. See the comment above
>>>>   * xdp_do_flush() in filter.c.
>>>>   */
>>>> -void __dev_flush(struct list_head *flush_list)
>>>> +void __dev_flush(struct list_head *flush_list, int *redirects)
>>>>  {
>>>>  	struct xdp_dev_bulk_queue *bq, *tmp;
>>>>  
>>>>  	list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
>>>> -		bq_xmit_all(bq, XDP_XMIT_FLUSH);
>>>> +		bq_xmit_all(bq, XDP_XMIT_FLUSH, redirects);
>>>>  		bq->dev_rx = NULL;
>>>>  		bq->xdp_prog = NULL;
>>>>  		__list_del_clearprev(&bq->flush_node);
>>>> @@ -458,8 +470,17 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
>>>>  {
>>>>  	struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
>>>>  
>>>> -	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
>>>> -		bq_xmit_all(bq, 0);
>>>> +	if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) {
>>>> +		int redirects = 0;
>>>> +
>>>> +		bq_xmit_all(bq, 0, &redirects);
>>>> +
>>>> +		/* according to comment above xdp_do_flush() in
>>>> +		 * filter.c, xdp_do_flush is always called at
>>>> +		 * the end of the NAPI anyway, so no need to act
>>>> +		 * on the redirects here
>>>> +		 */
>>>
>>> While it's true that it will be called again in NAPI, the purpose of
>>> calling bq_xmit_all() here is to make room space for the packet on the
>>> bulk queue that we're about to enqueue, and if bq_xmit_all() can just
>>> put the packet back on the queue, there is no guarantee that this will
>>> succeed. So you will have to handle that case here.
>>>
>>> Since there's also a potential infinite recursion issue in the
>>> do_flush() functions below, I think it may be better to handle this
>>> looping issue inside bq_xmit_all().
>>>
>>> I.e., structure the code so that bq_xmit_all() guarantees that when it
>>> returns it has actually done its job; that is, that bq->q is empty.
>>>
>>> Given the above "move all frames out of bq->q at the start" change, this
>>> is not all that hard. Simply add a check after the out: label (in
>>> bq_xmit_all()) to check if bq->count is actually 0, and if it isn't,
>>> start over from the beginning of that function. This also makes it
>>> straight forward to add a recursion limit; after looping a set number of
>>> times (say, XMIT_RECURSION_LIMIT), simply turn XDP_REDIRECT into drops.
>>>
>>> There will need to be some additional protection against looping forever
>>> in __dev_flush(), to handle the case where a packet is looped between
>>> two interfaces. This one is a bit trickier, but a similar recursion
>>> counter could be used, I think.
>>
>>
>> Thanks a lot for the extensive support!
>> Regarding __dev_flush(), could the following already be sufficient?
>>
>> void __dev_flush(struct list_head *flush_list)
>> {
>>         struct xdp_dev_bulk_queue *bq, *tmp;
>>         int i,j;
>>
>>         for (i = 0; !list_empty(flush_list) && i < XMIT_RECURSION_LIMIT; i++) {
>>                 /* go through list in reverse so that new items
>>                  * added to the flush_list will only be handled
>>                  * in the next iteration of the outer loop
>>                  */
>>                 list_for_each_entry_safe_reverse(bq, tmp, flush_list, flush_node) {
>>                         bq_xmit_all(bq, XDP_XMIT_FLUSH);
>>                         bq->dev_rx = NULL;
>>                         bq->xdp_prog = NULL;
>>                         __list_del_clearprev(&bq->flush_node);
>>                 }
>>         }
>>
>>         if (i == XMIT_RECURSION_LIMIT) {
>>                 pr_warn("XDP recursion limit hit, expect packet loss!\n");
>>
>>                 list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
>>                         for (j = 0; j < bq->count; j++)
>>                                 xdp_return_frame_rx_napi(bq->q[i]);
>>
>>                         bq->count = 0;
>>                         bq->dev_rx = NULL;
>>                         bq->xdp_prog = NULL;
>>                         __list_del_clearprev(&bq->flush_node);
>>                 }
>>         }
>> }
> 
> Yeah, this would work, I think (neat trick with iterating the list in
> reverse!), but instead of the extra loop in the end, I would suggest
> passing in an extra 'allow_redirect' parameter to bq_xmit_all(). Since
> you'll already have to handle the recursion limit inside that function,
> you can just reuse the same functionality by passing in the parameter in
> the last iteration of the first loop.
> 
> Also, definitely don't put an unconditional warn_on() in the fast path -
> that brings down the system really quickly if it's misconfigured :)
> 
> A warn_on_once() could work, but really I would suggest just reusing the
> _trace_xdp_redirect_map_err() tracepoint with a new error code (just has
> to be one we're not currently using and that vaguely resembles what this
> is about; ELOOP, EOVERFLOW or EDEADLOCK, maybe?).


The 'allow_redirect' parameter is a very good idea! If I also forward that
to dev_map_bpf_prog_run and do something like the following, I can just
reuse the existing error handling. Or is that too implict/too ugly?

switch (act) {
case XDP_PASS:
        err = xdp_update_frame_from_buff(&xdp, xdpf);
        if (unlikely(err < 0))
                xdp_return_frame_rx_napi(xdpf);
        else
                frames[nframes++] = xdpf;
        break;
case XDP_REDIRECT:
        if (allow_redirect) {
                err = xdp_do_redirect(rx_dev, &xdp, xdp_prog);
                if (unlikely(err))
                        xdp_return_frame_rx_napi(xdpf);
                else
                        *redirects += 1;
                break;
        } else
                fallthrough;
default:
        bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
        fallthrough;
case XDP_ABORTED:
        trace_xdp_exception(tx_dev, xdp_prog, act);
        fallthrough;
case XDP_DROP:
        xdp_return_frame_rx_napi(xdpf);
        break;
}


> 
>>> In any case, this needs extensive selftests, including ones with devmap
>>> programs that loop packets (both redirect from a->a, and from
>>> a->b->a->b) to make sure the limits work correctly.
>>
>>
>> Good point! I am going to prepare some.
>>
>>
>>>
>>>> +	}
>>>>  
>>>>  	/* Ingress dev_rx will be the same for all xdp_frame's in
>>>>  	 * bulk_queue, because bq stored per-CPU and must be flushed
>>>> diff --git a/net/core/filter.c b/net/core/filter.c
>>>> index 8569cd2482ee..b33fc0b1444a 100644
>>>> --- a/net/core/filter.c
>>>> +++ b/net/core/filter.c
>>>> @@ -4287,14 +4287,18 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
>>>>  void xdp_do_flush(void)
>>>>  {
>>>>  	struct list_head *lh_map, *lh_dev, *lh_xsk;
>>>> +	int redirect;
>>>>  
>>>> -	bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
>>>> -	if (lh_dev)
>>>> -		__dev_flush(lh_dev);
>>>> -	if (lh_map)
>>>> -		__cpu_map_flush(lh_map);
>>>> -	if (lh_xsk)
>>>> -		__xsk_map_flush(lh_xsk);
>>>> +	do {
>>>> +		redirect = 0;
>>>> +		bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
>>>> +		if (lh_dev)
>>>> +			__dev_flush(lh_dev, &redirect);
>>>> +		if (lh_map)
>>>> +			__cpu_map_flush(lh_map);
>>>> +		if (lh_xsk)
>>>> +			__xsk_map_flush(lh_xsk);
>>>> +	} while (redirect > 0);
>>>>  }
>>>>  EXPORT_SYMBOL_GPL(xdp_do_flush);
>>>>  
>>>> @@ -4303,20 +4307,24 @@ void xdp_do_check_flushed(struct napi_struct *napi)
>>>>  {
>>>>  	struct list_head *lh_map, *lh_dev, *lh_xsk;
>>>>  	bool missed = false;
>>>> +	int redirect;
>>>>  
>>>> -	bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
>>>> -	if (lh_dev) {
>>>> -		__dev_flush(lh_dev);
>>>> -		missed = true;
>>>> -	}
>>>> -	if (lh_map) {
>>>> -		__cpu_map_flush(lh_map);
>>>> -		missed = true;
>>>> -	}
>>>> -	if (lh_xsk) {
>>>> -		__xsk_map_flush(lh_xsk);
>>>> -		missed = true;
>>>> -	}
>>>> +	do {
>>>> +		redirect = 0;
>>>> +		bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
>>>> +		if (lh_dev) {
>>>> +			__dev_flush(lh_dev, &redirect);
>>>> +			missed = true;
>>>> +		}
>>>> +		if (lh_map) {
>>>> +			__cpu_map_flush(lh_map);
>>>> +			missed = true;
>>>> +		}
>>>> +		if (lh_xsk) {
>>>> +			__xsk_map_flush(lh_xsk);
>>>> +			missed = true;
>>>> +		}
>>>> +	} while (redirect > 0);
>>>
>>> With the change suggested above (so that bq_xmit_all() guarantees the
>>> flush is completely done), this looping is not needed anymore. However,
>>> it becomes important in which *order* the flushing is done
>>> (__dev_flush() should always happen first), so adding a comment to note
>>> this would be good.
>>
>>
>> I guess, if we remove the loop here and we still want to cover the case of
>> redirecting from devmap program via cpumap, we need to fetch the lh_map again
>> after calling __dev_flush, right?
>> So I think we should no longer use bpf_net_ctx_get_all_used_flush_lists then:
>>
>>         lh_dev = bpf_net_ctx_get_dev_flush_list();
>>         if (lh_dev)
>>                 __dev_flush(lh_dev);
>>         lh_map = bpf_net_ctx_get_cpu_map_flush_list();
>>         if (lh_map)
>>                 __cpu_map_flush(lh_map);
>>         lh_xsk = bpf_net_ctx_get_xskmap_flush_list();
>>         if (lh_xsk)
>>                 __xsk_map_flush(lh_xsk);
>>
>> But bpf_net_ctx_get_all_used_flush_lists also includes additional checks
>> for IS_ENABLED(CONFIG_BPF_SYSCALL) and IS_ENABLED(CONFIG_XDP_SOCKETS),
>> so I guess they should be directly included in the xdp_do_flush and
>> xdp_do_check_flushed?
>> Then we could remove the bpf_net_ctx_get_all_used_flush_lists.
>> Or do you have an idea for a more elegant solution?
> 
> I think cpumap is fine because that doesn't immediately redirect back to
> the bulk queue; instead __cpu_map_flush() will put the frames on a
> per-CPU ring buffer and wake the kworker on that CPU. Which will in turn
> do another xdp_do_flush() if the cpumap program does a redirect. So in
> other words, we only need to handle recursion of devmap redirects :)

I likely miss something here, but the scenario I am thinking about is the following:

1. cpu_map and xsk_map flush list are empty, while the dev flush map has
   a single frame pending, i.e. in the current implementation after
   executing bpf_net_ctx_get_all_used_flush_lists:
   lh_dev = something
   lh_map = lh_xsk = NULL

2. __dev_flush gets called and executes a bpf program on the frame
   that returns with "return bpf_redirect_map(&cpu_map, 0, 0);"
   and that adds an item to the cpumap flush list.

3. Since __dev_flush is only able to handle devmap redirects itself,
   the item is still on the cpumap flush list after __dev_flush
   has returned.

4. lh_map, however, is still NULL, so __cpu_map_flush does not
   get called and thus the kworker is never woken up.

That is at least what I see on the running system that the kworker
is never woken up, but I might have done something else wrong...

Thanks,
Florian
Toke Høiland-Jørgensen Sept. 11, 2024, 2:14 p.m. UTC | #5
Florian Kauer <florian.kauer@linutronix.de> writes:

> On 9/11/24 11:25, Toke Høiland-Jørgensen wrote:
>> Florian Kauer <florian.kauer@linutronix.de> writes:
>> 
>>> Hi Toke,
>>>
>>> On 9/10/24 10:34, Toke Høiland-Jørgensen wrote:
>>>> Florian Kauer <florian.kauer@linutronix.de> writes:
>>>>
>>>>> Currently, returning XDP_REDIRECT from a xdp/devmap program
>>>>> is considered as invalid action and an exception is traced.
>>>>>
>>>>> While it might seem counterintuitive to redirect in a xdp/devmap
>>>>> program (why not just redirect to the correct interface in the
>>>>> first program?), we faced several use cases where supporting
>>>>> this would be very useful.
>>>>>
>>>>> Most importantly, they occur when the first redirect is used
>>>>> with the BPF_F_BROADCAST flag. Using this together with xdp/devmap
>>>>> programs, enables to perform different actions on clones of
>>>>> the same incoming frame. In that case, it is often useful
>>>>> to redirect either to a different CPU or device AFTER the cloning.
>>>>>
>>>>> For example:
>>>>> - Replicate the frame (for redundancy according to IEEE 802.1CB FRER)
>>>>>   and then use the second redirect with a cpumap to select
>>>>>   the path-specific egress queue.
>>>>>
>>>>> - Also, one of the paths might need an encapsulation that
>>>>>   exceeds the MTU. So a second redirect can be used back
>>>>>   to the ingress interface to send an ICMP FRAG_NEEDED packet.
>>>>>
>>>>> - For OAM purposes, you might want to send one frame with
>>>>>   OAM information back, while the original frame in passed forward.
>>>>>
>>>>> To enable these use cases, add the XDP_REDIRECT case to
>>>>> dev_map_bpf_prog_run. Also, when this is called from inside
>>>>> xdp_do_flush, the redirect might add further entries to the
>>>>> flush lists that are currently processed. Therefore, loop inside
>>>>> xdp_do_flush until no more additional redirects were added.
>>>>>
>>>>> Signed-off-by: Florian Kauer <florian.kauer@linutronix.de>
>>>>
>>>> This is an interesting use case! However, your implementation makes it
>>>> way to easy to end up in a situation that loops forever, so I think we
>>>> should add some protection against that. Some details below:
>>>>
>>>>> ---
>>>>>  include/linux/bpf.h        |  4 ++--
>>>>>  include/trace/events/xdp.h | 10 ++++++----
>>>>>  kernel/bpf/devmap.c        | 37 +++++++++++++++++++++++++++--------
>>>>>  net/core/filter.c          | 48 +++++++++++++++++++++++++++-------------------
>>>>>  4 files changed, 65 insertions(+), 34 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>>>> index 3b94ec161e8c..1b57cbabf199 100644
>>>>> --- a/include/linux/bpf.h
>>>>> +++ b/include/linux/bpf.h
>>>>> @@ -2498,7 +2498,7 @@ struct sk_buff;
>>>>>  struct bpf_dtab_netdev;
>>>>>  struct bpf_cpu_map_entry;
>>>>>  
>>>>> -void __dev_flush(struct list_head *flush_list);
>>>>> +void __dev_flush(struct list_head *flush_list, int *redirects);
>>>>>  int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
>>>>>  		    struct net_device *dev_rx);
>>>>>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
>>>>> @@ -2740,7 +2740,7 @@ static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd)
>>>>>  	return ERR_PTR(-EOPNOTSUPP);
>>>>>  }
>>>>>  
>>>>> -static inline void __dev_flush(struct list_head *flush_list)
>>>>> +static inline void __dev_flush(struct list_head *flush_list, int *redirects)
>>>>>  {
>>>>>  }
>>>>>  
>>>>> diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
>>>>> index a7e5452b5d21..fba2c457e727 100644
>>>>> --- a/include/trace/events/xdp.h
>>>>> +++ b/include/trace/events/xdp.h
>>>>> @@ -269,9 +269,9 @@ TRACE_EVENT(xdp_devmap_xmit,
>>>>>  
>>>>>  	TP_PROTO(const struct net_device *from_dev,
>>>>>  		 const struct net_device *to_dev,
>>>>> -		 int sent, int drops, int err),
>>>>> +		 int sent, int drops, int redirects, int err),
>>>>>  
>>>>> -	TP_ARGS(from_dev, to_dev, sent, drops, err),
>>>>> +	TP_ARGS(from_dev, to_dev, sent, drops, redirects, err),
>>>>>  
>>>>>  	TP_STRUCT__entry(
>>>>>  		__field(int, from_ifindex)
>>>>> @@ -279,6 +279,7 @@ TRACE_EVENT(xdp_devmap_xmit,
>>>>>  		__field(int, to_ifindex)
>>>>>  		__field(int, drops)
>>>>>  		__field(int, sent)
>>>>> +		__field(int, redirects)
>>>>>  		__field(int, err)
>>>>>  	),
>>>>>  
>>>>> @@ -288,16 +289,17 @@ TRACE_EVENT(xdp_devmap_xmit,
>>>>>  		__entry->to_ifindex	= to_dev->ifindex;
>>>>>  		__entry->drops		= drops;
>>>>>  		__entry->sent		= sent;
>>>>> +		__entry->redirects	= redirects;
>>>>>  		__entry->err		= err;
>>>>>  	),
>>>>>  
>>>>>  	TP_printk("ndo_xdp_xmit"
>>>>>  		  " from_ifindex=%d to_ifindex=%d action=%s"
>>>>> -		  " sent=%d drops=%d"
>>>>> +		  " sent=%d drops=%d redirects=%d"
>>>>>  		  " err=%d",
>>>>>  		  __entry->from_ifindex, __entry->to_ifindex,
>>>>>  		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
>>>>> -		  __entry->sent, __entry->drops,
>>>>> +		  __entry->sent, __entry->drops, __entry->redirects,
>>>>>  		  __entry->err)
>>>>>  );
>>>>>  
>>>>> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
>>>>> index 7878be18e9d2..89bdec49ea40 100644
>>>>> --- a/kernel/bpf/devmap.c
>>>>> +++ b/kernel/bpf/devmap.c
>>>>> @@ -334,7 +334,8 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
>>>>>  static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
>>>>>  				struct xdp_frame **frames, int n,
>>>>>  				struct net_device *tx_dev,
>>>>> -				struct net_device *rx_dev)
>>>>> +				struct net_device *rx_dev,
>>>>> +				int *redirects)
>>>>>  {
>>>>>  	struct xdp_txq_info txq = { .dev = tx_dev };
>>>>>  	struct xdp_rxq_info rxq = { .dev = rx_dev };
>>>>> @@ -359,6 +360,13 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
>>>>>  			else
>>>>>  				frames[nframes++] = xdpf;
>>>>>  			break;
>>>>> +		case XDP_REDIRECT:
>>>>> +			err = xdp_do_redirect(rx_dev, &xdp, xdp_prog);
>>>>> +			if (unlikely(err))
>>>>> +				xdp_return_frame_rx_napi(xdpf);
>>>>> +			else
>>>>> +				*redirects += 1;
>>>>> +			break;
>>>>
>>>> It's a bit subtle, but dev_map_bpf_prog_run() also filters the list of
>>>> frames in the queue in-place (the frames[nframes++] = xdpf; line above),
>>>> which only works under the assumption that the array in bq->q is not
>>>> modified while this loop is being run. But now you're adding a call in
>>>> the middle that may result in the packet being put back on the same
>>>> queue in the middle, which means that this assumption no longer holds.
>>>>
>>>> So you need to clear the bq->q queue first for this to work.
>>>> Specifically, at the start of bq_xmit_all(), you'll need to first copy
>>>> all the packet pointer onto an on-stack array, then run the rest of the
>>>> function on that array. There's already an initial loop that goes
>>>> through all the frames, so you can just do it there.
>>>>
>>>> So the loop at the start of bq_xmit_all() goes from the current:
>>>>
>>>> 	for (i = 0; i < cnt; i++) {
>>>> 		struct xdp_frame *xdpf = bq->q[i];
>>>>
>>>> 		prefetch(xdpf);
>>>> 	}
>>>>
>>>>
>>>> to something like:
>>>>
>>>>         struct xdp_frame *frames[DEV_MAP_BULK_SIZE];
>>>>
>>>> 	for (i = 0; i < cnt; i++) {
>>>> 		struct xdp_frame *xdpf = bq->q[i];
>>>>
>>>> 		prefetch(xdpf);
>>>>                 frames[i] = xdpf;
>>>> 	}
>>>>
>>>>         bq->count = 0; /* bq is now empty, use the 'frames' and 'cnt'
>>>>                           stack variables for the rest of the function */
>>>>
>>>>
>>>>
>>>>>  		default:
>>>>>  			bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
>>>>>  			fallthrough;
>>>>> @@ -373,7 +381,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
>>>>>  	return nframes; /* sent frames count */
>>>>>  }
>>>>>  
>>>>> -static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
>>>>> +static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags, int *redirects)
>>>>>  {
>>>>>  	struct net_device *dev = bq->dev;
>>>>>  	unsigned int cnt = bq->count;
>>>>> @@ -390,8 +398,10 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
>>>>>  		prefetch(xdpf);
>>>>>  	}
>>>>>  
>>>>> +	int new_redirects = 0;
>>>>>  	if (bq->xdp_prog) {
>>>>> -		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx);
>>>>> +		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx,
>>>>> +				&new_redirects);
>>>>>  		if (!to_send)
>>>>>  			goto out;
>>>>>  	}
>>>>> @@ -413,19 +423,21 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
>>>>>  
>>>>>  out:
>>>>>  	bq->count = 0;
>>>>> -	trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err);
>>>>> +	*redirects += new_redirects;
>>>>> +	trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent - new_redirects,
>>>>> +			new_redirects, err);
>>>>>  }
>>>>>  
>>>>>  /* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the
>>>>>   * driver before returning from its napi->poll() routine. See the comment above
>>>>>   * xdp_do_flush() in filter.c.
>>>>>   */
>>>>> -void __dev_flush(struct list_head *flush_list)
>>>>> +void __dev_flush(struct list_head *flush_list, int *redirects)
>>>>>  {
>>>>>  	struct xdp_dev_bulk_queue *bq, *tmp;
>>>>>  
>>>>>  	list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
>>>>> -		bq_xmit_all(bq, XDP_XMIT_FLUSH);
>>>>> +		bq_xmit_all(bq, XDP_XMIT_FLUSH, redirects);
>>>>>  		bq->dev_rx = NULL;
>>>>>  		bq->xdp_prog = NULL;
>>>>>  		__list_del_clearprev(&bq->flush_node);
>>>>> @@ -458,8 +470,17 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
>>>>>  {
>>>>>  	struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
>>>>>  
>>>>> -	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
>>>>> -		bq_xmit_all(bq, 0);
>>>>> +	if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) {
>>>>> +		int redirects = 0;
>>>>> +
>>>>> +		bq_xmit_all(bq, 0, &redirects);
>>>>> +
>>>>> +		/* according to comment above xdp_do_flush() in
>>>>> +		 * filter.c, xdp_do_flush is always called at
>>>>> +		 * the end of the NAPI anyway, so no need to act
>>>>> +		 * on the redirects here
>>>>> +		 */
>>>>
>>>> While it's true that it will be called again in NAPI, the purpose of
>>>> calling bq_xmit_all() here is to make room space for the packet on the
>>>> bulk queue that we're about to enqueue, and if bq_xmit_all() can just
>>>> put the packet back on the queue, there is no guarantee that this will
>>>> succeed. So you will have to handle that case here.
>>>>
>>>> Since there's also a potential infinite recursion issue in the
>>>> do_flush() functions below, I think it may be better to handle this
>>>> looping issue inside bq_xmit_all().
>>>>
>>>> I.e., structure the code so that bq_xmit_all() guarantees that when it
>>>> returns it has actually done its job; that is, that bq->q is empty.
>>>>
>>>> Given the above "move all frames out of bq->q at the start" change, this
>>>> is not all that hard. Simply add a check after the out: label (in
>>>> bq_xmit_all()) to check if bq->count is actually 0, and if it isn't,
>>>> start over from the beginning of that function. This also makes it
>>>> straight forward to add a recursion limit; after looping a set number of
>>>> times (say, XMIT_RECURSION_LIMIT), simply turn XDP_REDIRECT into drops.
>>>>
>>>> There will need to be some additional protection against looping forever
>>>> in __dev_flush(), to handle the case where a packet is looped between
>>>> two interfaces. This one is a bit trickier, but a similar recursion
>>>> counter could be used, I think.
>>>
>>>
>>> Thanks a lot for the extensive support!
>>> Regarding __dev_flush(), could the following already be sufficient?
>>>
>>> void __dev_flush(struct list_head *flush_list)
>>> {
>>>         struct xdp_dev_bulk_queue *bq, *tmp;
>>>         int i,j;
>>>
>>>         for (i = 0; !list_empty(flush_list) && i < XMIT_RECURSION_LIMIT; i++) {
>>>                 /* go through list in reverse so that new items
>>>                  * added to the flush_list will only be handled
>>>                  * in the next iteration of the outer loop
>>>                  */
>>>                 list_for_each_entry_safe_reverse(bq, tmp, flush_list, flush_node) {
>>>                         bq_xmit_all(bq, XDP_XMIT_FLUSH);
>>>                         bq->dev_rx = NULL;
>>>                         bq->xdp_prog = NULL;
>>>                         __list_del_clearprev(&bq->flush_node);
>>>                 }
>>>         }
>>>
>>>         if (i == XMIT_RECURSION_LIMIT) {
>>>                 pr_warn("XDP recursion limit hit, expect packet loss!\n");
>>>
>>>                 list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
>>>                         for (j = 0; j < bq->count; j++)
>>>                                 xdp_return_frame_rx_napi(bq->q[i]);
>>>
>>>                         bq->count = 0;
>>>                         bq->dev_rx = NULL;
>>>                         bq->xdp_prog = NULL;
>>>                         __list_del_clearprev(&bq->flush_node);
>>>                 }
>>>         }
>>> }
>> 
>> Yeah, this would work, I think (neat trick with iterating the list in
>> reverse!), but instead of the extra loop in the end, I would suggest
>> passing in an extra 'allow_redirect' parameter to bq_xmit_all(). Since
>> you'll already have to handle the recursion limit inside that function,
>> you can just reuse the same functionality by passing in the parameter in
>> the last iteration of the first loop.
>> 
>> Also, definitely don't put an unconditional warn_on() in the fast path -
>> that brings down the system really quickly if it's misconfigured :)
>> 
>> A warn_on_once() could work, but really I would suggest just reusing the
>> _trace_xdp_redirect_map_err() tracepoint with a new error code (just has
>> to be one we're not currently using and that vaguely resembles what this
>> is about; ELOOP, EOVERFLOW or EDEADLOCK, maybe?).
>
>
> The 'allow_redirect' parameter is a very good idea! If I also forward that
> to dev_map_bpf_prog_run and do something like the following, I can just
> reuse the existing error handling. Or is that too implict/too ugly?
>
> switch (act) {
> case XDP_PASS:
>         err = xdp_update_frame_from_buff(&xdp, xdpf);
>         if (unlikely(err < 0))
>                 xdp_return_frame_rx_napi(xdpf);
>         else
>                 frames[nframes++] = xdpf;
>         break;
> case XDP_REDIRECT:
>         if (allow_redirect) {
>                 err = xdp_do_redirect(rx_dev, &xdp, xdp_prog);
>                 if (unlikely(err))
>                         xdp_return_frame_rx_napi(xdpf);
>                 else
>                         *redirects += 1;
>                 break;
>         } else
>                 fallthrough;
> default:
>         bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
>         fallthrough;

Yes, I was imagining something like this. Not sure if we want to turn it
into an "invalid action" this way (it should probably be a separate
tracepoint as I mentioned in the previous email). But otherwise, yeah!

> case XDP_ABORTED:
>         trace_xdp_exception(tx_dev, xdp_prog, act);
>         fallthrough;
> case XDP_DROP:
>         xdp_return_frame_rx_napi(xdpf);
>         break;
> }
>
>
>> 
>>>> In any case, this needs extensive selftests, including ones with devmap
>>>> programs that loop packets (both redirect from a->a, and from
>>>> a->b->a->b) to make sure the limits work correctly.
>>>
>>>
>>> Good point! I am going to prepare some.
>>>
>>>
>>>>
>>>>> +	}
>>>>>  
>>>>>  	/* Ingress dev_rx will be the same for all xdp_frame's in
>>>>>  	 * bulk_queue, because bq stored per-CPU and must be flushed
>>>>> diff --git a/net/core/filter.c b/net/core/filter.c
>>>>> index 8569cd2482ee..b33fc0b1444a 100644
>>>>> --- a/net/core/filter.c
>>>>> +++ b/net/core/filter.c
>>>>> @@ -4287,14 +4287,18 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
>>>>>  void xdp_do_flush(void)
>>>>>  {
>>>>>  	struct list_head *lh_map, *lh_dev, *lh_xsk;
>>>>> +	int redirect;
>>>>>  
>>>>> -	bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
>>>>> -	if (lh_dev)
>>>>> -		__dev_flush(lh_dev);
>>>>> -	if (lh_map)
>>>>> -		__cpu_map_flush(lh_map);
>>>>> -	if (lh_xsk)
>>>>> -		__xsk_map_flush(lh_xsk);
>>>>> +	do {
>>>>> +		redirect = 0;
>>>>> +		bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
>>>>> +		if (lh_dev)
>>>>> +			__dev_flush(lh_dev, &redirect);
>>>>> +		if (lh_map)
>>>>> +			__cpu_map_flush(lh_map);
>>>>> +		if (lh_xsk)
>>>>> +			__xsk_map_flush(lh_xsk);
>>>>> +	} while (redirect > 0);
>>>>>  }
>>>>>  EXPORT_SYMBOL_GPL(xdp_do_flush);
>>>>>  
>>>>> @@ -4303,20 +4307,24 @@ void xdp_do_check_flushed(struct napi_struct *napi)
>>>>>  {
>>>>>  	struct list_head *lh_map, *lh_dev, *lh_xsk;
>>>>>  	bool missed = false;
>>>>> +	int redirect;
>>>>>  
>>>>> -	bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
>>>>> -	if (lh_dev) {
>>>>> -		__dev_flush(lh_dev);
>>>>> -		missed = true;
>>>>> -	}
>>>>> -	if (lh_map) {
>>>>> -		__cpu_map_flush(lh_map);
>>>>> -		missed = true;
>>>>> -	}
>>>>> -	if (lh_xsk) {
>>>>> -		__xsk_map_flush(lh_xsk);
>>>>> -		missed = true;
>>>>> -	}
>>>>> +	do {
>>>>> +		redirect = 0;
>>>>> +		bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
>>>>> +		if (lh_dev) {
>>>>> +			__dev_flush(lh_dev, &redirect);
>>>>> +			missed = true;
>>>>> +		}
>>>>> +		if (lh_map) {
>>>>> +			__cpu_map_flush(lh_map);
>>>>> +			missed = true;
>>>>> +		}
>>>>> +		if (lh_xsk) {
>>>>> +			__xsk_map_flush(lh_xsk);
>>>>> +			missed = true;
>>>>> +		}
>>>>> +	} while (redirect > 0);
>>>>
>>>> With the change suggested above (so that bq_xmit_all() guarantees the
>>>> flush is completely done), this looping is not needed anymore. However,
>>>> it becomes important in which *order* the flushing is done
>>>> (__dev_flush() should always happen first), so adding a comment to note
>>>> this would be good.
>>>
>>>
>>> I guess, if we remove the loop here and we still want to cover the case of
>>> redirecting from devmap program via cpumap, we need to fetch the lh_map again
>>> after calling __dev_flush, right?
>>> So I think we should no longer use bpf_net_ctx_get_all_used_flush_lists then:
>>>
>>>         lh_dev = bpf_net_ctx_get_dev_flush_list();
>>>         if (lh_dev)
>>>                 __dev_flush(lh_dev);
>>>         lh_map = bpf_net_ctx_get_cpu_map_flush_list();
>>>         if (lh_map)
>>>                 __cpu_map_flush(lh_map);
>>>         lh_xsk = bpf_net_ctx_get_xskmap_flush_list();
>>>         if (lh_xsk)
>>>                 __xsk_map_flush(lh_xsk);
>>>
>>> But bpf_net_ctx_get_all_used_flush_lists also includes additional checks
>>> for IS_ENABLED(CONFIG_BPF_SYSCALL) and IS_ENABLED(CONFIG_XDP_SOCKETS),
>>> so I guess they should be directly included in the xdp_do_flush and
>>> xdp_do_check_flushed?
>>> Then we could remove the bpf_net_ctx_get_all_used_flush_lists.
>>> Or do you have an idea for a more elegant solution?
>> 
>> I think cpumap is fine because that doesn't immediately redirect back to
>> the bulk queue; instead __cpu_map_flush() will put the frames on a
>> per-CPU ring buffer and wake the kworker on that CPU. Which will in turn
>> do another xdp_do_flush() if the cpumap program does a redirect. So in
>> other words, we only need to handle recursion of devmap redirects :)
>
> I likely miss something here, but the scenario I am thinking about is the following:
>
> 1. cpu_map and xsk_map flush list are empty, while the dev flush map has
>    a single frame pending, i.e. in the current implementation after
>    executing bpf_net_ctx_get_all_used_flush_lists:
>    lh_dev = something
>    lh_map = lh_xsk = NULL
>
> 2. __dev_flush gets called and executes a bpf program on the frame
>    that returns with "return bpf_redirect_map(&cpu_map, 0, 0);"
>    and that adds an item to the cpumap flush list.
>
> 3. Since __dev_flush is only able to handle devmap redirects itself,
>    the item is still on the cpumap flush list after __dev_flush
>    has returned.
>
> 4. lh_map, however, is still NULL, so __cpu_map_flush does not
>    get called and thus the kworker is never woken up.
>
> That is at least what I see on the running system that the kworker
> is never woken up, but I might have done something else wrong...

Ah, yes, I see what you mean. Yup, you're right,
bpf_net_ctx_get_all_used_flush_lists() will no longer work, we'll have
to get the others after __dev_flush()

-Toke
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3b94ec161e8c..1b57cbabf199 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2498,7 +2498,7 @@  struct sk_buff;
 struct bpf_dtab_netdev;
 struct bpf_cpu_map_entry;
 
-void __dev_flush(struct list_head *flush_list);
+void __dev_flush(struct list_head *flush_list, int *redirects);
 int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
 		    struct net_device *dev_rx);
 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
@@ -2740,7 +2740,7 @@  static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd)
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
-static inline void __dev_flush(struct list_head *flush_list)
+static inline void __dev_flush(struct list_head *flush_list, int *redirects)
 {
 }
 
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index a7e5452b5d21..fba2c457e727 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -269,9 +269,9 @@  TRACE_EVENT(xdp_devmap_xmit,
 
 	TP_PROTO(const struct net_device *from_dev,
 		 const struct net_device *to_dev,
-		 int sent, int drops, int err),
+		 int sent, int drops, int redirects, int err),
 
-	TP_ARGS(from_dev, to_dev, sent, drops, err),
+	TP_ARGS(from_dev, to_dev, sent, drops, redirects, err),
 
 	TP_STRUCT__entry(
 		__field(int, from_ifindex)
@@ -279,6 +279,7 @@  TRACE_EVENT(xdp_devmap_xmit,
 		__field(int, to_ifindex)
 		__field(int, drops)
 		__field(int, sent)
+		__field(int, redirects)
 		__field(int, err)
 	),
 
@@ -288,16 +289,17 @@  TRACE_EVENT(xdp_devmap_xmit,
 		__entry->to_ifindex	= to_dev->ifindex;
 		__entry->drops		= drops;
 		__entry->sent		= sent;
+		__entry->redirects	= redirects;
 		__entry->err		= err;
 	),
 
 	TP_printk("ndo_xdp_xmit"
 		  " from_ifindex=%d to_ifindex=%d action=%s"
-		  " sent=%d drops=%d"
+		  " sent=%d drops=%d redirects=%d"
 		  " err=%d",
 		  __entry->from_ifindex, __entry->to_ifindex,
 		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
-		  __entry->sent, __entry->drops,
+		  __entry->sent, __entry->drops, __entry->redirects,
 		  __entry->err)
 );
 
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 7878be18e9d2..89bdec49ea40 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -334,7 +334,8 @@  static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
 static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
 				struct xdp_frame **frames, int n,
 				struct net_device *tx_dev,
-				struct net_device *rx_dev)
+				struct net_device *rx_dev,
+				int *redirects)
 {
 	struct xdp_txq_info txq = { .dev = tx_dev };
 	struct xdp_rxq_info rxq = { .dev = rx_dev };
@@ -359,6 +360,13 @@  static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
 			else
 				frames[nframes++] = xdpf;
 			break;
+		case XDP_REDIRECT:
+			err = xdp_do_redirect(rx_dev, &xdp, xdp_prog);
+			if (unlikely(err))
+				xdp_return_frame_rx_napi(xdpf);
+			else
+				*redirects += 1;
+			break;
 		default:
 			bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
 			fallthrough;
@@ -373,7 +381,7 @@  static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
 	return nframes; /* sent frames count */
 }
 
-static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
+static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags, int *redirects)
 {
 	struct net_device *dev = bq->dev;
 	unsigned int cnt = bq->count;
@@ -390,8 +398,10 @@  static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
 		prefetch(xdpf);
 	}
 
+	int new_redirects = 0;
 	if (bq->xdp_prog) {
-		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx);
+		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx,
+				&new_redirects);
 		if (!to_send)
 			goto out;
 	}
@@ -413,19 +423,21 @@  static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
 
 out:
 	bq->count = 0;
-	trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err);
+	*redirects += new_redirects;
+	trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent - new_redirects,
+			new_redirects, err);
 }
 
 /* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the
  * driver before returning from its napi->poll() routine. See the comment above
  * xdp_do_flush() in filter.c.
  */
-void __dev_flush(struct list_head *flush_list)
+void __dev_flush(struct list_head *flush_list, int *redirects)
 {
 	struct xdp_dev_bulk_queue *bq, *tmp;
 
 	list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
-		bq_xmit_all(bq, XDP_XMIT_FLUSH);
+		bq_xmit_all(bq, XDP_XMIT_FLUSH, redirects);
 		bq->dev_rx = NULL;
 		bq->xdp_prog = NULL;
 		__list_del_clearprev(&bq->flush_node);
@@ -458,8 +470,17 @@  static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
 {
 	struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
 
-	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
-		bq_xmit_all(bq, 0);
+	if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) {
+		int redirects = 0;
+
+		bq_xmit_all(bq, 0, &redirects);
+
+		/* according to comment above xdp_do_flush() in
+		 * filter.c, xdp_do_flush is always called at
+		 * the end of the NAPI anyway, so no need to act
+		 * on the redirects here
+		 */
+	}
 
 	/* Ingress dev_rx will be the same for all xdp_frame's in
 	 * bulk_queue, because bq stored per-CPU and must be flushed
diff --git a/net/core/filter.c b/net/core/filter.c
index 8569cd2482ee..b33fc0b1444a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4287,14 +4287,18 @@  static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
 void xdp_do_flush(void)
 {
 	struct list_head *lh_map, *lh_dev, *lh_xsk;
+	int redirect;
 
-	bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
-	if (lh_dev)
-		__dev_flush(lh_dev);
-	if (lh_map)
-		__cpu_map_flush(lh_map);
-	if (lh_xsk)
-		__xsk_map_flush(lh_xsk);
+	do {
+		redirect = 0;
+		bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
+		if (lh_dev)
+			__dev_flush(lh_dev, &redirect);
+		if (lh_map)
+			__cpu_map_flush(lh_map);
+		if (lh_xsk)
+			__xsk_map_flush(lh_xsk);
+	} while (redirect > 0);
 }
 EXPORT_SYMBOL_GPL(xdp_do_flush);
 
@@ -4303,20 +4307,24 @@  void xdp_do_check_flushed(struct napi_struct *napi)
 {
 	struct list_head *lh_map, *lh_dev, *lh_xsk;
 	bool missed = false;
+	int redirect;
 
-	bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
-	if (lh_dev) {
-		__dev_flush(lh_dev);
-		missed = true;
-	}
-	if (lh_map) {
-		__cpu_map_flush(lh_map);
-		missed = true;
-	}
-	if (lh_xsk) {
-		__xsk_map_flush(lh_xsk);
-		missed = true;
-	}
+	do {
+		redirect = 0;
+		bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
+		if (lh_dev) {
+			__dev_flush(lh_dev, &redirect);
+			missed = true;
+		}
+		if (lh_map) {
+			__cpu_map_flush(lh_map);
+			missed = true;
+		}
+		if (lh_xsk) {
+			__xsk_map_flush(lh_xsk);
+			missed = true;
+		}
+	} while (redirect > 0);
 
 	WARN_ONCE(missed, "Missing xdp_do_flush() invocation after NAPI by %ps\n",
 		  napi->poll);