Message ID | 20210122074652.2981711-2-liuhangbin@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | xdp: add a new helper for dev map multicast support | expand |
Context | Check | Description |
---|---|---|
netdev/cover_letter | success | Link |
netdev/fixes_present | success | Link |
netdev/patch_count | success | Link |
netdev/tree_selection | success | Clearly marked for bpf-next |
netdev/subject_prefix | success | Link |
netdev/cc_maintainers | warning | 8 maintainers not CCed: davem@davemloft.net songliubraving@fb.com andrii@kernel.org hawk@kernel.org kpsingh@kernel.org kuba@kernel.org kafai@fb.com yhs@fb.com |
netdev/source_inline | success | Was 0 now: 0 |
netdev/verify_signedoff | success | Link |
netdev/module_param | success | Was 0 now: 0 |
netdev/build_32bit | success | Errors and warnings before: 1 this patch: 1 |
netdev/kdoc | success | Errors and warnings before: 0 this patch: 0 |
netdev/verify_fixes | success | Link |
netdev/checkpatch | success | total: 0 errors, 0 warnings, 0 checks, 207 lines checked |
netdev/build_allmodconfig_warn | success | Errors and warnings before: 1 this patch: 1 |
netdev/header_inline | success | Link |
netdev/stable | success | Stable not CCed |
On Fri, Jan 22, 2021 at 03:46:47PM +0800, Hangbin Liu wrote: > From: Jesper Dangaard Brouer <brouer@redhat.com> > > This changes the devmap XDP program support to run the program when the > bulk queue is flushed instead of before the frame is enqueued. This has > a couple of benefits: > > - It "sorts" the packets by destination devmap entry, and then runs the > same BPF program on all the packets in sequence. This ensures that we > keep the XDP program and destination device properties hot in I-cache. > > - It makes the multicast implementation simpler because it can just > enqueue packets using bq_enqueue() without having to deal with the > devmap program at all. > > The drawback is that if the devmap program drops the packet, the enqueue > step is redundant. However, arguably this is mostly visible in a > micro-benchmark, and with more mixed traffic the I-cache benefit should > win out. The performance impact of just this patch is as follows: > > Using xdp_redirect_map(with a 2nd xdp_prog patch[1]) in sample/bpf and send > pkts via pktgen cmd: > ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64 > > There are about +/- 0.1M deviation for native testing, the performance > improved for the base-case, but some drop back with xdp devmap prog attached. > > Version | Test | Generic | Native | Native + 2nd xdp_prog > 5.10 rc6 | xdp_redirect_map i40e->i40e | 2.0M | 9.1M | 8.0M > 5.10 rc6 | xdp_redirect_map i40e->veth | 1.7M | 11.0M | 9.7M > 5.10 rc6 + patch | xdp_redirect_map i40e->i40e | 2.0M | 9.5M | 7.5M > 5.10 rc6 + patch | xdp_redirect_map i40e->veth | 1.7M | 11.6M | 9.1M > > [1] https://lore.kernel.org/bpf/20210122025007.2968381-1-liuhangbin@gmail.com > > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> > Signed-off-by: Hangbin Liu <liuhangbin@gmail.com> > > --- > v16: > a) refactor bq_xmit_all logic and remove error label > > v15: > a) do not use unlikely when checking bq->xdp_prog > b) return sent frames for dev_map_bpf_prog_run() > > v14: no update, only rebase the code > v13: pass in xdp_prog through __xdp_enqueue() > v2-v12: no this patch > --- > kernel/bpf/devmap.c | 136 ++++++++++++++++++++++++++------------------ > 1 file changed, 81 insertions(+), 55 deletions(-) > > diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c > index f6e9c68afdd4..c24fcffbbfad 100644 > --- a/kernel/bpf/devmap.c > +++ b/kernel/bpf/devmap.c > @@ -57,6 +57,7 @@ struct xdp_dev_bulk_queue { > struct list_head flush_node; > struct net_device *dev; > struct net_device *dev_rx; > + struct bpf_prog *xdp_prog; > unsigned int count; > }; > > @@ -327,46 +328,95 @@ bool dev_map_can_have_prog(struct bpf_map *map) > return false; > } > > +static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, > + struct xdp_frame **frames, int n, > + struct net_device *dev) > +{ > + struct xdp_txq_info txq = { .dev = dev }; > + struct xdp_buff xdp; > + int i, nframes = 0; > + > + for (i = 0; i < n; i++) { > + struct xdp_frame *xdpf = frames[i]; > + u32 act; > + int err; > + > + xdp_convert_frame_to_buff(xdpf, &xdp); > + xdp.txq = &txq; > + > + act = bpf_prog_run_xdp(xdp_prog, &xdp); > + switch (act) { > + case XDP_PASS: > + err = xdp_update_frame_from_buff(&xdp, xdpf); > + if (unlikely(err < 0)) > + xdp_return_frame_rx_napi(xdpf); > + else > + frames[nframes++] = xdpf; > + break; > + default: > + bpf_warn_invalid_xdp_action(act); > + fallthrough; > + case XDP_ABORTED: > + trace_xdp_exception(dev, xdp_prog, act); > + fallthrough; > + case XDP_DROP: > + xdp_return_frame_rx_napi(xdpf); > + break; > + } > + } > + return nframes; /* sent frames count */ > +} > + > static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) > { > struct net_device *dev = bq->dev; > - int sent = 0, drops = 0, err = 0; > + unsigned int cnt = bq->count; > + int drops = 0, err = 0; > + int to_sent = cnt; Hmm if I would be super picky then I'd like to have this variable as "to_send", as we spoke. Could you change that? With that, you can add my: Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com> to next revision. > + int sent = cnt; > int i; > > - if (unlikely(!bq->count)) > + if (unlikely(!cnt)) > return; > > - for (i = 0; i < bq->count; i++) { > + for (i = 0; i < cnt; i++) { > struct xdp_frame *xdpf = bq->q[i]; > > prefetch(xdpf); > } > > - sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags); > + if (bq->xdp_prog) { > + to_sent = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev); > + if (!to_sent) { > + sent = 0; > + goto out; > + } > + drops = cnt - to_sent; > + } > + > + sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_sent, bq->q, flags); > if (sent < 0) { > err = sent; > sent = 0; > - goto error; > + > + /* If ndo_xdp_xmit fails with an errno, no frames have been > + * xmit'ed and it's our responsibility to them free all. > + */ > + for (i = 0; i < cnt - drops; i++) { > + struct xdp_frame *xdpf = bq->q[i]; > + > + xdp_return_frame_rx_napi(xdpf); > + } > } > - drops = bq->count - sent; > out: > + drops = cnt - sent; > bq->count = 0; > > trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err); > bq->dev_rx = NULL; > + bq->xdp_prog = NULL; One more question, do you really have to do that per each bq_xmit_all call? Couldn't you clear it in __dev_flush ? Or IOW - what's the rationale behind storing xdp_prog in xdp_dev_bulk_queue. Why can't you propagate the dst->xdp_prog and rely on that without that local pointer? You probably have an answer for that, so maybe include it in commit message. BTW same question for clearing dev_rx. To me this will be the same for all bq_xmit_all() calls that will happen within same napi. > __list_del_clearprev(&bq->flush_node); > return; > -error: > - /* If ndo_xdp_xmit fails with an errno, no frames have been > - * xmit'ed and it's our responsibility to them free all. > - */ > - for (i = 0; i < bq->count; i++) { > - struct xdp_frame *xdpf = bq->q[i]; > - > - xdp_return_frame_rx_napi(xdpf); > - drops++; > - } > - goto out; > } > > /* __dev_flush is called from xdp_do_flush() which _must_ be signaled > @@ -408,7 +458,7 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) > * Thus, safe percpu variable access. > */ > static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, > - struct net_device *dev_rx) > + struct net_device *dev_rx, struct bpf_prog *xdp_prog) > { > struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); > struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); > @@ -423,6 +473,14 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, > if (!bq->dev_rx) > bq->dev_rx = dev_rx; > > + /* Store (potential) xdp_prog that run before egress to dev as > + * part of bulk_queue. This will be same xdp_prog for all > + * xdp_frame's in bulk_queue, because this per-CPU store must > + * be flushed from net_device drivers NAPI func end. > + */ > + if (!bq->xdp_prog) > + bq->xdp_prog = xdp_prog; > + > bq->q[bq->count++] = xdpf; > > if (!bq->flush_node.prev) > @@ -430,7 +488,8 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, > } > > static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, > - struct net_device *dev_rx) > + struct net_device *dev_rx, > + struct bpf_prog *xdp_prog) > { > struct xdp_frame *xdpf; > int err; > @@ -446,42 +505,14 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, > if (unlikely(!xdpf)) > return -EOVERFLOW; > > - bq_enqueue(dev, xdpf, dev_rx); > + bq_enqueue(dev, xdpf, dev_rx, xdp_prog); > return 0; > } > > -static struct xdp_buff *dev_map_run_prog(struct net_device *dev, > - struct xdp_buff *xdp, > - struct bpf_prog *xdp_prog) > -{ > - struct xdp_txq_info txq = { .dev = dev }; > - u32 act; > - > - xdp_set_data_meta_invalid(xdp); > - xdp->txq = &txq; > - > - act = bpf_prog_run_xdp(xdp_prog, xdp); > - switch (act) { > - case XDP_PASS: > - return xdp; > - case XDP_DROP: > - break; > - default: > - bpf_warn_invalid_xdp_action(act); > - fallthrough; > - case XDP_ABORTED: > - trace_xdp_exception(dev, xdp_prog, act); > - break; > - } > - > - xdp_return_buff(xdp); > - return NULL; > -} > - > int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, > struct net_device *dev_rx) > { > - return __xdp_enqueue(dev, xdp, dev_rx); > + return __xdp_enqueue(dev, xdp, dev_rx, NULL); > } > > int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, > @@ -489,12 +520,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, > { > struct net_device *dev = dst->dev; > > - if (dst->xdp_prog) { > - xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog); > - if (!xdp) > - return 0; > - } > - return __xdp_enqueue(dev, xdp, dev_rx); > + return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog); > } > > int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, > -- > 2.26.2 >
Maciej Fijalkowski <maciej.fijalkowski@intel.com> writes: > On Fri, Jan 22, 2021 at 03:46:47PM +0800, Hangbin Liu wrote: >> From: Jesper Dangaard Brouer <brouer@redhat.com> >> >> This changes the devmap XDP program support to run the program when the >> bulk queue is flushed instead of before the frame is enqueued. This has >> a couple of benefits: >> >> - It "sorts" the packets by destination devmap entry, and then runs the >> same BPF program on all the packets in sequence. This ensures that we >> keep the XDP program and destination device properties hot in I-cache. >> >> - It makes the multicast implementation simpler because it can just >> enqueue packets using bq_enqueue() without having to deal with the >> devmap program at all. >> >> The drawback is that if the devmap program drops the packet, the enqueue >> step is redundant. However, arguably this is mostly visible in a >> micro-benchmark, and with more mixed traffic the I-cache benefit should >> win out. The performance impact of just this patch is as follows: >> >> Using xdp_redirect_map(with a 2nd xdp_prog patch[1]) in sample/bpf and send >> pkts via pktgen cmd: >> ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64 >> >> There are about +/- 0.1M deviation for native testing, the performance >> improved for the base-case, but some drop back with xdp devmap prog attached. >> >> Version | Test | Generic | Native | Native + 2nd xdp_prog >> 5.10 rc6 | xdp_redirect_map i40e->i40e | 2.0M | 9.1M | 8.0M >> 5.10 rc6 | xdp_redirect_map i40e->veth | 1.7M | 11.0M | 9.7M >> 5.10 rc6 + patch | xdp_redirect_map i40e->i40e | 2.0M | 9.5M | 7.5M >> 5.10 rc6 + patch | xdp_redirect_map i40e->veth | 1.7M | 11.6M | 9.1M >> >> [1] https://lore.kernel.org/bpf/20210122025007.2968381-1-liuhangbin@gmail.com >> >> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> >> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com> >> >> --- >> v16: >> a) refactor bq_xmit_all logic and remove error label >> >> v15: >> a) do not use unlikely when checking bq->xdp_prog >> b) return sent frames for dev_map_bpf_prog_run() >> >> v14: no update, only rebase the code >> v13: pass in xdp_prog through __xdp_enqueue() >> v2-v12: no this patch >> --- >> kernel/bpf/devmap.c | 136 ++++++++++++++++++++++++++------------------ >> 1 file changed, 81 insertions(+), 55 deletions(-) >> >> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c >> index f6e9c68afdd4..c24fcffbbfad 100644 >> --- a/kernel/bpf/devmap.c >> +++ b/kernel/bpf/devmap.c >> @@ -57,6 +57,7 @@ struct xdp_dev_bulk_queue { >> struct list_head flush_node; >> struct net_device *dev; >> struct net_device *dev_rx; >> + struct bpf_prog *xdp_prog; >> unsigned int count; >> }; >> >> @@ -327,46 +328,95 @@ bool dev_map_can_have_prog(struct bpf_map *map) >> return false; >> } >> >> +static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, >> + struct xdp_frame **frames, int n, >> + struct net_device *dev) >> +{ >> + struct xdp_txq_info txq = { .dev = dev }; >> + struct xdp_buff xdp; >> + int i, nframes = 0; >> + >> + for (i = 0; i < n; i++) { >> + struct xdp_frame *xdpf = frames[i]; >> + u32 act; >> + int err; >> + >> + xdp_convert_frame_to_buff(xdpf, &xdp); >> + xdp.txq = &txq; >> + >> + act = bpf_prog_run_xdp(xdp_prog, &xdp); >> + switch (act) { >> + case XDP_PASS: >> + err = xdp_update_frame_from_buff(&xdp, xdpf); >> + if (unlikely(err < 0)) >> + xdp_return_frame_rx_napi(xdpf); >> + else >> + frames[nframes++] = xdpf; >> + break; >> + default: >> + bpf_warn_invalid_xdp_action(act); >> + fallthrough; >> + case XDP_ABORTED: >> + trace_xdp_exception(dev, xdp_prog, act); >> + fallthrough; >> + case XDP_DROP: >> + xdp_return_frame_rx_napi(xdpf); >> + break; >> + } >> + } >> + return nframes; /* sent frames count */ >> +} >> + >> static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) >> { >> struct net_device *dev = bq->dev; >> - int sent = 0, drops = 0, err = 0; >> + unsigned int cnt = bq->count; >> + int drops = 0, err = 0; >> + int to_sent = cnt; > > Hmm if I would be super picky then I'd like to have this variable as > "to_send", as we spoke. Could you change that? > > With that, you can add my: > > Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com> > > to next revision. > >> + int sent = cnt; >> int i; >> >> - if (unlikely(!bq->count)) >> + if (unlikely(!cnt)) >> return; >> >> - for (i = 0; i < bq->count; i++) { >> + for (i = 0; i < cnt; i++) { >> struct xdp_frame *xdpf = bq->q[i]; >> >> prefetch(xdpf); >> } >> >> - sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags); >> + if (bq->xdp_prog) { >> + to_sent = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev); >> + if (!to_sent) { >> + sent = 0; >> + goto out; >> + } >> + drops = cnt - to_sent; >> + } >> + >> + sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_sent, bq->q, flags); >> if (sent < 0) { >> err = sent; >> sent = 0; >> - goto error; >> + >> + /* If ndo_xdp_xmit fails with an errno, no frames have been >> + * xmit'ed and it's our responsibility to them free all. >> + */ >> + for (i = 0; i < cnt - drops; i++) { >> + struct xdp_frame *xdpf = bq->q[i]; >> + >> + xdp_return_frame_rx_napi(xdpf); >> + } >> } >> - drops = bq->count - sent; >> out: >> + drops = cnt - sent; >> bq->count = 0; >> >> trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err); >> bq->dev_rx = NULL; >> + bq->xdp_prog = NULL; > > One more question, do you really have to do that per each bq_xmit_all > call? Couldn't you clear it in __dev_flush ? > > Or IOW - what's the rationale behind storing xdp_prog in > xdp_dev_bulk_queue. Why can't you propagate the dst->xdp_prog and rely on > that without that local pointer? > > You probably have an answer for that, so maybe include it in commit > message. > > BTW same question for clearing dev_rx. To me this will be the same for all > bq_xmit_all() calls that will happen within same napi. I think you're right: When bq_xmit_all() is called from bq_enqueue(), another packet will always be enqueued immediately after, so clearing out all of those things in bq_xmit_all() is redundant. This also includes the list_del on bq->flush_node, BTW. And while we're getting into e micro-optimisations: In bq_enqueue() we have two checks: if (!bq->dev_rx) bq->dev_rx = dev_rx; bq->q[bq->count++] = xdpf; if (!bq->flush_node.prev) list_add(&bq->flush_node, flush_list); those two if() checks can be collapsed into one, since the list and the dev_rx field are only ever modified together. This will also be the case for bq->xdp_prog, so putting all three under the same check in bq_enqueue() and only clearing them in __dev_flush() would be a win, I suppose - nice catch! :) -Toke
On Fri, Jan 22, 2021 at 02:38:40PM +0100, Toke Høiland-Jørgensen wrote: > >> out: > >> + drops = cnt - sent; > >> bq->count = 0; > >> > >> trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err); > >> bq->dev_rx = NULL; > >> + bq->xdp_prog = NULL; > > > > One more question, do you really have to do that per each bq_xmit_all > > call? Couldn't you clear it in __dev_flush ? > > > > Or IOW - what's the rationale behind storing xdp_prog in > > xdp_dev_bulk_queue. Why can't you propagate the dst->xdp_prog and rely on > > that without that local pointer? > > > > You probably have an answer for that, so maybe include it in commit > > message. > > > > BTW same question for clearing dev_rx. To me this will be the same for all > > bq_xmit_all() calls that will happen within same napi. > > I think you're right: When bq_xmit_all() is called from bq_enqueue(), > another packet will always be enqueued immediately after, so clearing > out all of those things in bq_xmit_all() is redundant. This also > includes the list_del on bq->flush_node, BTW. > > And while we're getting into e micro-optimisations: In bq_enqueue() we > have two checks: > > if (!bq->dev_rx) > bq->dev_rx = dev_rx; > > bq->q[bq->count++] = xdpf; > > if (!bq->flush_node.prev) > list_add(&bq->flush_node, flush_list); > > > those two if() checks can be collapsed into one, since the list and the > dev_rx field are only ever modified together. This will also be the case > for bq->xdp_prog, so putting all three under the same check in > bq_enqueue() and only clearing them in __dev_flush() would be a win, I > suppose - nice catch! :) Thanks for the advice, so how about modify it like: diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index bc38f7193149..217e09533097 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -413,9 +413,6 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) bq->count = 0; trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err); - bq->dev_rx = NULL; - bq->xdp_prog = NULL; - __list_del_clearprev(&bq->flush_node); return; } @@ -434,8 +431,12 @@ void __dev_flush(void) struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq, *tmp; - list_for_each_entry_safe(bq, tmp, flush_list, flush_node) + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { bq_xmit_all(bq, XDP_XMIT_FLUSH); + bq->dev_rx = NULL; + bq->xdp_prog = NULL; + __list_del_clearprev(&bq->flush_node); + } } /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or @@ -469,22 +470,17 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed * from net_device drivers NAPI func end. + * + * Do the same with xdp_prog and flush_list since these fields + * are modified together. */ - if (!bq->dev_rx) + if (!bq->dev_rx) { bq->dev_rx = dev_rx; - - /* Store (potential) xdp_prog that run before egress to dev as - * part of bulk_queue. This will be same xdp_prog for all - * xdp_frame's in bulk_queue, because this per-CPU store must - * be flushed from net_device drivers NAPI func end. - */ - if (!bq->xdp_prog) bq->xdp_prog = xdp_prog; + list_add(&bq->flush_node, flush_list); + } bq->q[bq->count++] = xdpf; - - if (!bq->flush_node.prev) - list_add(&bq->flush_node, flush_list); } static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
Hangbin Liu <liuhangbin@gmail.com> writes: > On Fri, Jan 22, 2021 at 02:38:40PM +0100, Toke Høiland-Jørgensen wrote: >> >> out: >> >> + drops = cnt - sent; >> >> bq->count = 0; >> >> >> >> trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err); >> >> bq->dev_rx = NULL; >> >> + bq->xdp_prog = NULL; >> > >> > One more question, do you really have to do that per each bq_xmit_all >> > call? Couldn't you clear it in __dev_flush ? >> > >> > Or IOW - what's the rationale behind storing xdp_prog in >> > xdp_dev_bulk_queue. Why can't you propagate the dst->xdp_prog and rely on >> > that without that local pointer? >> > >> > You probably have an answer for that, so maybe include it in commit >> > message. >> > >> > BTW same question for clearing dev_rx. To me this will be the same for all >> > bq_xmit_all() calls that will happen within same napi. >> >> I think you're right: When bq_xmit_all() is called from bq_enqueue(), >> another packet will always be enqueued immediately after, so clearing >> out all of those things in bq_xmit_all() is redundant. This also >> includes the list_del on bq->flush_node, BTW. >> >> And while we're getting into e micro-optimisations: In bq_enqueue() we >> have two checks: >> >> if (!bq->dev_rx) >> bq->dev_rx = dev_rx; >> >> bq->q[bq->count++] = xdpf; >> >> if (!bq->flush_node.prev) >> list_add(&bq->flush_node, flush_list); >> >> >> those two if() checks can be collapsed into one, since the list and the >> dev_rx field are only ever modified together. This will also be the case >> for bq->xdp_prog, so putting all three under the same check in >> bq_enqueue() and only clearing them in __dev_flush() would be a win, I >> suppose - nice catch! :) > > Thanks for the advice, so how about modify it like: Yup, exactly! :) -Toke
On Mon, Jan 25, 2021 at 12:21:26PM +0100, Toke Høiland-Jørgensen wrote: > Hangbin Liu <liuhangbin@gmail.com> writes: > > > On Fri, Jan 22, 2021 at 02:38:40PM +0100, Toke Høiland-Jørgensen wrote: > >> >> out: > >> >> + drops = cnt - sent; > >> >> bq->count = 0; > >> >> > >> >> trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err); > >> >> bq->dev_rx = NULL; > >> >> + bq->xdp_prog = NULL; > >> > > >> > One more question, do you really have to do that per each bq_xmit_all > >> > call? Couldn't you clear it in __dev_flush ? > >> > > >> > Or IOW - what's the rationale behind storing xdp_prog in > >> > xdp_dev_bulk_queue. Why can't you propagate the dst->xdp_prog and rely on > >> > that without that local pointer? > >> > > >> > You probably have an answer for that, so maybe include it in commit > >> > message. > >> > > >> > BTW same question for clearing dev_rx. To me this will be the same for all > >> > bq_xmit_all() calls that will happen within same napi. > >> > >> I think you're right: When bq_xmit_all() is called from bq_enqueue(), > >> another packet will always be enqueued immediately after, so clearing > >> out all of those things in bq_xmit_all() is redundant. This also > >> includes the list_del on bq->flush_node, BTW. > >> > >> And while we're getting into e micro-optimisations: In bq_enqueue() we > >> have two checks: > >> > >> if (!bq->dev_rx) > >> bq->dev_rx = dev_rx; > >> > >> bq->q[bq->count++] = xdpf; > >> > >> if (!bq->flush_node.prev) > >> list_add(&bq->flush_node, flush_list); > >> > >> > >> those two if() checks can be collapsed into one, since the list and the > >> dev_rx field are only ever modified together. This will also be the case > >> for bq->xdp_prog, so putting all three under the same check in > >> bq_enqueue() and only clearing them in __dev_flush() would be a win, I > >> suppose - nice catch! :) Huh, nice further optimization! :) Of course I agree on that. > > > > Thanks for the advice, so how about modify it like: > > Yup, exactly! :) > > -Toke >
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f6e9c68afdd4..c24fcffbbfad 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -57,6 +57,7 @@ struct xdp_dev_bulk_queue { struct list_head flush_node; struct net_device *dev; struct net_device *dev_rx; + struct bpf_prog *xdp_prog; unsigned int count; }; @@ -327,46 +328,95 @@ bool dev_map_can_have_prog(struct bpf_map *map) return false; } +static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, + struct xdp_frame **frames, int n, + struct net_device *dev) +{ + struct xdp_txq_info txq = { .dev = dev }; + struct xdp_buff xdp; + int i, nframes = 0; + + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + u32 act; + int err; + + xdp_convert_frame_to_buff(xdpf, &xdp); + xdp.txq = &txq; + + act = bpf_prog_run_xdp(xdp_prog, &xdp); + switch (act) { + case XDP_PASS: + err = xdp_update_frame_from_buff(&xdp, xdpf); + if (unlikely(err < 0)) + xdp_return_frame_rx_napi(xdpf); + else + frames[nframes++] = xdpf; + break; + default: + bpf_warn_invalid_xdp_action(act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(dev, xdp_prog, act); + fallthrough; + case XDP_DROP: + xdp_return_frame_rx_napi(xdpf); + break; + } + } + return nframes; /* sent frames count */ +} + static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; - int sent = 0, drops = 0, err = 0; + unsigned int cnt = bq->count; + int drops = 0, err = 0; + int to_sent = cnt; + int sent = cnt; int i; - if (unlikely(!bq->count)) + if (unlikely(!cnt)) return; - for (i = 0; i < bq->count; i++) { + for (i = 0; i < cnt; i++) { struct xdp_frame *xdpf = bq->q[i]; prefetch(xdpf); } - sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags); + if (bq->xdp_prog) { + to_sent = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev); + if (!to_sent) { + sent = 0; + goto out; + } + drops = cnt - to_sent; + } + + sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_sent, bq->q, flags); if (sent < 0) { err = sent; sent = 0; - goto error; + + /* If ndo_xdp_xmit fails with an errno, no frames have been + * xmit'ed and it's our responsibility to them free all. + */ + for (i = 0; i < cnt - drops; i++) { + struct xdp_frame *xdpf = bq->q[i]; + + xdp_return_frame_rx_napi(xdpf); + } } - drops = bq->count - sent; out: + drops = cnt - sent; bq->count = 0; trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err); bq->dev_rx = NULL; + bq->xdp_prog = NULL; __list_del_clearprev(&bq->flush_node); return; -error: - /* If ndo_xdp_xmit fails with an errno, no frames have been - * xmit'ed and it's our responsibility to them free all. - */ - for (i = 0; i < bq->count; i++) { - struct xdp_frame *xdpf = bq->q[i]; - - xdp_return_frame_rx_napi(xdpf); - drops++; - } - goto out; } /* __dev_flush is called from xdp_do_flush() which _must_ be signaled @@ -408,7 +458,7 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) * Thus, safe percpu variable access. */ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, - struct net_device *dev_rx) + struct net_device *dev_rx, struct bpf_prog *xdp_prog) { struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); @@ -423,6 +473,14 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, if (!bq->dev_rx) bq->dev_rx = dev_rx; + /* Store (potential) xdp_prog that run before egress to dev as + * part of bulk_queue. This will be same xdp_prog for all + * xdp_frame's in bulk_queue, because this per-CPU store must + * be flushed from net_device drivers NAPI func end. + */ + if (!bq->xdp_prog) + bq->xdp_prog = xdp_prog; + bq->q[bq->count++] = xdpf; if (!bq->flush_node.prev) @@ -430,7 +488,8 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, } static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, - struct net_device *dev_rx) + struct net_device *dev_rx, + struct bpf_prog *xdp_prog) { struct xdp_frame *xdpf; int err; @@ -446,42 +505,14 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, if (unlikely(!xdpf)) return -EOVERFLOW; - bq_enqueue(dev, xdpf, dev_rx); + bq_enqueue(dev, xdpf, dev_rx, xdp_prog); return 0; } -static struct xdp_buff *dev_map_run_prog(struct net_device *dev, - struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) -{ - struct xdp_txq_info txq = { .dev = dev }; - u32 act; - - xdp_set_data_meta_invalid(xdp); - xdp->txq = &txq; - - act = bpf_prog_run_xdp(xdp_prog, xdp); - switch (act) { - case XDP_PASS: - return xdp; - case XDP_DROP: - break; - default: - bpf_warn_invalid_xdp_action(act); - fallthrough; - case XDP_ABORTED: - trace_xdp_exception(dev, xdp_prog, act); - break; - } - - xdp_return_buff(xdp); - return NULL; -} - int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, struct net_device *dev_rx) { - return __xdp_enqueue(dev, xdp, dev_rx); + return __xdp_enqueue(dev, xdp, dev_rx, NULL); } int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, @@ -489,12 +520,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, { struct net_device *dev = dst->dev; - if (dst->xdp_prog) { - xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog); - if (!xdp) - return 0; - } - return __xdp_enqueue(dev, xdp, dev_rx); + return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog); } int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,