Message ID | 24703dbc3477a4b3aaf908f6226a566d27969f83.1646755129.git.lorenzo@kernel.org (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | introduce xdp frags support to veth driver | expand |
Lorenzo Bianconi <lorenzo@kernel.org> writes: > Introduce veth_convert_xdp_buff_from_skb routine in order to > convert a non-linear skb into a xdp buffer. If the received skb > is cloned or shared, veth_convert_xdp_buff_from_skb will copy it > in a new skb composed by order-0 pages for the linear and the > fragmented area. Moreover veth_convert_xdp_buff_from_skb guarantees > we have enough headroom for xdp. > This is a preliminary patch to allow attaching xdp programs with frags > support on veth devices. > > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org> It's cool that we can do this! A few comments below: > --- > drivers/net/veth.c | 174 ++++++++++++++++++++++++++++++--------------- > net/core/xdp.c | 1 + > 2 files changed, 119 insertions(+), 56 deletions(-) > > diff --git a/drivers/net/veth.c b/drivers/net/veth.c > index b77ce3fdcfe8..47b21b1d2fd9 100644 > --- a/drivers/net/veth.c > +++ b/drivers/net/veth.c > @@ -433,21 +433,6 @@ static void veth_set_multicast_list(struct net_device *dev) > { > } > > -static struct sk_buff *veth_build_skb(void *head, int headroom, int len, > - int buflen) > -{ > - struct sk_buff *skb; > - > - skb = build_skb(head, buflen); > - if (!skb) > - return NULL; > - > - skb_reserve(skb, headroom); > - skb_put(skb, len); > - > - return skb; > -} > - > static int veth_select_rxq(struct net_device *dev) > { > return smp_processor_id() % dev->real_num_rx_queues; > @@ -695,72 +680,143 @@ static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, > } > } > > -static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, > - struct sk_buff *skb, > - struct veth_xdp_tx_bq *bq, > - struct veth_stats *stats) > +static void veth_xdp_get(struct xdp_buff *xdp) > { > - u32 pktlen, headroom, act, metalen, frame_sz; > - void *orig_data, *orig_data_end; > - struct bpf_prog *xdp_prog; > - int mac_len, delta, off; > - struct xdp_buff xdp; > + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); > + int i; > > - skb_prepare_for_gro(skb); > + get_page(virt_to_page(xdp->data)); > + if (likely(!xdp_buff_has_frags(xdp))) > + return; > > - rcu_read_lock(); > - xdp_prog = rcu_dereference(rq->xdp_prog); > - if (unlikely(!xdp_prog)) { > - rcu_read_unlock(); > - goto out; > - } > + for (i = 0; i < sinfo->nr_frags; i++) > + __skb_frag_ref(&sinfo->frags[i]); > +} > > - mac_len = skb->data - skb_mac_header(skb); > - pktlen = skb->len + mac_len; > - headroom = skb_headroom(skb) - mac_len; > +static int veth_convert_xdp_buff_from_skb(struct veth_rq *rq, > + struct xdp_buff *xdp, > + struct sk_buff **pskb) > +{ nit: It's not really "converting" and skb into an xdp_buff, since the xdp_buff lives on the stack; so maybe 'veth_init_xdp_buff_from_skb()'? > + struct sk_buff *skb = *pskb; > + u32 frame_sz; > > if (skb_shared(skb) || skb_head_is_locked(skb) || > - skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { > + skb_shinfo(skb)->nr_frags) { So this always clones the skb if it has frags? Is that really needed? Also, there's a lot of memory allocation and copying going on here; have you measured the performance? > + u32 size, len, max_head_size, off; > struct sk_buff *nskb; > - int size, head_off; > - void *head, *start; > struct page *page; > + int i, head_off; > > - size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + > - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); > - if (size > PAGE_SIZE) > + /* We need a private copy of the skb and data buffers since > + * the ebpf program can modify it. We segment the original skb > + * into order-0 pages without linearize it. > + * > + * Make sure we have enough space for linear and paged area > + */ > + max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - > + VETH_XDP_HEADROOM); > + if (skb->len > PAGE_SIZE * MAX_SKB_FRAGS + max_head_size) > goto drop; > > + /* Allocate skb head */ > page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); > if (!page) > goto drop; > > - head = page_address(page); > - start = head + VETH_XDP_HEADROOM; > - if (skb_copy_bits(skb, -mac_len, start, pktlen)) { > - page_frag_free(head); > + nskb = build_skb(page_address(page), PAGE_SIZE); > + if (!nskb) { > + put_page(page); > goto drop; > } > > - nskb = veth_build_skb(head, VETH_XDP_HEADROOM + mac_len, > - skb->len, PAGE_SIZE); > - if (!nskb) { > - page_frag_free(head); > + skb_reserve(nskb, VETH_XDP_HEADROOM); > + size = min_t(u32, skb->len, max_head_size); > + if (skb_copy_bits(skb, 0, nskb->data, size)) { > + consume_skb(nskb); > goto drop; > } > + skb_put(nskb, size); > > skb_copy_header(nskb, skb); > head_off = skb_headroom(nskb) - skb_headroom(skb); > skb_headers_offset_update(nskb, head_off); > + > + /* Allocate paged area of new skb */ > + off = size; > + len = skb->len - off; > + > + for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { > + page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); > + if (!page) { > + consume_skb(nskb); > + goto drop; > + } > + > + size = min_t(u32, len, PAGE_SIZE); > + skb_add_rx_frag(nskb, i, page, 0, size, PAGE_SIZE); > + if (skb_copy_bits(skb, off, page_address(page), > + size)) { > + consume_skb(nskb); > + goto drop; > + } > + > + len -= size; > + off += size; > + } > + > consume_skb(skb); > skb = nskb; > + } else if (skb_headroom(skb) < XDP_PACKET_HEADROOM && > + pskb_expand_head(skb, VETH_XDP_HEADROOM, 0, GFP_ATOMIC)) { > + goto drop; > } > > /* SKB "head" area always have tailroom for skb_shared_info */ > frame_sz = skb_end_pointer(skb) - skb->head; > frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); > - xdp_init_buff(&xdp, frame_sz, &rq->xdp_rxq); > - xdp_prepare_buff(&xdp, skb->head, skb->mac_header, pktlen, true); > + xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); > + xdp_prepare_buff(xdp, skb->head, skb_headroom(skb), > + skb_headlen(skb), true); > + > + if (skb_is_nonlinear(skb)) { > + skb_shinfo(skb)->xdp_frags_size = skb->data_len; > + xdp_buff_set_frags_flag(xdp); > + } else { > + xdp_buff_clear_frags_flag(xdp); > + } > + *pskb = skb; > + > + return 0; > +drop: > + consume_skb(skb); > + *pskb = NULL; > + > + return -ENOMEM; > +} > + > +static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, > + struct sk_buff *skb, > + struct veth_xdp_tx_bq *bq, > + struct veth_stats *stats) > +{ > + void *orig_data, *orig_data_end; > + struct bpf_prog *xdp_prog; > + struct xdp_buff xdp; > + u32 act, metalen; > + int off; > + > + skb_prepare_for_gro(skb); > + > + rcu_read_lock(); > + xdp_prog = rcu_dereference(rq->xdp_prog); > + if (unlikely(!xdp_prog)) { > + rcu_read_unlock(); > + goto out; > + } > + > + __skb_push(skb, skb->data - skb_mac_header(skb)); > + if (veth_convert_xdp_buff_from_skb(rq, &xdp, &skb)) > + goto drop; > > orig_data = xdp.data; > orig_data_end = xdp.data_end; > @@ -771,7 +827,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, > case XDP_PASS: > break; > case XDP_TX: > - get_page(virt_to_page(xdp.data)); > + veth_xdp_get(&xdp); > consume_skb(skb); > xdp.rxq->mem = rq->xdp_mem; > if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { > @@ -783,7 +839,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, > rcu_read_unlock(); > goto xdp_xmit; > case XDP_REDIRECT: > - get_page(virt_to_page(xdp.data)); > + veth_xdp_get(&xdp); > consume_skb(skb); > xdp.rxq->mem = rq->xdp_mem; > if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { > @@ -806,18 +862,24 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, > rcu_read_unlock(); > > /* check if bpf_xdp_adjust_head was used */ > - delta = orig_data - xdp.data; > - off = mac_len + delta; > + off = orig_data - xdp.data; > if (off > 0) > __skb_push(skb, off); > else if (off < 0) > __skb_pull(skb, -off); > - skb->mac_header -= delta; > + > + skb_reset_mac_header(skb); > > /* check if bpf_xdp_adjust_tail was used */ > off = xdp.data_end - orig_data_end; > if (off != 0) > __skb_put(skb, off); /* positive on grow, negative on shrink */ > + > + if (xdp_buff_has_frags(&xdp)) > + skb->data_len = skb_shinfo(skb)->xdp_frags_size; > + else > + skb->data_len = 0; We can remove entire frags using xdp_adjust_tail, right? Will that get propagated in the right way to the skb frags due to the dual use of skb_shared_info, or?
> Lorenzo Bianconi <lorenzo@kernel.org> writes: > > > Introduce veth_convert_xdp_buff_from_skb routine in order to > > convert a non-linear skb into a xdp buffer. If the received skb > > is cloned or shared, veth_convert_xdp_buff_from_skb will copy it > > in a new skb composed by order-0 pages for the linear and the > > fragmented area. Moreover veth_convert_xdp_buff_from_skb guarantees > > we have enough headroom for xdp. > > This is a preliminary patch to allow attaching xdp programs with frags > > support on veth devices. > > > > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org> > > It's cool that we can do this! A few comments below: Hi Toke, thx for the review :) [...] > > +static int veth_convert_xdp_buff_from_skb(struct veth_rq *rq, > > + struct xdp_buff *xdp, > > + struct sk_buff **pskb) > > +{ > > nit: It's not really "converting" and skb into an xdp_buff, since the > xdp_buff lives on the stack; so maybe 'veth_init_xdp_buff_from_skb()'? I kept the previous naming convention used for xdp_convert_frame_to_buff() (my goal would be to move it in xdp.c and reuse this routine for the generic-xdp use case) but I am fine with veth_init_xdp_buff_from_skb(). > > > + struct sk_buff *skb = *pskb; > > + u32 frame_sz; > > > > if (skb_shared(skb) || skb_head_is_locked(skb) || > > - skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { > > + skb_shinfo(skb)->nr_frags) { > > So this always clones the skb if it has frags? Is that really needed? if we look at skb_cow_data(), paged area is always considered not writable > > Also, there's a lot of memory allocation and copying going on here; have > you measured the performance? even in the previous implementation we always reallocate the skb if the conditions above are verified so I do not expect any difference in the single buffer use-case but I will run some performance tests. > [...] > > + > > + if (xdp_buff_has_frags(&xdp)) > > + skb->data_len = skb_shinfo(skb)->xdp_frags_size; > > + else > > + skb->data_len = 0; > > We can remove entire frags using xdp_adjust_tail, right? Will that get > propagated in the right way to the skb frags due to the dual use of > skb_shared_info, or? bpf_xdp_frags_shrink_tail() can remove entire frags and it will modify metadata contained in the skb_shared_info (e.g. nr_frags or the frag size of the given page). We should consider the data_len field in this case. Agree? Regards, Lorenzo >
Lorenzo Bianconi <lorenzo.bianconi@redhat.com> writes: >> Lorenzo Bianconi <lorenzo@kernel.org> writes: >> >> > Introduce veth_convert_xdp_buff_from_skb routine in order to >> > convert a non-linear skb into a xdp buffer. If the received skb >> > is cloned or shared, veth_convert_xdp_buff_from_skb will copy it >> > in a new skb composed by order-0 pages for the linear and the >> > fragmented area. Moreover veth_convert_xdp_buff_from_skb guarantees >> > we have enough headroom for xdp. >> > This is a preliminary patch to allow attaching xdp programs with frags >> > support on veth devices. >> > >> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org> >> >> It's cool that we can do this! A few comments below: > > Hi Toke, > > thx for the review :) > > [...] > >> > +static int veth_convert_xdp_buff_from_skb(struct veth_rq *rq, >> > + struct xdp_buff *xdp, >> > + struct sk_buff **pskb) >> > +{ >> >> nit: It's not really "converting" and skb into an xdp_buff, since the >> xdp_buff lives on the stack; so maybe 'veth_init_xdp_buff_from_skb()'? > > I kept the previous naming convention used for xdp_convert_frame_to_buff() > (my goal would be to move it in xdp.c and reuse this routine for the > generic-xdp use case) but I am fine with > veth_init_xdp_buff_from_skb(). Consistency is probably good, but right now we have functions of the form 'xdp_convert_X_to_Y()' and 'xdp_update_Y_from_X()'. So to follow that you'd have either 'veth_update_xdp_buff_from_skb()' or 'veth_convert_skb_to_xdp_buff()' :) >> > + struct sk_buff *skb = *pskb; >> > + u32 frame_sz; >> > >> > if (skb_shared(skb) || skb_head_is_locked(skb) || >> > - skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { >> > + skb_shinfo(skb)->nr_frags) { >> >> So this always clones the skb if it has frags? Is that really needed? > > if we look at skb_cow_data(), paged area is always considered not writable Ah, right, did not know that. Seems a bit odd, but OK. >> Also, there's a lot of memory allocation and copying going on here; have >> you measured the performance? > > even in the previous implementation we always reallocate the skb if the > conditions above are verified so I do not expect any difference in the single > buffer use-case but I will run some performance tests. No, I wouldn't expect any difference for the single-buffer case, but I would also be interested in how big the overhead is of having to copy the whole jumbo-frame? BTW, just noticed one other change - before we had: > - headroom = skb_headroom(skb) - mac_len; > if (skb_shared(skb) || skb_head_is_locked(skb) || > - skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { And in your patch that becomes: > + } else if (skb_headroom(skb) < XDP_PACKET_HEADROOM && > + pskb_expand_head(skb, VETH_XDP_HEADROOM, 0, GFP_ATOMIC)) { > + goto drop; So the mac_len subtraction disappeared; that seems wrong? >> > + >> > + if (xdp_buff_has_frags(&xdp)) >> > + skb->data_len = skb_shinfo(skb)->xdp_frags_size; >> > + else >> > + skb->data_len = 0; >> >> We can remove entire frags using xdp_adjust_tail, right? Will that get >> propagated in the right way to the skb frags due to the dual use of >> skb_shared_info, or? > > bpf_xdp_frags_shrink_tail() can remove entire frags and it will modify > metadata contained in the skb_shared_info (e.g. nr_frags or the frag > size of the given page). We should consider the data_len field in this > case. Agree? Right, that's what I assumed; makes sense. But adding a comment mentioning this above the update of data_len might be helpful? :) -Toke
> Lorenzo Bianconi <lorenzo.bianconi@redhat.com> writes: > > >> Lorenzo Bianconi <lorenzo@kernel.org> writes: > >> > >> > Introduce veth_convert_xdp_buff_from_skb routine in order to > >> > convert a non-linear skb into a xdp buffer. If the received skb > >> > is cloned or shared, veth_convert_xdp_buff_from_skb will copy it > >> > in a new skb composed by order-0 pages for the linear and the > >> > fragmented area. Moreover veth_convert_xdp_buff_from_skb guarantees > >> > we have enough headroom for xdp. > >> > This is a preliminary patch to allow attaching xdp programs with frags > >> > support on veth devices. > >> > > >> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org> > >> > >> It's cool that we can do this! A few comments below: > > > > Hi Toke, > > > > thx for the review :) > > > > [...] > > > >> > +static int veth_convert_xdp_buff_from_skb(struct veth_rq *rq, > >> > + struct xdp_buff *xdp, > >> > + struct sk_buff **pskb) > >> > +{ > >> > >> nit: It's not really "converting" and skb into an xdp_buff, since the > >> xdp_buff lives on the stack; so maybe 'veth_init_xdp_buff_from_skb()'? > > > > I kept the previous naming convention used for xdp_convert_frame_to_buff() > > (my goal would be to move it in xdp.c and reuse this routine for the > > generic-xdp use case) but I am fine with > > veth_init_xdp_buff_from_skb(). > > Consistency is probably good, but right now we have functions of the > form 'xdp_convert_X_to_Y()' and 'xdp_update_Y_from_X()'. So to follow > that you'd have either 'veth_update_xdp_buff_from_skb()' or > 'veth_convert_skb_to_xdp_buff()' :) ack, I am fine with veth_convert_skb_to_xdp_buff() > > >> > + struct sk_buff *skb = *pskb; > >> > + u32 frame_sz; > >> > > >> > if (skb_shared(skb) || skb_head_is_locked(skb) || > >> > - skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { > >> > + skb_shinfo(skb)->nr_frags) { > >> > >> So this always clones the skb if it has frags? Is that really needed? > > > > if we look at skb_cow_data(), paged area is always considered not writable > > Ah, right, did not know that. Seems a bit odd, but OK. > > >> Also, there's a lot of memory allocation and copying going on here; have > >> you measured the performance? > > > > even in the previous implementation we always reallocate the skb if the > > conditions above are verified so I do not expect any difference in the single > > buffer use-case but I will run some performance tests. > > No, I wouldn't expect any difference for the single-buffer case, but I > would also be interested in how big the overhead is of having to copy > the whole jumbo-frame? oh ok, I got what you mean. I guess we can compare the tcp throughput for the legacy skb mode (when no program is attached on the veth pair) and xdp mode (when we load a simple xdp program that just returns xdp_pass) when jumbo frames are enabled. I would expect a performance penalty but let's see. > > BTW, just noticed one other change - before we had: > > > - headroom = skb_headroom(skb) - mac_len; > > if (skb_shared(skb) || skb_head_is_locked(skb) || > > - skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { > > > And in your patch that becomes: > > > + } else if (skb_headroom(skb) < XDP_PACKET_HEADROOM && > > + pskb_expand_head(skb, VETH_XDP_HEADROOM, 0, GFP_ATOMIC)) { > > + goto drop; > > > So the mac_len subtraction disappeared; that seems wrong? we call __skb_push before running veth_convert_xdp_buff_from_skb() in veth_xdp_rcv_skb(). > > >> > + > >> > + if (xdp_buff_has_frags(&xdp)) > >> > + skb->data_len = skb_shinfo(skb)->xdp_frags_size; > >> > + else > >> > + skb->data_len = 0; > >> > >> We can remove entire frags using xdp_adjust_tail, right? Will that get > >> propagated in the right way to the skb frags due to the dual use of > >> skb_shared_info, or? > > > > bpf_xdp_frags_shrink_tail() can remove entire frags and it will modify > > metadata contained in the skb_shared_info (e.g. nr_frags or the frag > > size of the given page). We should consider the data_len field in this > > case. Agree? > > Right, that's what I assumed; makes sense. But adding a comment > mentioning this above the update of data_len might be helpful? :) ack, will do. Regards, Lorenzo > > -Toke >
> > Lorenzo Bianconi <lorenzo.bianconi@redhat.com> writes: > > > > >> Lorenzo Bianconi <lorenzo@kernel.org> writes: > > >> > > >> > Introduce veth_convert_xdp_buff_from_skb routine in order to > > >> > convert a non-linear skb into a xdp buffer. If the received skb > > >> > is cloned or shared, veth_convert_xdp_buff_from_skb will copy it > > >> > in a new skb composed by order-0 pages for the linear and the > > >> > fragmented area. Moreover veth_convert_xdp_buff_from_skb guarantees > > >> > we have enough headroom for xdp. > > >> > This is a preliminary patch to allow attaching xdp programs with frags > > >> > support on veth devices. > > >> > > > >> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org> > > >> > > >> It's cool that we can do this! A few comments below: > > > > > > Hi Toke, > > > > > > thx for the review :) > > > > > > [...] > > > > > >> > +static int veth_convert_xdp_buff_from_skb(struct veth_rq *rq, > > >> > + struct xdp_buff *xdp, > > >> > + struct sk_buff **pskb) > > >> > +{ > > >> > > >> nit: It's not really "converting" and skb into an xdp_buff, since the > > >> xdp_buff lives on the stack; so maybe 'veth_init_xdp_buff_from_skb()'? > > > > > > I kept the previous naming convention used for xdp_convert_frame_to_buff() > > > (my goal would be to move it in xdp.c and reuse this routine for the > > > generic-xdp use case) but I am fine with > > > veth_init_xdp_buff_from_skb(). > > > > Consistency is probably good, but right now we have functions of the > > form 'xdp_convert_X_to_Y()' and 'xdp_update_Y_from_X()'. So to follow > > that you'd have either 'veth_update_xdp_buff_from_skb()' or > > 'veth_convert_skb_to_xdp_buff()' :) > > ack, I am fine with veth_convert_skb_to_xdp_buff() > > > > > >> > + struct sk_buff *skb = *pskb; > > >> > + u32 frame_sz; > > >> > > > >> > if (skb_shared(skb) || skb_head_is_locked(skb) || > > >> > - skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { > > >> > + skb_shinfo(skb)->nr_frags) { > > >> > > >> So this always clones the skb if it has frags? Is that really needed? > > > > > > if we look at skb_cow_data(), paged area is always considered not writable > > > > Ah, right, did not know that. Seems a bit odd, but OK. > > > > >> Also, there's a lot of memory allocation and copying going on here; have > > >> you measured the performance? > > > > > > even in the previous implementation we always reallocate the skb if the > > > conditions above are verified so I do not expect any difference in the single > > > buffer use-case but I will run some performance tests. > > > > No, I wouldn't expect any difference for the single-buffer case, but I > > would also be interested in how big the overhead is of having to copy > > the whole jumbo-frame? > > oh ok, I got what you mean. I guess we can compare the tcp throughput for > the legacy skb mode (when no program is attached on the veth pair) and xdp > mode (when we load a simple xdp program that just returns xdp_pass) when > jumbo frames are enabled. I would expect a performance penalty but let's see. I run the tests described above and I got the following results: - skb mode mtu 1500B (TSO/GSO off): ~ 16.8 Gbps - xdp mode mtu 1500B (XDP_PASS): ~ 9.52 Gbps - skb mode mtu 32KB (TSO/GSO off): ~ 41 Gbps - xdp mode mtu 32KB (XDP_PASS): ~ 25 Gbps the (expected) performance penalty ratio (due to the copy) is quite constant Regards, Lorenzo > > > > > BTW, just noticed one other change - before we had: > > > > > - headroom = skb_headroom(skb) - mac_len; > > > if (skb_shared(skb) || skb_head_is_locked(skb) || > > > - skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { > > > > > > And in your patch that becomes: > > > > > + } else if (skb_headroom(skb) < XDP_PACKET_HEADROOM && > > > + pskb_expand_head(skb, VETH_XDP_HEADROOM, 0, GFP_ATOMIC)) { > > > + goto drop; > > > > > > So the mac_len subtraction disappeared; that seems wrong? > > we call __skb_push before running veth_convert_xdp_buff_from_skb() in > veth_xdp_rcv_skb(). > > > > > >> > + > > >> > + if (xdp_buff_has_frags(&xdp)) > > >> > + skb->data_len = skb_shinfo(skb)->xdp_frags_size; > > >> > + else > > >> > + skb->data_len = 0; > > >> > > >> We can remove entire frags using xdp_adjust_tail, right? Will that get > > >> propagated in the right way to the skb frags due to the dual use of > > >> skb_shared_info, or? > > > > > > bpf_xdp_frags_shrink_tail() can remove entire frags and it will modify > > > metadata contained in the skb_shared_info (e.g. nr_frags or the frag > > > size of the given page). We should consider the data_len field in this > > > case. Agree? > > > > Right, that's what I assumed; makes sense. But adding a comment > > mentioning this above the update of data_len might be helpful? :) > > ack, will do. > > Regards, > Lorenzo > > > > > -Toke > >
On Thu, 10 Mar 2022 20:06:40 +0100 Toke Høiland-Jørgensen wrote: > >> So this always clones the skb if it has frags? Is that really needed? > > > > if we look at skb_cow_data(), paged area is always considered not writable > > Ah, right, did not know that. Seems a bit odd, but OK. Yeah, I think I pointed that out, I may well be wrong. AFAICT frags which are not writable are not marked in any clear way. We have SKBFL_SHARED_FRAG which seems pretty close but its documented as an indication that the frag can be written under our feet, not that we can't write to it. Subtly different. And (as documented) it's only used when doing SW csums, as far as I can grep. Maybe someone else knows the semantics.
diff --git a/drivers/net/veth.c b/drivers/net/veth.c index b77ce3fdcfe8..47b21b1d2fd9 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -433,21 +433,6 @@ static void veth_set_multicast_list(struct net_device *dev) { } -static struct sk_buff *veth_build_skb(void *head, int headroom, int len, - int buflen) -{ - struct sk_buff *skb; - - skb = build_skb(head, buflen); - if (!skb) - return NULL; - - skb_reserve(skb, headroom); - skb_put(skb, len); - - return skb; -} - static int veth_select_rxq(struct net_device *dev) { return smp_processor_id() % dev->real_num_rx_queues; @@ -695,72 +680,143 @@ static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames, } } -static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, - struct sk_buff *skb, - struct veth_xdp_tx_bq *bq, - struct veth_stats *stats) +static void veth_xdp_get(struct xdp_buff *xdp) { - u32 pktlen, headroom, act, metalen, frame_sz; - void *orig_data, *orig_data_end; - struct bpf_prog *xdp_prog; - int mac_len, delta, off; - struct xdp_buff xdp; + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + int i; - skb_prepare_for_gro(skb); + get_page(virt_to_page(xdp->data)); + if (likely(!xdp_buff_has_frags(xdp))) + return; - rcu_read_lock(); - xdp_prog = rcu_dereference(rq->xdp_prog); - if (unlikely(!xdp_prog)) { - rcu_read_unlock(); - goto out; - } + for (i = 0; i < sinfo->nr_frags; i++) + __skb_frag_ref(&sinfo->frags[i]); +} - mac_len = skb->data - skb_mac_header(skb); - pktlen = skb->len + mac_len; - headroom = skb_headroom(skb) - mac_len; +static int veth_convert_xdp_buff_from_skb(struct veth_rq *rq, + struct xdp_buff *xdp, + struct sk_buff **pskb) +{ + struct sk_buff *skb = *pskb; + u32 frame_sz; if (skb_shared(skb) || skb_head_is_locked(skb) || - skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { + skb_shinfo(skb)->nr_frags) { + u32 size, len, max_head_size, off; struct sk_buff *nskb; - int size, head_off; - void *head, *start; struct page *page; + int i, head_off; - size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - if (size > PAGE_SIZE) + /* We need a private copy of the skb and data buffers since + * the ebpf program can modify it. We segment the original skb + * into order-0 pages without linearize it. + * + * Make sure we have enough space for linear and paged area + */ + max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - + VETH_XDP_HEADROOM); + if (skb->len > PAGE_SIZE * MAX_SKB_FRAGS + max_head_size) goto drop; + /* Allocate skb head */ page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); if (!page) goto drop; - head = page_address(page); - start = head + VETH_XDP_HEADROOM; - if (skb_copy_bits(skb, -mac_len, start, pktlen)) { - page_frag_free(head); + nskb = build_skb(page_address(page), PAGE_SIZE); + if (!nskb) { + put_page(page); goto drop; } - nskb = veth_build_skb(head, VETH_XDP_HEADROOM + mac_len, - skb->len, PAGE_SIZE); - if (!nskb) { - page_frag_free(head); + skb_reserve(nskb, VETH_XDP_HEADROOM); + size = min_t(u32, skb->len, max_head_size); + if (skb_copy_bits(skb, 0, nskb->data, size)) { + consume_skb(nskb); goto drop; } + skb_put(nskb, size); skb_copy_header(nskb, skb); head_off = skb_headroom(nskb) - skb_headroom(skb); skb_headers_offset_update(nskb, head_off); + + /* Allocate paged area of new skb */ + off = size; + len = skb->len - off; + + for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { + page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); + if (!page) { + consume_skb(nskb); + goto drop; + } + + size = min_t(u32, len, PAGE_SIZE); + skb_add_rx_frag(nskb, i, page, 0, size, PAGE_SIZE); + if (skb_copy_bits(skb, off, page_address(page), + size)) { + consume_skb(nskb); + goto drop; + } + + len -= size; + off += size; + } + consume_skb(skb); skb = nskb; + } else if (skb_headroom(skb) < XDP_PACKET_HEADROOM && + pskb_expand_head(skb, VETH_XDP_HEADROOM, 0, GFP_ATOMIC)) { + goto drop; } /* SKB "head" area always have tailroom for skb_shared_info */ frame_sz = skb_end_pointer(skb) - skb->head; frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - xdp_init_buff(&xdp, frame_sz, &rq->xdp_rxq); - xdp_prepare_buff(&xdp, skb->head, skb->mac_header, pktlen, true); + xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); + xdp_prepare_buff(xdp, skb->head, skb_headroom(skb), + skb_headlen(skb), true); + + if (skb_is_nonlinear(skb)) { + skb_shinfo(skb)->xdp_frags_size = skb->data_len; + xdp_buff_set_frags_flag(xdp); + } else { + xdp_buff_clear_frags_flag(xdp); + } + *pskb = skb; + + return 0; +drop: + consume_skb(skb); + *pskb = NULL; + + return -ENOMEM; +} + +static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, + struct sk_buff *skb, + struct veth_xdp_tx_bq *bq, + struct veth_stats *stats) +{ + void *orig_data, *orig_data_end; + struct bpf_prog *xdp_prog; + struct xdp_buff xdp; + u32 act, metalen; + int off; + + skb_prepare_for_gro(skb); + + rcu_read_lock(); + xdp_prog = rcu_dereference(rq->xdp_prog); + if (unlikely(!xdp_prog)) { + rcu_read_unlock(); + goto out; + } + + __skb_push(skb, skb->data - skb_mac_header(skb)); + if (veth_convert_xdp_buff_from_skb(rq, &xdp, &skb)) + goto drop; orig_data = xdp.data; orig_data_end = xdp.data_end; @@ -771,7 +827,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, case XDP_PASS: break; case XDP_TX: - get_page(virt_to_page(xdp.data)); + veth_xdp_get(&xdp); consume_skb(skb); xdp.rxq->mem = rq->xdp_mem; if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) { @@ -783,7 +839,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, rcu_read_unlock(); goto xdp_xmit; case XDP_REDIRECT: - get_page(virt_to_page(xdp.data)); + veth_xdp_get(&xdp); consume_skb(skb); xdp.rxq->mem = rq->xdp_mem; if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { @@ -806,18 +862,24 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, rcu_read_unlock(); /* check if bpf_xdp_adjust_head was used */ - delta = orig_data - xdp.data; - off = mac_len + delta; + off = orig_data - xdp.data; if (off > 0) __skb_push(skb, off); else if (off < 0) __skb_pull(skb, -off); - skb->mac_header -= delta; + + skb_reset_mac_header(skb); /* check if bpf_xdp_adjust_tail was used */ off = xdp.data_end - orig_data_end; if (off != 0) __skb_put(skb, off); /* positive on grow, negative on shrink */ + + if (xdp_buff_has_frags(&xdp)) + skb->data_len = skb_shinfo(skb)->xdp_frags_size; + else + skb->data_len = 0; + skb->protocol = eth_type_trans(skb, rq->dev); metalen = xdp.data - xdp.data_meta; @@ -833,7 +895,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, return NULL; err_xdp: rcu_read_unlock(); - page_frag_free(xdp.data); + xdp_return_buff(&xdp); xdp_xmit: return NULL; } diff --git a/net/core/xdp.c b/net/core/xdp.c index 361df312ee7f..b5f2d428d856 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -528,6 +528,7 @@ void xdp_return_buff(struct xdp_buff *xdp) out: __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp); } +EXPORT_SYMBOL_GPL(xdp_return_buff); /* Only called for MEM_TYPE_PAGE_POOL see xdp.h */ void __xdp_release_frame(void *data, struct xdp_mem_info *mem)
Introduce veth_convert_xdp_buff_from_skb routine in order to convert a non-linear skb into a xdp buffer. If the received skb is cloned or shared, veth_convert_xdp_buff_from_skb will copy it in a new skb composed by order-0 pages for the linear and the fragmented area. Moreover veth_convert_xdp_buff_from_skb guarantees we have enough headroom for xdp. This is a preliminary patch to allow attaching xdp programs with frags support on veth devices. Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org> --- drivers/net/veth.c | 174 ++++++++++++++++++++++++++++++--------------- net/core/xdp.c | 1 + 2 files changed, 119 insertions(+), 56 deletions(-)