Message ID | 20220208035510.1200-2-dongli.zhang@oracle.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | tun/tap: use kfree_skb_reason() to trace dropped skb | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Guessing tree name failed - patch did not apply |
On Mon, Feb 7, 2022 at 7:55 PM Dongli Zhang <dongli.zhang@oracle.com> wrote: > > The TAP can be used as vhost-net backend. E.g., the tap_handle_frame() is > the interface to forward the skb from TAP to vhost-net/virtio-net. > > However, there are many "goto drop" in the TAP driver. Therefore, the > kfree_skb_reason() is involved at each "goto drop" to help userspace > ftrace/ebpf to track the reason for the loss of packets > > Cc: Joao Martins <joao.m.martins@oracle.com> > Cc: Joe Jin <joe.jin@oracle.com> > Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com> > --- > drivers/net/tap.c | 30 ++++++++++++++++++++++-------- > include/linux/skbuff.h | 5 +++++ > include/trace/events/skb.h | 5 +++++ > 3 files changed, 32 insertions(+), 8 deletions(-) > > diff --git a/drivers/net/tap.c b/drivers/net/tap.c > index 8e3a28ba6b28..232572289e63 100644 > --- a/drivers/net/tap.c > +++ b/drivers/net/tap.c > @@ -322,6 +322,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) > struct tap_dev *tap; > struct tap_queue *q; > netdev_features_t features = TAP_FEATURES; > + int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; > > tap = tap_dev_get_rcu(dev); > if (!tap) > @@ -343,12 +344,16 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) > struct sk_buff *segs = __skb_gso_segment(skb, features, false); > struct sk_buff *next; > > - if (IS_ERR(segs)) > + if (IS_ERR(segs)) { > + drop_reason = SKB_DROP_REASON_SKB_GSO_SEGMENT; > goto drop; > + } > > if (!segs) { > - if (ptr_ring_produce(&q->ring, skb)) > + if (ptr_ring_produce(&q->ring, skb)) { > + drop_reason = SKB_DROP_REASON_PTR_FULL; PTR_FULL is strange .... How about FULL_RING, or FULL_QUEUE ?
On 2/7/22 7:55 PM, Dongli Zhang wrote: > diff --git a/drivers/net/tap.c b/drivers/net/tap.c > index 8e3a28ba6b28..232572289e63 100644 > --- a/drivers/net/tap.c > +++ b/drivers/net/tap.c > @@ -322,6 +322,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) > struct tap_dev *tap; > struct tap_queue *q; > netdev_features_t features = TAP_FEATURES; > + int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; maybe I missed an exit path, but I believe drop_reason is always set before a goto jump, so this init is not needed. > > tap = tap_dev_get_rcu(dev); > if (!tap) > @@ -343,12 +344,16 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) > struct sk_buff *segs = __skb_gso_segment(skb, features, false); > struct sk_buff *next; > > - if (IS_ERR(segs)) > + if (IS_ERR(segs)) { > + drop_reason = SKB_DROP_REASON_SKB_GSO_SEGMENT; This reason points to a line of code, not the real reason for the drop. If you unwind __skb_gso_segment the only failure there is ENOMEM. The reason code needs to be meaningful to users, not just code references. > goto drop; > + } > > if (!segs) { > - if (ptr_ring_produce(&q->ring, skb)) > + if (ptr_ring_produce(&q->ring, skb)) { > + drop_reason = SKB_DROP_REASON_PTR_FULL; similar comment to Eric - PTR_FULL needs to be more helpful. > goto drop; > + } > goto wake_up; > } > > @@ -369,10 +374,14 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) > */ > if (skb->ip_summed == CHECKSUM_PARTIAL && > !(features & NETIF_F_CSUM_MASK) && > - skb_checksum_help(skb)) > + skb_checksum_help(skb)) { > + drop_reason = SKB_DROP_REASON_SKB_CHECKSUM; That is not helpful explanation of the root cause; it is more of a code reference. > goto drop; > - if (ptr_ring_produce(&q->ring, skb)) > + } > + if (ptr_ring_produce(&q->ring, skb)) { > + drop_reason = SKB_DROP_REASON_PTR_FULL; ditto above comment > goto drop; > + } > } > > wake_up: > @@ -383,7 +392,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) > /* Count errors/drops only here, thus don't care about args. */ > if (tap->count_rx_dropped) > tap->count_rx_dropped(tap); > - kfree_skb(skb); > + kfree_skb_reason(skb, drop_reason); > return RX_HANDLER_CONSUMED; > } > EXPORT_SYMBOL_GPL(tap_handle_frame); > @@ -632,6 +641,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, > int depth; > bool zerocopy = false; > size_t linear; > + int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; > > if (q->flags & IFF_VNET_HDR) { > vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); > @@ -696,8 +706,10 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, > else > err = skb_copy_datagram_from_iter(skb, 0, from, len); > > - if (err) > + if (err) { > + drop_reason = SKB_DROP_REASON_SKB_COPY_DATA; As mentioned above, plus unwind the above functions and give a more explicit description of why the above fails. > goto err_kfree; > + } > > skb_set_network_header(skb, ETH_HLEN); > skb_reset_mac_header(skb); > @@ -706,8 +718,10 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, > if (vnet_hdr_len) { > err = virtio_net_hdr_to_skb(skb, &vnet_hdr, > tap_is_little_endian(q)); > - if (err) > + if (err) { > + drop_reason = SKB_DROP_REASON_VIRTNET_HDR; and here too.
Hi Eric, On 2/7/22 8:32 PM, Eric Dumazet wrote: > On Mon, Feb 7, 2022 at 7:55 PM Dongli Zhang <dongli.zhang@oracle.com> wrote: >> >> The TAP can be used as vhost-net backend. E.g., the tap_handle_frame() is >> the interface to forward the skb from TAP to vhost-net/virtio-net. >> >> However, there are many "goto drop" in the TAP driver. Therefore, the >> kfree_skb_reason() is involved at each "goto drop" to help userspace >> ftrace/ebpf to track the reason for the loss of packets >> >> Cc: Joao Martins <joao.m.martins@oracle.com> >> Cc: Joe Jin <joe.jin@oracle.com> >> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com> >> --- >> drivers/net/tap.c | 30 ++++++++++++++++++++++-------- >> include/linux/skbuff.h | 5 +++++ >> include/trace/events/skb.h | 5 +++++ >> 3 files changed, 32 insertions(+), 8 deletions(-) >> >> diff --git a/drivers/net/tap.c b/drivers/net/tap.c >> index 8e3a28ba6b28..232572289e63 100644 >> --- a/drivers/net/tap.c >> +++ b/drivers/net/tap.c >> @@ -322,6 +322,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) >> struct tap_dev *tap; >> struct tap_queue *q; >> netdev_features_t features = TAP_FEATURES; >> + int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; >> >> tap = tap_dev_get_rcu(dev); >> if (!tap) >> @@ -343,12 +344,16 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) >> struct sk_buff *segs = __skb_gso_segment(skb, features, false); >> struct sk_buff *next; >> >> - if (IS_ERR(segs)) >> + if (IS_ERR(segs)) { >> + drop_reason = SKB_DROP_REASON_SKB_GSO_SEGMENT; >> goto drop; >> + } >> >> if (!segs) { >> - if (ptr_ring_produce(&q->ring, skb)) >> + if (ptr_ring_produce(&q->ring, skb)) { >> + drop_reason = SKB_DROP_REASON_PTR_FULL; > > PTR_FULL is strange .... How about FULL_RING, or FULL_QUEUE ? > I will use FULL_RING. Thank you very much! Dongli Zhang
Hi David, On 2/7/22 8:47 PM, David Ahern wrote: > On 2/7/22 7:55 PM, Dongli Zhang wrote: >> diff --git a/drivers/net/tap.c b/drivers/net/tap.c >> index 8e3a28ba6b28..232572289e63 100644 >> --- a/drivers/net/tap.c >> +++ b/drivers/net/tap.c >> @@ -322,6 +322,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) >> struct tap_dev *tap; >> struct tap_queue *q; >> netdev_features_t features = TAP_FEATURES; >> + int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; > > maybe I missed an exit path, but I believe drop_reason is always set > before a goto jump, so this init is not needed. For tun/tap, the drop_reason is always set. I will avoid initialing the 'drop_reason'. > >> >> tap = tap_dev_get_rcu(dev); >> if (!tap) >> @@ -343,12 +344,16 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) >> struct sk_buff *segs = __skb_gso_segment(skb, features, false); >> struct sk_buff *next; >> >> - if (IS_ERR(segs)) >> + if (IS_ERR(segs)) { >> + drop_reason = SKB_DROP_REASON_SKB_GSO_SEGMENT; > > This reason points to a line of code, not the real reason for the drop. > If you unwind __skb_gso_segment the only failure there is ENOMEM. The > reason code needs to be meaningful to users, not just code references. The __skb_gso_segment()->skb_mac_gso_segment() may return error as well. Here I prefer to introduce a new reason because __skb_gso_segment() and skb_gso_segment() are called at many locations. E.g., to just select a driver that I never use. 2100 static void r8152_csum_workaround(struct r8152 *tp, struct sk_buff *skb, 2101 struct sk_buff_head *list) ... ... 2109 segs = skb_gso_segment(skb, features); 2110 if (IS_ERR(segs) || !segs) 2111 goto drop; ... ... 2130 drop: 2131 stats = &tp->netdev->stats; 2132 stats->tx_dropped++; 2133 dev_kfree_skb(skb); 2134 } There are many such patterns. That's why I introduce a new reason for skb gso segmentation so that developers may re-use it. > > >> goto drop; >> + } >> >> if (!segs) { >> - if (ptr_ring_produce(&q->ring, skb)) >> + if (ptr_ring_produce(&q->ring, skb)) { >> + drop_reason = SKB_DROP_REASON_PTR_FULL; > > similar comment to Eric - PTR_FULL needs to be more helpful. I will use 'FULL_RING' as suggested by Eric. > >> goto drop; >> + } >> goto wake_up; >> } >> >> @@ -369,10 +374,14 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) >> */ >> if (skb->ip_summed == CHECKSUM_PARTIAL && >> !(features & NETIF_F_CSUM_MASK) && >> - skb_checksum_help(skb)) >> + skb_checksum_help(skb)) { >> + drop_reason = SKB_DROP_REASON_SKB_CHECKSUM; > > That is not helpful explanation of the root cause; it is more of a code > reference. To expand a function may generate a deep call graph. Any modification to the callees in the call graph may have impact in the future. The skb_checksum_help() is commonly used in the linux kernel to decide if there is any error, e.g., 809 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 810 int (*output)(struct net *, struct sock *, struct sk_buff *)) 811 { ... ... 859 if (skb->ip_summed == CHECKSUM_PARTIAL && 860 (err = skb_checksum_help(skb))) 861 goto fail; ... ... 985 fail: 986 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 987 IPSTATS_MIB_FRAGFAILS); 988 kfree_skb(skb); 989 return err; 990 } That's why I prefer to introduce a new reason, which can be used by developers for other subsystem/drivers. > > >> goto drop; >> - if (ptr_ring_produce(&q->ring, skb)) >> + } >> + if (ptr_ring_produce(&q->ring, skb)) { >> + drop_reason = SKB_DROP_REASON_PTR_FULL; > > ditto above comment I will use 'FULL_RING' as suggested by Eric. > >> goto drop; >> + } >> } >> >> wake_up: >> @@ -383,7 +392,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) >> /* Count errors/drops only here, thus don't care about args. */ >> if (tap->count_rx_dropped) >> tap->count_rx_dropped(tap); >> - kfree_skb(skb); >> + kfree_skb_reason(skb, drop_reason); >> return RX_HANDLER_CONSUMED; >> } >> EXPORT_SYMBOL_GPL(tap_handle_frame); >> @@ -632,6 +641,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, >> int depth; >> bool zerocopy = false; >> size_t linear; >> + int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; >> >> if (q->flags & IFF_VNET_HDR) { >> vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); >> @@ -696,8 +706,10 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, >> else >> err = skb_copy_datagram_from_iter(skb, 0, from, len); >> >> - if (err) >> + if (err) { >> + drop_reason = SKB_DROP_REASON_SKB_COPY_DATA; > > As mentioned above, plus unwind the above functions and give a more > explicit description of why the above fails. The __zerocopy_sg_from_iter() and skb_copy_datagram_from_iter() are commonly used by linux kernel. That's why a new reason is introduced. E.g., 4924 int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) 4925 { ... ... 4955 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 4956 if (err) 4957 goto err_free; ... ... 4969 err_free: 4970 kfree_skb(skb); After expanding the function, one of failure is due to copy_from_iter(). I think COPY_DATA is able to represent the failure at many locations involving the function to copy data. Any other places involving data copy failure (e.g., due to privilege issue, use-after-free, length issue) may re-use this reason. > >> goto err_kfree; >> + } >> >> skb_set_network_header(skb, ETH_HLEN); >> skb_reset_mac_header(skb); >> @@ -706,8 +718,10 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, >> if (vnet_hdr_len) { >> err = virtio_net_hdr_to_skb(skb, &vnet_hdr, >> tap_is_little_endian(q)); >> - if (err) >> + if (err) { >> + drop_reason = SKB_DROP_REASON_VIRTNET_HDR; > > and here too. > The virtio_net_hdr_to_skb() may return error for a variety of reasons. To simply return -EFAULT does not help. Indeed it may help at TAP/TUN if this is the only place returning -EFAULT. However, it is not helpful when there are two "goto drop;" returning the same -EFAULT. Here I am trying to introduce a virtio-net specific reason, saying the virtio-net header is invalid. Perhaps SKB_DROP_REASON_PV_HDR (for all PV drivers) or SKB_DROP_REASON_INVALID_METADATA. In my opinion, the kfree_skb_reason() is not for admin, but for developer. The admin helps run tcpdump and narrows down the location and reason that the pakcket is dropped, while the developer conducts code analysis to identify the specific reason. Therefore, I think the kfree_skb_reason() should: 1. Be friendly to user (admin) to collect data (e.g., via dropwatch/ebpf/ftrace). 2. Be friendly to developer to analyze the issue. Thank you very much! Dongli Zhang
diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 8e3a28ba6b28..232572289e63 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -322,6 +322,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) struct tap_dev *tap; struct tap_queue *q; netdev_features_t features = TAP_FEATURES; + int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; tap = tap_dev_get_rcu(dev); if (!tap) @@ -343,12 +344,16 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) struct sk_buff *segs = __skb_gso_segment(skb, features, false); struct sk_buff *next; - if (IS_ERR(segs)) + if (IS_ERR(segs)) { + drop_reason = SKB_DROP_REASON_SKB_GSO_SEGMENT; goto drop; + } if (!segs) { - if (ptr_ring_produce(&q->ring, skb)) + if (ptr_ring_produce(&q->ring, skb)) { + drop_reason = SKB_DROP_REASON_PTR_FULL; goto drop; + } goto wake_up; } @@ -369,10 +374,14 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) */ if (skb->ip_summed == CHECKSUM_PARTIAL && !(features & NETIF_F_CSUM_MASK) && - skb_checksum_help(skb)) + skb_checksum_help(skb)) { + drop_reason = SKB_DROP_REASON_SKB_CHECKSUM; goto drop; - if (ptr_ring_produce(&q->ring, skb)) + } + if (ptr_ring_produce(&q->ring, skb)) { + drop_reason = SKB_DROP_REASON_PTR_FULL; goto drop; + } } wake_up: @@ -383,7 +392,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) /* Count errors/drops only here, thus don't care about args. */ if (tap->count_rx_dropped) tap->count_rx_dropped(tap); - kfree_skb(skb); + kfree_skb_reason(skb, drop_reason); return RX_HANDLER_CONSUMED; } EXPORT_SYMBOL_GPL(tap_handle_frame); @@ -632,6 +641,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, int depth; bool zerocopy = false; size_t linear; + int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (q->flags & IFF_VNET_HDR) { vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); @@ -696,8 +706,10 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, else err = skb_copy_datagram_from_iter(skb, 0, from, len); - if (err) + if (err) { + drop_reason = SKB_DROP_REASON_SKB_COPY_DATA; goto err_kfree; + } skb_set_network_header(skb, ETH_HLEN); skb_reset_mac_header(skb); @@ -706,8 +718,10 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, if (vnet_hdr_len) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, tap_is_little_endian(q)); - if (err) + if (err) { + drop_reason = SKB_DROP_REASON_VIRTNET_HDR; goto err_kfree; + } } skb_probe_transport_header(skb); @@ -738,7 +752,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, return total_len; err_kfree: - kfree_skb(skb); + kfree_skb_reason(skb, drop_reason); err: rcu_read_lock(); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8a636e678902..16c30d2e20dc 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -320,6 +320,11 @@ enum skb_drop_reason { SKB_DROP_REASON_TCP_CSUM, SKB_DROP_REASON_SOCKET_FILTER, SKB_DROP_REASON_UDP_CSUM, + SKB_DROP_REASON_SKB_GSO_SEGMENT, + SKB_DROP_REASON_SKB_CHECKSUM, + SKB_DROP_REASON_SKB_COPY_DATA, + SKB_DROP_REASON_PTR_FULL, + SKB_DROP_REASON_VIRTNET_HDR, SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index a8a64b97504d..bf1509c31cea 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -16,6 +16,11 @@ EM(SKB_DROP_REASON_TCP_CSUM, TCP_CSUM) \ EM(SKB_DROP_REASON_SOCKET_FILTER, SOCKET_FILTER) \ EM(SKB_DROP_REASON_UDP_CSUM, UDP_CSUM) \ + EM(SKB_DROP_REASON_SKB_GSO_SEGMENT, SKB_GSO_SEGMENT) \ + EM(SKB_DROP_REASON_SKB_CHECKSUM, SKB_CHECKSUM) \ + EM(SKB_DROP_REASON_SKB_COPY_DATA, SKB_COPY_DATA) \ + EM(SKB_DROP_REASON_PTR_FULL, PTR_FULL) \ + EM(SKB_DROP_REASON_VIRTNET_HDR, VIRTNET_HDR) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM
The TAP can be used as vhost-net backend. E.g., the tap_handle_frame() is the interface to forward the skb from TAP to vhost-net/virtio-net. However, there are many "goto drop" in the TAP driver. Therefore, the kfree_skb_reason() is involved at each "goto drop" to help userspace ftrace/ebpf to track the reason for the loss of packets Cc: Joao Martins <joao.m.martins@oracle.com> Cc: Joe Jin <joe.jin@oracle.com> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com> --- drivers/net/tap.c | 30 ++++++++++++++++++++++-------- include/linux/skbuff.h | 5 +++++ include/trace/events/skb.h | 5 +++++ 3 files changed, 32 insertions(+), 8 deletions(-)