diff mbox series

[net-next] packet: Account for VLAN_HLEN in csum_start when virtio_net_hdr is enabled

Message ID 20231123183835.635210-1-mkp@redhat.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net-next] packet: Account for VLAN_HLEN in csum_start when virtio_net_hdr is enabled | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/codegen success Generated files up to date
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1117 this patch: 1117
netdev/cc_maintainers success CCed 5 of 5 maintainers
netdev/build_clang success Errors and warnings before: 1142 this patch: 1142
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 1144 this patch: 1144
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 61 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Mike Pattrick Nov. 23, 2023, 6:38 p.m. UTC
Af_packet provides checksum offload offsets to usermode applications
through struct virtio_net_hdr when PACKET_VNET_HDR is enabled on the
socket. For skbuffs with a vlan being sent to a SOCK_RAW socket,
af_packet will include the link level header and so csum_start needs
to be adjusted accordingly.

Fixes: fd3a88625844 ("net: in virtio_net_hdr only add VLAN_HLEN to csum_start if payload holds vlan")
Signed-off-by: Mike Pattrick <mkp@redhat.com>
---
 net/packet/af_packet.c | 36 ++++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

Comments

Willem de Bruijn Nov. 23, 2023, 9:24 p.m. UTC | #1
Mike Pattrick wrote:
> Af_packet provides checksum offload offsets to usermode applications
> through struct virtio_net_hdr when PACKET_VNET_HDR is enabled on the
> socket. For skbuffs with a vlan being sent to a SOCK_RAW socket,
> af_packet will include the link level header and so csum_start needs
> to be adjusted accordingly.

Is this patch based on observing an incorrect offset in a workload,
or on code inspection?

As the referenced patch mentions, VLAN_HLEN adjustment is needed
in macvtap because it pulls the vlan header from skb->vlan_tci. At
which point skb->csum_start is wrong.

"Commit f09e2249c4f5 ("macvtap: restore vlan header on user read")
 added this feature to macvtap. Commit 3ce9b20f1971 ("macvtap: Fix
 csum_start when VLAN tags are present") then fixed up csum_start."

But the commit also mentions "Virtio, packet and uml do not insert
the vlan header in the user buffer.". This situation has not changed.

Packet sockets may receive packets with VLAN headers present, but
unless they were inserted manually before passing to user, as macvtap
does, this does not affect csum_start.

Packet sockets support reading those skb->vlan_tci stored VLAN
headers using AUXDATA.

> Fixes: fd3a88625844 ("net: in virtio_net_hdr only add VLAN_HLEN to csum_start if payload holds vlan")

The fix should target net, not net-next.

> Signed-off-by: Mike Pattrick <mkp@redhat.com>
> ---
>  net/packet/af_packet.c | 36 ++++++++++++++++++++++++++----------
>  1 file changed, 26 insertions(+), 10 deletions(-)
> 
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index a84e00b5904b..f6b602ffe383 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -2092,15 +2092,23 @@ static unsigned int run_filter(struct sk_buff *skb,
>  }
>  
>  static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
> -			   size_t *len, int vnet_hdr_sz)
> +			   size_t *len, int vnet_hdr_sz,
> +			   const struct sock *sk)
>  {
>  	struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 };
> +	int vlan_hlen;
>  
>  	if (*len < vnet_hdr_sz)
>  		return -EINVAL;
>  	*len -= vnet_hdr_sz;
>  
> -	if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0))
> +	if (sk->sk_type == SOCK_RAW && skb_vlan_tag_present(skb))
> +		vlan_hlen = VLAN_HLEN;
> +	else
> +		vlan_hlen = 0;
> +
> +	if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr,
> +				    vio_le(), true, vlan_hlen))
>  		return -EINVAL;
>  
>  	return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz);
> @@ -2368,13 +2376,21 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
>  		__set_bit(slot_id, po->rx_ring.rx_owner_map);
>  	}
>  
> -	if (vnet_hdr_sz &&
> -	    virtio_net_hdr_from_skb(skb, h.raw + macoff -
> -				    sizeof(struct virtio_net_hdr),
> -				    vio_le(), true, 0)) {
> -		if (po->tp_version == TPACKET_V3)
> -			prb_clear_blk_fill_status(&po->rx_ring);
> -		goto drop_n_account;
> +	if (vnet_hdr_sz) {
> +		int vlan_hlen;
> +
> +		if (sk->sk_type == SOCK_RAW && skb_vlan_tag_present(skb))
> +			vlan_hlen = VLAN_HLEN;
> +		else
> +			vlan_hlen = 0;
> +
> +		if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
> +					    sizeof(struct virtio_net_hdr),
> +					    vio_le(), true, vlan_hlen)) {
> +			if (po->tp_version == TPACKET_V3)
> +				prb_clear_blk_fill_status(&po->rx_ring);
> +			goto drop_n_account;
> +		}
>  	}
>  
>  	if (po->tp_version <= TPACKET_V2) {
> @@ -3464,7 +3480,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
>  	packet_rcv_try_clear_pressure(pkt_sk(sk));
>  
>  	if (vnet_hdr_len) {
> -		err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len);
> +		err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len, sk);
>  		if (err)
>  			goto out_free;
>  	}
> -- 
> 2.40.1
>
Mike Pattrick Nov. 23, 2023, 10:53 p.m. UTC | #2
On Thu, Nov 23, 2023 at 4:25 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Mike Pattrick wrote:
> > Af_packet provides checksum offload offsets to usermode applications
> > through struct virtio_net_hdr when PACKET_VNET_HDR is enabled on the
> > socket. For skbuffs with a vlan being sent to a SOCK_RAW socket,
> > af_packet will include the link level header and so csum_start needs
> > to be adjusted accordingly.
>
> Is this patch based on observing an incorrect offset in a workload,
> or on code inspection?

Based on an incorrect offset in a workload. The setup involved sending
vxlan traffic though a veth interface configured with a vlan. The
vnet_hdr's csum_start value was off by 4, and this problem went away
when the vlan was removed.

I'll take another look at this patch.

>
> As the referenced patch mentions, VLAN_HLEN adjustment is needed
> in macvtap because it pulls the vlan header from skb->vlan_tci. At
> which point skb->csum_start is wrong.
>
> "Commit f09e2249c4f5 ("macvtap: restore vlan header on user read")
>  added this feature to macvtap. Commit 3ce9b20f1971 ("macvtap: Fix
>  csum_start when VLAN tags are present") then fixed up csum_start."
>
> But the commit also mentions "Virtio, packet and uml do not insert
> the vlan header in the user buffer.". This situation has not changed.
>
> Packet sockets may receive packets with VLAN headers present, but
> unless they were inserted manually before passing to user, as macvtap
> does, this does not affect csum_start.
>
> Packet sockets support reading those skb->vlan_tci stored VLAN
> headers using AUXDATA.
>
> > Fixes: fd3a88625844 ("net: in virtio_net_hdr only add VLAN_HLEN to csum_start if payload holds vlan")
>
> The fix should target net, not net-next.
>
> > Signed-off-by: Mike Pattrick <mkp@redhat.com>
> > ---
> >  net/packet/af_packet.c | 36 ++++++++++++++++++++++++++----------
> >  1 file changed, 26 insertions(+), 10 deletions(-)
> >
> > diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> > index a84e00b5904b..f6b602ffe383 100644
> > --- a/net/packet/af_packet.c
> > +++ b/net/packet/af_packet.c
> > @@ -2092,15 +2092,23 @@ static unsigned int run_filter(struct sk_buff *skb,
> >  }
> >
> >  static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
> > -                        size_t *len, int vnet_hdr_sz)
> > +                        size_t *len, int vnet_hdr_sz,
> > +                        const struct sock *sk)
> >  {
> >       struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 };
> > +     int vlan_hlen;
> >
> >       if (*len < vnet_hdr_sz)
> >               return -EINVAL;
> >       *len -= vnet_hdr_sz;
> >
> > -     if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0))
> > +     if (sk->sk_type == SOCK_RAW && skb_vlan_tag_present(skb))
> > +             vlan_hlen = VLAN_HLEN;
> > +     else
> > +             vlan_hlen = 0;
> > +
> > +     if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr,
> > +                                 vio_le(), true, vlan_hlen))
> >               return -EINVAL;
> >
> >       return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz);
> > @@ -2368,13 +2376,21 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
> >               __set_bit(slot_id, po->rx_ring.rx_owner_map);
> >       }
> >
> > -     if (vnet_hdr_sz &&
> > -         virtio_net_hdr_from_skb(skb, h.raw + macoff -
> > -                                 sizeof(struct virtio_net_hdr),
> > -                                 vio_le(), true, 0)) {
> > -             if (po->tp_version == TPACKET_V3)
> > -                     prb_clear_blk_fill_status(&po->rx_ring);
> > -             goto drop_n_account;
> > +     if (vnet_hdr_sz) {
> > +             int vlan_hlen;
> > +
> > +             if (sk->sk_type == SOCK_RAW && skb_vlan_tag_present(skb))
> > +                     vlan_hlen = VLAN_HLEN;
> > +             else
> > +                     vlan_hlen = 0;
> > +
> > +             if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
> > +                                         sizeof(struct virtio_net_hdr),
> > +                                         vio_le(), true, vlan_hlen)) {
> > +                     if (po->tp_version == TPACKET_V3)
> > +                             prb_clear_blk_fill_status(&po->rx_ring);
> > +                     goto drop_n_account;
> > +             }
> >       }
> >
> >       if (po->tp_version <= TPACKET_V2) {
> > @@ -3464,7 +3480,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
> >       packet_rcv_try_clear_pressure(pkt_sk(sk));
> >
> >       if (vnet_hdr_len) {
> > -             err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len);
> > +             err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len, sk);
> >               if (err)
> >                       goto out_free;
> >       }
> > --
> > 2.40.1
> >
>
>
Willem de Bruijn Nov. 24, 2023, 12:05 a.m. UTC | #3
Mike Pattrick wrote:
> On Thu, Nov 23, 2023 at 4:25 PM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > Mike Pattrick wrote:
> > > Af_packet provides checksum offload offsets to usermode applications
> > > through struct virtio_net_hdr when PACKET_VNET_HDR is enabled on the
> > > socket. For skbuffs with a vlan being sent to a SOCK_RAW socket,
> > > af_packet will include the link level header and so csum_start needs
> > > to be adjusted accordingly.
> >
> > Is this patch based on observing an incorrect offset in a workload,
> > or on code inspection?
> 
> Based on an incorrect offset in a workload. The setup involved sending
> vxlan traffic though a veth interface configured with a vlan. The
> vnet_hdr's csum_start value was off by 4, and this problem went away
> when the vlan was removed.
> 
> I'll take another look at this patch.

This is a vlan device on top of a veth device? On which device and at
which point (ingress or egress) are you receiving the packet over the
packet socket?

From a quick glance, in all cases that I see the VLAN tag is kept in
skb->vlan_tci, so is never part of the packet payload.

But checksum offload with VXLAN can be non-trivial on its own. If
type & SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_TUNNEL_REMCSUM, say. Then
csum_start will point to the checksum in vxlanhdr.
Willem de Bruijn Nov. 24, 2023, 12:19 a.m. UTC | #4
Willem de Bruijn wrote:
> Mike Pattrick wrote:
> > On Thu, Nov 23, 2023 at 4:25 PM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > Mike Pattrick wrote:
> > > > Af_packet provides checksum offload offsets to usermode applications
> > > > through struct virtio_net_hdr when PACKET_VNET_HDR is enabled on the
> > > > socket. For skbuffs with a vlan being sent to a SOCK_RAW socket,
> > > > af_packet will include the link level header and so csum_start needs
> > > > to be adjusted accordingly.
> > >
> > > Is this patch based on observing an incorrect offset in a workload,
> > > or on code inspection?
> > 
> > Based on an incorrect offset in a workload. The setup involved sending
> > vxlan traffic though a veth interface configured with a vlan. The
> > vnet_hdr's csum_start value was off by 4, and this problem went away
> > when the vlan was removed.
> > 
> > I'll take another look at this patch.
> 
> This is a vlan device on top of a veth device? On which device and at
> which point (ingress or egress) are you receiving the packet over the
> packet socket?
> 
> From a quick glance, in all cases that I see the VLAN tag is kept in
> skb->vlan_tci, so is never part of the packet payload.
> 
> But checksum offload with VXLAN can be non-trivial on its own. If
> type & SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_TUNNEL_REMCSUM, say. Then
> csum_start will point to the checksum in vxlanhdr.

This last statement is incorrect. But SKB_GSO_UDP_TUNNEL_CSUM might
be relevant to unexpected csum_start.
Suman Ghosh Nov. 24, 2023, 2:12 a.m. UTC | #5
> static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff
>*skb,
>-			   size_t *len, int vnet_hdr_sz)
>+			   size_t *len, int vnet_hdr_sz,
>+			   const struct sock *sk)
> {
> 	struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 };
>+	int vlan_hlen;
>
> 	if (*len < vnet_hdr_sz)
> 		return -EINVAL;
> 	*len -= vnet_hdr_sz;
>
>-	if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr
>*)&vnet_hdr, vio_le(), true, 0))
>+	if (sk->sk_type == SOCK_RAW && skb_vlan_tag_present(skb))
>+		vlan_hlen = VLAN_HLEN;
>+	else
>+		vlan_hlen = 0;
[Suman] A minor nit. You can initialize vlan_hdr = 0 and get rid of the else.
>+
>+	if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr
>*)&vnet_hdr,
>+				    vio_le(), true, vlan_hlen))
> 		return -EINVAL;
>
> 	return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz); @@ -
>2368,13 +2376,21 @@ static int tpacket_rcv(struct sk_buff *skb, struct
>net_device *dev,
> 		__set_bit(slot_id, po->rx_ring.rx_owner_map);
> 	}
>
>-	if (vnet_hdr_sz &&
>-	    virtio_net_hdr_from_skb(skb, h.raw + macoff -
>-				    sizeof(struct virtio_net_hdr),
>-				    vio_le(), true, 0)) {
>-		if (po->tp_version == TPACKET_V3)
>-			prb_clear_blk_fill_status(&po->rx_ring);
>-		goto drop_n_account;
>+	if (vnet_hdr_sz) {
>+		int vlan_hlen;
>+
>+		if (sk->sk_type == SOCK_RAW && skb_vlan_tag_present(skb))
>+			vlan_hlen = VLAN_HLEN;
>+		else
>+			vlan_hlen = 0;
[Suman] Same as previous comment.
>+
>+		if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
>+					    sizeof(struct virtio_net_hdr),
>+					    vio_le(), true, vlan_hlen)) {
>+			if (po->tp_version == TPACKET_V3)
>+				prb_clear_blk_fill_status(&po->rx_ring);
>+			goto drop_n_account;
>+		}
> 	}
>
> 	if (po->tp_version <= TPACKET_V2) {
>@@ -3464,7 +3480,7 @@ static int packet_recvmsg(struct socket *sock,
>struct msghdr *msg, size_t len,
> 	packet_rcv_try_clear_pressure(pkt_sk(sk));
>
> 	if (vnet_hdr_len) {
>-		err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len);
>+		err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len, sk);
> 		if (err)
> 			goto out_free;
> 	}
>--
>2.40.1
>
Mike Pattrick Nov. 24, 2023, 6:23 a.m. UTC | #6
On Thu, Nov 23, 2023 at 7:06 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Mike Pattrick wrote:
> > On Thu, Nov 23, 2023 at 4:25 PM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > Mike Pattrick wrote:
> > > > Af_packet provides checksum offload offsets to usermode applications
> > > > through struct virtio_net_hdr when PACKET_VNET_HDR is enabled on the
> > > > socket. For skbuffs with a vlan being sent to a SOCK_RAW socket,
> > > > af_packet will include the link level header and so csum_start needs
> > > > to be adjusted accordingly.
> > >
> > > Is this patch based on observing an incorrect offset in a workload,
> > > or on code inspection?
> >
> > Based on an incorrect offset in a workload. The setup involved sending
> > vxlan traffic though a veth interface configured with a vlan. The
> > vnet_hdr's csum_start value was off by 4, and this problem went away
> > when the vlan was removed.
> >
> > I'll take another look at this patch.
>
> This is a vlan device on top of a veth device? On which device and at
> which point (ingress or egress) are you receiving the packet over the
> packet socket?

Just for maximum clarity I'll include the extracted commands below,
but roughly there is a vlan device on top of a vxlan device on top of
a vlan device on top of a veth, in a namespace.

ip netns add at_ns0
ip netns exec at_ns0 ip link add dev at_vxlan1 type vxlan remote
172.31.1.100 id 0 dstport 4789
ip netns exec at_ns0 ip addr add dev at_vxlan1 10.2.1.1/24
ip netns exec at_ns0 ip link set dev at_vxlan1 mtu 1450 up
ip link add p0 type veth peer name ovs-p0
ethtool -K p0 sg on
ethtool -K p0 tso on
ip link set p0 netns at_ns0
ip link set dev ovs-p0 up
ip netns exec at_ns0 ip addr add "172.31.2.1/24" dev p0
ip netns exec at_ns0 ip link set dev p0 up
ip netns exec at_ns0 ip link add link at_vxlan1 name at_vxlan1.100
type vlan proto 802.1q id 100
ip netns exec at_ns0 ip link set dev at_vxlan1.100 up
ip netns exec at_ns0 ip addr add dev at_vxlan1.100 "10.1.1.1/24"
ip netns exec at_ns0 ip link add link p0 name p0.42 type vlan proto 802.1q id 42
ip netns exec at_ns0 ip link set dev p0.42 up
ip netns exec at_ns0 ip addr add dev p0.42 "172.31.1.1/24"
ip addr add "172.31.1.100/24" dev p0
ip link set dev p0 up
ip netns exec at_ns0 ping 10.1.1.100

An AF_PACKET socket on ovs-p0 receives the incorrect csum_start.
Setting up the same with a geneve tunnel and udpcsum enabled produces
the same result. Removing vlan 100 also yields an incorrect
csum_start. Removing only vlan 42 yields a correct csum_start.

>
> From a quick glance, in all cases that I see the VLAN tag is kept in
> skb->vlan_tci, so is never part of the packet payload.
>
> But checksum offload with VXLAN can be non-trivial on its own. If
> type & SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_TUNNEL_REMCSUM, say. Then
> csum_start will point to the checksum in vxlanhdr.
>
Willem de Bruijn Nov. 24, 2023, 4:24 p.m. UTC | #7
Mike Pattrick wrote:
> On Thu, Nov 23, 2023 at 7:06 PM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > Mike Pattrick wrote:
> > > On Thu, Nov 23, 2023 at 4:25 PM Willem de Bruijn
> > > <willemdebruijn.kernel@gmail.com> wrote:
> > > >
> > > > Mike Pattrick wrote:
> > > > > Af_packet provides checksum offload offsets to usermode applications
> > > > > through struct virtio_net_hdr when PACKET_VNET_HDR is enabled on the
> > > > > socket. For skbuffs with a vlan being sent to a SOCK_RAW socket,
> > > > > af_packet will include the link level header and so csum_start needs
> > > > > to be adjusted accordingly.
> > > >
> > > > Is this patch based on observing an incorrect offset in a workload,
> > > > or on code inspection?
> > >
> > > Based on an incorrect offset in a workload. The setup involved sending
> > > vxlan traffic though a veth interface configured with a vlan. The
> > > vnet_hdr's csum_start value was off by 4, and this problem went away
> > > when the vlan was removed.
> > >
> > > I'll take another look at this patch.
> >
> > This is a vlan device on top of a veth device? On which device and at
> > which point (ingress or egress) are you receiving the packet over the
> > packet socket?
> 
> Just for maximum clarity I'll include the extracted commands below,
> but roughly there is a vlan device on top of a vxlan device on top of
> a vlan device on top of a veth, in a namespace.
> 
> ip netns add at_ns0
> ip netns exec at_ns0 ip link add dev at_vxlan1 type vxlan remote
> 172.31.1.100 id 0 dstport 4789
> ip netns exec at_ns0 ip addr add dev at_vxlan1 10.2.1.1/24
> ip netns exec at_ns0 ip link set dev at_vxlan1 mtu 1450 up
> ip link add p0 type veth peer name ovs-p0
> ethtool -K p0 sg on
> ethtool -K p0 tso on
> ip link set p0 netns at_ns0
> ip link set dev ovs-p0 up
> ip netns exec at_ns0 ip addr add "172.31.2.1/24" dev p0
> ip netns exec at_ns0 ip link set dev p0 up
> ip netns exec at_ns0 ip link add link at_vxlan1 name at_vxlan1.100
> type vlan proto 802.1q id 100
> ip netns exec at_ns0 ip link set dev at_vxlan1.100 up
> ip netns exec at_ns0 ip addr add dev at_vxlan1.100 "10.1.1.1/24"
> ip netns exec at_ns0 ip link add link p0 name p0.42 type vlan proto 802.1q id 42
> ip netns exec at_ns0 ip link set dev p0.42 up
> ip netns exec at_ns0 ip addr add dev p0.42 "172.31.1.1/24"
> ip addr add "172.31.1.100/24" dev p0
> ip link set dev p0 up

Are these two lines intended to be ovs-p0 rather than p0?
As p0 lives inside the netns (and already has an address)

> ip netns exec at_ns0 ping 10.1.1.100
> 
> An AF_PACKET socket on ovs-p0 receives the incorrect csum_start.
> Setting up the same with a geneve tunnel and udpcsum enabled produces
> the same result. Removing vlan 100 also yields an incorrect
> csum_start. Removing only vlan 42 yields a correct csum_start.

I wonder if a simple checksum offloaded UDP packet sent over a vlan
directly on top of veth, like .100 has a correct offset before your
patch, but incorrect offset after.

The issue you are encountering might be that vxlan does not support
vlan offload, so validate_xmit_vlan will pull the vlan into the
header with __vlan_hwaccel_push_inside, but this does not adjust
csum_start. Not sure, but perhaps you can instrument your kernel
and see where the offset goes wrong. It is not inside the pf_packet
socket, in any case.
 
> >
> > From a quick glance, in all cases that I see the VLAN tag is kept in
> > skb->vlan_tci, so is never part of the packet payload.
> >
> > But checksum offload with VXLAN can be non-trivial on its own. If
> > type & SKB_GSO_UDP_TUNNEL_CSUM | SKB_GSO_TUNNEL_REMCSUM, say. Then
> > csum_start will point to the checksum in vxlanhdr.
> >
>
diff mbox series

Patch

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index a84e00b5904b..f6b602ffe383 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2092,15 +2092,23 @@  static unsigned int run_filter(struct sk_buff *skb,
 }
 
 static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
-			   size_t *len, int vnet_hdr_sz)
+			   size_t *len, int vnet_hdr_sz,
+			   const struct sock *sk)
 {
 	struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 };
+	int vlan_hlen;
 
 	if (*len < vnet_hdr_sz)
 		return -EINVAL;
 	*len -= vnet_hdr_sz;
 
-	if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0))
+	if (sk->sk_type == SOCK_RAW && skb_vlan_tag_present(skb))
+		vlan_hlen = VLAN_HLEN;
+	else
+		vlan_hlen = 0;
+
+	if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr,
+				    vio_le(), true, vlan_hlen))
 		return -EINVAL;
 
 	return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz);
@@ -2368,13 +2376,21 @@  static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		__set_bit(slot_id, po->rx_ring.rx_owner_map);
 	}
 
-	if (vnet_hdr_sz &&
-	    virtio_net_hdr_from_skb(skb, h.raw + macoff -
-				    sizeof(struct virtio_net_hdr),
-				    vio_le(), true, 0)) {
-		if (po->tp_version == TPACKET_V3)
-			prb_clear_blk_fill_status(&po->rx_ring);
-		goto drop_n_account;
+	if (vnet_hdr_sz) {
+		int vlan_hlen;
+
+		if (sk->sk_type == SOCK_RAW && skb_vlan_tag_present(skb))
+			vlan_hlen = VLAN_HLEN;
+		else
+			vlan_hlen = 0;
+
+		if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
+					    sizeof(struct virtio_net_hdr),
+					    vio_le(), true, vlan_hlen)) {
+			if (po->tp_version == TPACKET_V3)
+				prb_clear_blk_fill_status(&po->rx_ring);
+			goto drop_n_account;
+		}
 	}
 
 	if (po->tp_version <= TPACKET_V2) {
@@ -3464,7 +3480,7 @@  static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 	packet_rcv_try_clear_pressure(pkt_sk(sk));
 
 	if (vnet_hdr_len) {
-		err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len);
+		err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len, sk);
 		if (err)
 			goto out_free;
 	}