Message ID | 20230327153641.204660-1-amy.saq@antgroup.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [v6] net/packet: support mergeable feature of virtio | expand |
沈安琪(凛玥) wrote: > From: Jianfeng Tan <henry.tjf@antgroup.com> > > Packet sockets, like tap, can be used as the backend for kernel vhost. > In packet sockets, virtio net header size is currently hardcoded to be > the size of struct virtio_net_hdr, which is 10 bytes; however, it is not > always the case: some virtio features, such as mrg_rxbuf, need virtio > net header to be 12-byte long. > > Mergeable buffers, as a virtio feature, is worthy of supporting: packets > that are larger than one-mbuf size will be dropped in vhost worker's > handle_rx if mrg_rxbuf feature is not used, but large packets > cannot be avoided and increasing mbuf's size is not economical. > > With this virtio feature enabled by virtio-user, packet sockets with > hardcoded 10-byte virtio net header will parse mac head incorrectly in > packet_snd by taking the last two bytes of virtio net header as part of > mac header. > This incorrect mac header parsing will cause packet to be dropped due to > invalid ether head checking in later under-layer device packet receiving. > > By adding extra field vnet_hdr_sz with utilizing holes in struct > packet_sock to record currently used virtio net header size and supporting > extra sockopt PACKET_VNET_HDR_SZ to set specified vnet_hdr_sz, packet > sockets can know the exact length of virtio net header that virtio user > gives. > In packet_snd, tpacket_snd and packet_recvmsg, instead of using > hardcoded virtio net header size, it can get the exact vnet_hdr_sz from > corresponding packet_sock, and parse mac header correctly based on this > information to avoid the packets being mistakenly dropped. > > Signed-off-by: Jianfeng Tan <henry.tjf@antgroup.com> > Co-developed-by: Anqi Shen <amy.saq@antgroup.com> > Signed-off-by: Anqi Shen <amy.saq@antgroup.com> > --- > > V5 -> V6: > * rebase patch based on 6.3-rc2 > > include/uapi/linux/if_packet.h | 1 + > net/packet/af_packet.c | 88 +++++++++++++++++++++------------- > net/packet/diag.c | 2 +- > net/packet/internal.h | 2 +- > 4 files changed, 59 insertions(+), 34 deletions(-) > > diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h > index 78c981d6a9d4..9efc42382fdb 100644 > --- a/include/uapi/linux/if_packet.h > +++ b/include/uapi/linux/if_packet.h > @@ -59,6 +59,7 @@ struct sockaddr_ll { > #define PACKET_ROLLOVER_STATS 21 > #define PACKET_FANOUT_DATA 22 > #define PACKET_IGNORE_OUTGOING 23 > +#define PACKET_VNET_HDR_SZ 24 > > #define PACKET_FANOUT_HASH 0 > #define PACKET_FANOUT_LB 1 > diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c > index 497193f73030..b13536767cbe 100644 > --- a/net/packet/af_packet.c > +++ b/net/packet/af_packet.c > @@ -2090,18 +2090,18 @@ static unsigned int run_filter(struct sk_buff *skb, > } > > @@ -3925,11 +3938,19 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, > if (copy_from_sockptr(&val, optval, sizeof(val))) > return -EFAULT; > > + hdr_len = val ? sizeof(struct virtio_net_hdr) : 0; > + if (optname == PACKET_VNET_HDR_SZ) { > + if (val && val != sizeof(struct virtio_net_hdr) && > + val != sizeof(struct virtio_net_hdr_mrg_rxbuf)) > + return -EINVAL; > + hdr_len = val; > + } > + Since the below requires a change, I'd prefer if (optname == PACKET_VNET_HDR_SZ) { ... } else { hdr_len = val ? sizeof(struct virtio_net_hdr) : 0; } Rather than first doing that and then modifying val in the case of PACKET_VNET_HDR_SZ. > lock_sock(sk); > if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { > ret = -EBUSY; > } else { > - packet_sock_flag_set(po, PACKET_SOCK_HAS_VNET_HDR, val); > + po->vnet_hdr_sz = hdr_len; Needs to use WRITE_ONCE to match the READ_ONCE elsewhere > ret = 0; > } > release_sock(sk); > @@ -4062,7 +4083,10 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, > val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV); > break; > case PACKET_VNET_HDR: > - val = packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR); > + val = !!po->vnet_hdr_sz; > + break; > + case PACKET_VNET_HDR_SZ: > + val = po->vnet_hdr_sz; > break; > case PACKET_VERSION: > val = po->tp_version; > diff --git a/net/packet/diag.c b/net/packet/diag.c > index de4ced5cf3e8..5cf13cf0b862 100644 > --- a/net/packet/diag.c > +++ b/net/packet/diag.c > @@ -27,7 +27,7 @@ static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb) > pinfo.pdi_flags |= PDI_AUXDATA; > if (packet_sock_flag(po, PACKET_SOCK_ORIGDEV)) > pinfo.pdi_flags |= PDI_ORIGDEV; > - if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) > + if (po->vnet_hdr_sz) > pinfo.pdi_flags |= PDI_VNETHDR; > if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) > pinfo.pdi_flags |= PDI_LOSS; > diff --git a/net/packet/internal.h b/net/packet/internal.h > index 27930f69f368..63f4865202c1 100644 > --- a/net/packet/internal.h > +++ b/net/packet/internal.h > @@ -118,6 +118,7 @@ struct packet_sock { > struct mutex pg_vec_lock; > unsigned long flags; > int ifindex; /* bound device */ > + u8 vnet_hdr_sz; Did you use pahole to find a spot that is currently padding? This looks good in principle, an int followed by __be16. > __be16 num; > struct packet_rollover *rollover; > struct packet_mclist *mclist; > @@ -139,7 +140,6 @@ enum packet_sock_flags { > PACKET_SOCK_AUXDATA, > PACKET_SOCK_TX_HAS_OFF, > PACKET_SOCK_TP_LOSS, > - PACKET_SOCK_HAS_VNET_HDR, > PACKET_SOCK_RUNNING, > PACKET_SOCK_PRESSURE, > PACKET_SOCK_QDISC_BYPASS, > -- > 2.19.1.6.gb485710b >
在 2023/3/29 上午12:30, Willem de Bruijn 写道: > 沈安琪(凛玥) wrote: >> From: Jianfeng Tan <henry.tjf@antgroup.com> >> >> Packet sockets, like tap, can be used as the backend for kernel vhost. >> In packet sockets, virtio net header size is currently hardcoded to be >> the size of struct virtio_net_hdr, which is 10 bytes; however, it is not >> always the case: some virtio features, such as mrg_rxbuf, need virtio >> net header to be 12-byte long. >> >> Mergeable buffers, as a virtio feature, is worthy of supporting: packets >> that are larger than one-mbuf size will be dropped in vhost worker's >> handle_rx if mrg_rxbuf feature is not used, but large packets >> cannot be avoided and increasing mbuf's size is not economical. >> >> With this virtio feature enabled by virtio-user, packet sockets with >> hardcoded 10-byte virtio net header will parse mac head incorrectly in >> packet_snd by taking the last two bytes of virtio net header as part of >> mac header. >> This incorrect mac header parsing will cause packet to be dropped due to >> invalid ether head checking in later under-layer device packet receiving. >> >> By adding extra field vnet_hdr_sz with utilizing holes in struct >> packet_sock to record currently used virtio net header size and supporting >> extra sockopt PACKET_VNET_HDR_SZ to set specified vnet_hdr_sz, packet >> sockets can know the exact length of virtio net header that virtio user >> gives. >> In packet_snd, tpacket_snd and packet_recvmsg, instead of using >> hardcoded virtio net header size, it can get the exact vnet_hdr_sz from >> corresponding packet_sock, and parse mac header correctly based on this >> information to avoid the packets being mistakenly dropped. >> >> Signed-off-by: Jianfeng Tan <henry.tjf@antgroup.com> >> Co-developed-by: Anqi Shen <amy.saq@antgroup.com> >> Signed-off-by: Anqi Shen <amy.saq@antgroup.com> >> --- >> >> V5 -> V6: >> * rebase patch based on 6.3-rc2 >> >> include/uapi/linux/if_packet.h | 1 + >> net/packet/af_packet.c | 88 +++++++++++++++++++++------------- >> net/packet/diag.c | 2 +- >> net/packet/internal.h | 2 +- >> 4 files changed, 59 insertions(+), 34 deletions(-) >> >> diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h >> index 78c981d6a9d4..9efc42382fdb 100644 >> --- a/include/uapi/linux/if_packet.h >> +++ b/include/uapi/linux/if_packet.h >> @@ -59,6 +59,7 @@ struct sockaddr_ll { >> #define PACKET_ROLLOVER_STATS 21 >> #define PACKET_FANOUT_DATA 22 >> #define PACKET_IGNORE_OUTGOING 23 >> +#define PACKET_VNET_HDR_SZ 24 >> >> #define PACKET_FANOUT_HASH 0 >> #define PACKET_FANOUT_LB 1 >> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c >> index 497193f73030..b13536767cbe 100644 >> --- a/net/packet/af_packet.c >> +++ b/net/packet/af_packet.c >> @@ -2090,18 +2090,18 @@ static unsigned int run_filter(struct sk_buff *skb, >> } >> >> @@ -3925,11 +3938,19 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, >> if (copy_from_sockptr(&val, optval, sizeof(val))) >> return -EFAULT; >> >> + hdr_len = val ? sizeof(struct virtio_net_hdr) : 0; >> + if (optname == PACKET_VNET_HDR_SZ) { >> + if (val && val != sizeof(struct virtio_net_hdr) && >> + val != sizeof(struct virtio_net_hdr_mrg_rxbuf)) >> + return -EINVAL; >> + hdr_len = val; >> + } >> + > Since the below requires a change, I'd prefer > > if (optname == PACKET_VNET_HDR_SZ) { > ... > } else { > hdr_len = val ? sizeof(struct virtio_net_hdr) : 0; > } > > Rather than first doing that and then modifying val in the case of > PACKET_VNET_HDR_SZ. Good point. We will address this comment in the next version of patch. >> lock_sock(sk); >> if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { >> ret = -EBUSY; >> } else { >> - packet_sock_flag_set(po, PACKET_SOCK_HAS_VNET_HDR, val); >> + po->vnet_hdr_sz = hdr_len; > Needs to use WRITE_ONCE to match the READ_ONCE elsewhere Thanks for pointing out. We will fix this in the next version of patch. > >> ret = 0; >> } >> release_sock(sk); >> @@ -4062,7 +4083,10 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, >> val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV); >> break; >> case PACKET_VNET_HDR: >> - val = packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR); >> + val = !!po->vnet_hdr_sz; >> + break; >> + case PACKET_VNET_HDR_SZ: >> + val = po->vnet_hdr_sz; >> break; >> case PACKET_VERSION: >> val = po->tp_version; >> diff --git a/net/packet/diag.c b/net/packet/diag.c >> index de4ced5cf3e8..5cf13cf0b862 100644 >> --- a/net/packet/diag.c >> +++ b/net/packet/diag.c >> @@ -27,7 +27,7 @@ static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb) >> pinfo.pdi_flags |= PDI_AUXDATA; >> if (packet_sock_flag(po, PACKET_SOCK_ORIGDEV)) >> pinfo.pdi_flags |= PDI_ORIGDEV; >> - if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) >> + if (po->vnet_hdr_sz) >> pinfo.pdi_flags |= PDI_VNETHDR; >> if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) >> pinfo.pdi_flags |= PDI_LOSS; >> diff --git a/net/packet/internal.h b/net/packet/internal.h >> index 27930f69f368..63f4865202c1 100644 >> --- a/net/packet/internal.h >> +++ b/net/packet/internal.h >> @@ -118,6 +118,7 @@ struct packet_sock { >> struct mutex pg_vec_lock; >> unsigned long flags; >> int ifindex; /* bound device */ >> + u8 vnet_hdr_sz; > Did you use pahole to find a spot that is currently padding? > > This looks good in principle, an int followed by __be16. Yes, we have checked this spot with pahole to make sure it is a right place to add vnet_hdr_sz. Here's the result: struct packet_sock { struct sock sk; /* 0 776 */ /* --- cacheline 12 boundary (768 bytes) was 8 bytes ago --- */ struct packet_fanout * fanout; /* 776 8 */ union tpacket_stats_u stats; /* 784 12 */ /* XXX 4 bytes hole, try to pack */ struct packet_ring_buffer rx_ring; /* 800 200 */ /* --- cacheline 15 boundary (960 bytes) was 40 bytes ago --- */ struct packet_ring_buffer tx_ring; /* 1000 200 */ /* --- cacheline 18 boundary (1152 bytes) was 48 bytes ago --- */ int copy_thresh; /* 1200 4 */ spinlock_t bind_lock; /* 1204 4 */ struct mutex pg_vec_lock; /* 1208 32 */ /* --- cacheline 19 boundary (1216 bytes) was 24 bytes ago --- */ long unsigned int flags; /* 1240 8 */ int ifindex; /* 1248 4 */ u8 vnet_hdr_sz; /* 1252 1 */ /* XXX 1 byte hole, try to pack */ __be16 num; /* 1254 2 */ struct packet_rollover * rollover; /* 1256 8 */ struct packet_mclist * mclist; /* 1264 8 */ atomic_t mapped; /* 1272 4 */ enum tpacket_versions tp_version; /* 1276 4 */ /* --- cacheline 20 boundary (1280 bytes) --- */ unsigned int tp_hdrlen; /* 1280 4 */ unsigned int tp_reserve; /* 1284 4 */ unsigned int tp_tstamp; /* 1288 4 */ /* XXX 4 bytes hole, try to pack */ struct completion skb_completion; /* 1296 32 */ struct net_device * cached_dev; /* 1328 8 */ /* XXX 8 bytes hole, try to pack */ /* --- cacheline 21 boundary (1344 bytes) --- */ struct packet_type prot_hook; /* 1344 72 */ /* XXX 56 bytes hole, try to pack */ /* --- cacheline 23 boundary (1472 bytes) --- */ atomic_t tp_drops; /* 1472 4 */ /* Force padding: */ atomic_t :32; atomic_t :32; atomic_t :32; atomic_t :32; atomic_t :32; atomic_t :32; atomic_t :32; atomic_t :32; atomic_t :32; atomic_t :32; atomic_t :32; atomic_t :32; atomic_t :32; atomic_t :32; atomic_t :32; /* size: 1536, cachelines: 24, members: 23 */ /* sum members: 1403, holes: 5, sum holes: 73 */ /* padding: 60 */ }; >> __be16 num; >> struct packet_rollover *rollover; >> struct packet_mclist *mclist; >> @@ -139,7 +140,6 @@ enum packet_sock_flags { >> PACKET_SOCK_AUXDATA, >> PACKET_SOCK_TX_HAS_OFF, >> PACKET_SOCK_TP_LOSS, >> - PACKET_SOCK_HAS_VNET_HDR, >> PACKET_SOCK_RUNNING, >> PACKET_SOCK_PRESSURE, >> PACKET_SOCK_QDISC_BYPASS, >> -- >> 2.19.1.6.gb485710b >> >>
在 2023/3/29 下午8:14, Anqi Shen 写道: We will soon send out the next version of patch that addresses the following comments. Before we doing that, I wonder whether there is any other comment/discussion on this version of patch. Again, thanks for all the comments and discussion on this patch so far. > 在 2023/3/29 上午12:30, Willem de Bruijn 写道: >> 沈安琪(凛玥) wrote: >>> From: Jianfeng Tan <henry.tjf@antgroup.com> >>> >>> Packet sockets, like tap, can be used as the backend for kernel vhost. >>> In packet sockets, virtio net header size is currently hardcoded to be >>> the size of struct virtio_net_hdr, which is 10 bytes; however, it is >>> not >>> always the case: some virtio features, such as mrg_rxbuf, need virtio >>> net header to be 12-byte long. >>> >>> Mergeable buffers, as a virtio feature, is worthy of supporting: >>> packets >>> that are larger than one-mbuf size will be dropped in vhost worker's >>> handle_rx if mrg_rxbuf feature is not used, but large packets >>> cannot be avoided and increasing mbuf's size is not economical. >>> >>> With this virtio feature enabled by virtio-user, packet sockets with >>> hardcoded 10-byte virtio net header will parse mac head incorrectly in >>> packet_snd by taking the last two bytes of virtio net header as part of >>> mac header. >>> This incorrect mac header parsing will cause packet to be dropped >>> due to >>> invalid ether head checking in later under-layer device packet >>> receiving. >>> >>> By adding extra field vnet_hdr_sz with utilizing holes in struct >>> packet_sock to record currently used virtio net header size and >>> supporting >>> extra sockopt PACKET_VNET_HDR_SZ to set specified vnet_hdr_sz, packet >>> sockets can know the exact length of virtio net header that virtio user >>> gives. >>> In packet_snd, tpacket_snd and packet_recvmsg, instead of using >>> hardcoded virtio net header size, it can get the exact vnet_hdr_sz from >>> corresponding packet_sock, and parse mac header correctly based on this >>> information to avoid the packets being mistakenly dropped. >>> >>> Signed-off-by: Jianfeng Tan <henry.tjf@antgroup.com> >>> Co-developed-by: Anqi Shen <amy.saq@antgroup.com> >>> Signed-off-by: Anqi Shen <amy.saq@antgroup.com> >>> --- >>> >>> V5 -> V6: >>> * rebase patch based on 6.3-rc2 >>> >>> include/uapi/linux/if_packet.h | 1 + >>> net/packet/af_packet.c | 88 >>> +++++++++++++++++++++------------- >>> net/packet/diag.c | 2 +- >>> net/packet/internal.h | 2 +- >>> 4 files changed, 59 insertions(+), 34 deletions(-) >>> >>> diff --git a/include/uapi/linux/if_packet.h >>> b/include/uapi/linux/if_packet.h >>> index 78c981d6a9d4..9efc42382fdb 100644 >>> --- a/include/uapi/linux/if_packet.h >>> +++ b/include/uapi/linux/if_packet.h >>> @@ -59,6 +59,7 @@ struct sockaddr_ll { >>> #define PACKET_ROLLOVER_STATS 21 >>> #define PACKET_FANOUT_DATA 22 >>> #define PACKET_IGNORE_OUTGOING 23 >>> +#define PACKET_VNET_HDR_SZ 24 >>> #define PACKET_FANOUT_HASH 0 >>> #define PACKET_FANOUT_LB 1 >>> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c >>> index 497193f73030..b13536767cbe 100644 >>> --- a/net/packet/af_packet.c >>> +++ b/net/packet/af_packet.c >>> @@ -2090,18 +2090,18 @@ static unsigned int run_filter(struct >>> sk_buff *skb, >>> } >>> @@ -3925,11 +3938,19 @@ packet_setsockopt(struct socket *sock, int >>> level, int optname, sockptr_t optval, >>> if (copy_from_sockptr(&val, optval, sizeof(val))) >>> return -EFAULT; >>> + hdr_len = val ? sizeof(struct virtio_net_hdr) : 0; >>> + if (optname == PACKET_VNET_HDR_SZ) { >>> + if (val && val != sizeof(struct virtio_net_hdr) && >>> + val != sizeof(struct virtio_net_hdr_mrg_rxbuf)) >>> + return -EINVAL; >>> + hdr_len = val; >>> + } >>> + >> Since the below requires a change, I'd prefer >> >> if (optname == PACKET_VNET_HDR_SZ) { >> ... >> } else { >> hdr_len = val ? sizeof(struct virtio_net_hdr) : 0; >> } >> >> Rather than first doing that and then modifying val in the case of >> PACKET_VNET_HDR_SZ. > > > Good point. We will address this comment in the next version of patch. > > >>> lock_sock(sk); >>> if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { >>> ret = -EBUSY; >>> } else { >>> - packet_sock_flag_set(po, PACKET_SOCK_HAS_VNET_HDR, val); >>> + po->vnet_hdr_sz = hdr_len; >> Needs to use WRITE_ONCE to match the READ_ONCE elsewhere > > > Thanks for pointing out. We will fix this in the next version of patch. > > >> >>> ret = 0; >>> } >>> release_sock(sk); >>> @@ -4062,7 +4083,10 @@ static int packet_getsockopt(struct socket >>> *sock, int level, int optname, >>> val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV); >>> break; >>> case PACKET_VNET_HDR: >>> - val = packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR); >>> + val = !!po->vnet_hdr_sz; >>> + break; >>> + case PACKET_VNET_HDR_SZ: >>> + val = po->vnet_hdr_sz; >>> break; >>> case PACKET_VERSION: >>> val = po->tp_version; >>> diff --git a/net/packet/diag.c b/net/packet/diag.c >>> index de4ced5cf3e8..5cf13cf0b862 100644 >>> --- a/net/packet/diag.c >>> +++ b/net/packet/diag.c >>> @@ -27,7 +27,7 @@ static int pdiag_put_info(const struct packet_sock >>> *po, struct sk_buff *nlskb) >>> pinfo.pdi_flags |= PDI_AUXDATA; >>> if (packet_sock_flag(po, PACKET_SOCK_ORIGDEV)) >>> pinfo.pdi_flags |= PDI_ORIGDEV; >>> - if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) >>> + if (po->vnet_hdr_sz) >>> pinfo.pdi_flags |= PDI_VNETHDR; >>> if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) >>> pinfo.pdi_flags |= PDI_LOSS; >>> diff --git a/net/packet/internal.h b/net/packet/internal.h >>> index 27930f69f368..63f4865202c1 100644 >>> --- a/net/packet/internal.h >>> +++ b/net/packet/internal.h >>> @@ -118,6 +118,7 @@ struct packet_sock { >>> struct mutex pg_vec_lock; >>> unsigned long flags; >>> int ifindex; /* bound device */ >>> + u8 vnet_hdr_sz; >> Did you use pahole to find a spot that is currently padding? >> >> This looks good in principle, an int followed by __be16. > > > Yes, we have checked this spot with pahole to make sure it is a right > place to add vnet_hdr_sz. > > Here's the result: > > struct packet_sock { > struct sock sk; /* 0 776 */ > /* --- cacheline 12 boundary (768 bytes) was 8 bytes ago --- */ > struct packet_fanout * fanout; /* 776 8 */ > union tpacket_stats_u stats; /* 784 12 */ > > /* XXX 4 bytes hole, try to pack */ > > struct packet_ring_buffer rx_ring; /* 800 200 */ > /* --- cacheline 15 boundary (960 bytes) was 40 bytes ago --- */ > struct packet_ring_buffer tx_ring; /* 1000 200 */ > /* --- cacheline 18 boundary (1152 bytes) was 48 bytes ago --- */ > int copy_thresh; /* 1200 4 */ > spinlock_t bind_lock; /* 1204 4 */ > struct mutex pg_vec_lock; /* 1208 32 */ > /* --- cacheline 19 boundary (1216 bytes) was 24 bytes ago --- */ > long unsigned int flags; /* 1240 8 */ > int ifindex; /* 1248 4 */ > u8 vnet_hdr_sz; /* 1252 1 */ > > /* XXX 1 byte hole, try to pack */ > > __be16 num; /* 1254 2 */ > struct packet_rollover * rollover; /* 1256 8 */ > struct packet_mclist * mclist; /* 1264 8 */ > atomic_t mapped; /* 1272 4 */ > enum tpacket_versions tp_version; /* 1276 4 */ > /* --- cacheline 20 boundary (1280 bytes) --- */ > unsigned int tp_hdrlen; /* 1280 4 */ > unsigned int tp_reserve; /* 1284 4 */ > unsigned int tp_tstamp; /* 1288 4 */ > > /* XXX 4 bytes hole, try to pack */ > > struct completion skb_completion; /* 1296 32 */ > struct net_device * cached_dev; /* 1328 8 */ > > /* XXX 8 bytes hole, try to pack */ > > /* --- cacheline 21 boundary (1344 bytes) --- */ > struct packet_type prot_hook; /* 1344 72 */ > > /* XXX 56 bytes hole, try to pack */ > > /* --- cacheline 23 boundary (1472 bytes) --- */ > atomic_t tp_drops; /* 1472 4 */ > > /* Force padding: */ > atomic_t :32; > atomic_t :32; > atomic_t :32; > atomic_t :32; > atomic_t :32; > atomic_t :32; > atomic_t :32; > atomic_t :32; > atomic_t :32; > atomic_t :32; > atomic_t :32; > atomic_t :32; > atomic_t :32; > atomic_t :32; > atomic_t :32; > > /* size: 1536, cachelines: 24, members: 23 */ > /* sum members: 1403, holes: 5, sum holes: 73 */ > /* padding: 60 */ > }; > > >>> __be16 num; >>> struct packet_rollover *rollover; >>> struct packet_mclist *mclist; >>> @@ -139,7 +140,6 @@ enum packet_sock_flags { >>> PACKET_SOCK_AUXDATA, >>> PACKET_SOCK_TX_HAS_OFF, >>> PACKET_SOCK_TP_LOSS, >>> - PACKET_SOCK_HAS_VNET_HDR, >>> PACKET_SOCK_RUNNING, >>> PACKET_SOCK_PRESSURE, >>> PACKET_SOCK_QDISC_BYPASS, >>> -- >>> 2.19.1.6.gb485710b >>> >>>
diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index 78c981d6a9d4..9efc42382fdb 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -59,6 +59,7 @@ struct sockaddr_ll { #define PACKET_ROLLOVER_STATS 21 #define PACKET_FANOUT_DATA 22 #define PACKET_IGNORE_OUTGOING 23 +#define PACKET_VNET_HDR_SZ 24 #define PACKET_FANOUT_HASH 0 #define PACKET_FANOUT_LB 1 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 497193f73030..b13536767cbe 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2090,18 +2090,18 @@ static unsigned int run_filter(struct sk_buff *skb, } static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, - size_t *len) + size_t *len, int vnet_hdr_sz) { - struct virtio_net_hdr vnet_hdr; + struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 }; - if (*len < sizeof(vnet_hdr)) + if (*len < vnet_hdr_sz) return -EINVAL; - *len -= sizeof(vnet_hdr); + *len -= vnet_hdr_sz; - if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0)) + if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0)) return -EINVAL; - return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr)); + return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz); } /* @@ -2251,6 +2251,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, bool is_drop_n_account = false; unsigned int slot_id = 0; bool do_vnet = false; + int vnet_hdr_sz; /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. * We may add members to them until current aligned size without forcing @@ -2308,8 +2309,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, netoff = TPACKET_ALIGN(po->tp_hdrlen + (maclen < 16 ? 16 : maclen)) + po->tp_reserve; - if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) { - netoff += sizeof(struct virtio_net_hdr); + vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); + if (vnet_hdr_sz) { + netoff += vnet_hdr_sz; do_vnet = true; } macoff = netoff - maclen; @@ -2551,16 +2553,26 @@ static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len) } static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len, - struct virtio_net_hdr *vnet_hdr) + struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz) { - if (*len < sizeof(*vnet_hdr)) + int ret; + + if (*len < vnet_hdr_sz) return -EINVAL; - *len -= sizeof(*vnet_hdr); + *len -= vnet_hdr_sz; if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter)) return -EFAULT; - return __packet_snd_vnet_parse(vnet_hdr, *len); + ret = __packet_snd_vnet_parse(vnet_hdr, *len); + if (ret) + return ret; + + /* move iter to point to the start of mac header */ + if (vnet_hdr_sz != sizeof(struct virtio_net_hdr)) + iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr)); + + return 0; } static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, @@ -2722,6 +2734,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) void *ph; DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); bool need_wait = !(msg->msg_flags & MSG_DONTWAIT); + int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); unsigned char *addr = NULL; int tp_len, size_max; void *data; @@ -2779,8 +2792,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) size_max = po->tx_ring.frame_size - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); - if ((size_max > dev->mtu + reserve + VLAN_HLEN) && - !packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) + if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz) size_max = dev->mtu + reserve + VLAN_HLEN; reinit_completion(&po->skb_completion); @@ -2809,10 +2821,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) status = TP_STATUS_SEND_REQUEST; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; - if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) { + + if (vnet_hdr_sz) { vnet_hdr = data; - data += sizeof(*vnet_hdr); - tp_len -= sizeof(*vnet_hdr); + data += vnet_hdr_sz; + tp_len -= vnet_hdr_sz; if (tp_len < 0 || __packet_snd_vnet_parse(vnet_hdr, tp_len)) { tp_len = -EINVAL; @@ -2837,7 +2850,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) addr, hlen, copylen, &sockc); if (likely(tp_len >= 0) && tp_len > dev->mtu + reserve && - !packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR) && + !vnet_hdr_sz && !packet_extra_vlan_len_allowed(dev, skb)) tp_len = -EMSGSIZE; @@ -2856,7 +2869,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) } } - if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) { + if (vnet_hdr_sz) { if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) { tp_len = -EINVAL; goto tpacket_error; @@ -2946,7 +2959,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; struct packet_sock *po = pkt_sk(sk); - bool has_vnet_hdr = false; + int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); int hlen, tlen, linear; int extra_len = 0; @@ -2990,11 +3003,11 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (sock->type == SOCK_RAW) reserve = dev->hard_header_len; - if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) { - err = packet_snd_vnet_parse(msg, &len, &vnet_hdr); + + if (vnet_hdr_sz) { + err = packet_snd_vnet_parse(msg, &len, &vnet_hdr, vnet_hdr_sz); if (err) goto out_unlock; - has_vnet_hdr = true; } if (unlikely(sock_flag(sk, SOCK_NOFCS))) { @@ -3064,11 +3077,11 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) packet_parse_headers(skb, sock); - if (has_vnet_hdr) { + if (vnet_hdr_sz) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); if (err) goto out_free; - len += sizeof(vnet_hdr); + len += vnet_hdr_sz; virtio_net_hdr_set_proto(skb, &vnet_hdr); } @@ -3408,7 +3421,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; - int vnet_hdr_len = 0; + int vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz); unsigned int origlen = 0; err = -EINVAL; @@ -3449,11 +3462,10 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, packet_rcv_try_clear_pressure(pkt_sk(sk)); - if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_HAS_VNET_HDR)) { - err = packet_rcv_vnet(msg, skb, &len); + if (vnet_hdr_len) { + err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len); if (err) goto out_free; - vnet_hdr_len = sizeof(struct virtio_net_hdr); } /* You lose any data beyond the buffer you gave. If it worries @@ -3915,8 +3927,9 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, return 0; } case PACKET_VNET_HDR: + case PACKET_VNET_HDR_SZ: { - int val; + int val, hdr_len; if (sock->type != SOCK_RAW) return -EINVAL; @@ -3925,11 +3938,19 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; + hdr_len = val ? sizeof(struct virtio_net_hdr) : 0; + if (optname == PACKET_VNET_HDR_SZ) { + if (val && val != sizeof(struct virtio_net_hdr) && + val != sizeof(struct virtio_net_hdr_mrg_rxbuf)) + return -EINVAL; + hdr_len = val; + } + lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { - packet_sock_flag_set(po, PACKET_SOCK_HAS_VNET_HDR, val); + po->vnet_hdr_sz = hdr_len; ret = 0; } release_sock(sk); @@ -4062,7 +4083,10 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV); break; case PACKET_VNET_HDR: - val = packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR); + val = !!po->vnet_hdr_sz; + break; + case PACKET_VNET_HDR_SZ: + val = po->vnet_hdr_sz; break; case PACKET_VERSION: val = po->tp_version; diff --git a/net/packet/diag.c b/net/packet/diag.c index de4ced5cf3e8..5cf13cf0b862 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -27,7 +27,7 @@ static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb) pinfo.pdi_flags |= PDI_AUXDATA; if (packet_sock_flag(po, PACKET_SOCK_ORIGDEV)) pinfo.pdi_flags |= PDI_ORIGDEV; - if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) + if (po->vnet_hdr_sz) pinfo.pdi_flags |= PDI_VNETHDR; if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) pinfo.pdi_flags |= PDI_LOSS; diff --git a/net/packet/internal.h b/net/packet/internal.h index 27930f69f368..63f4865202c1 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -118,6 +118,7 @@ struct packet_sock { struct mutex pg_vec_lock; unsigned long flags; int ifindex; /* bound device */ + u8 vnet_hdr_sz; __be16 num; struct packet_rollover *rollover; struct packet_mclist *mclist; @@ -139,7 +140,6 @@ enum packet_sock_flags { PACKET_SOCK_AUXDATA, PACKET_SOCK_TX_HAS_OFF, PACKET_SOCK_TP_LOSS, - PACKET_SOCK_HAS_VNET_HDR, PACKET_SOCK_RUNNING, PACKET_SOCK_PRESSURE, PACKET_SOCK_QDISC_BYPASS,