diff mbox series

[1/2] net/packet: add socketopt to set/get vnet_hdr_sz

Message ID 1675946595-103034-2-git-send-email-amy.saq@antgroup.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series net/packet: support of specifying virtio net header size | expand

Checks

Context Check Description
netdev/tree_selection success Guessed tree name to be net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix warning Target tree name not specified in the subject
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 5334 this patch: 5171
netdev/cc_maintainers warning 3 maintainers not CCed: edumazet@google.com pabeni@redhat.com kuba@kernel.org
netdev/build_clang fail Errors and warnings before: 1084 this patch: 982
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 5549 this patch: 5395
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

沈安琪(凛玥) Feb. 9, 2023, 12:43 p.m. UTC
From: "Jianfeng Tan" <henry.tjf@antgroup.com>

Raw socket can be used as the backend for kernel vhost, like tap.
However, in current raw socket implementation, it use hardcoded virtio
net header length, which will cause error mac header parsing when some
virtio features that need virtio net header other than 10-byte are used.

By adding extra field vnet_hdr_sz in packet_sock to record virtio net
header size that current raw socket should use and supporting extra
sockopt PACKET_VNET_HDR_SZ to allow user level set specified vnet header
size to current socket, raw socket will know the exact virtio net header
size it should use instead of hardcoding to avoid incorrect header
parsing.

Signed-off-by: Jianfeng Tan <henry.tjf@antgroup.com>
Co-developed-by: Anqi Shen <amy.saq@antgroup.com>
Signed-off-by: Anqi Shen <amy.saq@antgroup.com>
---
 include/uapi/linux/if_packet.h |  1 +
 net/packet/af_packet.c         | 34 ++++++++++++++++++++++++++++++++++
 net/packet/internal.h          |  3 ++-
 3 files changed, 37 insertions(+), 1 deletion(-)

Comments

Willem de Bruijn Feb. 9, 2023, 2:45 p.m. UTC | #1
On Thu, Feb 9, 2023 at 7:43 AM 沈安琪(凛玥) <amy.saq@antgroup.com> wrote:
>
> From: "Jianfeng Tan" <henry.tjf@antgroup.com>
>
> Raw socket can be used as the backend for kernel vhost, like tap.

Please refer to PF_PACKET sockets as packet sockets.

"raw" sockets is ambiguous: it is also used to refer to type SOCK_RAW
of other socket families.

> However, in current raw socket implementation, it use hardcoded virtio
> net header length, which will cause error mac header parsing when some
> virtio features that need virtio net header other than 10-byte are used.

This series only adds support for skipping past two extra bytes.

The 2-byte field num_buffers in virtio_net_hdr_mrg_rxbuf is used for
coalesced buffers. That is not a feature packet sockets support.

How do you intend to use this? Only ever with num_buffers == 1?

We have to make ABI changes sparingly. It would take another setsockopt
to signal actual use of this feature.

If adding an extended struct, then this also needs to be documented in
the UAPI headers.

> By adding extra field vnet_hdr_sz in packet_sock to record virtio net
> header size that current raw socket should use and supporting extra
> sockopt PACKET_VNET_HDR_SZ to allow user level set specified vnet header
> size to current socket, raw socket will know the exact virtio net header
> size it should use instead of hardcoding to avoid incorrect header
> parsing.
>
> Signed-off-by: Jianfeng Tan <henry.tjf@antgroup.com>
> Co-developed-by: Anqi Shen <amy.saq@antgroup.com>
> Signed-off-by: Anqi Shen <amy.saq@antgroup.com>
> ---
>  include/uapi/linux/if_packet.h |  1 +
>  net/packet/af_packet.c         | 34 ++++++++++++++++++++++++++++++++++
>  net/packet/internal.h          |  3 ++-
>  3 files changed, 37 insertions(+), 1 deletion(-)
>
> diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
> index 78c981d..9efc423 100644
> --- a/include/uapi/linux/if_packet.h
> +++ b/include/uapi/linux/if_packet.h
> @@ -59,6 +59,7 @@ struct sockaddr_ll {
>  #define PACKET_ROLLOVER_STATS          21
>  #define PACKET_FANOUT_DATA             22
>  #define PACKET_IGNORE_OUTGOING         23
> +#define PACKET_VNET_HDR_SZ             24
>
>  #define PACKET_FANOUT_HASH             0
>  #define PACKET_FANOUT_LB               1
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index 8ffb19c..8389f18 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -3936,11 +3936,42 @@ static void packet_flush_mclist(struct sock *sk)
>                         ret = -EBUSY;
>                 } else {
>                         po->has_vnet_hdr = !!val;
> +                       /* set vnet_hdr_sz to default value */
> +                       if (po->has_vnet_hdr)
> +                               po->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
> +                       else
> +                               po->vnet_hdr_sz = 0;
>                         ret = 0;
>                 }
>                 release_sock(sk);
>                 return ret;
>         }
> +       case PACKET_VNET_HDR_SZ:
> +       {
> +               int val;
> +
> +               if (sock->type != SOCK_RAW)
> +                       return -EINVAL;
> +               if (optlen < sizeof(val))
> +                       return -EINVAL;
> +               if (copy_from_user(&val, optval, sizeof(val)))
> +                       return -EFAULT;

This duplicates the code in PACKET_VNET_HR. I'd prefer:

        case PACKET_VNET_HDR:
        case PACKET_VNET_HDR_SZ:

        .. sanity checks and copy from user ..

        if (optname = PACKET_VNET_HDR)
                val = sizeof(struct virtio_net_hdr);

And move the check for valid lengths before taking the lock.

> +
> +               lock_sock(sk);
> +               if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
> +                       ret = -EBUSY;
> +               } else {
> +                       if (val == sizeof(struct virtio_net_hdr) ||
> +                           val == sizeof(struct virtio_net_hdr_mrg_rxbuf)) {
> +                               po->vnet_hdr_sz = val;
> +                               ret = 0;
> +                       } else {
> +                               ret = -EINVAL;
> +                       }
> +               }
> +               release_sock(sk);
> +               return ret;
> +       }
>         case PACKET_TIMESTAMP:
>         {
>                 int val;
> @@ -4070,6 +4101,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
>         case PACKET_VNET_HDR:
>                 val = po->has_vnet_hdr;
>                 break;
> +       case PACKET_VNET_HDR_SZ:
> +               val = po->vnet_hdr_sz;
> +               break;
>         case PACKET_VERSION:
>                 val = po->tp_version;
>                 break;
> diff --git a/net/packet/internal.h b/net/packet/internal.h
> index 48af35b..e27b47d 100644
> --- a/net/packet/internal.h
> +++ b/net/packet/internal.h
> @@ -121,7 +121,8 @@ struct packet_sock {
>                                 origdev:1,
>                                 has_vnet_hdr:1,
>                                 tp_loss:1,
> -                               tp_tx_has_off:1;
> +                               tp_tx_has_off:1,
> +                               vnet_hdr_sz:8;  /* vnet header size should use */

This location looks fine from the point of view of using holes in the
struct:


        /* --- cacheline 12 boundary (768 bytes) --- */
        struct packet_ring_buffer  rx_ring;              /*   768   200 */
        /* --- cacheline 15 boundary (960 bytes) was 8 bytes ago --- */
        struct packet_ring_buffer  tx_ring;              /*   968   200 */
        /* --- cacheline 18 boundary (1152 bytes) was 16 bytes ago --- */
        int                        copy_thresh;          /*  1168     4 */
        spinlock_t                 bind_lock;            /*  1172     4 */
        struct mutex               pg_vec_lock;          /*  1176    32 */
        unsigned int               running;              /*  1208     4 */
        unsigned int               auxdata:1;            /*  1212: 0  4 */
        unsigned int               origdev:1;            /*  1212: 1  4 */
        unsigned int               has_vnet_hdr:1;       /*  1212: 2  4 */
        unsigned int               tp_loss:1;            /*  1212: 3  4 */
        unsigned int               tp_tx_has_off:1;      /*  1212: 4  4 */

        /* XXX 27 bits hole, try to pack */

        /* --- cacheline 19 boundary (1216 bytes) --- */

>         int                     pressure;
>         int                     ifindex;        /* bound device         */
>         __be16                  num;
> --
> 1.8.3.1
>
沈安琪(凛玥) Feb. 10, 2023, 4 a.m. UTC | #2
在 2023/2/9 下午10:45, Willem de Bruijn 写道:
> On Thu, Feb 9, 2023 at 7:43 AM 沈安琪(凛玥) <amy.saq@antgroup.com> wrote:
>> From: "Jianfeng Tan" <henry.tjf@antgroup.com>
>>
>> Raw socket can be used as the backend for kernel vhost, like tap.
> Please refer to PF_PACKET sockets as packet sockets.
>
> "raw" sockets is ambiguous: it is also used to refer to type SOCK_RAW
> of other socket families.
>
>> However, in current raw socket implementation, it use hardcoded virtio
>> net header length, which will cause error mac header parsing when some
>> virtio features that need virtio net header other than 10-byte are used.
> This series only adds support for skipping past two extra bytes.
>
> The 2-byte field num_buffers in virtio_net_hdr_mrg_rxbuf is used for
> coalesced buffers. That is not a feature packet sockets support.
>
> How do you intend to use this? Only ever with num_buffers == 1?


This 2-byte field will later be used to record how many buffers it can 
use when virtio mergeable is enable in drivers/vhost/net.c:1231.

Here we need to skip this 2-byte field in af_packet since otherwise, the 
number of buffers info will overwrite the mac header info.


>
> We have to make ABI changes sparingly. It would take another setsockopt
> to signal actual use of this feature.
>
> If adding an extended struct, then this also needs to be documented in
> the UAPI headers.
>
>> By adding extra field vnet_hdr_sz in packet_sock to record virtio net
>> header size that current raw socket should use and supporting extra
>> sockopt PACKET_VNET_HDR_SZ to allow user level set specified vnet header
>> size to current socket, raw socket will know the exact virtio net header
>> size it should use instead of hardcoding to avoid incorrect header
>> parsing.
>>
>> Signed-off-by: Jianfeng Tan <henry.tjf@antgroup.com>
>> Co-developed-by: Anqi Shen <amy.saq@antgroup.com>
>> Signed-off-by: Anqi Shen <amy.saq@antgroup.com>
>> ---
>>   include/uapi/linux/if_packet.h |  1 +
>>   net/packet/af_packet.c         | 34 ++++++++++++++++++++++++++++++++++
>>   net/packet/internal.h          |  3 ++-
>>   3 files changed, 37 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
>> index 78c981d..9efc423 100644
>> --- a/include/uapi/linux/if_packet.h
>> +++ b/include/uapi/linux/if_packet.h
>> @@ -59,6 +59,7 @@ struct sockaddr_ll {
>>   #define PACKET_ROLLOVER_STATS          21
>>   #define PACKET_FANOUT_DATA             22
>>   #define PACKET_IGNORE_OUTGOING         23
>> +#define PACKET_VNET_HDR_SZ             24
>>
>>   #define PACKET_FANOUT_HASH             0
>>   #define PACKET_FANOUT_LB               1
>> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
>> index 8ffb19c..8389f18 100644
>> --- a/net/packet/af_packet.c
>> +++ b/net/packet/af_packet.c
>> @@ -3936,11 +3936,42 @@ static void packet_flush_mclist(struct sock *sk)
>>                          ret = -EBUSY;
>>                  } else {
>>                          po->has_vnet_hdr = !!val;
>> +                       /* set vnet_hdr_sz to default value */
>> +                       if (po->has_vnet_hdr)
>> +                               po->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
>> +                       else
>> +                               po->vnet_hdr_sz = 0;
>>                          ret = 0;
>>                  }
>>                  release_sock(sk);
>>                  return ret;
>>          }
>> +       case PACKET_VNET_HDR_SZ:
>> +       {
>> +               int val;
>> +
>> +               if (sock->type != SOCK_RAW)
>> +                       return -EINVAL;
>> +               if (optlen < sizeof(val))
>> +                       return -EINVAL;
>> +               if (copy_from_user(&val, optval, sizeof(val)))
>> +                       return -EFAULT;
> This duplicates the code in PACKET_VNET_HR. I'd prefer:
>
>          case PACKET_VNET_HDR:
>          case PACKET_VNET_HDR_SZ:
>
>          .. sanity checks and copy from user ..
>
>          if (optname = PACKET_VNET_HDR)
>                  val = sizeof(struct virtio_net_hdr);
>
> And move the check for valid lengths before taking the lock.


Thanks for pointing out. It makes sense and we will address in the next 
version of patch.


>
>> +
>> +               lock_sock(sk);
>> +               if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
>> +                       ret = -EBUSY;
>> +               } else {
>> +                       if (val == sizeof(struct virtio_net_hdr) ||
>> +                           val == sizeof(struct virtio_net_hdr_mrg_rxbuf)) {
>> +                               po->vnet_hdr_sz = val;
>> +                               ret = 0;
>> +                       } else {
>> +                               ret = -EINVAL;
>> +                       }
>> +               }
>> +               release_sock(sk);
>> +               return ret;
>> +       }
>>          case PACKET_TIMESTAMP:
>>          {
>>                  int val;
>> @@ -4070,6 +4101,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
>>          case PACKET_VNET_HDR:
>>                  val = po->has_vnet_hdr;
>>                  break;
>> +       case PACKET_VNET_HDR_SZ:
>> +               val = po->vnet_hdr_sz;
>> +               break;
>>          case PACKET_VERSION:
>>                  val = po->tp_version;
>>                  break;
>> diff --git a/net/packet/internal.h b/net/packet/internal.h
>> index 48af35b..e27b47d 100644
>> --- a/net/packet/internal.h
>> +++ b/net/packet/internal.h
>> @@ -121,7 +121,8 @@ struct packet_sock {
>>                                  origdev:1,
>>                                  has_vnet_hdr:1,
>>                                  tp_loss:1,
>> -                               tp_tx_has_off:1;
>> +                               tp_tx_has_off:1,
>> +                               vnet_hdr_sz:8;  /* vnet header size should use */
> This location looks fine from the point of view of using holes in the
> struct:
>
>
>          /* --- cacheline 12 boundary (768 bytes) --- */
>          struct packet_ring_buffer  rx_ring;              /*   768   200 */
>          /* --- cacheline 15 boundary (960 bytes) was 8 bytes ago --- */
>          struct packet_ring_buffer  tx_ring;              /*   968   200 */
>          /* --- cacheline 18 boundary (1152 bytes) was 16 bytes ago --- */
>          int                        copy_thresh;          /*  1168     4 */
>          spinlock_t                 bind_lock;            /*  1172     4 */
>          struct mutex               pg_vec_lock;          /*  1176    32 */
>          unsigned int               running;              /*  1208     4 */
>          unsigned int               auxdata:1;            /*  1212: 0  4 */
>          unsigned int               origdev:1;            /*  1212: 1  4 */
>          unsigned int               has_vnet_hdr:1;       /*  1212: 2  4 */
>          unsigned int               tp_loss:1;            /*  1212: 3  4 */
>          unsigned int               tp_tx_has_off:1;      /*  1212: 4  4 */
>
>          /* XXX 27 bits hole, try to pack */
>
>          /* --- cacheline 19 boundary (1216 bytes) --- */
>
>>          int                     pressure;
>>          int                     ifindex;        /* bound device         */
>>          __be16                  num;
>> --
>> 1.8.3.1
>>
diff mbox series

Patch

diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index 78c981d..9efc423 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -59,6 +59,7 @@  struct sockaddr_ll {
 #define PACKET_ROLLOVER_STATS		21
 #define PACKET_FANOUT_DATA		22
 #define PACKET_IGNORE_OUTGOING		23
+#define PACKET_VNET_HDR_SZ		24
 
 #define PACKET_FANOUT_HASH		0
 #define PACKET_FANOUT_LB		1
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 8ffb19c..8389f18 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3936,11 +3936,42 @@  static void packet_flush_mclist(struct sock *sk)
 			ret = -EBUSY;
 		} else {
 			po->has_vnet_hdr = !!val;
+			/* set vnet_hdr_sz to default value */
+			if (po->has_vnet_hdr)
+				po->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
+			else
+				po->vnet_hdr_sz = 0;
 			ret = 0;
 		}
 		release_sock(sk);
 		return ret;
 	}
+	case PACKET_VNET_HDR_SZ:
+	{
+		int val;
+
+		if (sock->type != SOCK_RAW)
+			return -EINVAL;
+		if (optlen < sizeof(val))
+			return -EINVAL;
+		if (copy_from_user(&val, optval, sizeof(val)))
+			return -EFAULT;
+
+		lock_sock(sk);
+		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
+			ret = -EBUSY;
+		} else {
+			if (val == sizeof(struct virtio_net_hdr) ||
+			    val == sizeof(struct virtio_net_hdr_mrg_rxbuf)) {
+				po->vnet_hdr_sz = val;
+				ret = 0;
+			} else {
+				ret = -EINVAL;
+			}
+		}
+		release_sock(sk);
+		return ret;
+	}
 	case PACKET_TIMESTAMP:
 	{
 		int val;
@@ -4070,6 +4101,9 @@  static int packet_getsockopt(struct socket *sock, int level, int optname,
 	case PACKET_VNET_HDR:
 		val = po->has_vnet_hdr;
 		break;
+	case PACKET_VNET_HDR_SZ:
+		val = po->vnet_hdr_sz;
+		break;
 	case PACKET_VERSION:
 		val = po->tp_version;
 		break;
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 48af35b..e27b47d 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -121,7 +121,8 @@  struct packet_sock {
 				origdev:1,
 				has_vnet_hdr:1,
 				tp_loss:1,
-				tp_tx_has_off:1;
+				tp_tx_has_off:1,
+				vnet_hdr_sz:8;	/* vnet header size should use */
 	int			pressure;
 	int			ifindex;	/* bound device		*/
 	__be16			num;