diff mbox series

[net-next] tun: Pad virtio headers

Message ID 20250213-buffers-v1-1-ec4a0821957a@daynix.com (mailing list archive)
State New
Headers show
Series [net-next] tun: Pad virtio headers | expand

Commit Message

Akihiko Odaki Feb. 13, 2025, 6:54 a.m. UTC
tun used to simply advance iov_iter when it needs to pad virtio header,
which leaves the garbage in the buffer as is. This is especially
problematic when tun starts to allow enabling the hash reporting
feature; even if the feature is enabled, the packet may lack a hash
value and may contain a hole in the virtio header because the packet
arrived before the feature gets enabled or does not contain the
header fields to be hashed. If the hole is not filled with zero, it is
impossible to tell if the packet lacks a hash value.

In theory, a user of tun can fill the buffer with zero before calling
read() to avoid such a problem, but leaving the garbage in the buffer is
awkward anyway so fill the buffer in tun.

The specification also says the device MUST set num_buffers to 1 when
the field is present so set it when the specified header size is big
enough to contain the field.

Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
---
 drivers/net/tap.c      |  2 +-
 drivers/net/tun.c      |  6 ++++--
 drivers/net/tun_vnet.h | 14 +++++++++-----
 3 files changed, 14 insertions(+), 8 deletions(-)


---
base-commit: f54eab84fc17ef79b701e29364b7d08ca3a1d2f6
change-id: 20250116-buffers-96e14bf023fc
prerequisite-change-id: 20241230-tun-66e10a49b0c7:v6
prerequisite-patch-id: 871dc5f146fb6b0e3ec8612971a8e8190472c0fb
prerequisite-patch-id: 2797ed249d32590321f088373d4055ff3f430a0e
prerequisite-patch-id: ea3370c72d4904e2f0536ec76ba5d26784c0cede
prerequisite-patch-id: 837e4cf5d6b451424f9b1639455e83a260c4440d
prerequisite-patch-id: ea701076f57819e844f5a35efe5cbc5712d3080d
prerequisite-patch-id: 701646fb43ad04cc64dd2bf13c150ccbe6f828ce
prerequisite-patch-id: 53176dae0c003f5b6c114d43f936cf7140d31bb5

Best regards,

Comments

Michael S. Tsirkin Feb. 13, 2025, 7:18 a.m. UTC | #1
Commit log needs some work.

So my understanding is, this patch does not do much functionally,
but makes adding the hash feature easier. OK.

On Thu, Feb 13, 2025 at 03:54:06PM +0900, Akihiko Odaki wrote:
> tun used to simply advance iov_iter when it needs to pad virtio header,
> which leaves the garbage in the buffer as is. This is especially
> problematic

I think you mean "this will become especially problematic"

> when tun starts to allow enabling the hash reporting
> feature; even if the feature is enabled, the packet may lack a hash
> value and may contain a hole in the virtio header because the packet
> arrived before the feature gets enabled or does not contain the
> header fields to be hashed. If the hole is not filled with zero, it is
> impossible to tell if the packet lacks a hash value.
> 
> In theory, a user of tun can fill the buffer with zero before calling
> read() to avoid such a problem, but leaving the garbage in the buffer is
> awkward anyway so fill the buffer in tun.


What is missing here is description of what the patch does.
I think it is
"Replace advancing the iterator with writing zeros".

There could be performance cost to the dirtying extra cache lines, though.
Could you try checking that please?

I think we should mention the risks of the patch, too.
Maybe:

	Also in theory, a user might have initialized the buffer
	to some non-zero value, expecting tun to skip writing it.
	As this was never a documented feature, this seems unlikely.


> 
> The specification also says the device MUST set num_buffers to 1 when
> the field is present so set it when the specified header size is big
> enough to contain the field.

This part I dislike. tun has no idea what the number of buffers is.
Why 1 specifically?


> 
> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
> ---
>  drivers/net/tap.c      |  2 +-
>  drivers/net/tun.c      |  6 ++++--
>  drivers/net/tun_vnet.h | 14 +++++++++-----
>  3 files changed, 14 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
> index 1287e241f4454fb8ec4975bbaded5fbaa88e3cc8..d96009153c316f669e626d95002e5fe8add3a1b2 100644
> --- a/drivers/net/tap.c
> +++ b/drivers/net/tap.c
> @@ -711,7 +711,7 @@ static ssize_t tap_put_user(struct tap_queue *q,
>  	int total;
>  
>  	if (q->flags & IFF_VNET_HDR) {
> -		struct virtio_net_hdr vnet_hdr;
> +		struct virtio_net_hdr_v1 vnet_hdr;
>  
>  		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
>  
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index b14231a743915c2851eaae49d757b763ec4a8841..a3aed7e42c63d8b8f523c0141c7d970ab185178c 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1987,7 +1987,9 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun,
>  	ssize_t ret;
>  
>  	if (tun->flags & IFF_VNET_HDR) {
> -		struct virtio_net_hdr gso = { 0 };
> +		struct virtio_net_hdr_v1 gso = {
> +			.num_buffers = cpu_to_tun_vnet16(tun->flags, 1)
> +		};
>  
>  		vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
>  		ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso);
> @@ -2040,7 +2042,7 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>  	}
>  
>  	if (vnet_hdr_sz) {
> -		struct virtio_net_hdr gso;
> +		struct virtio_net_hdr_v1 gso;
>  
>  		ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso);
>  		if (ret)
> diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h
> index fd7411c4447ffb180e032fe3e22f6709c30da8e9..b4f406f522728f92266898969831c26a87930f6a 100644
> --- a/drivers/net/tun_vnet.h
> +++ b/drivers/net/tun_vnet.h
> @@ -135,15 +135,17 @@ static inline int tun_vnet_hdr_get(int sz, unsigned int flags,
>  }
>  
>  static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter,
> -				   const struct virtio_net_hdr *hdr)
> +				   const struct virtio_net_hdr_v1 *hdr)
>  {
> +	int content_sz = MIN(sizeof(*hdr), sz);
> +
>  	if (unlikely(iov_iter_count(iter) < sz))
>  		return -EINVAL;
>  
> -	if (unlikely(copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr)))
> +	if (unlikely(copy_to_iter(hdr, content_sz, iter) != content_sz))
>  		return -EFAULT;
>  
> -	iov_iter_advance(iter, sz - sizeof(*hdr));
> +	iov_iter_zero(sz - content_sz, iter);
>  
>  	return 0;
>  }
> @@ -157,11 +159,11 @@ static inline int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb,
>  static inline int tun_vnet_hdr_from_skb(unsigned int flags,
>  					const struct net_device *dev,
>  					const struct sk_buff *skb,
> -					struct virtio_net_hdr *hdr)
> +					struct virtio_net_hdr_v1 *hdr)
>  {
>  	int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;
>  
> -	if (virtio_net_hdr_from_skb(skb, hdr,
> +	if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr,
>  				    tun_vnet_is_little_endian(flags), true,
>  				    vlan_hlen)) {
>  		struct skb_shared_info *sinfo = skb_shinfo(skb);
> @@ -179,6 +181,8 @@ static inline int tun_vnet_hdr_from_skb(unsigned int flags,
>  		return -EINVAL;
>  	}
>  
> +	hdr->num_buffers = cpu_to_tun_vnet16(flags, 1);
> +
>  	return 0;
>  }
>  
> 
> ---
> base-commit: f54eab84fc17ef79b701e29364b7d08ca3a1d2f6
> change-id: 20250116-buffers-96e14bf023fc
> prerequisite-change-id: 20241230-tun-66e10a49b0c7:v6
> prerequisite-patch-id: 871dc5f146fb6b0e3ec8612971a8e8190472c0fb
> prerequisite-patch-id: 2797ed249d32590321f088373d4055ff3f430a0e
> prerequisite-patch-id: ea3370c72d4904e2f0536ec76ba5d26784c0cede
> prerequisite-patch-id: 837e4cf5d6b451424f9b1639455e83a260c4440d
> prerequisite-patch-id: ea701076f57819e844f5a35efe5cbc5712d3080d
> prerequisite-patch-id: 701646fb43ad04cc64dd2bf13c150ccbe6f828ce
> prerequisite-patch-id: 53176dae0c003f5b6c114d43f936cf7140d31bb5
> 
> Best regards,
> -- 
> Akihiko Odaki <akihiko.odaki@daynix.com>
Akihiko Odaki Feb. 13, 2025, 9:23 a.m. UTC | #2
On 2025/02/13 16:18, Michael S. Tsirkin wrote:
> 
> Commit log needs some work.
> 
> So my understanding is, this patch does not do much functionally,
> but makes adding the hash feature easier. OK.
> 
> On Thu, Feb 13, 2025 at 03:54:06PM +0900, Akihiko Odaki wrote:
>> tun used to simply advance iov_iter when it needs to pad virtio header,
>> which leaves the garbage in the buffer as is. This is especially
>> problematic
> 
> I think you mean "this will become especially problematic"
> 
>> when tun starts to allow enabling the hash reporting
>> feature; even if the feature is enabled, the packet may lack a hash
>> value and may contain a hole in the virtio header because the packet
>> arrived before the feature gets enabled or does not contain the
>> header fields to be hashed. If the hole is not filled with zero, it is
>> impossible to tell if the packet lacks a hash value.
>>
>> In theory, a user of tun can fill the buffer with zero before calling
>> read() to avoid such a problem, but leaving the garbage in the buffer is
>> awkward anyway so fill the buffer in tun.
> 
> 
> What is missing here is description of what the patch does.
> I think it is
> "Replace advancing the iterator with writing zeros".
> 
> There could be performance cost to the dirtying extra cache lines, though.
> Could you try checking that please?

It will not dirty extra cache lines; an explanation follows later. 
Because of that, any benchmark are likely to show only noises, but if 
you have an idea of workloads that should be tested, please tell me.

> 
> I think we should mention the risks of the patch, too.
> Maybe:
> 
> 	Also in theory, a user might have initialized the buffer
> 	to some non-zero value, expecting tun to skip writing it.
> 	As this was never a documented feature, this seems unlikely.
 > >
>>
>> The specification also says the device MUST set num_buffers to 1 when
>> the field is present so set it when the specified header size is big
>> enough to contain the field.
> 
> This part I dislike. tun has no idea what the number of buffers is.
> Why 1 specifically?

That's a valid point. I rewrote the commit log to clarify, but perhaps 
we can drop the code to set the num_buffers as "[PATCH] vhost/net: Set 
num_buffers for virtio 1.0" already landed.

Below is the rewritten commit log, which incorporates your suggestions 
and is extended to cover the performance implication and reason the 
num_buffers initialization:

tun simply advances iov_iter when it needs to pad virtio header,
which leaves the garbage in the buffer as is. This will become
especially problematic when tun starts to allow enabling the hash
reporting feature; even if the feature is enabled, the packet may lack a
hash value and may contain a hole in the virtio header because the
packet arrived before the feature gets enabled or does not contain the
header fields to be hashed. If the hole is not filled with zero, it is
impossible to tell if the packet lacks a hash value.

In theory, a user of tun can fill the buffer with zero before calling
read() to avoid such a problem, but leaving the garbage in the buffer is
awkward anyway so replace advancing the iterator with writing zeros.

A user might have initialized the buffer to some non-zero value,
expecting tun to skip writing it. As this was never a documented
feature, this seems unlikely. Neither is there a non-zero value that can
be determined and set before receiving the packet; the only exception
is the num_buffers field, which is expected to be 1 for version 1 when
VIRTIO_NET_F_HASH_REPORT is not negotiated. This field is specifically
set to 1 instead of 0.

The overhead of filling the hole in the header is negligible as the
entire header is already placed on the cache when a header size defined
in the current specification is used even if the cache line is small
(16 bytes for example).

Below are the header sizes possible with the current specification:
a) 10 bytes if the legacy interface is used
b) 12 bytes if the modern interface is used
c) 20 bytes if VIRTIO_NET_F_HASH_REPORT is negotiated

a) and b) obviously fit in a cache line. c) uses one extra cache line,
but the cache line also contains the first 12 bytes of the packet so
it is always placed on the cache.
Michael S. Tsirkin Feb. 13, 2025, 3:43 p.m. UTC | #3
On Thu, Feb 13, 2025 at 06:23:55PM +0900, Akihiko Odaki wrote:
> On 2025/02/13 16:18, Michael S. Tsirkin wrote:
> > 
> > Commit log needs some work.
> > 
> > So my understanding is, this patch does not do much functionally,
> > but makes adding the hash feature easier. OK.
> > 
> > On Thu, Feb 13, 2025 at 03:54:06PM +0900, Akihiko Odaki wrote:
> > > tun used to simply advance iov_iter when it needs to pad virtio header,
> > > which leaves the garbage in the buffer as is. This is especially
> > > problematic
> > 
> > I think you mean "this will become especially problematic"
> > 
> > > when tun starts to allow enabling the hash reporting
> > > feature; even if the feature is enabled, the packet may lack a hash
> > > value and may contain a hole in the virtio header because the packet
> > > arrived before the feature gets enabled or does not contain the
> > > header fields to be hashed. If the hole is not filled with zero, it is
> > > impossible to tell if the packet lacks a hash value.
> > > 
> > > In theory, a user of tun can fill the buffer with zero before calling
> > > read() to avoid such a problem, but leaving the garbage in the buffer is
> > > awkward anyway so fill the buffer in tun.
> > 
> > 
> > What is missing here is description of what the patch does.
> > I think it is
> > "Replace advancing the iterator with writing zeros".
> > 
> > There could be performance cost to the dirtying extra cache lines, though.
> > Could you try checking that please?
> 
> It will not dirty extra cache lines; an explanation follows later. Because
> of that, any benchmark are likely to show only noises, but if you have an
> idea of workloads that should be tested, please tell me.

pktgen usually



> > 
> > I think we should mention the risks of the patch, too.
> > Maybe:
> > 
> > 	Also in theory, a user might have initialized the buffer
> > 	to some non-zero value, expecting tun to skip writing it.
> > 	As this was never a documented feature, this seems unlikely.
> > >
> > > 
> > > The specification also says the device MUST set num_buffers to 1 when
> > > the field is present so set it when the specified header size is big
> > > enough to contain the field.
> > 
> > This part I dislike. tun has no idea what the number of buffers is.
> > Why 1 specifically?
> 
> That's a valid point. I rewrote the commit log to clarify, but perhaps we
> can drop the code to set the num_buffers as "[PATCH] vhost/net: Set
> num_buffers for virtio 1.0" already landed.


I think I'd prefer that second option. it allows userspace
to reliably detect the new behaviour, by setting the value
to != 0.


> 
> Below is the rewritten commit log, which incorporates your suggestions and
> is extended to cover the performance implication and reason the num_buffers
> initialization:
> 
> tun simply advances iov_iter when it needs to pad virtio header,
> which leaves the garbage in the buffer as is. This will become
> especially problematic when tun starts to allow enabling the hash
> reporting feature; even if the feature is enabled, the packet may lack a
> hash value and may contain a hole in the virtio header because the
> packet arrived before the feature gets enabled or does not contain the
> header fields to be hashed. If the hole is not filled with zero, it is
> impossible to tell if the packet lacks a hash value.
> 
> In theory, a user of tun can fill the buffer with zero before calling
> read() to avoid such a problem, but leaving the garbage in the buffer is
> awkward anyway so replace advancing the iterator with writing zeros.
> 
> A user might have initialized the buffer to some non-zero value,
> expecting tun to skip writing it. As this was never a documented
> feature, this seems unlikely. Neither is there a non-zero value that can
> be determined and set before receiving the packet; the only exception
> is the num_buffers field, which is expected to be 1 for version 1 when
> VIRTIO_NET_F_HASH_REPORT is not negotiated.

you need mergeable buffers instead i presume.

> This field is specifically
> set to 1 instead of 0.
> 
> The overhead of filling the hole in the header is negligible as the
> entire header is already placed on the cache when a header size defined


what does this mean?

> in the current specification is used even if the cache line is small
> (16 bytes for example).
> 
> Below are the header sizes possible with the current specification:
> a) 10 bytes if the legacy interface is used
> b) 12 bytes if the modern interface is used
> c) 20 bytes if VIRTIO_NET_F_HASH_REPORT is negotiated
> 
> a) and b) obviously fit in a cache line. c) uses one extra cache line,
> but the cache line also contains the first 12 bytes of the packet so
> it is always placed on the cache.


Hmm. But it could be clean so shared. write makes it dirty and so
not shared.
Akihiko Odaki Feb. 15, 2025, 5:25 a.m. UTC | #4
On 2025/02/14 0:43, Michael S. Tsirkin wrote:
> On Thu, Feb 13, 2025 at 06:23:55PM +0900, Akihiko Odaki wrote:
>> On 2025/02/13 16:18, Michael S. Tsirkin wrote:
>>>
>>> Commit log needs some work.
>>>
>>> So my understanding is, this patch does not do much functionally,
>>> but makes adding the hash feature easier. OK.
>>>
>>> On Thu, Feb 13, 2025 at 03:54:06PM +0900, Akihiko Odaki wrote:
>>>> tun used to simply advance iov_iter when it needs to pad virtio header,
>>>> which leaves the garbage in the buffer as is. This is especially
>>>> problematic
>>>
>>> I think you mean "this will become especially problematic"
>>>
>>>> when tun starts to allow enabling the hash reporting
>>>> feature; even if the feature is enabled, the packet may lack a hash
>>>> value and may contain a hole in the virtio header because the packet
>>>> arrived before the feature gets enabled or does not contain the
>>>> header fields to be hashed. If the hole is not filled with zero, it is
>>>> impossible to tell if the packet lacks a hash value.
>>>>
>>>> In theory, a user of tun can fill the buffer with zero before calling
>>>> read() to avoid such a problem, but leaving the garbage in the buffer is
>>>> awkward anyway so fill the buffer in tun.
>>>
>>>
>>> What is missing here is description of what the patch does.
>>> I think it is
>>> "Replace advancing the iterator with writing zeros".
>>>
>>> There could be performance cost to the dirtying extra cache lines, though.
>>> Could you try checking that please?
>>
>> It will not dirty extra cache lines; an explanation follows later. Because
>> of that, any benchmark are likely to show only noises, but if you have an
>> idea of workloads that should be tested, please tell me.
> 
> pktgen usually

I tried it but it didn't give meaningful results so I may be doing 
wrong. It didn't show an obvious performance regression at least. I ran 
the following procedure:

1. create a veth pair
2. connect it to macvtap
3. run Fedora 41 on QEMU with vhost=on
4. run samples/pktgen/pktgen_sample01_simple.sh for the other end of veth
5. observe the rx packet rate of macvtap with ifstat for 60 seconds

ifstat showed that it received:
532K packets / 60 seconds without this patch
578K packets / 60 seconds with this patch

This is 8.6 % uplift, not degradation. I guess it's just a noise.

Below are actual commands I ran:

The commands I set up the veth pair and macvtap is as follows:

ip link add veth_host type veth peer name veth_guest
ip link set veth_host up
ip link set veth_guest up
ip link add link macvtap0 link veth_guest type macvtap
ip link set macvtap0 address 02:00:00:01:00:00 mtu 1486 up
ip address add 10.0.2.0 dev veth_host
ip route add 10.0.2.1 dev veth_host

The command for the pktgen is:
samples/pktgen/pktgen_sample01_simple.sh -i veth_host -d 10.0.2.1 -m 
02:00:00:01:00:00 -n 0

After I started pktgen, I ran: ifstat -d 60 macvtap0
I waited 60 seconds, and observed the rx rate with: ifstat -as macvtap0

> 
> 
> 
>>>
>>> I think we should mention the risks of the patch, too.
>>> Maybe:
>>>
>>> 	Also in theory, a user might have initialized the buffer
>>> 	to some non-zero value, expecting tun to skip writing it.
>>> 	As this was never a documented feature, this seems unlikely.
>>>>
>>>>
>>>> The specification also says the device MUST set num_buffers to 1 when
>>>> the field is present so set it when the specified header size is big
>>>> enough to contain the field.
>>>
>>> This part I dislike. tun has no idea what the number of buffers is.
>>> Why 1 specifically?
>>
>> That's a valid point. I rewrote the commit log to clarify, but perhaps we
>> can drop the code to set the num_buffers as "[PATCH] vhost/net: Set
>> num_buffers for virtio 1.0" already landed.
> 
> 
> I think I'd prefer that second option. it allows userspace
> to reliably detect the new behaviour, by setting the value
> to != 0.

I'll leave num_buffers zero in the next version.

> 
> 
>>
>> Below is the rewritten commit log, which incorporates your suggestions and
>> is extended to cover the performance implication and reason the num_buffers
>> initialization:
>>
>> tun simply advances iov_iter when it needs to pad virtio header,
>> which leaves the garbage in the buffer as is. This will become
>> especially problematic when tun starts to allow enabling the hash
>> reporting feature; even if the feature is enabled, the packet may lack a
>> hash value and may contain a hole in the virtio header because the
>> packet arrived before the feature gets enabled or does not contain the
>> header fields to be hashed. If the hole is not filled with zero, it is
>> impossible to tell if the packet lacks a hash value.
>>
>> In theory, a user of tun can fill the buffer with zero before calling
>> read() to avoid such a problem, but leaving the garbage in the buffer is
>> awkward anyway so replace advancing the iterator with writing zeros.
>>
>> A user might have initialized the buffer to some non-zero value,
>> expecting tun to skip writing it. As this was never a documented
>> feature, this seems unlikely. Neither is there a non-zero value that can
>> be determined and set before receiving the packet; the only exception
>> is the num_buffers field, which is expected to be 1 for version 1 when
>> VIRTIO_NET_F_HASH_REPORT is not negotiated.
> 
> you need mergeable buffers instead i presume.
> 
>> This field is specifically
>> set to 1 instead of 0.
>>
>> The overhead of filling the hole in the header is negligible as the
>> entire header is already placed on the cache when a header size defined
> 
> 
> what does this mean?

The current specification says the header size is 20 bytes or less. tun 
already makes all cache lines where the header will be written dirty for 
such a header size so we are not making another cache line dirty.

> 
>> in the current specification is used even if the cache line is small
>> (16 bytes for example).
>>
>> Below are the header sizes possible with the current specification:
>> a) 10 bytes if the legacy interface is used
>> b) 12 bytes if the modern interface is used
>> c) 20 bytes if VIRTIO_NET_F_HASH_REPORT is negotiated
>>
>> a) and b) obviously fit in a cache line. c) uses one extra cache line,
>> but the cache line also contains the first 12 bytes of the packet so
>> it is always placed on the cache.
> 
> 
> Hmm. But it could be clean so shared. write makes it dirty and so
> not shared.

The packet is not just located after the header in the buffer but tun 
writes the packet there. It makes the cache line dirty even without this 
change.

>
Jason Wang Feb. 17, 2025, 2:57 a.m. UTC | #5
On Sat, Feb 15, 2025 at 1:25 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>
> On 2025/02/14 0:43, Michael S. Tsirkin wrote:
> > On Thu, Feb 13, 2025 at 06:23:55PM +0900, Akihiko Odaki wrote:
> >> On 2025/02/13 16:18, Michael S. Tsirkin wrote:
> >>>
> >>> Commit log needs some work.
> >>>
> >>> So my understanding is, this patch does not do much functionally,
> >>> but makes adding the hash feature easier. OK.
> >>>
> >>> On Thu, Feb 13, 2025 at 03:54:06PM +0900, Akihiko Odaki wrote:
> >>>> tun used to simply advance iov_iter when it needs to pad virtio header,
> >>>> which leaves the garbage in the buffer as is. This is especially
> >>>> problematic
> >>>
> >>> I think you mean "this will become especially problematic"
> >>>
> >>>> when tun starts to allow enabling the hash reporting
> >>>> feature; even if the feature is enabled, the packet may lack a hash
> >>>> value and may contain a hole in the virtio header because the packet
> >>>> arrived before the feature gets enabled or does not contain the
> >>>> header fields to be hashed. If the hole is not filled with zero, it is
> >>>> impossible to tell if the packet lacks a hash value.
> >>>>
> >>>> In theory, a user of tun can fill the buffer with zero before calling
> >>>> read() to avoid such a problem, but leaving the garbage in the buffer is
> >>>> awkward anyway so fill the buffer in tun.
> >>>
> >>>
> >>> What is missing here is description of what the patch does.
> >>> I think it is
> >>> "Replace advancing the iterator with writing zeros".
> >>>
> >>> There could be performance cost to the dirtying extra cache lines, though.
> >>> Could you try checking that please?
> >>
> >> It will not dirty extra cache lines; an explanation follows later. Because
> >> of that, any benchmark are likely to show only noises, but if you have an
> >> idea of workloads that should be tested, please tell me.
> >
> > pktgen usually
>
> I tried it but it didn't give meaningful results so I may be doing
> wrong. It didn't show an obvious performance regression at least. I ran
> the following procedure:
>
> 1. create a veth pair
> 2. connect it to macvtap
> 3. run Fedora 41 on QEMU with vhost=on
> 4. run samples/pktgen/pktgen_sample01_simple.sh for the other end of veth
> 5. observe the rx packet rate of macvtap with ifstat for 60 seconds
>
> ifstat showed that it received:
> 532K packets / 60 seconds without this patch
> 578K packets / 60 seconds with this patch

The pps seems to be too poor.

If you want to test, I would suggest to use:

pktgen on the host with DPDK testpmd + virtio user:

https://doc.dpdk.org/guides/howto/virtio_user_as_exception_path.html

Thanks

>
> This is 8.6 % uplift, not degradation. I guess it's just a noise.
>
> Below are actual commands I ran:
>
> The commands I set up the veth pair and macvtap is as follows:
>
> ip link add veth_host type veth peer name veth_guest
> ip link set veth_host up
> ip link set veth_guest up
> ip link add link macvtap0 link veth_guest type macvtap
> ip link set macvtap0 address 02:00:00:01:00:00 mtu 1486 up
> ip address add 10.0.2.0 dev veth_host
> ip route add 10.0.2.1 dev veth_host
>
> The command for the pktgen is:
> samples/pktgen/pktgen_sample01_simple.sh -i veth_host -d 10.0.2.1 -m
> 02:00:00:01:00:00 -n 0
>
> After I started pktgen, I ran: ifstat -d 60 macvtap0
> I waited 60 seconds, and observed the rx rate with: ifstat -as macvtap0
>
> >
> >
> >
> >>>
> >>> I think we should mention the risks of the patch, too.
> >>> Maybe:
> >>>
> >>>     Also in theory, a user might have initialized the buffer
> >>>     to some non-zero value, expecting tun to skip writing it.
> >>>     As this was never a documented feature, this seems unlikely.
> >>>>
> >>>>
> >>>> The specification also says the device MUST set num_buffers to 1 when
> >>>> the field is present so set it when the specified header size is big
> >>>> enough to contain the field.
> >>>
> >>> This part I dislike. tun has no idea what the number of buffers is.
> >>> Why 1 specifically?
> >>
> >> That's a valid point. I rewrote the commit log to clarify, but perhaps we
> >> can drop the code to set the num_buffers as "[PATCH] vhost/net: Set
> >> num_buffers for virtio 1.0" already landed.
> >
> >
> > I think I'd prefer that second option. it allows userspace
> > to reliably detect the new behaviour, by setting the value
> > to != 0.
>
> I'll leave num_buffers zero in the next version.
>
> >
> >
> >>
> >> Below is the rewritten commit log, which incorporates your suggestions and
> >> is extended to cover the performance implication and reason the num_buffers
> >> initialization:
> >>
> >> tun simply advances iov_iter when it needs to pad virtio header,
> >> which leaves the garbage in the buffer as is. This will become
> >> especially problematic when tun starts to allow enabling the hash
> >> reporting feature; even if the feature is enabled, the packet may lack a
> >> hash value and may contain a hole in the virtio header because the
> >> packet arrived before the feature gets enabled or does not contain the
> >> header fields to be hashed. If the hole is not filled with zero, it is
> >> impossible to tell if the packet lacks a hash value.
> >>
> >> In theory, a user of tun can fill the buffer with zero before calling
> >> read() to avoid such a problem, but leaving the garbage in the buffer is
> >> awkward anyway so replace advancing the iterator with writing zeros.
> >>
> >> A user might have initialized the buffer to some non-zero value,
> >> expecting tun to skip writing it. As this was never a documented
> >> feature, this seems unlikely. Neither is there a non-zero value that can
> >> be determined and set before receiving the packet; the only exception
> >> is the num_buffers field, which is expected to be 1 for version 1 when
> >> VIRTIO_NET_F_HASH_REPORT is not negotiated.
> >
> > you need mergeable buffers instead i presume.
> >
> >> This field is specifically
> >> set to 1 instead of 0.
> >>
> >> The overhead of filling the hole in the header is negligible as the
> >> entire header is already placed on the cache when a header size defined
> >
> >
> > what does this mean?
>
> The current specification says the header size is 20 bytes or less. tun
> already makes all cache lines where the header will be written dirty for
> such a header size so we are not making another cache line dirty.
>
> >
> >> in the current specification is used even if the cache line is small
> >> (16 bytes for example).
> >>
> >> Below are the header sizes possible with the current specification:
> >> a) 10 bytes if the legacy interface is used
> >> b) 12 bytes if the modern interface is used
> >> c) 20 bytes if VIRTIO_NET_F_HASH_REPORT is negotiated
> >>
> >> a) and b) obviously fit in a cache line. c) uses one extra cache line,
> >> but the cache line also contains the first 12 bytes of the packet so
> >> it is always placed on the cache.
> >
> >
> > Hmm. But it could be clean so shared. write makes it dirty and so
> > not shared.
>
> The packet is not just located after the header in the buffer but tun
> writes the packet there. It makes the cache line dirty even without this
> change.
>
> >
>
diff mbox series

Patch

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 1287e241f4454fb8ec4975bbaded5fbaa88e3cc8..d96009153c316f669e626d95002e5fe8add3a1b2 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -711,7 +711,7 @@  static ssize_t tap_put_user(struct tap_queue *q,
 	int total;
 
 	if (q->flags & IFF_VNET_HDR) {
-		struct virtio_net_hdr vnet_hdr;
+		struct virtio_net_hdr_v1 vnet_hdr;
 
 		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index b14231a743915c2851eaae49d757b763ec4a8841..a3aed7e42c63d8b8f523c0141c7d970ab185178c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1987,7 +1987,9 @@  static ssize_t tun_put_user_xdp(struct tun_struct *tun,
 	ssize_t ret;
 
 	if (tun->flags & IFF_VNET_HDR) {
-		struct virtio_net_hdr gso = { 0 };
+		struct virtio_net_hdr_v1 gso = {
+			.num_buffers = cpu_to_tun_vnet16(tun->flags, 1)
+		};
 
 		vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
 		ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso);
@@ -2040,7 +2042,7 @@  static ssize_t tun_put_user(struct tun_struct *tun,
 	}
 
 	if (vnet_hdr_sz) {
-		struct virtio_net_hdr gso;
+		struct virtio_net_hdr_v1 gso;
 
 		ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso);
 		if (ret)
diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h
index fd7411c4447ffb180e032fe3e22f6709c30da8e9..b4f406f522728f92266898969831c26a87930f6a 100644
--- a/drivers/net/tun_vnet.h
+++ b/drivers/net/tun_vnet.h
@@ -135,15 +135,17 @@  static inline int tun_vnet_hdr_get(int sz, unsigned int flags,
 }
 
 static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter,
-				   const struct virtio_net_hdr *hdr)
+				   const struct virtio_net_hdr_v1 *hdr)
 {
+	int content_sz = MIN(sizeof(*hdr), sz);
+
 	if (unlikely(iov_iter_count(iter) < sz))
 		return -EINVAL;
 
-	if (unlikely(copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr)))
+	if (unlikely(copy_to_iter(hdr, content_sz, iter) != content_sz))
 		return -EFAULT;
 
-	iov_iter_advance(iter, sz - sizeof(*hdr));
+	iov_iter_zero(sz - content_sz, iter);
 
 	return 0;
 }
@@ -157,11 +159,11 @@  static inline int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb,
 static inline int tun_vnet_hdr_from_skb(unsigned int flags,
 					const struct net_device *dev,
 					const struct sk_buff *skb,
-					struct virtio_net_hdr *hdr)
+					struct virtio_net_hdr_v1 *hdr)
 {
 	int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;
 
-	if (virtio_net_hdr_from_skb(skb, hdr,
+	if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr,
 				    tun_vnet_is_little_endian(flags), true,
 				    vlan_hlen)) {
 		struct skb_shared_info *sinfo = skb_shinfo(skb);
@@ -179,6 +181,8 @@  static inline int tun_vnet_hdr_from_skb(unsigned int flags,
 		return -EINVAL;
 	}
 
+	hdr->num_buffers = cpu_to_tun_vnet16(flags, 1);
+
 	return 0;
 }