diff mbox series

[RFC,net,2/3] virtio-net: allow usage of vrings smaller than MAX_SKB_FRAGS + 2

Message ID 20230430131518.2708471-3-alvaro.karsz@solid-run.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series virtio-net: allow usage of small vrings | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net
netdev/fixes_present fail Series targets non-next tree, but doesn't contain any Fixes tags
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 8 this patch: 8
netdev/cc_maintainers success CCed 8 of 8 maintainers
netdev/build_clang success Errors and warnings before: 8 this patch: 8
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 8 this patch: 8
netdev/checkpatch warning WARNING: line length of 93 exceeds 80 columns WARNING: line length of 94 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Alvaro Karsz April 30, 2023, 1:15 p.m. UTC
At the moment, if a network device uses vrings with less than
MAX_SKB_FRAGS + 2 entries, the device won't be functional.

The following condition vq->num_free >= 2 + MAX_SKB_FRAGS will always
evaluate to false, leading to TX timeouts.

This patch introduces a new variable, single_pkt_max_descs, that holds
the max number of descriptors we may need to handle a single packet.

This patch also detects the small vring during probe, blocks some
features that can't be used with small vrings, and fails probe,
leading to a reset and features re-negotiation.

Features that can't be used with small vrings:
GRO features (VIRTIO_NET_F_GUEST_*):
When we use small vrings, we may not have enough entries in the ring to
chain page size buffers and form a 64K buffer.
So we may need to allocate 64k of continuous memory, which may be too
much when the system is stressed.

This patch also fixes the MTU size in small vring cases to be up to the
default one, 1500B.

Signed-off-by: Alvaro Karsz <alvaro.karsz@solid-run.com>
---
 drivers/net/virtio_net.c | 149 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 144 insertions(+), 5 deletions(-)

Comments

Michael S. Tsirkin April 30, 2023, 2:05 p.m. UTC | #1
On Sun, Apr 30, 2023 at 04:15:17PM +0300, Alvaro Karsz wrote:
> At the moment, if a network device uses vrings with less than
> MAX_SKB_FRAGS + 2 entries, the device won't be functional.
> 
> The following condition vq->num_free >= 2 + MAX_SKB_FRAGS will always
> evaluate to false, leading to TX timeouts.
> 
> This patch introduces a new variable, single_pkt_max_descs, that holds
> the max number of descriptors we may need to handle a single packet.
> 
> This patch also detects the small vring during probe, blocks some
> features that can't be used with small vrings, and fails probe,
> leading to a reset and features re-negotiation.
> 
> Features that can't be used with small vrings:
> GRO features (VIRTIO_NET_F_GUEST_*):
> When we use small vrings, we may not have enough entries in the ring to
> chain page size buffers and form a 64K buffer.
> So we may need to allocate 64k of continuous memory, which may be too
> much when the system is stressed.
> 
> This patch also fixes the MTU size in small vring cases to be up to the
> default one, 1500B.

and then it should clear VIRTIO_NET_F_MTU?

> Signed-off-by: Alvaro Karsz <alvaro.karsz@solid-run.com>




> ---
>  drivers/net/virtio_net.c | 149 +++++++++++++++++++++++++++++++++++++--
>  1 file changed, 144 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 8d8038538fc..b4441d63890 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -103,6 +103,8 @@ struct virtnet_rq_stats {
>  #define VIRTNET_SQ_STAT(m)	offsetof(struct virtnet_sq_stats, m)
>  #define VIRTNET_RQ_STAT(m)	offsetof(struct virtnet_rq_stats, m)
>  
> +#define IS_SMALL_VRING(size)	((size) < MAX_SKB_FRAGS + 2)
> +
>  static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = {
>  	{ "packets",		VIRTNET_SQ_STAT(packets) },
>  	{ "bytes",		VIRTNET_SQ_STAT(bytes) },
> @@ -268,6 +270,12 @@ struct virtnet_info {
>  	/* Does the affinity hint is set for virtqueues? */
>  	bool affinity_hint_set;
>  
> +	/* How many ring descriptors we may need to transmit a single packet */
> +	u16 single_pkt_max_descs;
> +
> +	/* Do we have virtqueues with small vrings? */
> +	bool svring;
> +
>  	/* CPU hotplug instances for online & dead */
>  	struct hlist_node node;
>  	struct hlist_node node_dead;

worth checking that all these layout changes don't push useful things to
a different cache line. can you add that analysis?

I see confusiong here wrt whether some rings are "small"? all of them?
some rx rings? some tx rings? names should make it clear.
also do we really need bool svring? can't we just check single_pkt_max_descs
all the time?

> @@ -455,6 +463,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>  	unsigned int copy, hdr_len, hdr_padded_len;
>  	struct page *page_to_free = NULL;
>  	int tailroom, shinfo_size;
> +	u16 max_frags = MAX_SKB_FRAGS;
>  	char *p, *hdr_p, *buf;
>  
>  	p = page_address(page) + offset;
> @@ -520,7 +529,10 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>  	 * tries to receive more than is possible. This is usually
>  	 * the case of a broken device.
>  	 */
> -	if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
> +	if (unlikely(vi->svring))
> +		max_frags = 1;
> +
> +	if (unlikely(len > max_frags * PAGE_SIZE)) {
>  		net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
>  		dev_kfree_skb(skb);
>  		return NULL;
> @@ -612,7 +624,7 @@ static void check_sq_full_and_disable(struct virtnet_info *vi,
>  	 * Since most packets only take 1 or 2 ring slots, stopping the queue
>  	 * early means 16 slots are typically wasted.
>  	 */
> -	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
> +	if (sq->vq->num_free < vi->single_pkt_max_descs) {
>  		netif_stop_subqueue(dev, qnum);
>  		if (use_napi) {
>  			if (unlikely(!virtqueue_enable_cb_delayed(sq->vq)))
> @@ -620,7 +632,7 @@ static void check_sq_full_and_disable(struct virtnet_info *vi,
>  		} else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
>  			/* More just got used, free them then recheck. */
>  			free_old_xmit_skbs(sq, false);
> -			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
> +			if (sq->vq->num_free >= vi->single_pkt_max_descs) {
>  				netif_start_subqueue(dev, qnum);
>  				virtqueue_disable_cb(sq->vq);
>  			}
> @@ -1108,6 +1120,10 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
>  		return 0;
>  
>  	if (*num_buf > 1) {
> +		/* Small vring - can't be more than 1 buffer */
> +		if (unlikely(vi->svring))
> +			return -EINVAL;
> +
>  		/* If we want to build multi-buffer xdp, we need
>  		 * to specify that the flags of xdp_buff have the
>  		 * XDP_FLAGS_HAS_FRAG bit.
> @@ -1828,7 +1844,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq)
>  			free_old_xmit_skbs(sq, true);
>  		} while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
>  
> -		if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
> +		if (sq->vq->num_free >= vi->single_pkt_max_descs)
>  			netif_tx_wake_queue(txq);
>  
>  		__netif_tx_unlock(txq);
> @@ -1919,7 +1935,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
>  	virtqueue_disable_cb(sq->vq);
>  	free_old_xmit_skbs(sq, true);
>  
> -	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
> +	if (sq->vq->num_free >= vi->single_pkt_max_descs)
>  		netif_tx_wake_queue(txq);
>  
>  	opaque = virtqueue_enable_cb_prepare(sq->vq);
> @@ -3862,6 +3878,15 @@ static bool virtnet_check_guest_gso(const struct virtnet_info *vi)
>  		virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO6));
>  }
>  
> +static bool virtnet_check_host_gso(const struct virtnet_info *vi)
> +{
> +	return virtio_has_feature(vi->vdev, VIRTIO_NET_F_HOST_TSO4) ||
> +		virtio_has_feature(vi->vdev, VIRTIO_NET_F_HOST_TSO6) ||
> +		virtio_has_feature(vi->vdev, VIRTIO_NET_F_HOST_ECN) ||
> +		virtio_has_feature(vi->vdev, VIRTIO_NET_F_HOST_UFO) ||
> +		virtio_has_feature(vi->vdev, VIRTIO_NET_F_HOST_USO);
> +}
> +
>  static void virtnet_set_big_packets(struct virtnet_info *vi, const int mtu)
>  {
>  	bool guest_gso = virtnet_check_guest_gso(vi);
> @@ -3876,6 +3901,112 @@ static void virtnet_set_big_packets(struct virtnet_info *vi, const int mtu)
>  	}
>  }
>  
> +static u16 virtnet_calc_max_descs(struct virtnet_info *vi)
> +{
> +	if (vi->svring) {
> +		if (virtnet_check_host_gso(vi))
> +			return 4; /* 1 fragment + linear part + virtio header + GSO header */
> +		else
> +			return 3; /* 1 fragment + linear part + virtio header */
> +	} else {
> +		return MAX_SKB_FRAGS + 2;
> +	}
> +}
> +
> +static bool virtnet_uses_svring(struct virtnet_info *vi)
> +{
> +	u32 i;
> +
> +	/* If a transmit/receive virtqueue is small,
> +	 * we cannot handle fragmented packets.
> +	 */
> +	for (i = 0; i < vi->max_queue_pairs; i++) {
> +		if (IS_SMALL_VRING(virtqueue_get_vring_size(vi->sq[i].vq)) ||
> +		    IS_SMALL_VRING(virtqueue_get_vring_size(vi->rq[i].vq)))
> +			return true;
> +	}
> +
> +	return false;
> +}

I see even if only some rings are too small we force everything to use
small ones. Wouldn't it be better to just disable small ones in this
case? That would not need a reset.


> +
> +/* Function returns the number of features it blocked */

We don't need the # though. Make it bool?

> +static int virtnet_block_svring_unsupported(struct virtio_device *vdev)
> +{
> +	int cnt = 0;
> +	/* Block Virtio guest GRO features.
> +	 * Asking Linux to allocate 64k of continuous memory is too much,
> +	 * specially when the system is stressed.
> +	 *
> +	 * If VIRTIO_NET_F_MRG_RXBUF is negotiated we can allcoate smaller
> +	 * buffers, but since the ring is small, the buffers can be quite big.
> +	 *
> +	 */
> +	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4)) {
> +		virtio_block_feature(vdev, VIRTIO_NET_F_GUEST_TSO4);
> +		cnt++;
> +	}
> +	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6)) {
> +		virtio_block_feature(vdev, VIRTIO_NET_F_GUEST_TSO6);
> +		cnt++;
> +	}
> +	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN)) {
> +		virtio_block_feature(vdev, VIRTIO_NET_F_GUEST_ECN);
> +		cnt++;
> +	}
> +	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO)) {
> +		virtio_block_feature(vdev, VIRTIO_NET_F_GUEST_UFO);
> +		cnt++;
> +	}
> +	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_USO4)) {
> +		virtio_block_feature(vdev, VIRTIO_NET_F_GUEST_USO4);
> +		cnt++;
> +	}
> +	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_USO6)) {
> +		virtio_block_feature(vdev, VIRTIO_NET_F_GUEST_USO6);
> +		cnt++;
> +	}
> +
> +	return cnt;
> +}
> +
> +static int virtnet_fixup_svring(struct virtnet_info *vi)
> +{
> +	int i;
> +	/* Do we use small vrings?
> +	 * If not, nothing we need to do.
> +	 */
> +	vi->svring = virtnet_uses_svring(vi);
> +	if (!vi->svring)
> +		return 0;
> +
> +	/* Some features can't be used with small vrings.
> +	 * Block those and return an error.
> +	 * This will trigger a reprobe without the blocked
> +	 * features.
> +	 */
> +	if (virtnet_block_svring_unsupported(vi->vdev))
> +		return -EOPNOTSUPP;
> +
> +	/* Disable NETIF_F_SG */
> +	vi->dev->hw_features &= ~NETIF_F_SG;
> +
> +	/* Don't use MTU bigger than default */
> +	if (vi->dev->max_mtu > ETH_DATA_LEN)
> +		vi->dev->max_mtu = ETH_DATA_LEN;
> +	if (vi->dev->mtu > ETH_DATA_LEN)
> +		vi->dev->mtu = ETH_DATA_LEN;
> +
> +	/* Don't use big packets */
> +	vi->big_packets = false;
> +	vi->big_packets_num_skbfrags = 1;
> +
> +	/* Fix min_buf_len for receive virtqueues */
> +	for (i = 0; i < vi->max_queue_pairs; i++)
> +		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> +
> +	return 0;
> +}
> +
>  static int virtnet_probe(struct virtio_device *vdev)
>  {
>  	int i, err = -ENOMEM;
> @@ -4061,6 +4192,14 @@ static int virtnet_probe(struct virtio_device *vdev)
>  	if (err)
>  		goto free;
>  
> +	/* Do required fixups in case we are using small vrings */
> +	err = virtnet_fixup_svring(vi);
> +	if (err)
> +		goto free_vqs;
> +
> +	/* Calculate the max. number of descriptors we may need to transmit a single packet */
> +	vi->single_pkt_max_descs = virtnet_calc_max_descs(vi);
> +
>  #ifdef CONFIG_SYSFS
>  	if (vi->mergeable_rx_bufs)
>  		dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
> -- 
> 2.34.1
Alvaro Karsz April 30, 2023, 6:54 p.m. UTC | #2
> > At the moment, if a network device uses vrings with less than
> > MAX_SKB_FRAGS + 2 entries, the device won't be functional.
> >
> > The following condition vq->num_free >= 2 + MAX_SKB_FRAGS will always
> > evaluate to false, leading to TX timeouts.
> >
> > This patch introduces a new variable, single_pkt_max_descs, that holds
> > the max number of descriptors we may need to handle a single packet.
> >
> > This patch also detects the small vring during probe, blocks some
> > features that can't be used with small vrings, and fails probe,
> > leading to a reset and features re-negotiation.
> >
> > Features that can't be used with small vrings:
> > GRO features (VIRTIO_NET_F_GUEST_*):
> > When we use small vrings, we may not have enough entries in the ring to
> > chain page size buffers and form a 64K buffer.
> > So we may need to allocate 64k of continuous memory, which may be too
> > much when the system is stressed.
> >
> > This patch also fixes the MTU size in small vring cases to be up to the
> > default one, 1500B.
> 
> and then it should clear VIRTIO_NET_F_MTU?
> 

Following [1], I was thinking to accept the feature and a let the device figure out that it can't transmit a big packet, since the RX buffers are not big enough (without VIRTIO_NET_F_MRG_RXBUF).
But, I think that we may need to block the MTU feature after all.
Quoting the spec:

A driver SHOULD negotiate VIRTIO_NET_F_MTU if the device offers it.
If the driver negotiates VIRTIO_NET_F_MTU, it MUST supply enough receive buffers to receive at least one receive packet of size mtu (plus low level ethernet header length) with gso_type NONE or ECN.

So, if VIRTIO_NET_F_MTU is negotiated, we MUST supply enough receive buffers.
So I think that blocking VIRTIO_NET_F_MTU  should be the way to go, If mtu > 1500.

[1] https://lore.kernel.org/lkml/20230417031052-mutt-send-email-mst@kernel.org/

> > +     /* How many ring descriptors we may need to transmit a single packet */
> > +     u16 single_pkt_max_descs;
> > +
> > +     /* Do we have virtqueues with small vrings? */
> > +     bool svring;
> > +
> >       /* CPU hotplug instances for online & dead */
> >       struct hlist_node node;
> >       struct hlist_node node_dead;
> 
> worth checking that all these layout changes don't push useful things to
> a different cache line. can you add that analysis?
> 

Good point.
I think that we can just move these to the bottom of the struct.

> 
> I see confusiong here wrt whether some rings are "small"? all of them?
> some rx rings? some tx rings? names should make it clear.

The small vring is a device attribute, not a vq attribute. It blocks features, which affects the entire device.
Maybe we can call it "small vring mode".

> also do we really need bool svring? can't we just check single_pkt_max_descs
> all the time?
> 

We can work without the bool, we could always check if single_pkt_max_descs != MAX_SKB_FRAGS + 2.
It doesn't really matter to me, I was thinking it may be more readable this way.

> > +static bool virtnet_uses_svring(struct virtnet_info *vi)
> > +{
> > +     u32 i;
> > +
> > +     /* If a transmit/receive virtqueue is small,
> > +      * we cannot handle fragmented packets.
> > +      */
> > +     for (i = 0; i < vi->max_queue_pairs; i++) {
> > +             if (IS_SMALL_VRING(virtqueue_get_vring_size(vi->sq[i].vq)) ||
> > +                 IS_SMALL_VRING(virtqueue_get_vring_size(vi->rq[i].vq)))
> > +                     return true;
> > +     }
> > +
> > +     return false;
> > +}
> 
> I see even if only some rings are too small we force everything to use
> small ones. Wouldn't it be better to just disable small ones in this
> case? That would not need a reset.
> 

I'm not sure. It may complicate things.

What if all TX vqs are small?
What if all RX vqs are small?
What if we end up with an unbalanced number of TX vqs and RX vqs? is this allowed by the spec?
What if we end up disabling the RX default vq (receiveq1)?

I guess we could do it, after checking some conditions.
Maybe we can do it in a follow up patch?
Do you think it's important for it to be included since day 1?

I think that the question is: what's more important, to use all the vqs while blocking some features, or to use part of the vqs without blocking features?

> > +
> > +/* Function returns the number of features it blocked */
> 
> We don't need the # though. Make it bool?
> 

Sure.
Michael S. Tsirkin May 1, 2023, 10:20 a.m. UTC | #3
On Sun, Apr 30, 2023 at 06:54:08PM +0000, Alvaro Karsz wrote:
> > > At the moment, if a network device uses vrings with less than
> > > MAX_SKB_FRAGS + 2 entries, the device won't be functional.
> > >
> > > The following condition vq->num_free >= 2 + MAX_SKB_FRAGS will always
> > > evaluate to false, leading to TX timeouts.
> > >
> > > This patch introduces a new variable, single_pkt_max_descs, that holds
> > > the max number of descriptors we may need to handle a single packet.
> > >
> > > This patch also detects the small vring during probe, blocks some
> > > features that can't be used with small vrings, and fails probe,
> > > leading to a reset and features re-negotiation.
> > >
> > > Features that can't be used with small vrings:
> > > GRO features (VIRTIO_NET_F_GUEST_*):
> > > When we use small vrings, we may not have enough entries in the ring to
> > > chain page size buffers and form a 64K buffer.
> > > So we may need to allocate 64k of continuous memory, which may be too
> > > much when the system is stressed.
> > >
> > > This patch also fixes the MTU size in small vring cases to be up to the
> > > default one, 1500B.
> > 
> > and then it should clear VIRTIO_NET_F_MTU?
> > 
> 
> Following [1], I was thinking to accept the feature and a let the device figure out that it can't transmit a big packet, since the RX buffers are not big enough (without VIRTIO_NET_F_MRG_RXBUF).
> But, I think that we may need to block the MTU feature after all.
> Quoting the spec:
> 
> A driver SHOULD negotiate VIRTIO_NET_F_MTU if the device offers it.
> If the driver negotiates VIRTIO_NET_F_MTU, it MUST supply enough receive buffers to receive at least one receive packet of size mtu (plus low level ethernet header length) with gso_type NONE or ECN.
> 
> So, if VIRTIO_NET_F_MTU is negotiated, we MUST supply enough receive buffers.
> So I think that blocking VIRTIO_NET_F_MTU  should be the way to go, If mtu > 1500.
> 
> [1] https://lore.kernel.org/lkml/20230417031052-mutt-send-email-mst@kernel.org/


First up to 4k should not be a problem. Even jumbo frames e.g. 9k
is highly likely to succeed. And a probe time which is often boot
even 64k isn't a problem ...

Hmm. We could allocate large buffers at probe time. Reuse them and
copy data over.

IOW reusing  GOOD_COPY_LEN flow for this case.  Not yet sure how I feel
about this. OTOH it removes the need for the whole feature blocking
approach, does it not?
WDYT?


> > > +     /* How many ring descriptors we may need to transmit a single packet */
> > > +     u16 single_pkt_max_descs;
> > > +
> > > +     /* Do we have virtqueues with small vrings? */
> > > +     bool svring;
> > > +
> > >       /* CPU hotplug instances for online & dead */
> > >       struct hlist_node node;
> > >       struct hlist_node node_dead;
> > 
> > worth checking that all these layout changes don't push useful things to
> > a different cache line. can you add that analysis?
> > 
> 
> Good point.
> I think that we can just move these to the bottom of the struct.
> 
> > 
> > I see confusiong here wrt whether some rings are "small"? all of them?
> > some rx rings? some tx rings? names should make it clear.
> 
> The small vring is a device attribute, not a vq attribute. It blocks features, which affects the entire device.
> Maybe we can call it "small vring mode".
> 
> > also do we really need bool svring? can't we just check single_pkt_max_descs
> > all the time?
> > 
> 
> We can work without the bool, we could always check if single_pkt_max_descs != MAX_SKB_FRAGS + 2.
> It doesn't really matter to me, I was thinking it may be more readable this way.
> 
> > > +static bool virtnet_uses_svring(struct virtnet_info *vi)
> > > +{
> > > +     u32 i;
> > > +
> > > +     /* If a transmit/receive virtqueue is small,
> > > +      * we cannot handle fragmented packets.
> > > +      */
> > > +     for (i = 0; i < vi->max_queue_pairs; i++) {
> > > +             if (IS_SMALL_VRING(virtqueue_get_vring_size(vi->sq[i].vq)) ||
> > > +                 IS_SMALL_VRING(virtqueue_get_vring_size(vi->rq[i].vq)))
> > > +                     return true;
> > > +     }
> > > +
> > > +     return false;
> > > +}
> > 
> > I see even if only some rings are too small we force everything to use
> > small ones. Wouldn't it be better to just disable small ones in this
> > case? That would not need a reset.
> > 
> 
> I'm not sure. It may complicate things.
> 
> What if all TX vqs are small?
> What if all RX vqs are small?
> What if we end up with an unbalanced number of TX vqs and RX vqs? is this allowed by the spec?
> What if we end up disabling the RX default vq (receiveq1)?
> 
> I guess we could do it, after checking some conditions.
> Maybe we can do it in a follow up patch?
> Do you think it's important for it to be included since day 1?
> 
> I think that the question is: what's more important, to use all the vqs while blocking some features, or to use part of the vqs without blocking features?
> 
> > > +
> > > +/* Function returns the number of features it blocked */
> > 
> > We don't need the # though. Make it bool?
> > 
> 
> Sure.
>
Alvaro Karsz May 1, 2023, 11:59 a.m. UTC | #4
> First up to 4k should not be a problem. Even jumbo frames e.g. 9k
> is highly likely to succeed. And a probe time which is often boot
> even 64k isn't a problem ...
> 
> Hmm. We could allocate large buffers at probe time. Reuse them and
> copy data over.
> 
> IOW reusing  GOOD_COPY_LEN flow for this case.  Not yet sure how I feel
> about this. OTOH it removes the need for the whole feature blocking
> approach, does it not?
> WDYT?
> 

It could work..

In order to remove completely the feature blocking approach, we'll need to let the control commands fail (as you mentioned in the other patch).
I'm not sure I like it, it means many warnings from virtnet..
And it means accepting features that we know for sure that are not going to work.
Michael S. Tsirkin June 2, 2023, 11:30 a.m. UTC | #5
On Mon, May 01, 2023 at 11:59:42AM +0000, Alvaro Karsz wrote:
> > First up to 4k should not be a problem. Even jumbo frames e.g. 9k
> > is highly likely to succeed. And a probe time which is often boot
> > even 64k isn't a problem ...
> > 
> > Hmm. We could allocate large buffers at probe time. Reuse them and
> > copy data over.
> > 
> > IOW reusing  GOOD_COPY_LEN flow for this case.  Not yet sure how I feel
> > about this. OTOH it removes the need for the whole feature blocking
> > approach, does it not?
> > WDYT?
> > 
> 
> It could work..
> 
> In order to remove completely the feature blocking approach, we'll need to let the control commands fail (as you mentioned in the other patch).
> I'm not sure I like it, it means many warnings from virtnet..
> And it means accepting features that we know for sure that are not going to work.
>

Well they will work yes? just with an extra copy.
diff mbox series

Patch

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 8d8038538fc..b4441d63890 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -103,6 +103,8 @@  struct virtnet_rq_stats {
 #define VIRTNET_SQ_STAT(m)	offsetof(struct virtnet_sq_stats, m)
 #define VIRTNET_RQ_STAT(m)	offsetof(struct virtnet_rq_stats, m)
 
+#define IS_SMALL_VRING(size)	((size) < MAX_SKB_FRAGS + 2)
+
 static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = {
 	{ "packets",		VIRTNET_SQ_STAT(packets) },
 	{ "bytes",		VIRTNET_SQ_STAT(bytes) },
@@ -268,6 +270,12 @@  struct virtnet_info {
 	/* Does the affinity hint is set for virtqueues? */
 	bool affinity_hint_set;
 
+	/* How many ring descriptors we may need to transmit a single packet */
+	u16 single_pkt_max_descs;
+
+	/* Do we have virtqueues with small vrings? */
+	bool svring;
+
 	/* CPU hotplug instances for online & dead */
 	struct hlist_node node;
 	struct hlist_node node_dead;
@@ -455,6 +463,7 @@  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	unsigned int copy, hdr_len, hdr_padded_len;
 	struct page *page_to_free = NULL;
 	int tailroom, shinfo_size;
+	u16 max_frags = MAX_SKB_FRAGS;
 	char *p, *hdr_p, *buf;
 
 	p = page_address(page) + offset;
@@ -520,7 +529,10 @@  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	 * tries to receive more than is possible. This is usually
 	 * the case of a broken device.
 	 */
-	if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
+	if (unlikely(vi->svring))
+		max_frags = 1;
+
+	if (unlikely(len > max_frags * PAGE_SIZE)) {
 		net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
 		dev_kfree_skb(skb);
 		return NULL;
@@ -612,7 +624,7 @@  static void check_sq_full_and_disable(struct virtnet_info *vi,
 	 * Since most packets only take 1 or 2 ring slots, stopping the queue
 	 * early means 16 slots are typically wasted.
 	 */
-	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
+	if (sq->vq->num_free < vi->single_pkt_max_descs) {
 		netif_stop_subqueue(dev, qnum);
 		if (use_napi) {
 			if (unlikely(!virtqueue_enable_cb_delayed(sq->vq)))
@@ -620,7 +632,7 @@  static void check_sq_full_and_disable(struct virtnet_info *vi,
 		} else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
 			/* More just got used, free them then recheck. */
 			free_old_xmit_skbs(sq, false);
-			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
+			if (sq->vq->num_free >= vi->single_pkt_max_descs) {
 				netif_start_subqueue(dev, qnum);
 				virtqueue_disable_cb(sq->vq);
 			}
@@ -1108,6 +1120,10 @@  static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
 		return 0;
 
 	if (*num_buf > 1) {
+		/* Small vring - can't be more than 1 buffer */
+		if (unlikely(vi->svring))
+			return -EINVAL;
+
 		/* If we want to build multi-buffer xdp, we need
 		 * to specify that the flags of xdp_buff have the
 		 * XDP_FLAGS_HAS_FRAG bit.
@@ -1828,7 +1844,7 @@  static void virtnet_poll_cleantx(struct receive_queue *rq)
 			free_old_xmit_skbs(sq, true);
 		} while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
 
-		if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
+		if (sq->vq->num_free >= vi->single_pkt_max_descs)
 			netif_tx_wake_queue(txq);
 
 		__netif_tx_unlock(txq);
@@ -1919,7 +1935,7 @@  static int virtnet_poll_tx(struct napi_struct *napi, int budget)
 	virtqueue_disable_cb(sq->vq);
 	free_old_xmit_skbs(sq, true);
 
-	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
+	if (sq->vq->num_free >= vi->single_pkt_max_descs)
 		netif_tx_wake_queue(txq);
 
 	opaque = virtqueue_enable_cb_prepare(sq->vq);
@@ -3862,6 +3878,15 @@  static bool virtnet_check_guest_gso(const struct virtnet_info *vi)
 		virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO6));
 }
 
+static bool virtnet_check_host_gso(const struct virtnet_info *vi)
+{
+	return virtio_has_feature(vi->vdev, VIRTIO_NET_F_HOST_TSO4) ||
+		virtio_has_feature(vi->vdev, VIRTIO_NET_F_HOST_TSO6) ||
+		virtio_has_feature(vi->vdev, VIRTIO_NET_F_HOST_ECN) ||
+		virtio_has_feature(vi->vdev, VIRTIO_NET_F_HOST_UFO) ||
+		virtio_has_feature(vi->vdev, VIRTIO_NET_F_HOST_USO);
+}
+
 static void virtnet_set_big_packets(struct virtnet_info *vi, const int mtu)
 {
 	bool guest_gso = virtnet_check_guest_gso(vi);
@@ -3876,6 +3901,112 @@  static void virtnet_set_big_packets(struct virtnet_info *vi, const int mtu)
 	}
 }
 
+static u16 virtnet_calc_max_descs(struct virtnet_info *vi)
+{
+	if (vi->svring) {
+		if (virtnet_check_host_gso(vi))
+			return 4; /* 1 fragment + linear part + virtio header + GSO header */
+		else
+			return 3; /* 1 fragment + linear part + virtio header */
+	} else {
+		return MAX_SKB_FRAGS + 2;
+	}
+}
+
+static bool virtnet_uses_svring(struct virtnet_info *vi)
+{
+	u32 i;
+
+	/* If a transmit/receive virtqueue is small,
+	 * we cannot handle fragmented packets.
+	 */
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		if (IS_SMALL_VRING(virtqueue_get_vring_size(vi->sq[i].vq)) ||
+		    IS_SMALL_VRING(virtqueue_get_vring_size(vi->rq[i].vq)))
+			return true;
+	}
+
+	return false;
+}
+
+/* Function returns the number of features it blocked */
+static int virtnet_block_svring_unsupported(struct virtio_device *vdev)
+{
+	int cnt = 0;
+	/* Block Virtio guest GRO features.
+	 * Asking Linux to allocate 64k of continuous memory is too much,
+	 * specially when the system is stressed.
+	 *
+	 * If VIRTIO_NET_F_MRG_RXBUF is negotiated we can allcoate smaller
+	 * buffers, but since the ring is small, the buffers can be quite big.
+	 *
+	 */
+	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4)) {
+		virtio_block_feature(vdev, VIRTIO_NET_F_GUEST_TSO4);
+		cnt++;
+	}
+	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6)) {
+		virtio_block_feature(vdev, VIRTIO_NET_F_GUEST_TSO6);
+		cnt++;
+	}
+	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN)) {
+		virtio_block_feature(vdev, VIRTIO_NET_F_GUEST_ECN);
+		cnt++;
+	}
+	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO)) {
+		virtio_block_feature(vdev, VIRTIO_NET_F_GUEST_UFO);
+		cnt++;
+	}
+	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_USO4)) {
+		virtio_block_feature(vdev, VIRTIO_NET_F_GUEST_USO4);
+		cnt++;
+	}
+	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_USO6)) {
+		virtio_block_feature(vdev, VIRTIO_NET_F_GUEST_USO6);
+		cnt++;
+	}
+
+	return cnt;
+}
+
+static int virtnet_fixup_svring(struct virtnet_info *vi)
+{
+	int i;
+	/* Do we use small vrings?
+	 * If not, nothing we need to do.
+	 */
+	vi->svring = virtnet_uses_svring(vi);
+	if (!vi->svring)
+		return 0;
+
+	/* Some features can't be used with small vrings.
+	 * Block those and return an error.
+	 * This will trigger a reprobe without the blocked
+	 * features.
+	 */
+	if (virtnet_block_svring_unsupported(vi->vdev))
+		return -EOPNOTSUPP;
+
+	/* Disable NETIF_F_SG */
+	vi->dev->hw_features &= ~NETIF_F_SG;
+
+	/* Don't use MTU bigger than default */
+	if (vi->dev->max_mtu > ETH_DATA_LEN)
+		vi->dev->max_mtu = ETH_DATA_LEN;
+	if (vi->dev->mtu > ETH_DATA_LEN)
+		vi->dev->mtu = ETH_DATA_LEN;
+
+	/* Don't use big packets */
+	vi->big_packets = false;
+	vi->big_packets_num_skbfrags = 1;
+
+	/* Fix min_buf_len for receive virtqueues */
+	for (i = 0; i < vi->max_queue_pairs; i++)
+		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
+
+	return 0;
+}
+
 static int virtnet_probe(struct virtio_device *vdev)
 {
 	int i, err = -ENOMEM;
@@ -4061,6 +4192,14 @@  static int virtnet_probe(struct virtio_device *vdev)
 	if (err)
 		goto free;
 
+	/* Do required fixups in case we are using small vrings */
+	err = virtnet_fixup_svring(vi);
+	if (err)
+		goto free_vqs;
+
+	/* Calculate the max. number of descriptors we may need to transmit a single packet */
+	vi->single_pkt_max_descs = virtnet_calc_max_descs(vi);
+
 #ifdef CONFIG_SYSFS
 	if (vi->mergeable_rx_bufs)
 		dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;