diff mbox series

[net-next] virtio_net: add support for Byte Queue Limits

Message ID 20240509114615.317450-1-jiri@resnulli.us (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net-next] virtio_net: add support for Byte Queue Limits | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 926 this patch: 926
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 1 maintainers not CCed: bpf@vger.kernel.org
netdev/build_clang success Errors and warnings before: 937 this patch: 937
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 937 this patch: 937
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 112 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-05-10--18-00 (tests: 1014)

Commit Message

Jiri Pirko May 9, 2024, 11:46 a.m. UTC
From: Jiri Pirko <jiri@nvidia.com>

Add support for Byte Queue Limits (BQL).

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
---
 drivers/net/virtio_net.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

Comments

Michael S. Tsirkin May 9, 2024, 12:41 p.m. UTC | #1
On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
> From: Jiri Pirko <jiri@nvidia.com>
> 
> Add support for Byte Queue Limits (BQL).
> 
> Signed-off-by: Jiri Pirko <jiri@nvidia.com>

Can we get more detail on the benefits you observe etc?
Thanks!

> ---
>  drivers/net/virtio_net.c | 33 ++++++++++++++++++++-------------
>  1 file changed, 20 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 218a446c4c27..c53d6dc6d332 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -84,7 +84,9 @@ struct virtnet_stat_desc {
>  
>  struct virtnet_sq_free_stats {
>  	u64 packets;
> +	u64 xdp_packets;
>  	u64 bytes;
> +	u64 xdp_bytes;
>  };
>  
>  struct virtnet_sq_stats {
> @@ -512,19 +514,19 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
>  	void *ptr;
>  
>  	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> -		++stats->packets;
> -
>  		if (!is_xdp_frame(ptr)) {
>  			struct sk_buff *skb = ptr;
>  
>  			pr_debug("Sent skb %p\n", skb);
>  
> +			stats->packets++;
>  			stats->bytes += skb->len;
>  			napi_consume_skb(skb, in_napi);
>  		} else {
>  			struct xdp_frame *frame = ptr_to_xdp(ptr);
>  
> -			stats->bytes += xdp_get_frame_len(frame);
> +			stats->xdp_packets++;
> +			stats->xdp_bytes += xdp_get_frame_len(frame);
>  			xdp_return_frame(frame);
>  		}
>  	}
> @@ -965,7 +967,8 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf)
>  	virtnet_rq_free_buf(vi, rq, buf);
>  }
>  
> -static void free_old_xmit(struct send_queue *sq, bool in_napi)
> +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> +			  bool in_napi)
>  {
>  	struct virtnet_sq_free_stats stats = {0};
>  
> @@ -974,9 +977,11 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi)
>  	/* Avoid overhead when no packets have been processed
>  	 * happens when called speculatively from start_xmit.
>  	 */
> -	if (!stats.packets)
> +	if (!stats.packets && !stats.xdp_packets)
>  		return;
>  
> +	netdev_tx_completed_queue(txq, stats.packets, stats.bytes);
> +
>  	u64_stats_update_begin(&sq->stats.syncp);
>  	u64_stats_add(&sq->stats.bytes, stats.bytes);
>  	u64_stats_add(&sq->stats.packets, stats.packets);
> @@ -1013,13 +1018,15 @@ static void check_sq_full_and_disable(struct virtnet_info *vi,
>  	 * early means 16 slots are typically wasted.
>  	 */
>  	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
> -		netif_stop_subqueue(dev, qnum);
> +		struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
> +
> +		netif_tx_stop_queue(txq);
>  		if (use_napi) {
>  			if (unlikely(!virtqueue_enable_cb_delayed(sq->vq)))
>  				virtqueue_napi_schedule(&sq->napi, sq->vq);
>  		} else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
>  			/* More just got used, free them then recheck. */
> -			free_old_xmit(sq, false);
> +			free_old_xmit(sq, txq, false);
>  			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
>  				netif_start_subqueue(dev, qnum);
>  				virtqueue_disable_cb(sq->vq);
> @@ -2319,7 +2326,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq)
>  
>  		do {
>  			virtqueue_disable_cb(sq->vq);
> -			free_old_xmit(sq, true);
> +			free_old_xmit(sq, txq, true);
>  		} while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
>  
>  		if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
> @@ -2471,7 +2478,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
>  	txq = netdev_get_tx_queue(vi->dev, index);
>  	__netif_tx_lock(txq, raw_smp_processor_id());
>  	virtqueue_disable_cb(sq->vq);
> -	free_old_xmit(sq, true);
> +	free_old_xmit(sq, txq, true);
>  
>  	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
>  		netif_tx_wake_queue(txq);
> @@ -2553,7 +2560,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  	struct send_queue *sq = &vi->sq[qnum];
>  	int err;
>  	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
> -	bool kick = !netdev_xmit_more();
> +	bool xmit_more = netdev_xmit_more();
>  	bool use_napi = sq->napi.weight;
>  
>  	/* Free up any pending old buffers before queueing new ones. */
> @@ -2561,9 +2568,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  		if (use_napi)
>  			virtqueue_disable_cb(sq->vq);
>  
> -		free_old_xmit(sq, false);
> +		free_old_xmit(sq, txq, false);
>  
> -	} while (use_napi && kick &&
> +	} while (use_napi && !xmit_more &&
>  	       unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
>  
>  	/* timestamp packet in software */
> @@ -2592,7 +2599,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  
>  	check_sq_full_and_disable(vi, dev, sq);
>  
> -	if (kick || netif_xmit_stopped(txq)) {
> +	if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) {
>  		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
>  			u64_stats_update_begin(&sq->stats.syncp);
>  			u64_stats_inc(&sq->stats.kicks);
> -- 
> 2.44.0
Jiri Pirko May 9, 2024, 1:31 p.m. UTC | #2
Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
>On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
>> From: Jiri Pirko <jiri@nvidia.com>
>> 
>> Add support for Byte Queue Limits (BQL).
>> 
>> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>
>Can we get more detail on the benefits you observe etc?
>Thanks!

More info about the BQL in general is here:
https://lwn.net/Articles/469652/
Michael S. Tsirkin May 9, 2024, 2:28 p.m. UTC | #3
On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
> >> From: Jiri Pirko <jiri@nvidia.com>
> >> 
> >> Add support for Byte Queue Limits (BQL).
> >> 
> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
> >
> >Can we get more detail on the benefits you observe etc?
> >Thanks!
> 
> More info about the BQL in general is here:
> https://lwn.net/Articles/469652/

I know about BQL in general. We discussed BQL for virtio in the past
mostly I got the feedback from net core maintainers that it likely won't
benefit virtio.

So I'm asking, what kind of benefit do you observe?
Jason Wang May 10, 2024, 4:25 a.m. UTC | #4
On Thu, May 9, 2024 at 7:46 PM Jiri Pirko <jiri@resnulli.us> wrote:
>
> From: Jiri Pirko <jiri@nvidia.com>
>
> Add support for Byte Queue Limits (BQL).
>
> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
> ---

Does this work for non tx NAPI mode (it seems not obvious to me)?

Thanks
Jason Wang May 10, 2024, 4:25 a.m. UTC | #5
On Thu, May 9, 2024 at 10:28 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
> > Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
> > >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
> > >> From: Jiri Pirko <jiri@nvidia.com>
> > >>
> > >> Add support for Byte Queue Limits (BQL).
> > >>
> > >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
> > >
> > >Can we get more detail on the benefits you observe etc?
> > >Thanks!
> >
> > More info about the BQL in general is here:
> > https://lwn.net/Articles/469652/
>
> I know about BQL in general. We discussed BQL for virtio in the past
> mostly I got the feedback from net core maintainers that it likely won't
> benefit virtio.
>
> So I'm asking, what kind of benefit do you observe?

Yes, benmark is more than welcomed.

Thanks

>
> --
> MST
>
Heng Qi May 10, 2024, 7:11 a.m. UTC | #6
On Thu,  9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
> From: Jiri Pirko <jiri@nvidia.com>
> 
> Add support for Byte Queue Limits (BQL).

Historically both Jason and Michael have attempted to support BQL
for virtio-net, for example:

https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/

These discussions focus primarily on:

1. BQL is based on napi tx. Therefore, the transfer of statistical information
needs to rely on the judgment of use_napi. When the napi mode is switched to
orphan, some statistical information will be lost, resulting in temporary
inaccuracy in BQL.

2. If tx dim is supported, orphan mode may be removed and tx irq will be more
reasonable. This provides good support for BQL.

Thanks.

> 
> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
> ---
>  drivers/net/virtio_net.c | 33 ++++++++++++++++++++-------------
>  1 file changed, 20 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 218a446c4c27..c53d6dc6d332 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -84,7 +84,9 @@ struct virtnet_stat_desc {
>  
>  struct virtnet_sq_free_stats {
>  	u64 packets;
> +	u64 xdp_packets;
>  	u64 bytes;
> +	u64 xdp_bytes;
>  };
>  
>  struct virtnet_sq_stats {
> @@ -512,19 +514,19 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
>  	void *ptr;
>  
>  	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> -		++stats->packets;
> -
>  		if (!is_xdp_frame(ptr)) {
>  			struct sk_buff *skb = ptr;
>  
>  			pr_debug("Sent skb %p\n", skb);
>  
> +			stats->packets++;
>  			stats->bytes += skb->len;
>  			napi_consume_skb(skb, in_napi);
>  		} else {
>  			struct xdp_frame *frame = ptr_to_xdp(ptr);
>  
> -			stats->bytes += xdp_get_frame_len(frame);
> +			stats->xdp_packets++;
> +			stats->xdp_bytes += xdp_get_frame_len(frame);
>  			xdp_return_frame(frame);
>  		}
>  	}
> @@ -965,7 +967,8 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf)
>  	virtnet_rq_free_buf(vi, rq, buf);
>  }
>  
> -static void free_old_xmit(struct send_queue *sq, bool in_napi)
> +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
> +			  bool in_napi)
>  {
>  	struct virtnet_sq_free_stats stats = {0};
>  
> @@ -974,9 +977,11 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi)
>  	/* Avoid overhead when no packets have been processed
>  	 * happens when called speculatively from start_xmit.
>  	 */
> -	if (!stats.packets)
> +	if (!stats.packets && !stats.xdp_packets)
>  		return;
>  
> +	netdev_tx_completed_queue(txq, stats.packets, stats.bytes);
> +
>  	u64_stats_update_begin(&sq->stats.syncp);
>  	u64_stats_add(&sq->stats.bytes, stats.bytes);
>  	u64_stats_add(&sq->stats.packets, stats.packets);
> @@ -1013,13 +1018,15 @@ static void check_sq_full_and_disable(struct virtnet_info *vi,
>  	 * early means 16 slots are typically wasted.
>  	 */
>  	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
> -		netif_stop_subqueue(dev, qnum);
> +		struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
> +
> +		netif_tx_stop_queue(txq);
>  		if (use_napi) {
>  			if (unlikely(!virtqueue_enable_cb_delayed(sq->vq)))
>  				virtqueue_napi_schedule(&sq->napi, sq->vq);
>  		} else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
>  			/* More just got used, free them then recheck. */
> -			free_old_xmit(sq, false);
> +			free_old_xmit(sq, txq, false);
>  			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
>  				netif_start_subqueue(dev, qnum);
>  				virtqueue_disable_cb(sq->vq);
> @@ -2319,7 +2326,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq)
>  
>  		do {
>  			virtqueue_disable_cb(sq->vq);
> -			free_old_xmit(sq, true);
> +			free_old_xmit(sq, txq, true);
>  		} while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
>  
>  		if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
> @@ -2471,7 +2478,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
>  	txq = netdev_get_tx_queue(vi->dev, index);
>  	__netif_tx_lock(txq, raw_smp_processor_id());
>  	virtqueue_disable_cb(sq->vq);
> -	free_old_xmit(sq, true);
> +	free_old_xmit(sq, txq, true);
>  
>  	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
>  		netif_tx_wake_queue(txq);
> @@ -2553,7 +2560,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  	struct send_queue *sq = &vi->sq[qnum];
>  	int err;
>  	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
> -	bool kick = !netdev_xmit_more();
> +	bool xmit_more = netdev_xmit_more();
>  	bool use_napi = sq->napi.weight;
>  
>  	/* Free up any pending old buffers before queueing new ones. */
> @@ -2561,9 +2568,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  		if (use_napi)
>  			virtqueue_disable_cb(sq->vq);
>  
> -		free_old_xmit(sq, false);
> +		free_old_xmit(sq, txq, false);
>  
> -	} while (use_napi && kick &&
> +	} while (use_napi && !xmit_more &&
>  	       unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
>  
>  	/* timestamp packet in software */
> @@ -2592,7 +2599,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  
>  	check_sq_full_and_disable(vi, dev, sq);
>  
> -	if (kick || netif_xmit_stopped(txq)) {
> +	if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) {
>  		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
>  			u64_stats_update_begin(&sq->stats.syncp);
>  			u64_stats_inc(&sq->stats.kicks);
> -- 
> 2.44.0
> 
>
Jiri Pirko May 10, 2024, 10:35 a.m. UTC | #7
Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote:
>On Thu,  9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote:
>> From: Jiri Pirko <jiri@nvidia.com>
>> 
>> Add support for Byte Queue Limits (BQL).
>
>Historically both Jason and Michael have attempted to support BQL
>for virtio-net, for example:
>
>https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
>
>These discussions focus primarily on:
>
>1. BQL is based on napi tx. Therefore, the transfer of statistical information
>needs to rely on the judgment of use_napi. When the napi mode is switched to
>orphan, some statistical information will be lost, resulting in temporary
>inaccuracy in BQL.
>
>2. If tx dim is supported, orphan mode may be removed and tx irq will be more
>reasonable. This provides good support for BQL.

Thanks for the pointers, will check that out.


>
>Thanks.
>
>> 
>> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>> ---
>>  drivers/net/virtio_net.c | 33 ++++++++++++++++++++-------------
>>  1 file changed, 20 insertions(+), 13 deletions(-)
>> 
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 218a446c4c27..c53d6dc6d332 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -84,7 +84,9 @@ struct virtnet_stat_desc {
>>  
>>  struct virtnet_sq_free_stats {
>>  	u64 packets;
>> +	u64 xdp_packets;
>>  	u64 bytes;
>> +	u64 xdp_bytes;
>>  };
>>  
>>  struct virtnet_sq_stats {
>> @@ -512,19 +514,19 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi,
>>  	void *ptr;
>>  
>>  	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
>> -		++stats->packets;
>> -
>>  		if (!is_xdp_frame(ptr)) {
>>  			struct sk_buff *skb = ptr;
>>  
>>  			pr_debug("Sent skb %p\n", skb);
>>  
>> +			stats->packets++;
>>  			stats->bytes += skb->len;
>>  			napi_consume_skb(skb, in_napi);
>>  		} else {
>>  			struct xdp_frame *frame = ptr_to_xdp(ptr);
>>  
>> -			stats->bytes += xdp_get_frame_len(frame);
>> +			stats->xdp_packets++;
>> +			stats->xdp_bytes += xdp_get_frame_len(frame);
>>  			xdp_return_frame(frame);
>>  		}
>>  	}
>> @@ -965,7 +967,8 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf)
>>  	virtnet_rq_free_buf(vi, rq, buf);
>>  }
>>  
>> -static void free_old_xmit(struct send_queue *sq, bool in_napi)
>> +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
>> +			  bool in_napi)
>>  {
>>  	struct virtnet_sq_free_stats stats = {0};
>>  
>> @@ -974,9 +977,11 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi)
>>  	/* Avoid overhead when no packets have been processed
>>  	 * happens when called speculatively from start_xmit.
>>  	 */
>> -	if (!stats.packets)
>> +	if (!stats.packets && !stats.xdp_packets)
>>  		return;
>>  
>> +	netdev_tx_completed_queue(txq, stats.packets, stats.bytes);
>> +
>>  	u64_stats_update_begin(&sq->stats.syncp);
>>  	u64_stats_add(&sq->stats.bytes, stats.bytes);
>>  	u64_stats_add(&sq->stats.packets, stats.packets);
>> @@ -1013,13 +1018,15 @@ static void check_sq_full_and_disable(struct virtnet_info *vi,
>>  	 * early means 16 slots are typically wasted.
>>  	 */
>>  	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
>> -		netif_stop_subqueue(dev, qnum);
>> +		struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
>> +
>> +		netif_tx_stop_queue(txq);
>>  		if (use_napi) {
>>  			if (unlikely(!virtqueue_enable_cb_delayed(sq->vq)))
>>  				virtqueue_napi_schedule(&sq->napi, sq->vq);
>>  		} else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
>>  			/* More just got used, free them then recheck. */
>> -			free_old_xmit(sq, false);
>> +			free_old_xmit(sq, txq, false);
>>  			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
>>  				netif_start_subqueue(dev, qnum);
>>  				virtqueue_disable_cb(sq->vq);
>> @@ -2319,7 +2326,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq)
>>  
>>  		do {
>>  			virtqueue_disable_cb(sq->vq);
>> -			free_old_xmit(sq, true);
>> +			free_old_xmit(sq, txq, true);
>>  		} while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
>>  
>>  		if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
>> @@ -2471,7 +2478,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
>>  	txq = netdev_get_tx_queue(vi->dev, index);
>>  	__netif_tx_lock(txq, raw_smp_processor_id());
>>  	virtqueue_disable_cb(sq->vq);
>> -	free_old_xmit(sq, true);
>> +	free_old_xmit(sq, txq, true);
>>  
>>  	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
>>  		netif_tx_wake_queue(txq);
>> @@ -2553,7 +2560,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>>  	struct send_queue *sq = &vi->sq[qnum];
>>  	int err;
>>  	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
>> -	bool kick = !netdev_xmit_more();
>> +	bool xmit_more = netdev_xmit_more();
>>  	bool use_napi = sq->napi.weight;
>>  
>>  	/* Free up any pending old buffers before queueing new ones. */
>> @@ -2561,9 +2568,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>>  		if (use_napi)
>>  			virtqueue_disable_cb(sq->vq);
>>  
>> -		free_old_xmit(sq, false);
>> +		free_old_xmit(sq, txq, false);
>>  
>> -	} while (use_napi && kick &&
>> +	} while (use_napi && !xmit_more &&
>>  	       unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
>>  
>>  	/* timestamp packet in software */
>> @@ -2592,7 +2599,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>>  
>>  	check_sq_full_and_disable(vi, dev, sq);
>>  
>> -	if (kick || netif_xmit_stopped(txq)) {
>> +	if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) {
>>  		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
>>  			u64_stats_update_begin(&sq->stats.syncp);
>>  			u64_stats_inc(&sq->stats.kicks);
>> -- 
>> 2.44.0
>> 
>>
Jiri Pirko May 10, 2024, 10:37 a.m. UTC | #8
Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
>On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
>> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
>> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
>> >> From: Jiri Pirko <jiri@nvidia.com>
>> >> 
>> >> Add support for Byte Queue Limits (BQL).
>> >> 
>> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>> >
>> >Can we get more detail on the benefits you observe etc?
>> >Thanks!
>> 
>> More info about the BQL in general is here:
>> https://lwn.net/Articles/469652/
>
>I know about BQL in general. We discussed BQL for virtio in the past
>mostly I got the feedback from net core maintainers that it likely won't
>benefit virtio.

Do you have some link to that, or is it this thread:
https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/

I don't see why virtio should be any different from other
drivers/devices that benefit from bql. HOL blocking is the same here are
everywhere.

>
>So I'm asking, what kind of benefit do you observe?

I don't have measurements at hand, will attach them to v2.

Thanks!

>
>-- 
>MST
>
Michael S. Tsirkin May 10, 2024, 10:52 a.m. UTC | #9
On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
> >> >> From: Jiri Pirko <jiri@nvidia.com>
> >> >> 
> >> >> Add support for Byte Queue Limits (BQL).
> >> >> 
> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
> >> >
> >> >Can we get more detail on the benefits you observe etc?
> >> >Thanks!
> >> 
> >> More info about the BQL in general is here:
> >> https://lwn.net/Articles/469652/
> >
> >I know about BQL in general. We discussed BQL for virtio in the past
> >mostly I got the feedback from net core maintainers that it likely won't
> >benefit virtio.
> 
> Do you have some link to that, or is it this thread:
> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/


A quick search on lore turned up this, for example:
https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/




> I don't see why virtio should be any different from other
> drivers/devices that benefit from bql. HOL blocking is the same here are
> everywhere.
> 
> >
> >So I'm asking, what kind of benefit do you observe?
> 
> I don't have measurements at hand, will attach them to v2.
> 
> Thanks!
> 
> >
> >-- 
> >MST
> >
Jiri Pirko May 10, 2024, 11:11 a.m. UTC | #10
Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
>On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
>> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
>> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
>> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
>> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
>> >> >> From: Jiri Pirko <jiri@nvidia.com>
>> >> >> 
>> >> >> Add support for Byte Queue Limits (BQL).
>> >> >> 
>> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>> >> >
>> >> >Can we get more detail on the benefits you observe etc?
>> >> >Thanks!
>> >> 
>> >> More info about the BQL in general is here:
>> >> https://lwn.net/Articles/469652/
>> >
>> >I know about BQL in general. We discussed BQL for virtio in the past
>> >mostly I got the feedback from net core maintainers that it likely won't
>> >benefit virtio.
>> 
>> Do you have some link to that, or is it this thread:
>> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
>
>
>A quick search on lore turned up this, for example:
>https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/

Says:
"Note that NIC with many TX queues make BQL almost useless, only adding extra
 overhead."

But virtio can have one tx queue, I guess that could be quite common
configuration in lot of deployments.


>
>
>
>
>> I don't see why virtio should be any different from other
>> drivers/devices that benefit from bql. HOL blocking is the same here are
>> everywhere.
>> 
>> >
>> >So I'm asking, what kind of benefit do you observe?
>> 
>> I don't have measurements at hand, will attach them to v2.
>> 
>> Thanks!
>> 
>> >
>> >-- 
>> >MST
>> >
>
Michael S. Tsirkin May 10, 2024, 11:27 a.m. UTC | #11
On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote:
> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
> >> >> >> From: Jiri Pirko <jiri@nvidia.com>
> >> >> >> 
> >> >> >> Add support for Byte Queue Limits (BQL).
> >> >> >> 
> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
> >> >> >
> >> >> >Can we get more detail on the benefits you observe etc?
> >> >> >Thanks!
> >> >> 
> >> >> More info about the BQL in general is here:
> >> >> https://lwn.net/Articles/469652/
> >> >
> >> >I know about BQL in general. We discussed BQL for virtio in the past
> >> >mostly I got the feedback from net core maintainers that it likely won't
> >> >benefit virtio.
> >> 
> >> Do you have some link to that, or is it this thread:
> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
> >
> >
> >A quick search on lore turned up this, for example:
> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/
> 
> Says:
> "Note that NIC with many TX queues make BQL almost useless, only adding extra
>  overhead."
> 
> But virtio can have one tx queue, I guess that could be quite common
> configuration in lot of deployments.

Not sure we should worry about performance for these though.
What I am saying is this should come with some benchmarking
results.


> 
> >
> >
> >
> >
> >> I don't see why virtio should be any different from other
> >> drivers/devices that benefit from bql. HOL blocking is the same here are
> >> everywhere.
> >> 
> >> >
> >> >So I'm asking, what kind of benefit do you observe?
> >> 
> >> I don't have measurements at hand, will attach them to v2.
> >> 
> >> Thanks!
> >> 
> >> >
> >> >-- 
> >> >MST
> >> >
> >
Jiri Pirko May 10, 2024, 11:36 a.m. UTC | #12
Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote:
>On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote:
>> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
>> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
>> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
>> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
>> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
>> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
>> >> >> >> From: Jiri Pirko <jiri@nvidia.com>
>> >> >> >> 
>> >> >> >> Add support for Byte Queue Limits (BQL).
>> >> >> >> 
>> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>> >> >> >
>> >> >> >Can we get more detail on the benefits you observe etc?
>> >> >> >Thanks!
>> >> >> 
>> >> >> More info about the BQL in general is here:
>> >> >> https://lwn.net/Articles/469652/
>> >> >
>> >> >I know about BQL in general. We discussed BQL for virtio in the past
>> >> >mostly I got the feedback from net core maintainers that it likely won't
>> >> >benefit virtio.
>> >> 
>> >> Do you have some link to that, or is it this thread:
>> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
>> >
>> >
>> >A quick search on lore turned up this, for example:
>> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/
>> 
>> Says:
>> "Note that NIC with many TX queues make BQL almost useless, only adding extra
>>  overhead."
>> 
>> But virtio can have one tx queue, I guess that could be quite common
>> configuration in lot of deployments.
>
>Not sure we should worry about performance for these though.

Well, queues may be scarce resource sometimes, even in those cases, you
want to perform.

>What I am saying is this should come with some benchmarking
>results.

Sure, I got the message.

>
>
>> 
>> >
>> >
>> >
>> >
>> >> I don't see why virtio should be any different from other
>> >> drivers/devices that benefit from bql. HOL blocking is the same here are
>> >> everywhere.
>> >> 
>> >> >
>> >> >So I'm asking, what kind of benefit do you observe?
>> >> 
>> >> I don't have measurements at hand, will attach them to v2.
>> >> 
>> >> Thanks!
>> >> 
>> >> >
>> >> >-- 
>> >> >MST
>> >> >
>> >
>
Jiri Pirko May 15, 2024, 7:34 a.m. UTC | #13
Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote:
>On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote:
>> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
>> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
>> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
>> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
>> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
>> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
>> >> >> >> From: Jiri Pirko <jiri@nvidia.com>
>> >> >> >> 
>> >> >> >> Add support for Byte Queue Limits (BQL).
>> >> >> >> 
>> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>> >> >> >
>> >> >> >Can we get more detail on the benefits you observe etc?
>> >> >> >Thanks!
>> >> >> 
>> >> >> More info about the BQL in general is here:
>> >> >> https://lwn.net/Articles/469652/
>> >> >
>> >> >I know about BQL in general. We discussed BQL for virtio in the past
>> >> >mostly I got the feedback from net core maintainers that it likely won't
>> >> >benefit virtio.
>> >> 
>> >> Do you have some link to that, or is it this thread:
>> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
>> >
>> >
>> >A quick search on lore turned up this, for example:
>> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/
>> 
>> Says:
>> "Note that NIC with many TX queues make BQL almost useless, only adding extra
>>  overhead."
>> 
>> But virtio can have one tx queue, I guess that could be quite common
>> configuration in lot of deployments.
>
>Not sure we should worry about performance for these though.
>What I am saying is this should come with some benchmarking
>results.

I did some measurements with VDPA, backed by ConnectX6dx NIC, single
queue pair:

super_netperf 200 -H $ip -l 45 -t TCP_STREAM &
nice -n 20 netperf -H $ip -l 10 -t TCP_RR

RR result with no bql:
29.95
32.74
28.77

RR result with bql:
222.98
159.81
197.88



>
>
>> 
>> >
>> >
>> >
>> >
>> >> I don't see why virtio should be any different from other
>> >> drivers/devices that benefit from bql. HOL blocking is the same here are
>> >> everywhere.
>> >> 
>> >> >
>> >> >So I'm asking, what kind of benefit do you observe?
>> >> 
>> >> I don't have measurements at hand, will attach them to v2.
>> >> 
>> >> Thanks!
>> >> 
>> >> >
>> >> >-- 
>> >> >MST
>> >> >
>> >
>
Michael S. Tsirkin May 15, 2024, 8:20 a.m. UTC | #14
On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote:
> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote:
> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote:
> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com>
> >> >> >> >> 
> >> >> >> >> Add support for Byte Queue Limits (BQL).
> >> >> >> >> 
> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
> >> >> >> >
> >> >> >> >Can we get more detail on the benefits you observe etc?
> >> >> >> >Thanks!
> >> >> >> 
> >> >> >> More info about the BQL in general is here:
> >> >> >> https://lwn.net/Articles/469652/
> >> >> >
> >> >> >I know about BQL in general. We discussed BQL for virtio in the past
> >> >> >mostly I got the feedback from net core maintainers that it likely won't
> >> >> >benefit virtio.
> >> >> 
> >> >> Do you have some link to that, or is it this thread:
> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
> >> >
> >> >
> >> >A quick search on lore turned up this, for example:
> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/
> >> 
> >> Says:
> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra
> >>  overhead."
> >> 
> >> But virtio can have one tx queue, I guess that could be quite common
> >> configuration in lot of deployments.
> >
> >Not sure we should worry about performance for these though.
> >What I am saying is this should come with some benchmarking
> >results.
> 
> I did some measurements with VDPA, backed by ConnectX6dx NIC, single
> queue pair:
> 
> super_netperf 200 -H $ip -l 45 -t TCP_STREAM &
> nice -n 20 netperf -H $ip -l 10 -t TCP_RR
> 
> RR result with no bql:
> 29.95
> 32.74
> 28.77
> 
> RR result with bql:
> 222.98
> 159.81
> 197.88
> 

Okay. And on the other hand, any measureable degradation with
multiqueue and when testing throughput?


> 
> >
> >
> >> 
> >> >
> >> >
> >> >
> >> >
> >> >> I don't see why virtio should be any different from other
> >> >> drivers/devices that benefit from bql. HOL blocking is the same here are
> >> >> everywhere.
> >> >> 
> >> >> >
> >> >> >So I'm asking, what kind of benefit do you observe?
> >> >> 
> >> >> I don't have measurements at hand, will attach them to v2.
> >> >> 
> >> >> Thanks!
> >> >> 
> >> >> >
> >> >> >-- 
> >> >> >MST
> >> >> >
> >> >
> >
Jiri Pirko May 15, 2024, 10:12 a.m. UTC | #15
Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote:
>On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote:
>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote:
>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote:
>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com>
>> >> >> >> >> 
>> >> >> >> >> Add support for Byte Queue Limits (BQL).
>> >> >> >> >> 
>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>> >> >> >> >
>> >> >> >> >Can we get more detail on the benefits you observe etc?
>> >> >> >> >Thanks!
>> >> >> >> 
>> >> >> >> More info about the BQL in general is here:
>> >> >> >> https://lwn.net/Articles/469652/
>> >> >> >
>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past
>> >> >> >mostly I got the feedback from net core maintainers that it likely won't
>> >> >> >benefit virtio.
>> >> >> 
>> >> >> Do you have some link to that, or is it this thread:
>> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
>> >> >
>> >> >
>> >> >A quick search on lore turned up this, for example:
>> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/
>> >> 
>> >> Says:
>> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra
>> >>  overhead."
>> >> 
>> >> But virtio can have one tx queue, I guess that could be quite common
>> >> configuration in lot of deployments.
>> >
>> >Not sure we should worry about performance for these though.
>> >What I am saying is this should come with some benchmarking
>> >results.
>> 
>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single
>> queue pair:
>> 
>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM &
>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR
>> 
>> RR result with no bql:
>> 29.95
>> 32.74
>> 28.77
>> 
>> RR result with bql:
>> 222.98
>> 159.81
>> 197.88
>> 
>
>Okay. And on the other hand, any measureable degradation with
>multiqueue and when testing throughput?

With multiqueue it depends if the flows hits the same queue or not. If
they do, the same results will likely be shown.


>
>
>> 
>> >
>> >
>> >> 
>> >> >
>> >> >
>> >> >
>> >> >
>> >> >> I don't see why virtio should be any different from other
>> >> >> drivers/devices that benefit from bql. HOL blocking is the same here are
>> >> >> everywhere.
>> >> >> 
>> >> >> >
>> >> >> >So I'm asking, what kind of benefit do you observe?
>> >> >> 
>> >> >> I don't have measurements at hand, will attach them to v2.
>> >> >> 
>> >> >> Thanks!
>> >> >> 
>> >> >> >
>> >> >> >-- 
>> >> >> >MST
>> >> >> >
>> >> >
>> >
>
Jiri Pirko May 15, 2024, 12:54 p.m. UTC | #16
Wed, May 15, 2024 at 12:12:51PM CEST, jiri@resnulli.us wrote:
>Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote:
>>On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote:
>>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote:
>>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote:
>>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
>>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
>>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
>>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
>>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
>>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
>>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com>
>>> >> >> >> >> 
>>> >> >> >> >> Add support for Byte Queue Limits (BQL).
>>> >> >> >> >> 
>>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>>> >> >> >> >
>>> >> >> >> >Can we get more detail on the benefits you observe etc?
>>> >> >> >> >Thanks!
>>> >> >> >> 
>>> >> >> >> More info about the BQL in general is here:
>>> >> >> >> https://lwn.net/Articles/469652/
>>> >> >> >
>>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past
>>> >> >> >mostly I got the feedback from net core maintainers that it likely won't
>>> >> >> >benefit virtio.
>>> >> >> 
>>> >> >> Do you have some link to that, or is it this thread:
>>> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
>>> >> >
>>> >> >
>>> >> >A quick search on lore turned up this, for example:
>>> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/
>>> >> 
>>> >> Says:
>>> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra
>>> >>  overhead."
>>> >> 
>>> >> But virtio can have one tx queue, I guess that could be quite common
>>> >> configuration in lot of deployments.
>>> >
>>> >Not sure we should worry about performance for these though.
>>> >What I am saying is this should come with some benchmarking
>>> >results.
>>> 
>>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single
>>> queue pair:
>>> 
>>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM &
>>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR
>>> 
>>> RR result with no bql:
>>> 29.95
>>> 32.74
>>> 28.77
>>> 
>>> RR result with bql:
>>> 222.98
>>> 159.81
>>> 197.88
>>> 
>>
>>Okay. And on the other hand, any measureable degradation with
>>multiqueue and when testing throughput?
>
>With multiqueue it depends if the flows hits the same queue or not. If
>they do, the same results will likely be shown.

RR 1q, w/o bql:
29.95
32.74
28.77

RR 1q, with bql:
222.98
159.81
197.88

RR 4q, w/o bql:
355.82
364.58
233.47

RR 4q, with bql:
371.19
255.93
337.77

So answer to your question is: "no measurable degradation with 4
queues".


>
>
>>
>>
>>> 
>>> >
>>> >
>>> >> 
>>> >> >
>>> >> >
>>> >> >
>>> >> >
>>> >> >> I don't see why virtio should be any different from other
>>> >> >> drivers/devices that benefit from bql. HOL blocking is the same here are
>>> >> >> everywhere.
>>> >> >> 
>>> >> >> >
>>> >> >> >So I'm asking, what kind of benefit do you observe?
>>> >> >> 
>>> >> >> I don't have measurements at hand, will attach them to v2.
>>> >> >> 
>>> >> >> Thanks!
>>> >> >> 
>>> >> >> >
>>> >> >> >-- 
>>> >> >> >MST
>>> >> >> >
>>> >> >
>>> >
>>
Jason Wang May 16, 2024, 4:48 a.m. UTC | #17
On Wed, May 15, 2024 at 8:54 PM Jiri Pirko <jiri@resnulli.us> wrote:
>
> Wed, May 15, 2024 at 12:12:51PM CEST, jiri@resnulli.us wrote:
> >Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote:
> >>On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote:
> >>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote:
> >>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote:
> >>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
> >>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
> >>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
> >>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
> >>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
> >>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
> >>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com>
> >>> >> >> >> >>
> >>> >> >> >> >> Add support for Byte Queue Limits (BQL).
> >>> >> >> >> >>
> >>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
> >>> >> >> >> >
> >>> >> >> >> >Can we get more detail on the benefits you observe etc?
> >>> >> >> >> >Thanks!
> >>> >> >> >>
> >>> >> >> >> More info about the BQL in general is here:
> >>> >> >> >> https://lwn.net/Articles/469652/
> >>> >> >> >
> >>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past
> >>> >> >> >mostly I got the feedback from net core maintainers that it likely won't
> >>> >> >> >benefit virtio.
> >>> >> >>
> >>> >> >> Do you have some link to that, or is it this thread:
> >>> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
> >>> >> >
> >>> >> >
> >>> >> >A quick search on lore turned up this, for example:
> >>> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/
> >>> >>
> >>> >> Says:
> >>> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra
> >>> >>  overhead."
> >>> >>
> >>> >> But virtio can have one tx queue, I guess that could be quite common
> >>> >> configuration in lot of deployments.
> >>> >
> >>> >Not sure we should worry about performance for these though.
> >>> >What I am saying is this should come with some benchmarking
> >>> >results.
> >>>
> >>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single
> >>> queue pair:
> >>>
> >>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM &
> >>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR
> >>>
> >>> RR result with no bql:
> >>> 29.95
> >>> 32.74
> >>> 28.77
> >>>
> >>> RR result with bql:
> >>> 222.98
> >>> 159.81
> >>> 197.88
> >>>
> >>
> >>Okay. And on the other hand, any measureable degradation with
> >>multiqueue and when testing throughput?
> >
> >With multiqueue it depends if the flows hits the same queue or not. If
> >they do, the same results will likely be shown.
>
> RR 1q, w/o bql:
> 29.95
> 32.74
> 28.77
>
> RR 1q, with bql:
> 222.98
> 159.81
> 197.88
>
> RR 4q, w/o bql:
> 355.82
> 364.58
> 233.47
>
> RR 4q, with bql:
> 371.19
> 255.93
> 337.77
>
> So answer to your question is: "no measurable degradation with 4
> queues".

Thanks but I think we also need benchmarks in cases other than vDPA.
For example, a simple virtualization setup.
Jiri Pirko May 16, 2024, 10:54 a.m. UTC | #18
Thu, May 16, 2024 at 06:48:38AM CEST, jasowang@redhat.com wrote:
>On Wed, May 15, 2024 at 8:54 PM Jiri Pirko <jiri@resnulli.us> wrote:
>>
>> Wed, May 15, 2024 at 12:12:51PM CEST, jiri@resnulli.us wrote:
>> >Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote:
>> >>On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote:
>> >>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote:
>> >>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote:
>> >>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
>> >>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
>> >>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
>> >>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
>> >>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
>> >>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
>> >>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com>
>> >>> >> >> >> >>
>> >>> >> >> >> >> Add support for Byte Queue Limits (BQL).
>> >>> >> >> >> >>
>> >>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>> >>> >> >> >> >
>> >>> >> >> >> >Can we get more detail on the benefits you observe etc?
>> >>> >> >> >> >Thanks!
>> >>> >> >> >>
>> >>> >> >> >> More info about the BQL in general is here:
>> >>> >> >> >> https://lwn.net/Articles/469652/
>> >>> >> >> >
>> >>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past
>> >>> >> >> >mostly I got the feedback from net core maintainers that it likely won't
>> >>> >> >> >benefit virtio.
>> >>> >> >>
>> >>> >> >> Do you have some link to that, or is it this thread:
>> >>> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
>> >>> >> >
>> >>> >> >
>> >>> >> >A quick search on lore turned up this, for example:
>> >>> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/
>> >>> >>
>> >>> >> Says:
>> >>> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra
>> >>> >>  overhead."
>> >>> >>
>> >>> >> But virtio can have one tx queue, I guess that could be quite common
>> >>> >> configuration in lot of deployments.
>> >>> >
>> >>> >Not sure we should worry about performance for these though.
>> >>> >What I am saying is this should come with some benchmarking
>> >>> >results.
>> >>>
>> >>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single
>> >>> queue pair:
>> >>>
>> >>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM &
>> >>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR
>> >>>
>> >>> RR result with no bql:
>> >>> 29.95
>> >>> 32.74
>> >>> 28.77
>> >>>
>> >>> RR result with bql:
>> >>> 222.98
>> >>> 159.81
>> >>> 197.88
>> >>>
>> >>
>> >>Okay. And on the other hand, any measureable degradation with
>> >>multiqueue and when testing throughput?
>> >
>> >With multiqueue it depends if the flows hits the same queue or not. If
>> >they do, the same results will likely be shown.
>>
>> RR 1q, w/o bql:
>> 29.95
>> 32.74
>> 28.77
>>
>> RR 1q, with bql:
>> 222.98
>> 159.81
>> 197.88
>>
>> RR 4q, w/o bql:
>> 355.82
>> 364.58
>> 233.47
>>
>> RR 4q, with bql:
>> 371.19
>> 255.93
>> 337.77
>>
>> So answer to your question is: "no measurable degradation with 4
>> queues".
>
>Thanks but I think we also need benchmarks in cases other than vDPA.
>For example, a simple virtualization setup.

For virtualization setup, I get this:

VIRT RR 1q, w/0 bql:
49.18
49.75
50.07

VIRT RR 1q, with bql:
51.33
47.88
40.40

No measurable/significant difference.

>
Michael S. Tsirkin May 16, 2024, 12:31 p.m. UTC | #19
On Thu, May 16, 2024 at 12:54:58PM +0200, Jiri Pirko wrote:
> Thu, May 16, 2024 at 06:48:38AM CEST, jasowang@redhat.com wrote:
> >On Wed, May 15, 2024 at 8:54 PM Jiri Pirko <jiri@resnulli.us> wrote:
> >>
> >> Wed, May 15, 2024 at 12:12:51PM CEST, jiri@resnulli.us wrote:
> >> >Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote:
> >> >>On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote:
> >> >>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote:
> >> >>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote:
> >> >>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
> >> >>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
> >> >>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
> >> >>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
> >> >>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
> >> >>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
> >> >>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com>
> >> >>> >> >> >> >>
> >> >>> >> >> >> >> Add support for Byte Queue Limits (BQL).
> >> >>> >> >> >> >>
> >> >>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
> >> >>> >> >> >> >
> >> >>> >> >> >> >Can we get more detail on the benefits you observe etc?
> >> >>> >> >> >> >Thanks!
> >> >>> >> >> >>
> >> >>> >> >> >> More info about the BQL in general is here:
> >> >>> >> >> >> https://lwn.net/Articles/469652/
> >> >>> >> >> >
> >> >>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past
> >> >>> >> >> >mostly I got the feedback from net core maintainers that it likely won't
> >> >>> >> >> >benefit virtio.
> >> >>> >> >>
> >> >>> >> >> Do you have some link to that, or is it this thread:
> >> >>> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
> >> >>> >> >
> >> >>> >> >
> >> >>> >> >A quick search on lore turned up this, for example:
> >> >>> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/
> >> >>> >>
> >> >>> >> Says:
> >> >>> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra
> >> >>> >>  overhead."
> >> >>> >>
> >> >>> >> But virtio can have one tx queue, I guess that could be quite common
> >> >>> >> configuration in lot of deployments.
> >> >>> >
> >> >>> >Not sure we should worry about performance for these though.
> >> >>> >What I am saying is this should come with some benchmarking
> >> >>> >results.
> >> >>>
> >> >>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single
> >> >>> queue pair:
> >> >>>
> >> >>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM &
> >> >>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR
> >> >>>
> >> >>> RR result with no bql:
> >> >>> 29.95
> >> >>> 32.74
> >> >>> 28.77
> >> >>>
> >> >>> RR result with bql:
> >> >>> 222.98
> >> >>> 159.81
> >> >>> 197.88
> >> >>>
> >> >>
> >> >>Okay. And on the other hand, any measureable degradation with
> >> >>multiqueue and when testing throughput?
> >> >
> >> >With multiqueue it depends if the flows hits the same queue or not. If
> >> >they do, the same results will likely be shown.
> >>
> >> RR 1q, w/o bql:
> >> 29.95
> >> 32.74
> >> 28.77
> >>
> >> RR 1q, with bql:
> >> 222.98
> >> 159.81
> >> 197.88
> >>
> >> RR 4q, w/o bql:
> >> 355.82
> >> 364.58
> >> 233.47
> >>
> >> RR 4q, with bql:
> >> 371.19
> >> 255.93
> >> 337.77
> >>
> >> So answer to your question is: "no measurable degradation with 4
> >> queues".
> >
> >Thanks but I think we also need benchmarks in cases other than vDPA.
> >For example, a simple virtualization setup.
> 
> For virtualization setup, I get this:
> 
> VIRT RR 1q, w/0 bql:
> 49.18
> 49.75
> 50.07
> 
> VIRT RR 1q, with bql:
> 51.33
> 47.88
> 40.40
> 
> No measurable/significant difference.

Seems the results became much noisier? Also
I'd expect a regression if any to be in a streaming benchmark.
Jiri Pirko May 16, 2024, 3:25 p.m. UTC | #20
Thu, May 16, 2024 at 02:31:59PM CEST, mst@redhat.com wrote:
>On Thu, May 16, 2024 at 12:54:58PM +0200, Jiri Pirko wrote:
>> Thu, May 16, 2024 at 06:48:38AM CEST, jasowang@redhat.com wrote:
>> >On Wed, May 15, 2024 at 8:54 PM Jiri Pirko <jiri@resnulli.us> wrote:
>> >>
>> >> Wed, May 15, 2024 at 12:12:51PM CEST, jiri@resnulli.us wrote:
>> >> >Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote:
>> >> >>On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote:
>> >> >>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote:
>> >> >>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote:
>> >> >>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote:
>> >> >>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote:
>> >> >>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote:
>> >> >>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote:
>> >> >>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote:
>> >> >>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote:
>> >> >>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com>
>> >> >>> >> >> >> >>
>> >> >>> >> >> >> >> Add support for Byte Queue Limits (BQL).
>> >> >>> >> >> >> >>
>> >> >>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com>
>> >> >>> >> >> >> >
>> >> >>> >> >> >> >Can we get more detail on the benefits you observe etc?
>> >> >>> >> >> >> >Thanks!
>> >> >>> >> >> >>
>> >> >>> >> >> >> More info about the BQL in general is here:
>> >> >>> >> >> >> https://lwn.net/Articles/469652/
>> >> >>> >> >> >
>> >> >>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past
>> >> >>> >> >> >mostly I got the feedback from net core maintainers that it likely won't
>> >> >>> >> >> >benefit virtio.
>> >> >>> >> >>
>> >> >>> >> >> Do you have some link to that, or is it this thread:
>> >> >>> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/
>> >> >>> >> >
>> >> >>> >> >
>> >> >>> >> >A quick search on lore turned up this, for example:
>> >> >>> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/
>> >> >>> >>
>> >> >>> >> Says:
>> >> >>> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra
>> >> >>> >>  overhead."
>> >> >>> >>
>> >> >>> >> But virtio can have one tx queue, I guess that could be quite common
>> >> >>> >> configuration in lot of deployments.
>> >> >>> >
>> >> >>> >Not sure we should worry about performance for these though.
>> >> >>> >What I am saying is this should come with some benchmarking
>> >> >>> >results.
>> >> >>>
>> >> >>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single
>> >> >>> queue pair:
>> >> >>>
>> >> >>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM &
>> >> >>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR
>> >> >>>
>> >> >>> RR result with no bql:
>> >> >>> 29.95
>> >> >>> 32.74
>> >> >>> 28.77
>> >> >>>
>> >> >>> RR result with bql:
>> >> >>> 222.98
>> >> >>> 159.81
>> >> >>> 197.88
>> >> >>>
>> >> >>
>> >> >>Okay. And on the other hand, any measureable degradation with
>> >> >>multiqueue and when testing throughput?
>> >> >
>> >> >With multiqueue it depends if the flows hits the same queue or not. If
>> >> >they do, the same results will likely be shown.
>> >>
>> >> RR 1q, w/o bql:
>> >> 29.95
>> >> 32.74
>> >> 28.77
>> >>
>> >> RR 1q, with bql:
>> >> 222.98
>> >> 159.81
>> >> 197.88
>> >>
>> >> RR 4q, w/o bql:
>> >> 355.82
>> >> 364.58
>> >> 233.47
>> >>
>> >> RR 4q, with bql:
>> >> 371.19
>> >> 255.93
>> >> 337.77
>> >>
>> >> So answer to your question is: "no measurable degradation with 4
>> >> queues".
>> >
>> >Thanks but I think we also need benchmarks in cases other than vDPA.
>> >For example, a simple virtualization setup.
>> 
>> For virtualization setup, I get this:
>> 
>> VIRT RR 1q, w/0 bql:
>> 49.18
>> 49.75
>> 50.07
>> 
>> VIRT RR 1q, with bql:
>> 51.33
>> 47.88
>> 40.40
>> 
>> No measurable/significant difference.
>
>Seems the results became much noisier? Also

Not enough data to assume that I believe.


>I'd expect a regression if any to be in a streaming benchmark.

Can you elaborate?


>
>-- 
>MST
>
Michael S. Tsirkin May 16, 2024, 7:04 p.m. UTC | #21
On Thu, May 16, 2024 at 05:25:20PM +0200, Jiri Pirko wrote:
> 
> >I'd expect a regression if any to be in a streaming benchmark.
> 
> Can you elaborate?

BQL does two things that can hurt throughput:
- limits amount of data in the queue - can limit bandwidth
  if we now get queue underruns
- adds CPU overhead - can limit bandwidth if CPU bound

So checking result of a streaming benchmark seems important.
Jiri Pirko May 17, 2024, 7:52 a.m. UTC | #22
Thu, May 16, 2024 at 09:04:41PM CEST, mst@redhat.com wrote:
>On Thu, May 16, 2024 at 05:25:20PM +0200, Jiri Pirko wrote:
>> 
>> >I'd expect a regression if any to be in a streaming benchmark.
>> 
>> Can you elaborate?
>
>BQL does two things that can hurt throughput:
>- limits amount of data in the queue - can limit bandwidth
>  if we now get queue underruns
>- adds CPU overhead - can limit bandwidth if CPU bound
>
>So checking result of a streaming benchmark seems important.

I understand. Didn't see any degradation in TCP_STREAM netperf. But I
will try to extend the testing.

Thanks!

>
>
>-- 
>MST
>
diff mbox series

Patch

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 218a446c4c27..c53d6dc6d332 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -84,7 +84,9 @@  struct virtnet_stat_desc {
 
 struct virtnet_sq_free_stats {
 	u64 packets;
+	u64 xdp_packets;
 	u64 bytes;
+	u64 xdp_bytes;
 };
 
 struct virtnet_sq_stats {
@@ -512,19 +514,19 @@  static void __free_old_xmit(struct send_queue *sq, bool in_napi,
 	void *ptr;
 
 	while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
-		++stats->packets;
-
 		if (!is_xdp_frame(ptr)) {
 			struct sk_buff *skb = ptr;
 
 			pr_debug("Sent skb %p\n", skb);
 
+			stats->packets++;
 			stats->bytes += skb->len;
 			napi_consume_skb(skb, in_napi);
 		} else {
 			struct xdp_frame *frame = ptr_to_xdp(ptr);
 
-			stats->bytes += xdp_get_frame_len(frame);
+			stats->xdp_packets++;
+			stats->xdp_bytes += xdp_get_frame_len(frame);
 			xdp_return_frame(frame);
 		}
 	}
@@ -965,7 +967,8 @@  static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf)
 	virtnet_rq_free_buf(vi, rq, buf);
 }
 
-static void free_old_xmit(struct send_queue *sq, bool in_napi)
+static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
+			  bool in_napi)
 {
 	struct virtnet_sq_free_stats stats = {0};
 
@@ -974,9 +977,11 @@  static void free_old_xmit(struct send_queue *sq, bool in_napi)
 	/* Avoid overhead when no packets have been processed
 	 * happens when called speculatively from start_xmit.
 	 */
-	if (!stats.packets)
+	if (!stats.packets && !stats.xdp_packets)
 		return;
 
+	netdev_tx_completed_queue(txq, stats.packets, stats.bytes);
+
 	u64_stats_update_begin(&sq->stats.syncp);
 	u64_stats_add(&sq->stats.bytes, stats.bytes);
 	u64_stats_add(&sq->stats.packets, stats.packets);
@@ -1013,13 +1018,15 @@  static void check_sq_full_and_disable(struct virtnet_info *vi,
 	 * early means 16 slots are typically wasted.
 	 */
 	if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
-		netif_stop_subqueue(dev, qnum);
+		struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
+
+		netif_tx_stop_queue(txq);
 		if (use_napi) {
 			if (unlikely(!virtqueue_enable_cb_delayed(sq->vq)))
 				virtqueue_napi_schedule(&sq->napi, sq->vq);
 		} else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
 			/* More just got used, free them then recheck. */
-			free_old_xmit(sq, false);
+			free_old_xmit(sq, txq, false);
 			if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
 				netif_start_subqueue(dev, qnum);
 				virtqueue_disable_cb(sq->vq);
@@ -2319,7 +2326,7 @@  static void virtnet_poll_cleantx(struct receive_queue *rq)
 
 		do {
 			virtqueue_disable_cb(sq->vq);
-			free_old_xmit(sq, true);
+			free_old_xmit(sq, txq, true);
 		} while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
 
 		if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
@@ -2471,7 +2478,7 @@  static int virtnet_poll_tx(struct napi_struct *napi, int budget)
 	txq = netdev_get_tx_queue(vi->dev, index);
 	__netif_tx_lock(txq, raw_smp_processor_id());
 	virtqueue_disable_cb(sq->vq);
-	free_old_xmit(sq, true);
+	free_old_xmit(sq, txq, true);
 
 	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
 		netif_tx_wake_queue(txq);
@@ -2553,7 +2560,7 @@  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct send_queue *sq = &vi->sq[qnum];
 	int err;
 	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
-	bool kick = !netdev_xmit_more();
+	bool xmit_more = netdev_xmit_more();
 	bool use_napi = sq->napi.weight;
 
 	/* Free up any pending old buffers before queueing new ones. */
@@ -2561,9 +2568,9 @@  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 		if (use_napi)
 			virtqueue_disable_cb(sq->vq);
 
-		free_old_xmit(sq, false);
+		free_old_xmit(sq, txq, false);
 
-	} while (use_napi && kick &&
+	} while (use_napi && !xmit_more &&
 	       unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
 
 	/* timestamp packet in software */
@@ -2592,7 +2599,7 @@  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	check_sq_full_and_disable(vi, dev, sq);
 
-	if (kick || netif_xmit_stopped(txq)) {
+	if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) {
 		if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
 			u64_stats_update_begin(&sq->stats.syncp);
 			u64_stats_inc(&sq->stats.kicks);