Message ID | 20240509114615.317450-1-jiri@resnulli.us (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [net-next] virtio_net: add support for Byte Queue Limits | expand |
On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: > From: Jiri Pirko <jiri@nvidia.com> > > Add support for Byte Queue Limits (BQL). > > Signed-off-by: Jiri Pirko <jiri@nvidia.com> Can we get more detail on the benefits you observe etc? Thanks! > --- > drivers/net/virtio_net.c | 33 ++++++++++++++++++++------------- > 1 file changed, 20 insertions(+), 13 deletions(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index 218a446c4c27..c53d6dc6d332 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -84,7 +84,9 @@ struct virtnet_stat_desc { > > struct virtnet_sq_free_stats { > u64 packets; > + u64 xdp_packets; > u64 bytes; > + u64 xdp_bytes; > }; > > struct virtnet_sq_stats { > @@ -512,19 +514,19 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi, > void *ptr; > > while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { > - ++stats->packets; > - > if (!is_xdp_frame(ptr)) { > struct sk_buff *skb = ptr; > > pr_debug("Sent skb %p\n", skb); > > + stats->packets++; > stats->bytes += skb->len; > napi_consume_skb(skb, in_napi); > } else { > struct xdp_frame *frame = ptr_to_xdp(ptr); > > - stats->bytes += xdp_get_frame_len(frame); > + stats->xdp_packets++; > + stats->xdp_bytes += xdp_get_frame_len(frame); > xdp_return_frame(frame); > } > } > @@ -965,7 +967,8 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) > virtnet_rq_free_buf(vi, rq, buf); > } > > -static void free_old_xmit(struct send_queue *sq, bool in_napi) > +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, > + bool in_napi) > { > struct virtnet_sq_free_stats stats = {0}; > > @@ -974,9 +977,11 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi) > /* Avoid overhead when no packets have been processed > * happens when called speculatively from start_xmit. > */ > - if (!stats.packets) > + if (!stats.packets && !stats.xdp_packets) > return; > > + netdev_tx_completed_queue(txq, stats.packets, stats.bytes); > + > u64_stats_update_begin(&sq->stats.syncp); > u64_stats_add(&sq->stats.bytes, stats.bytes); > u64_stats_add(&sq->stats.packets, stats.packets); > @@ -1013,13 +1018,15 @@ static void check_sq_full_and_disable(struct virtnet_info *vi, > * early means 16 slots are typically wasted. > */ > if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { > - netif_stop_subqueue(dev, qnum); > + struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); > + > + netif_tx_stop_queue(txq); > if (use_napi) { > if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) > virtqueue_napi_schedule(&sq->napi, sq->vq); > } else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { > /* More just got used, free them then recheck. */ > - free_old_xmit(sq, false); > + free_old_xmit(sq, txq, false); > if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { > netif_start_subqueue(dev, qnum); > virtqueue_disable_cb(sq->vq); > @@ -2319,7 +2326,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq) > > do { > virtqueue_disable_cb(sq->vq); > - free_old_xmit(sq, true); > + free_old_xmit(sq, txq, true); > } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq))); > > if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) > @@ -2471,7 +2478,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) > txq = netdev_get_tx_queue(vi->dev, index); > __netif_tx_lock(txq, raw_smp_processor_id()); > virtqueue_disable_cb(sq->vq); > - free_old_xmit(sq, true); > + free_old_xmit(sq, txq, true); > > if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) > netif_tx_wake_queue(txq); > @@ -2553,7 +2560,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > struct send_queue *sq = &vi->sq[qnum]; > int err; > struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); > - bool kick = !netdev_xmit_more(); > + bool xmit_more = netdev_xmit_more(); > bool use_napi = sq->napi.weight; > > /* Free up any pending old buffers before queueing new ones. */ > @@ -2561,9 +2568,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > if (use_napi) > virtqueue_disable_cb(sq->vq); > > - free_old_xmit(sq, false); > + free_old_xmit(sq, txq, false); > > - } while (use_napi && kick && > + } while (use_napi && !xmit_more && > unlikely(!virtqueue_enable_cb_delayed(sq->vq))); > > /* timestamp packet in software */ > @@ -2592,7 +2599,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > > check_sq_full_and_disable(vi, dev, sq); > > - if (kick || netif_xmit_stopped(txq)) { > + if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) { > if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { > u64_stats_update_begin(&sq->stats.syncp); > u64_stats_inc(&sq->stats.kicks); > -- > 2.44.0
Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: >> From: Jiri Pirko <jiri@nvidia.com> >> >> Add support for Byte Queue Limits (BQL). >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> > >Can we get more detail on the benefits you observe etc? >Thanks! More info about the BQL in general is here: https://lwn.net/Articles/469652/
On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: > Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: > >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: > >> From: Jiri Pirko <jiri@nvidia.com> > >> > >> Add support for Byte Queue Limits (BQL). > >> > >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> > > > >Can we get more detail on the benefits you observe etc? > >Thanks! > > More info about the BQL in general is here: > https://lwn.net/Articles/469652/ I know about BQL in general. We discussed BQL for virtio in the past mostly I got the feedback from net core maintainers that it likely won't benefit virtio. So I'm asking, what kind of benefit do you observe?
On Thu, May 9, 2024 at 7:46 PM Jiri Pirko <jiri@resnulli.us> wrote: > > From: Jiri Pirko <jiri@nvidia.com> > > Add support for Byte Queue Limits (BQL). > > Signed-off-by: Jiri Pirko <jiri@nvidia.com> > --- Does this work for non tx NAPI mode (it seems not obvious to me)? Thanks
On Thu, May 9, 2024 at 10:28 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: > > Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: > > >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: > > >> From: Jiri Pirko <jiri@nvidia.com> > > >> > > >> Add support for Byte Queue Limits (BQL). > > >> > > >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> > > > > > >Can we get more detail on the benefits you observe etc? > > >Thanks! > > > > More info about the BQL in general is here: > > https://lwn.net/Articles/469652/ > > I know about BQL in general. We discussed BQL for virtio in the past > mostly I got the feedback from net core maintainers that it likely won't > benefit virtio. > > So I'm asking, what kind of benefit do you observe? Yes, benmark is more than welcomed. Thanks > > -- > MST >
On Thu, 9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote: > From: Jiri Pirko <jiri@nvidia.com> > > Add support for Byte Queue Limits (BQL). Historically both Jason and Michael have attempted to support BQL for virtio-net, for example: https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ These discussions focus primarily on: 1. BQL is based on napi tx. Therefore, the transfer of statistical information needs to rely on the judgment of use_napi. When the napi mode is switched to orphan, some statistical information will be lost, resulting in temporary inaccuracy in BQL. 2. If tx dim is supported, orphan mode may be removed and tx irq will be more reasonable. This provides good support for BQL. Thanks. > > Signed-off-by: Jiri Pirko <jiri@nvidia.com> > --- > drivers/net/virtio_net.c | 33 ++++++++++++++++++++------------- > 1 file changed, 20 insertions(+), 13 deletions(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index 218a446c4c27..c53d6dc6d332 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -84,7 +84,9 @@ struct virtnet_stat_desc { > > struct virtnet_sq_free_stats { > u64 packets; > + u64 xdp_packets; > u64 bytes; > + u64 xdp_bytes; > }; > > struct virtnet_sq_stats { > @@ -512,19 +514,19 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi, > void *ptr; > > while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { > - ++stats->packets; > - > if (!is_xdp_frame(ptr)) { > struct sk_buff *skb = ptr; > > pr_debug("Sent skb %p\n", skb); > > + stats->packets++; > stats->bytes += skb->len; > napi_consume_skb(skb, in_napi); > } else { > struct xdp_frame *frame = ptr_to_xdp(ptr); > > - stats->bytes += xdp_get_frame_len(frame); > + stats->xdp_packets++; > + stats->xdp_bytes += xdp_get_frame_len(frame); > xdp_return_frame(frame); > } > } > @@ -965,7 +967,8 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) > virtnet_rq_free_buf(vi, rq, buf); > } > > -static void free_old_xmit(struct send_queue *sq, bool in_napi) > +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, > + bool in_napi) > { > struct virtnet_sq_free_stats stats = {0}; > > @@ -974,9 +977,11 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi) > /* Avoid overhead when no packets have been processed > * happens when called speculatively from start_xmit. > */ > - if (!stats.packets) > + if (!stats.packets && !stats.xdp_packets) > return; > > + netdev_tx_completed_queue(txq, stats.packets, stats.bytes); > + > u64_stats_update_begin(&sq->stats.syncp); > u64_stats_add(&sq->stats.bytes, stats.bytes); > u64_stats_add(&sq->stats.packets, stats.packets); > @@ -1013,13 +1018,15 @@ static void check_sq_full_and_disable(struct virtnet_info *vi, > * early means 16 slots are typically wasted. > */ > if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { > - netif_stop_subqueue(dev, qnum); > + struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); > + > + netif_tx_stop_queue(txq); > if (use_napi) { > if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) > virtqueue_napi_schedule(&sq->napi, sq->vq); > } else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { > /* More just got used, free them then recheck. */ > - free_old_xmit(sq, false); > + free_old_xmit(sq, txq, false); > if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { > netif_start_subqueue(dev, qnum); > virtqueue_disable_cb(sq->vq); > @@ -2319,7 +2326,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq) > > do { > virtqueue_disable_cb(sq->vq); > - free_old_xmit(sq, true); > + free_old_xmit(sq, txq, true); > } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq))); > > if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) > @@ -2471,7 +2478,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) > txq = netdev_get_tx_queue(vi->dev, index); > __netif_tx_lock(txq, raw_smp_processor_id()); > virtqueue_disable_cb(sq->vq); > - free_old_xmit(sq, true); > + free_old_xmit(sq, txq, true); > > if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) > netif_tx_wake_queue(txq); > @@ -2553,7 +2560,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > struct send_queue *sq = &vi->sq[qnum]; > int err; > struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); > - bool kick = !netdev_xmit_more(); > + bool xmit_more = netdev_xmit_more(); > bool use_napi = sq->napi.weight; > > /* Free up any pending old buffers before queueing new ones. */ > @@ -2561,9 +2568,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > if (use_napi) > virtqueue_disable_cb(sq->vq); > > - free_old_xmit(sq, false); > + free_old_xmit(sq, txq, false); > > - } while (use_napi && kick && > + } while (use_napi && !xmit_more && > unlikely(!virtqueue_enable_cb_delayed(sq->vq))); > > /* timestamp packet in software */ > @@ -2592,7 +2599,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > > check_sq_full_and_disable(vi, dev, sq); > > - if (kick || netif_xmit_stopped(txq)) { > + if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) { > if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { > u64_stats_update_begin(&sq->stats.syncp); > u64_stats_inc(&sq->stats.kicks); > -- > 2.44.0 > >
Fri, May 10, 2024 at 09:11:16AM CEST, hengqi@linux.alibaba.com wrote: >On Thu, 9 May 2024 13:46:15 +0200, Jiri Pirko <jiri@resnulli.us> wrote: >> From: Jiri Pirko <jiri@nvidia.com> >> >> Add support for Byte Queue Limits (BQL). > >Historically both Jason and Michael have attempted to support BQL >for virtio-net, for example: > >https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > >These discussions focus primarily on: > >1. BQL is based on napi tx. Therefore, the transfer of statistical information >needs to rely on the judgment of use_napi. When the napi mode is switched to >orphan, some statistical information will be lost, resulting in temporary >inaccuracy in BQL. > >2. If tx dim is supported, orphan mode may be removed and tx irq will be more >reasonable. This provides good support for BQL. Thanks for the pointers, will check that out. > >Thanks. > >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >> --- >> drivers/net/virtio_net.c | 33 ++++++++++++++++++++------------- >> 1 file changed, 20 insertions(+), 13 deletions(-) >> >> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c >> index 218a446c4c27..c53d6dc6d332 100644 >> --- a/drivers/net/virtio_net.c >> +++ b/drivers/net/virtio_net.c >> @@ -84,7 +84,9 @@ struct virtnet_stat_desc { >> >> struct virtnet_sq_free_stats { >> u64 packets; >> + u64 xdp_packets; >> u64 bytes; >> + u64 xdp_bytes; >> }; >> >> struct virtnet_sq_stats { >> @@ -512,19 +514,19 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi, >> void *ptr; >> >> while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { >> - ++stats->packets; >> - >> if (!is_xdp_frame(ptr)) { >> struct sk_buff *skb = ptr; >> >> pr_debug("Sent skb %p\n", skb); >> >> + stats->packets++; >> stats->bytes += skb->len; >> napi_consume_skb(skb, in_napi); >> } else { >> struct xdp_frame *frame = ptr_to_xdp(ptr); >> >> - stats->bytes += xdp_get_frame_len(frame); >> + stats->xdp_packets++; >> + stats->xdp_bytes += xdp_get_frame_len(frame); >> xdp_return_frame(frame); >> } >> } >> @@ -965,7 +967,8 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) >> virtnet_rq_free_buf(vi, rq, buf); >> } >> >> -static void free_old_xmit(struct send_queue *sq, bool in_napi) >> +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, >> + bool in_napi) >> { >> struct virtnet_sq_free_stats stats = {0}; >> >> @@ -974,9 +977,11 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi) >> /* Avoid overhead when no packets have been processed >> * happens when called speculatively from start_xmit. >> */ >> - if (!stats.packets) >> + if (!stats.packets && !stats.xdp_packets) >> return; >> >> + netdev_tx_completed_queue(txq, stats.packets, stats.bytes); >> + >> u64_stats_update_begin(&sq->stats.syncp); >> u64_stats_add(&sq->stats.bytes, stats.bytes); >> u64_stats_add(&sq->stats.packets, stats.packets); >> @@ -1013,13 +1018,15 @@ static void check_sq_full_and_disable(struct virtnet_info *vi, >> * early means 16 slots are typically wasted. >> */ >> if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { >> - netif_stop_subqueue(dev, qnum); >> + struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); >> + >> + netif_tx_stop_queue(txq); >> if (use_napi) { >> if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) >> virtqueue_napi_schedule(&sq->napi, sq->vq); >> } else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { >> /* More just got used, free them then recheck. */ >> - free_old_xmit(sq, false); >> + free_old_xmit(sq, txq, false); >> if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { >> netif_start_subqueue(dev, qnum); >> virtqueue_disable_cb(sq->vq); >> @@ -2319,7 +2326,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq) >> >> do { >> virtqueue_disable_cb(sq->vq); >> - free_old_xmit(sq, true); >> + free_old_xmit(sq, txq, true); >> } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq))); >> >> if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) >> @@ -2471,7 +2478,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) >> txq = netdev_get_tx_queue(vi->dev, index); >> __netif_tx_lock(txq, raw_smp_processor_id()); >> virtqueue_disable_cb(sq->vq); >> - free_old_xmit(sq, true); >> + free_old_xmit(sq, txq, true); >> >> if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) >> netif_tx_wake_queue(txq); >> @@ -2553,7 +2560,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) >> struct send_queue *sq = &vi->sq[qnum]; >> int err; >> struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); >> - bool kick = !netdev_xmit_more(); >> + bool xmit_more = netdev_xmit_more(); >> bool use_napi = sq->napi.weight; >> >> /* Free up any pending old buffers before queueing new ones. */ >> @@ -2561,9 +2568,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) >> if (use_napi) >> virtqueue_disable_cb(sq->vq); >> >> - free_old_xmit(sq, false); >> + free_old_xmit(sq, txq, false); >> >> - } while (use_napi && kick && >> + } while (use_napi && !xmit_more && >> unlikely(!virtqueue_enable_cb_delayed(sq->vq))); >> >> /* timestamp packet in software */ >> @@ -2592,7 +2599,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) >> >> check_sq_full_and_disable(vi, dev, sq); >> >> - if (kick || netif_xmit_stopped(txq)) { >> + if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) { >> if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { >> u64_stats_update_begin(&sq->stats.syncp); >> u64_stats_inc(&sq->stats.kicks); >> -- >> 2.44.0 >> >>
Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: >> >> From: Jiri Pirko <jiri@nvidia.com> >> >> >> >> Add support for Byte Queue Limits (BQL). >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >> > >> >Can we get more detail on the benefits you observe etc? >> >Thanks! >> >> More info about the BQL in general is here: >> https://lwn.net/Articles/469652/ > >I know about BQL in general. We discussed BQL for virtio in the past >mostly I got the feedback from net core maintainers that it likely won't >benefit virtio. Do you have some link to that, or is it this thread: https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ I don't see why virtio should be any different from other drivers/devices that benefit from bql. HOL blocking is the same here are everywhere. > >So I'm asking, what kind of benefit do you observe? I don't have measurements at hand, will attach them to v2. Thanks! > >-- >MST >
On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: > Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: > >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: > >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: > >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: > >> >> From: Jiri Pirko <jiri@nvidia.com> > >> >> > >> >> Add support for Byte Queue Limits (BQL). > >> >> > >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> > >> > > >> >Can we get more detail on the benefits you observe etc? > >> >Thanks! > >> > >> More info about the BQL in general is here: > >> https://lwn.net/Articles/469652/ > > > >I know about BQL in general. We discussed BQL for virtio in the past > >mostly I got the feedback from net core maintainers that it likely won't > >benefit virtio. > > Do you have some link to that, or is it this thread: > https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ A quick search on lore turned up this, for example: https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ > I don't see why virtio should be any different from other > drivers/devices that benefit from bql. HOL blocking is the same here are > everywhere. > > > > >So I'm asking, what kind of benefit do you observe? > > I don't have measurements at hand, will attach them to v2. > > Thanks! > > > > >-- > >MST > >
Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: >> >> >> From: Jiri Pirko <jiri@nvidia.com> >> >> >> >> >> >> Add support for Byte Queue Limits (BQL). >> >> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >> >> > >> >> >Can we get more detail on the benefits you observe etc? >> >> >Thanks! >> >> >> >> More info about the BQL in general is here: >> >> https://lwn.net/Articles/469652/ >> > >> >I know about BQL in general. We discussed BQL for virtio in the past >> >mostly I got the feedback from net core maintainers that it likely won't >> >benefit virtio. >> >> Do you have some link to that, or is it this thread: >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > > >A quick search on lore turned up this, for example: >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ Says: "Note that NIC with many TX queues make BQL almost useless, only adding extra overhead." But virtio can have one tx queue, I guess that could be quite common configuration in lot of deployments. > > > > >> I don't see why virtio should be any different from other >> drivers/devices that benefit from bql. HOL blocking is the same here are >> everywhere. >> >> > >> >So I'm asking, what kind of benefit do you observe? >> >> I don't have measurements at hand, will attach them to v2. >> >> Thanks! >> >> > >> >-- >> >MST >> > >
On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote: > Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: > >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: > >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: > >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: > >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: > >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: > >> >> >> From: Jiri Pirko <jiri@nvidia.com> > >> >> >> > >> >> >> Add support for Byte Queue Limits (BQL). > >> >> >> > >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> > >> >> > > >> >> >Can we get more detail on the benefits you observe etc? > >> >> >Thanks! > >> >> > >> >> More info about the BQL in general is here: > >> >> https://lwn.net/Articles/469652/ > >> > > >> >I know about BQL in general. We discussed BQL for virtio in the past > >> >mostly I got the feedback from net core maintainers that it likely won't > >> >benefit virtio. > >> > >> Do you have some link to that, or is it this thread: > >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > > > > > >A quick search on lore turned up this, for example: > >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ > > Says: > "Note that NIC with many TX queues make BQL almost useless, only adding extra > overhead." > > But virtio can have one tx queue, I guess that could be quite common > configuration in lot of deployments. Not sure we should worry about performance for these though. What I am saying is this should come with some benchmarking results. > > > > > > > > > > >> I don't see why virtio should be any different from other > >> drivers/devices that benefit from bql. HOL blocking is the same here are > >> everywhere. > >> > >> > > >> >So I'm asking, what kind of benefit do you observe? > >> > >> I don't have measurements at hand, will attach them to v2. > >> > >> Thanks! > >> > >> > > >> >-- > >> >MST > >> > > >
Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote: >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote: >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: >> >> >> >> From: Jiri Pirko <jiri@nvidia.com> >> >> >> >> >> >> >> >> Add support for Byte Queue Limits (BQL). >> >> >> >> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >> >> >> > >> >> >> >Can we get more detail on the benefits you observe etc? >> >> >> >Thanks! >> >> >> >> >> >> More info about the BQL in general is here: >> >> >> https://lwn.net/Articles/469652/ >> >> > >> >> >I know about BQL in general. We discussed BQL for virtio in the past >> >> >mostly I got the feedback from net core maintainers that it likely won't >> >> >benefit virtio. >> >> >> >> Do you have some link to that, or is it this thread: >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ >> > >> > >> >A quick search on lore turned up this, for example: >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ >> >> Says: >> "Note that NIC with many TX queues make BQL almost useless, only adding extra >> overhead." >> >> But virtio can have one tx queue, I guess that could be quite common >> configuration in lot of deployments. > >Not sure we should worry about performance for these though. Well, queues may be scarce resource sometimes, even in those cases, you want to perform. >What I am saying is this should come with some benchmarking >results. Sure, I got the message. > > >> >> > >> > >> > >> > >> >> I don't see why virtio should be any different from other >> >> drivers/devices that benefit from bql. HOL blocking is the same here are >> >> everywhere. >> >> >> >> > >> >> >So I'm asking, what kind of benefit do you observe? >> >> >> >> I don't have measurements at hand, will attach them to v2. >> >> >> >> Thanks! >> >> >> >> > >> >> >-- >> >> >MST >> >> > >> > >
Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote: >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote: >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: >> >> >> >> From: Jiri Pirko <jiri@nvidia.com> >> >> >> >> >> >> >> >> Add support for Byte Queue Limits (BQL). >> >> >> >> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >> >> >> > >> >> >> >Can we get more detail on the benefits you observe etc? >> >> >> >Thanks! >> >> >> >> >> >> More info about the BQL in general is here: >> >> >> https://lwn.net/Articles/469652/ >> >> > >> >> >I know about BQL in general. We discussed BQL for virtio in the past >> >> >mostly I got the feedback from net core maintainers that it likely won't >> >> >benefit virtio. >> >> >> >> Do you have some link to that, or is it this thread: >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ >> > >> > >> >A quick search on lore turned up this, for example: >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ >> >> Says: >> "Note that NIC with many TX queues make BQL almost useless, only adding extra >> overhead." >> >> But virtio can have one tx queue, I guess that could be quite common >> configuration in lot of deployments. > >Not sure we should worry about performance for these though. >What I am saying is this should come with some benchmarking >results. I did some measurements with VDPA, backed by ConnectX6dx NIC, single queue pair: super_netperf 200 -H $ip -l 45 -t TCP_STREAM & nice -n 20 netperf -H $ip -l 10 -t TCP_RR RR result with no bql: 29.95 32.74 28.77 RR result with bql: 222.98 159.81 197.88 > > >> >> > >> > >> > >> > >> >> I don't see why virtio should be any different from other >> >> drivers/devices that benefit from bql. HOL blocking is the same here are >> >> everywhere. >> >> >> >> > >> >> >So I'm asking, what kind of benefit do you observe? >> >> >> >> I don't have measurements at hand, will attach them to v2. >> >> >> >> Thanks! >> >> >> >> > >> >> >-- >> >> >MST >> >> > >> > >
On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote: > Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote: > >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote: > >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: > >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: > >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: > >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: > >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: > >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: > >> >> >> >> From: Jiri Pirko <jiri@nvidia.com> > >> >> >> >> > >> >> >> >> Add support for Byte Queue Limits (BQL). > >> >> >> >> > >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> > >> >> >> > > >> >> >> >Can we get more detail on the benefits you observe etc? > >> >> >> >Thanks! > >> >> >> > >> >> >> More info about the BQL in general is here: > >> >> >> https://lwn.net/Articles/469652/ > >> >> > > >> >> >I know about BQL in general. We discussed BQL for virtio in the past > >> >> >mostly I got the feedback from net core maintainers that it likely won't > >> >> >benefit virtio. > >> >> > >> >> Do you have some link to that, or is it this thread: > >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > >> > > >> > > >> >A quick search on lore turned up this, for example: > >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ > >> > >> Says: > >> "Note that NIC with many TX queues make BQL almost useless, only adding extra > >> overhead." > >> > >> But virtio can have one tx queue, I guess that could be quite common > >> configuration in lot of deployments. > > > >Not sure we should worry about performance for these though. > >What I am saying is this should come with some benchmarking > >results. > > I did some measurements with VDPA, backed by ConnectX6dx NIC, single > queue pair: > > super_netperf 200 -H $ip -l 45 -t TCP_STREAM & > nice -n 20 netperf -H $ip -l 10 -t TCP_RR > > RR result with no bql: > 29.95 > 32.74 > 28.77 > > RR result with bql: > 222.98 > 159.81 > 197.88 > Okay. And on the other hand, any measureable degradation with multiqueue and when testing throughput? > > > > > > >> > >> > > >> > > >> > > >> > > >> >> I don't see why virtio should be any different from other > >> >> drivers/devices that benefit from bql. HOL blocking is the same here are > >> >> everywhere. > >> >> > >> >> > > >> >> >So I'm asking, what kind of benefit do you observe? > >> >> > >> >> I don't have measurements at hand, will attach them to v2. > >> >> > >> >> Thanks! > >> >> > >> >> > > >> >> >-- > >> >> >MST > >> >> > > >> > > >
Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote: >On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote: >> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote: >> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote: >> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: >> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: >> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: >> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: >> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: >> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: >> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com> >> >> >> >> >> >> >> >> >> >> Add support for Byte Queue Limits (BQL). >> >> >> >> >> >> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >> >> >> >> > >> >> >> >> >Can we get more detail on the benefits you observe etc? >> >> >> >> >Thanks! >> >> >> >> >> >> >> >> More info about the BQL in general is here: >> >> >> >> https://lwn.net/Articles/469652/ >> >> >> > >> >> >> >I know about BQL in general. We discussed BQL for virtio in the past >> >> >> >mostly I got the feedback from net core maintainers that it likely won't >> >> >> >benefit virtio. >> >> >> >> >> >> Do you have some link to that, or is it this thread: >> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ >> >> > >> >> > >> >> >A quick search on lore turned up this, for example: >> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ >> >> >> >> Says: >> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra >> >> overhead." >> >> >> >> But virtio can have one tx queue, I guess that could be quite common >> >> configuration in lot of deployments. >> > >> >Not sure we should worry about performance for these though. >> >What I am saying is this should come with some benchmarking >> >results. >> >> I did some measurements with VDPA, backed by ConnectX6dx NIC, single >> queue pair: >> >> super_netperf 200 -H $ip -l 45 -t TCP_STREAM & >> nice -n 20 netperf -H $ip -l 10 -t TCP_RR >> >> RR result with no bql: >> 29.95 >> 32.74 >> 28.77 >> >> RR result with bql: >> 222.98 >> 159.81 >> 197.88 >> > >Okay. And on the other hand, any measureable degradation with >multiqueue and when testing throughput? With multiqueue it depends if the flows hits the same queue or not. If they do, the same results will likely be shown. > > >> >> > >> > >> >> >> >> > >> >> > >> >> > >> >> > >> >> >> I don't see why virtio should be any different from other >> >> >> drivers/devices that benefit from bql. HOL blocking is the same here are >> >> >> everywhere. >> >> >> >> >> >> > >> >> >> >So I'm asking, what kind of benefit do you observe? >> >> >> >> >> >> I don't have measurements at hand, will attach them to v2. >> >> >> >> >> >> Thanks! >> >> >> >> >> >> > >> >> >> >-- >> >> >> >MST >> >> >> > >> >> > >> > >
Wed, May 15, 2024 at 12:12:51PM CEST, jiri@resnulli.us wrote: >Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote: >>On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote: >>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote: >>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote: >>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: >>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: >>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: >>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: >>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: >>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: >>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com> >>> >> >> >> >> >>> >> >> >> >> Add support for Byte Queue Limits (BQL). >>> >> >> >> >> >>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >>> >> >> >> > >>> >> >> >> >Can we get more detail on the benefits you observe etc? >>> >> >> >> >Thanks! >>> >> >> >> >>> >> >> >> More info about the BQL in general is here: >>> >> >> >> https://lwn.net/Articles/469652/ >>> >> >> > >>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past >>> >> >> >mostly I got the feedback from net core maintainers that it likely won't >>> >> >> >benefit virtio. >>> >> >> >>> >> >> Do you have some link to that, or is it this thread: >>> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ >>> >> > >>> >> > >>> >> >A quick search on lore turned up this, for example: >>> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ >>> >> >>> >> Says: >>> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra >>> >> overhead." >>> >> >>> >> But virtio can have one tx queue, I guess that could be quite common >>> >> configuration in lot of deployments. >>> > >>> >Not sure we should worry about performance for these though. >>> >What I am saying is this should come with some benchmarking >>> >results. >>> >>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single >>> queue pair: >>> >>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM & >>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR >>> >>> RR result with no bql: >>> 29.95 >>> 32.74 >>> 28.77 >>> >>> RR result with bql: >>> 222.98 >>> 159.81 >>> 197.88 >>> >> >>Okay. And on the other hand, any measureable degradation with >>multiqueue and when testing throughput? > >With multiqueue it depends if the flows hits the same queue or not. If >they do, the same results will likely be shown. RR 1q, w/o bql: 29.95 32.74 28.77 RR 1q, with bql: 222.98 159.81 197.88 RR 4q, w/o bql: 355.82 364.58 233.47 RR 4q, with bql: 371.19 255.93 337.77 So answer to your question is: "no measurable degradation with 4 queues". > > >> >> >>> >>> > >>> > >>> >> >>> >> > >>> >> > >>> >> > >>> >> > >>> >> >> I don't see why virtio should be any different from other >>> >> >> drivers/devices that benefit from bql. HOL blocking is the same here are >>> >> >> everywhere. >>> >> >> >>> >> >> > >>> >> >> >So I'm asking, what kind of benefit do you observe? >>> >> >> >>> >> >> I don't have measurements at hand, will attach them to v2. >>> >> >> >>> >> >> Thanks! >>> >> >> >>> >> >> > >>> >> >> >-- >>> >> >> >MST >>> >> >> > >>> >> > >>> > >>
On Wed, May 15, 2024 at 8:54 PM Jiri Pirko <jiri@resnulli.us> wrote: > > Wed, May 15, 2024 at 12:12:51PM CEST, jiri@resnulli.us wrote: > >Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote: > >>On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote: > >>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote: > >>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote: > >>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: > >>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: > >>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: > >>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: > >>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: > >>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: > >>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com> > >>> >> >> >> >> > >>> >> >> >> >> Add support for Byte Queue Limits (BQL). > >>> >> >> >> >> > >>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> > >>> >> >> >> > > >>> >> >> >> >Can we get more detail on the benefits you observe etc? > >>> >> >> >> >Thanks! > >>> >> >> >> > >>> >> >> >> More info about the BQL in general is here: > >>> >> >> >> https://lwn.net/Articles/469652/ > >>> >> >> > > >>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past > >>> >> >> >mostly I got the feedback from net core maintainers that it likely won't > >>> >> >> >benefit virtio. > >>> >> >> > >>> >> >> Do you have some link to that, or is it this thread: > >>> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > >>> >> > > >>> >> > > >>> >> >A quick search on lore turned up this, for example: > >>> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ > >>> >> > >>> >> Says: > >>> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra > >>> >> overhead." > >>> >> > >>> >> But virtio can have one tx queue, I guess that could be quite common > >>> >> configuration in lot of deployments. > >>> > > >>> >Not sure we should worry about performance for these though. > >>> >What I am saying is this should come with some benchmarking > >>> >results. > >>> > >>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single > >>> queue pair: > >>> > >>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM & > >>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR > >>> > >>> RR result with no bql: > >>> 29.95 > >>> 32.74 > >>> 28.77 > >>> > >>> RR result with bql: > >>> 222.98 > >>> 159.81 > >>> 197.88 > >>> > >> > >>Okay. And on the other hand, any measureable degradation with > >>multiqueue and when testing throughput? > > > >With multiqueue it depends if the flows hits the same queue or not. If > >they do, the same results will likely be shown. > > RR 1q, w/o bql: > 29.95 > 32.74 > 28.77 > > RR 1q, with bql: > 222.98 > 159.81 > 197.88 > > RR 4q, w/o bql: > 355.82 > 364.58 > 233.47 > > RR 4q, with bql: > 371.19 > 255.93 > 337.77 > > So answer to your question is: "no measurable degradation with 4 > queues". Thanks but I think we also need benchmarks in cases other than vDPA. For example, a simple virtualization setup.
Thu, May 16, 2024 at 06:48:38AM CEST, jasowang@redhat.com wrote: >On Wed, May 15, 2024 at 8:54 PM Jiri Pirko <jiri@resnulli.us> wrote: >> >> Wed, May 15, 2024 at 12:12:51PM CEST, jiri@resnulli.us wrote: >> >Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote: >> >>On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote: >> >>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote: >> >>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote: >> >>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: >> >>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: >> >>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: >> >>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: >> >>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: >> >>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: >> >>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com> >> >>> >> >> >> >> >> >>> >> >> >> >> Add support for Byte Queue Limits (BQL). >> >>> >> >> >> >> >> >>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >> >>> >> >> >> > >> >>> >> >> >> >Can we get more detail on the benefits you observe etc? >> >>> >> >> >> >Thanks! >> >>> >> >> >> >> >>> >> >> >> More info about the BQL in general is here: >> >>> >> >> >> https://lwn.net/Articles/469652/ >> >>> >> >> > >> >>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past >> >>> >> >> >mostly I got the feedback from net core maintainers that it likely won't >> >>> >> >> >benefit virtio. >> >>> >> >> >> >>> >> >> Do you have some link to that, or is it this thread: >> >>> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ >> >>> >> > >> >>> >> > >> >>> >> >A quick search on lore turned up this, for example: >> >>> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ >> >>> >> >> >>> >> Says: >> >>> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra >> >>> >> overhead." >> >>> >> >> >>> >> But virtio can have one tx queue, I guess that could be quite common >> >>> >> configuration in lot of deployments. >> >>> > >> >>> >Not sure we should worry about performance for these though. >> >>> >What I am saying is this should come with some benchmarking >> >>> >results. >> >>> >> >>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single >> >>> queue pair: >> >>> >> >>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM & >> >>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR >> >>> >> >>> RR result with no bql: >> >>> 29.95 >> >>> 32.74 >> >>> 28.77 >> >>> >> >>> RR result with bql: >> >>> 222.98 >> >>> 159.81 >> >>> 197.88 >> >>> >> >> >> >>Okay. And on the other hand, any measureable degradation with >> >>multiqueue and when testing throughput? >> > >> >With multiqueue it depends if the flows hits the same queue or not. If >> >they do, the same results will likely be shown. >> >> RR 1q, w/o bql: >> 29.95 >> 32.74 >> 28.77 >> >> RR 1q, with bql: >> 222.98 >> 159.81 >> 197.88 >> >> RR 4q, w/o bql: >> 355.82 >> 364.58 >> 233.47 >> >> RR 4q, with bql: >> 371.19 >> 255.93 >> 337.77 >> >> So answer to your question is: "no measurable degradation with 4 >> queues". > >Thanks but I think we also need benchmarks in cases other than vDPA. >For example, a simple virtualization setup. For virtualization setup, I get this: VIRT RR 1q, w/0 bql: 49.18 49.75 50.07 VIRT RR 1q, with bql: 51.33 47.88 40.40 No measurable/significant difference. >
On Thu, May 16, 2024 at 12:54:58PM +0200, Jiri Pirko wrote: > Thu, May 16, 2024 at 06:48:38AM CEST, jasowang@redhat.com wrote: > >On Wed, May 15, 2024 at 8:54 PM Jiri Pirko <jiri@resnulli.us> wrote: > >> > >> Wed, May 15, 2024 at 12:12:51PM CEST, jiri@resnulli.us wrote: > >> >Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote: > >> >>On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote: > >> >>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote: > >> >>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote: > >> >>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: > >> >>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: > >> >>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: > >> >>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: > >> >>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: > >> >>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: > >> >>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com> > >> >>> >> >> >> >> > >> >>> >> >> >> >> Add support for Byte Queue Limits (BQL). > >> >>> >> >> >> >> > >> >>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> > >> >>> >> >> >> > > >> >>> >> >> >> >Can we get more detail on the benefits you observe etc? > >> >>> >> >> >> >Thanks! > >> >>> >> >> >> > >> >>> >> >> >> More info about the BQL in general is here: > >> >>> >> >> >> https://lwn.net/Articles/469652/ > >> >>> >> >> > > >> >>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past > >> >>> >> >> >mostly I got the feedback from net core maintainers that it likely won't > >> >>> >> >> >benefit virtio. > >> >>> >> >> > >> >>> >> >> Do you have some link to that, or is it this thread: > >> >>> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ > >> >>> >> > > >> >>> >> > > >> >>> >> >A quick search on lore turned up this, for example: > >> >>> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ > >> >>> >> > >> >>> >> Says: > >> >>> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra > >> >>> >> overhead." > >> >>> >> > >> >>> >> But virtio can have one tx queue, I guess that could be quite common > >> >>> >> configuration in lot of deployments. > >> >>> > > >> >>> >Not sure we should worry about performance for these though. > >> >>> >What I am saying is this should come with some benchmarking > >> >>> >results. > >> >>> > >> >>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single > >> >>> queue pair: > >> >>> > >> >>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM & > >> >>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR > >> >>> > >> >>> RR result with no bql: > >> >>> 29.95 > >> >>> 32.74 > >> >>> 28.77 > >> >>> > >> >>> RR result with bql: > >> >>> 222.98 > >> >>> 159.81 > >> >>> 197.88 > >> >>> > >> >> > >> >>Okay. And on the other hand, any measureable degradation with > >> >>multiqueue and when testing throughput? > >> > > >> >With multiqueue it depends if the flows hits the same queue or not. If > >> >they do, the same results will likely be shown. > >> > >> RR 1q, w/o bql: > >> 29.95 > >> 32.74 > >> 28.77 > >> > >> RR 1q, with bql: > >> 222.98 > >> 159.81 > >> 197.88 > >> > >> RR 4q, w/o bql: > >> 355.82 > >> 364.58 > >> 233.47 > >> > >> RR 4q, with bql: > >> 371.19 > >> 255.93 > >> 337.77 > >> > >> So answer to your question is: "no measurable degradation with 4 > >> queues". > > > >Thanks but I think we also need benchmarks in cases other than vDPA. > >For example, a simple virtualization setup. > > For virtualization setup, I get this: > > VIRT RR 1q, w/0 bql: > 49.18 > 49.75 > 50.07 > > VIRT RR 1q, with bql: > 51.33 > 47.88 > 40.40 > > No measurable/significant difference. Seems the results became much noisier? Also I'd expect a regression if any to be in a streaming benchmark.
Thu, May 16, 2024 at 02:31:59PM CEST, mst@redhat.com wrote: >On Thu, May 16, 2024 at 12:54:58PM +0200, Jiri Pirko wrote: >> Thu, May 16, 2024 at 06:48:38AM CEST, jasowang@redhat.com wrote: >> >On Wed, May 15, 2024 at 8:54 PM Jiri Pirko <jiri@resnulli.us> wrote: >> >> >> >> Wed, May 15, 2024 at 12:12:51PM CEST, jiri@resnulli.us wrote: >> >> >Wed, May 15, 2024 at 10:20:04AM CEST, mst@redhat.com wrote: >> >> >>On Wed, May 15, 2024 at 09:34:08AM +0200, Jiri Pirko wrote: >> >> >>> Fri, May 10, 2024 at 01:27:08PM CEST, mst@redhat.com wrote: >> >> >>> >On Fri, May 10, 2024 at 01:11:49PM +0200, Jiri Pirko wrote: >> >> >>> >> Fri, May 10, 2024 at 12:52:52PM CEST, mst@redhat.com wrote: >> >> >>> >> >On Fri, May 10, 2024 at 12:37:15PM +0200, Jiri Pirko wrote: >> >> >>> >> >> Thu, May 09, 2024 at 04:28:12PM CEST, mst@redhat.com wrote: >> >> >>> >> >> >On Thu, May 09, 2024 at 03:31:56PM +0200, Jiri Pirko wrote: >> >> >>> >> >> >> Thu, May 09, 2024 at 02:41:39PM CEST, mst@redhat.com wrote: >> >> >>> >> >> >> >On Thu, May 09, 2024 at 01:46:15PM +0200, Jiri Pirko wrote: >> >> >>> >> >> >> >> From: Jiri Pirko <jiri@nvidia.com> >> >> >>> >> >> >> >> >> >> >>> >> >> >> >> Add support for Byte Queue Limits (BQL). >> >> >>> >> >> >> >> >> >> >>> >> >> >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >> >> >>> >> >> >> > >> >> >>> >> >> >> >Can we get more detail on the benefits you observe etc? >> >> >>> >> >> >> >Thanks! >> >> >>> >> >> >> >> >> >>> >> >> >> More info about the BQL in general is here: >> >> >>> >> >> >> https://lwn.net/Articles/469652/ >> >> >>> >> >> > >> >> >>> >> >> >I know about BQL in general. We discussed BQL for virtio in the past >> >> >>> >> >> >mostly I got the feedback from net core maintainers that it likely won't >> >> >>> >> >> >benefit virtio. >> >> >>> >> >> >> >> >>> >> >> Do you have some link to that, or is it this thread: >> >> >>> >> >> https://lore.kernel.org/netdev/21384cb5-99a6-7431-1039-b356521e1bc3@redhat.com/ >> >> >>> >> > >> >> >>> >> > >> >> >>> >> >A quick search on lore turned up this, for example: >> >> >>> >> >https://lore.kernel.org/all/a11eee78-b2a1-3dbc-4821-b5f4bfaae819@gmail.com/ >> >> >>> >> >> >> >>> >> Says: >> >> >>> >> "Note that NIC with many TX queues make BQL almost useless, only adding extra >> >> >>> >> overhead." >> >> >>> >> >> >> >>> >> But virtio can have one tx queue, I guess that could be quite common >> >> >>> >> configuration in lot of deployments. >> >> >>> > >> >> >>> >Not sure we should worry about performance for these though. >> >> >>> >What I am saying is this should come with some benchmarking >> >> >>> >results. >> >> >>> >> >> >>> I did some measurements with VDPA, backed by ConnectX6dx NIC, single >> >> >>> queue pair: >> >> >>> >> >> >>> super_netperf 200 -H $ip -l 45 -t TCP_STREAM & >> >> >>> nice -n 20 netperf -H $ip -l 10 -t TCP_RR >> >> >>> >> >> >>> RR result with no bql: >> >> >>> 29.95 >> >> >>> 32.74 >> >> >>> 28.77 >> >> >>> >> >> >>> RR result with bql: >> >> >>> 222.98 >> >> >>> 159.81 >> >> >>> 197.88 >> >> >>> >> >> >> >> >> >>Okay. And on the other hand, any measureable degradation with >> >> >>multiqueue and when testing throughput? >> >> > >> >> >With multiqueue it depends if the flows hits the same queue or not. If >> >> >they do, the same results will likely be shown. >> >> >> >> RR 1q, w/o bql: >> >> 29.95 >> >> 32.74 >> >> 28.77 >> >> >> >> RR 1q, with bql: >> >> 222.98 >> >> 159.81 >> >> 197.88 >> >> >> >> RR 4q, w/o bql: >> >> 355.82 >> >> 364.58 >> >> 233.47 >> >> >> >> RR 4q, with bql: >> >> 371.19 >> >> 255.93 >> >> 337.77 >> >> >> >> So answer to your question is: "no measurable degradation with 4 >> >> queues". >> > >> >Thanks but I think we also need benchmarks in cases other than vDPA. >> >For example, a simple virtualization setup. >> >> For virtualization setup, I get this: >> >> VIRT RR 1q, w/0 bql: >> 49.18 >> 49.75 >> 50.07 >> >> VIRT RR 1q, with bql: >> 51.33 >> 47.88 >> 40.40 >> >> No measurable/significant difference. > >Seems the results became much noisier? Also Not enough data to assume that I believe. >I'd expect a regression if any to be in a streaming benchmark. Can you elaborate? > >-- >MST >
On Thu, May 16, 2024 at 05:25:20PM +0200, Jiri Pirko wrote: > > >I'd expect a regression if any to be in a streaming benchmark. > > Can you elaborate? BQL does two things that can hurt throughput: - limits amount of data in the queue - can limit bandwidth if we now get queue underruns - adds CPU overhead - can limit bandwidth if CPU bound So checking result of a streaming benchmark seems important.
Thu, May 16, 2024 at 09:04:41PM CEST, mst@redhat.com wrote: >On Thu, May 16, 2024 at 05:25:20PM +0200, Jiri Pirko wrote: >> >> >I'd expect a regression if any to be in a streaming benchmark. >> >> Can you elaborate? > >BQL does two things that can hurt throughput: >- limits amount of data in the queue - can limit bandwidth > if we now get queue underruns >- adds CPU overhead - can limit bandwidth if CPU bound > >So checking result of a streaming benchmark seems important. I understand. Didn't see any degradation in TCP_STREAM netperf. But I will try to extend the testing. Thanks! > > >-- >MST >
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 218a446c4c27..c53d6dc6d332 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -84,7 +84,9 @@ struct virtnet_stat_desc { struct virtnet_sq_free_stats { u64 packets; + u64 xdp_packets; u64 bytes; + u64 xdp_bytes; }; struct virtnet_sq_stats { @@ -512,19 +514,19 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi, void *ptr; while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { - ++stats->packets; - if (!is_xdp_frame(ptr)) { struct sk_buff *skb = ptr; pr_debug("Sent skb %p\n", skb); + stats->packets++; stats->bytes += skb->len; napi_consume_skb(skb, in_napi); } else { struct xdp_frame *frame = ptr_to_xdp(ptr); - stats->bytes += xdp_get_frame_len(frame); + stats->xdp_packets++; + stats->xdp_bytes += xdp_get_frame_len(frame); xdp_return_frame(frame); } } @@ -965,7 +967,8 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) virtnet_rq_free_buf(vi, rq, buf); } -static void free_old_xmit(struct send_queue *sq, bool in_napi) +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, + bool in_napi) { struct virtnet_sq_free_stats stats = {0}; @@ -974,9 +977,11 @@ static void free_old_xmit(struct send_queue *sq, bool in_napi) /* Avoid overhead when no packets have been processed * happens when called speculatively from start_xmit. */ - if (!stats.packets) + if (!stats.packets && !stats.xdp_packets) return; + netdev_tx_completed_queue(txq, stats.packets, stats.bytes); + u64_stats_update_begin(&sq->stats.syncp); u64_stats_add(&sq->stats.bytes, stats.bytes); u64_stats_add(&sq->stats.packets, stats.packets); @@ -1013,13 +1018,15 @@ static void check_sq_full_and_disable(struct virtnet_info *vi, * early means 16 slots are typically wasted. */ if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { - netif_stop_subqueue(dev, qnum); + struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); + + netif_tx_stop_queue(txq); if (use_napi) { if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) virtqueue_napi_schedule(&sq->napi, sq->vq); } else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { /* More just got used, free them then recheck. */ - free_old_xmit(sq, false); + free_old_xmit(sq, txq, false); if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { netif_start_subqueue(dev, qnum); virtqueue_disable_cb(sq->vq); @@ -2319,7 +2326,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq) do { virtqueue_disable_cb(sq->vq); - free_old_xmit(sq, true); + free_old_xmit(sq, txq, true); } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq))); if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) @@ -2471,7 +2478,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) txq = netdev_get_tx_queue(vi->dev, index); __netif_tx_lock(txq, raw_smp_processor_id()); virtqueue_disable_cb(sq->vq); - free_old_xmit(sq, true); + free_old_xmit(sq, txq, true); if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) netif_tx_wake_queue(txq); @@ -2553,7 +2560,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) struct send_queue *sq = &vi->sq[qnum]; int err; struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); - bool kick = !netdev_xmit_more(); + bool xmit_more = netdev_xmit_more(); bool use_napi = sq->napi.weight; /* Free up any pending old buffers before queueing new ones. */ @@ -2561,9 +2568,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) if (use_napi) virtqueue_disable_cb(sq->vq); - free_old_xmit(sq, false); + free_old_xmit(sq, txq, false); - } while (use_napi && kick && + } while (use_napi && !xmit_more && unlikely(!virtqueue_enable_cb_delayed(sq->vq))); /* timestamp packet in software */ @@ -2592,7 +2599,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) check_sq_full_and_disable(vi, dev, sq); - if (kick || netif_xmit_stopped(txq)) { + if (__netdev_tx_sent_queue(txq, skb->len, xmit_more)) { if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { u64_stats_update_begin(&sq->stats.syncp); u64_stats_inc(&sq->stats.kicks);