Message ID | 20240618144456.1688998-1-jiri@resnulli.us (mailing list archive) |
---|---|
State | Accepted |
Commit | c8bd1f7f3e61fc6c562c806045f3ccd2cc819c01 |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [net-next,v3] virtio_net: add support for Byte Queue Limits | expand |
This looks like a sensible way to do this. Yet something to improve: On Tue, Jun 18, 2024 at 04:44:56PM +0200, Jiri Pirko wrote: > From: Jiri Pirko <jiri@nvidia.com> > > Add support for Byte Queue Limits (BQL). > > Tested on qemu emulated virtio_net device with 1, 2 and 4 queues. > Tested with fq_codel and pfifo_fast. Super netperf with 50 threads is > running in background. Netperf TCP_RR results: > > NOBQL FQC 1q: 159.56 159.33 158.50 154.31 agv: 157.925 > NOBQL FQC 2q: 184.64 184.96 174.73 174.15 agv: 179.62 > NOBQL FQC 4q: 994.46 441.96 416.50 499.56 agv: 588.12 > NOBQL PFF 1q: 148.68 148.92 145.95 149.48 agv: 148.2575 > NOBQL PFF 2q: 171.86 171.20 170.42 169.42 agv: 170.725 > NOBQL PFF 4q: 1505.23 1137.23 2488.70 3507.99 agv: 2159.7875 > BQL FQC 1q: 1332.80 1297.97 1351.41 1147.57 agv: 1282.4375 > BQL FQC 2q: 768.30 817.72 864.43 974.40 agv: 856.2125 > BQL FQC 4q: 945.66 942.68 878.51 822.82 agv: 897.4175 > BQL PFF 1q: 149.69 151.49 149.40 147.47 agv: 149.5125 > BQL PFF 2q: 2059.32 798.74 1844.12 381.80 agv: 1270.995 > BQL PFF 4q: 1871.98 4420.02 4916.59 13268.16 agv: 6119.1875 > > Signed-off-by: Jiri Pirko <jiri@nvidia.com> > --- > v2->v3: > - fixed the switch from/to orphan mode while skbs are yet to be > completed by using the second least significant bit in virtqueue > token pointer to indicate skb is orphan. Don't account orphan > skbs in completion. > - reorganized parallel skb/xdp free stats accounting to napi/others. > - fixed kick condition check in orphan mode > v1->v2: > - moved netdev_tx_completed_queue() call into __free_old_xmit(), > propagate use_napi flag to __free_old_xmit() and only call > netdev_tx_completed_queue() in case it is true > - added forgotten call to netdev_tx_reset_queue() > - fixed stats for xdp packets > - fixed bql accounting when __free_old_xmit() is called from xdp path > - handle the !use_napi case in start_xmit() kick section > --- > drivers/net/virtio_net.c | 81 ++++++++++++++++++++++++++++------------ > 1 file changed, 57 insertions(+), 24 deletions(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index 61a57d134544..9f9b86874173 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -47,7 +47,8 @@ module_param(napi_tx, bool, 0644); > #define VIRTIO_XDP_TX BIT(0) > #define VIRTIO_XDP_REDIR BIT(1) > > -#define VIRTIO_XDP_FLAG BIT(0) > +#define VIRTIO_XDP_FLAG BIT(0) > +#define VIRTIO_ORPHAN_FLAG BIT(1) > > /* RX packet size EWMA. The average packet size is used to determine the packet > * buffer size when refilling RX rings. As the entire RX ring may be refilled > @@ -85,6 +86,8 @@ struct virtnet_stat_desc { > struct virtnet_sq_free_stats { > u64 packets; > u64 bytes; > + u64 napi_packets; > + u64 napi_bytes; > }; > > struct virtnet_sq_stats { > @@ -506,29 +509,50 @@ static struct xdp_frame *ptr_to_xdp(void *ptr) > return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG); > } > > -static void __free_old_xmit(struct send_queue *sq, bool in_napi, > - struct virtnet_sq_free_stats *stats) > +static bool is_orphan_skb(void *ptr) > +{ > + return (unsigned long)ptr & VIRTIO_ORPHAN_FLAG; > +} > + > +static void *skb_to_ptr(struct sk_buff *skb, bool orphan) > +{ > + return (void *)((unsigned long)skb | (orphan ? VIRTIO_ORPHAN_FLAG : 0)); > +} > + > +static struct sk_buff *ptr_to_skb(void *ptr) > +{ > + return (struct sk_buff *)((unsigned long)ptr & ~VIRTIO_ORPHAN_FLAG); > +} > + > +static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, > + bool in_napi, struct virtnet_sq_free_stats *stats) > { > unsigned int len; > void *ptr; > > while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { > - ++stats->packets; > - > if (!is_xdp_frame(ptr)) { > - struct sk_buff *skb = ptr; > + struct sk_buff *skb = ptr_to_skb(ptr); > > pr_debug("Sent skb %p\n", skb); > > - stats->bytes += skb->len; > + if (is_orphan_skb(ptr)) { > + stats->packets++; > + stats->bytes += skb->len; > + } else { > + stats->napi_packets++; > + stats->napi_bytes += skb->len; > + } > napi_consume_skb(skb, in_napi); > } else { > struct xdp_frame *frame = ptr_to_xdp(ptr); > > + stats->packets++; > stats->bytes += xdp_get_frame_len(frame); > xdp_return_frame(frame); > } > } > + netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes); Are you sure it's right? You are completing larger and larger number of bytes and packets each time. For example as won't this eventually trigger this inside dql_completed: BUG_ON(count > num_queued - dql->num_completed); ? If I am right the perf testing has to be redone with this fixed ... > } > > /* Converting between virtqueue no. and kernel tx/rx queue no. > @@ -955,21 +979,22 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) > virtnet_rq_free_buf(vi, rq, buf); > } > > -static void free_old_xmit(struct send_queue *sq, bool in_napi) > +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, > + bool in_napi) > { > struct virtnet_sq_free_stats stats = {0}; > > - __free_old_xmit(sq, in_napi, &stats); > + __free_old_xmit(sq, txq, in_napi, &stats); > > /* Avoid overhead when no packets have been processed > * happens when called speculatively from start_xmit. > */ > - if (!stats.packets) > + if (!stats.packets && !stats.napi_packets) > return; > > u64_stats_update_begin(&sq->stats.syncp); > - u64_stats_add(&sq->stats.bytes, stats.bytes); > - u64_stats_add(&sq->stats.packets, stats.packets); > + u64_stats_add(&sq->stats.bytes, stats.bytes + stats.napi_bytes); > + u64_stats_add(&sq->stats.packets, stats.packets + stats.napi_packets); > u64_stats_update_end(&sq->stats.syncp); > } > > @@ -1003,7 +1028,9 @@ static void check_sq_full_and_disable(struct virtnet_info *vi, > * early means 16 slots are typically wasted. > */ > if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { > - netif_stop_subqueue(dev, qnum); > + struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); > + > + netif_tx_stop_queue(txq); > u64_stats_update_begin(&sq->stats.syncp); > u64_stats_inc(&sq->stats.stop); > u64_stats_update_end(&sq->stats.syncp); > @@ -1012,7 +1039,7 @@ static void check_sq_full_and_disable(struct virtnet_info *vi, > virtqueue_napi_schedule(&sq->napi, sq->vq); > } else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { > /* More just got used, free them then recheck. */ > - free_old_xmit(sq, false); > + free_old_xmit(sq, txq, false); > if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { > netif_start_subqueue(dev, qnum); > u64_stats_update_begin(&sq->stats.syncp); > @@ -1138,7 +1165,8 @@ static int virtnet_xdp_xmit(struct net_device *dev, > } > > /* Free up any pending old buffers before queueing new ones. */ > - __free_old_xmit(sq, false, &stats); > + __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), > + false, &stats); > > for (i = 0; i < n; i++) { > struct xdp_frame *xdpf = frames[i]; > @@ -2313,7 +2341,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq) > > do { > virtqueue_disable_cb(sq->vq); > - free_old_xmit(sq, true); > + free_old_xmit(sq, txq, true); > } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq))); > > if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) { > @@ -2412,6 +2440,7 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > goto err_xdp_reg_mem_model; > > virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi); > + netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, qp_index)); > virtnet_napi_tx_enable(vi, vi->sq[qp_index].vq, &vi->sq[qp_index].napi); > > return 0; > @@ -2471,7 +2500,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) > txq = netdev_get_tx_queue(vi->dev, index); > __netif_tx_lock(txq, raw_smp_processor_id()); > virtqueue_disable_cb(sq->vq); > - free_old_xmit(sq, true); > + free_old_xmit(sq, txq, true); > > if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) { > if (netif_tx_queue_stopped(txq)) { > @@ -2505,7 +2534,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) > return 0; > } > > -static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) > +static int xmit_skb(struct send_queue *sq, struct sk_buff *skb, bool orphan) > { > struct virtio_net_hdr_mrg_rxbuf *hdr; > const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; > @@ -2549,7 +2578,8 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) > return num_sg; > num_sg++; > } > - return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC); > + return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, > + skb_to_ptr(skb, orphan), GFP_ATOMIC); > } > > static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > @@ -2559,24 +2589,25 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > struct send_queue *sq = &vi->sq[qnum]; > int err; > struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); > - bool kick = !netdev_xmit_more(); > + bool xmit_more = netdev_xmit_more(); > bool use_napi = sq->napi.weight; > + bool kick; > > /* Free up any pending old buffers before queueing new ones. */ > do { > if (use_napi) > virtqueue_disable_cb(sq->vq); > > - free_old_xmit(sq, false); > + free_old_xmit(sq, txq, false); > > - } while (use_napi && kick && > + } while (use_napi && !xmit_more && > unlikely(!virtqueue_enable_cb_delayed(sq->vq))); > > /* timestamp packet in software */ > skb_tx_timestamp(skb); > > /* Try to transmit */ > - err = xmit_skb(sq, skb); > + err = xmit_skb(sq, skb, !use_napi); > > /* This should not happen! */ > if (unlikely(err)) { > @@ -2598,7 +2629,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) > > check_sq_full_and_disable(vi, dev, sq); > > - if (kick || netif_xmit_stopped(txq)) { > + kick = use_napi ? __netdev_tx_sent_queue(txq, skb->len, xmit_more) : > + !xmit_more || netif_xmit_stopped(txq); > + if (kick) { > if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { > u64_stats_update_begin(&sq->stats.syncp); > u64_stats_inc(&sq->stats.kicks); > -- > 2.45.1
Tue, Jun 18, 2024 at 08:18:12PM CEST, mst@redhat.com wrote: >This looks like a sensible way to do this. >Yet something to improve: > > >On Tue, Jun 18, 2024 at 04:44:56PM +0200, Jiri Pirko wrote: >> From: Jiri Pirko <jiri@nvidia.com> >> [...] >> +static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, >> + bool in_napi, struct virtnet_sq_free_stats *stats) >> { >> unsigned int len; >> void *ptr; >> >> while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { >> - ++stats->packets; >> - >> if (!is_xdp_frame(ptr)) { >> - struct sk_buff *skb = ptr; >> + struct sk_buff *skb = ptr_to_skb(ptr); >> >> pr_debug("Sent skb %p\n", skb); >> >> - stats->bytes += skb->len; >> + if (is_orphan_skb(ptr)) { >> + stats->packets++; >> + stats->bytes += skb->len; >> + } else { >> + stats->napi_packets++; >> + stats->napi_bytes += skb->len; >> + } >> napi_consume_skb(skb, in_napi); >> } else { >> struct xdp_frame *frame = ptr_to_xdp(ptr); >> >> + stats->packets++; >> stats->bytes += xdp_get_frame_len(frame); >> xdp_return_frame(frame); >> } >> } >> + netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes); > >Are you sure it's right? You are completing larger and larger >number of bytes and packets each time. Not sure I get you. __free_old_xmit() is always called with stats zeroed. So this is just sum-up of one queue completion run. I don't see how this could become "larger and larger number" as you describe. > >For example as won't this eventually trigger this inside dql_completed: > > BUG_ON(count > num_queued - dql->num_completed); Nope, I don't see how we can hit it. Do not complete anything else in addition to what was started in xmit(). Am I missing something? > >? > > >If I am right the perf testing has to be redone with this fixed ... > > >> } >> [...]
On Wed, Jun 19, 2024 at 07:45:16AM +0200, Jiri Pirko wrote: > Tue, Jun 18, 2024 at 08:18:12PM CEST, mst@redhat.com wrote: > >This looks like a sensible way to do this. > >Yet something to improve: > > > > > >On Tue, Jun 18, 2024 at 04:44:56PM +0200, Jiri Pirko wrote: > >> From: Jiri Pirko <jiri@nvidia.com> > >> > > [...] > > > >> +static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, > >> + bool in_napi, struct virtnet_sq_free_stats *stats) > >> { > >> unsigned int len; > >> void *ptr; > >> > >> while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { > >> - ++stats->packets; > >> - > >> if (!is_xdp_frame(ptr)) { > >> - struct sk_buff *skb = ptr; > >> + struct sk_buff *skb = ptr_to_skb(ptr); > >> > >> pr_debug("Sent skb %p\n", skb); > >> > >> - stats->bytes += skb->len; > >> + if (is_orphan_skb(ptr)) { > >> + stats->packets++; > >> + stats->bytes += skb->len; > >> + } else { > >> + stats->napi_packets++; > >> + stats->napi_bytes += skb->len; > >> + } > >> napi_consume_skb(skb, in_napi); > >> } else { > >> struct xdp_frame *frame = ptr_to_xdp(ptr); > >> > >> + stats->packets++; > >> stats->bytes += xdp_get_frame_len(frame); > >> xdp_return_frame(frame); > >> } > >> } > >> + netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes); > > > >Are you sure it's right? You are completing larger and larger > >number of bytes and packets each time. > > Not sure I get you. __free_old_xmit() is always called with stats > zeroed. So this is just sum-up of one queue completion run. > I don't see how this could become "larger and larger number" as you > describe. Oh. Right of course. Worth a comment maybe? Just to make sure we remember not to call __free_old_xmit twice in a row without reinitializing stats. Or move the initialization into __free_old_xmit to make it self-contained .. WDYT? > > > > >For example as won't this eventually trigger this inside dql_completed: > > > > BUG_ON(count > num_queued - dql->num_completed); > > Nope, I don't see how we can hit it. Do not complete anything else > in addition to what was started in xmit(). Am I missing something? > > > > > >? > > > > > >If I am right the perf testing has to be redone with this fixed ... > > > > > >> } > >> > > [...]
Wed, Jun 19, 2024 at 09:26:22AM CEST, mst@redhat.com wrote: >On Wed, Jun 19, 2024 at 07:45:16AM +0200, Jiri Pirko wrote: >> Tue, Jun 18, 2024 at 08:18:12PM CEST, mst@redhat.com wrote: >> >This looks like a sensible way to do this. >> >Yet something to improve: >> > >> > >> >On Tue, Jun 18, 2024 at 04:44:56PM +0200, Jiri Pirko wrote: >> >> From: Jiri Pirko <jiri@nvidia.com> >> >> >> >> [...] >> >> >> >> +static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, >> >> + bool in_napi, struct virtnet_sq_free_stats *stats) >> >> { >> >> unsigned int len; >> >> void *ptr; >> >> >> >> while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { >> >> - ++stats->packets; >> >> - >> >> if (!is_xdp_frame(ptr)) { >> >> - struct sk_buff *skb = ptr; >> >> + struct sk_buff *skb = ptr_to_skb(ptr); >> >> >> >> pr_debug("Sent skb %p\n", skb); >> >> >> >> - stats->bytes += skb->len; >> >> + if (is_orphan_skb(ptr)) { >> >> + stats->packets++; >> >> + stats->bytes += skb->len; >> >> + } else { >> >> + stats->napi_packets++; >> >> + stats->napi_bytes += skb->len; >> >> + } >> >> napi_consume_skb(skb, in_napi); >> >> } else { >> >> struct xdp_frame *frame = ptr_to_xdp(ptr); >> >> >> >> + stats->packets++; >> >> stats->bytes += xdp_get_frame_len(frame); >> >> xdp_return_frame(frame); >> >> } >> >> } >> >> + netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes); >> > >> >Are you sure it's right? You are completing larger and larger >> >number of bytes and packets each time. >> >> Not sure I get you. __free_old_xmit() is always called with stats >> zeroed. So this is just sum-up of one queue completion run. >> I don't see how this could become "larger and larger number" as you >> describe. > >Oh. Right of course. Worth a comment maybe? Just to make sure >we remember not to call __free_old_xmit twice in a row >without reinitializing stats. >Or move the initialization into __free_old_xmit to make it >self-contained .. Well, the initialization happens in the caller by {0}, Wouldn't memset in __free_old_xmit() add an extra overhead? IDK. Perhaps a small comment in __free_old_xmit() would do better. One way or another, I think this is parallel to this patchset. Will handle it separatelly if you don't mind. >WDYT? > >> >> > >> >For example as won't this eventually trigger this inside dql_completed: >> > >> > BUG_ON(count > num_queued - dql->num_completed); >> >> Nope, I don't see how we can hit it. Do not complete anything else >> in addition to what was started in xmit(). Am I missing something? >> >> >> > >> >? >> > >> > >> >If I am right the perf testing has to be redone with this fixed ... >> > >> > >> >> } >> >> >> >> [...] >
On Wed, Jun 19, 2024 at 10:05:41AM +0200, Jiri Pirko wrote: > Wed, Jun 19, 2024 at 09:26:22AM CEST, mst@redhat.com wrote: > >On Wed, Jun 19, 2024 at 07:45:16AM +0200, Jiri Pirko wrote: > >> Tue, Jun 18, 2024 at 08:18:12PM CEST, mst@redhat.com wrote: > >> >This looks like a sensible way to do this. > >> >Yet something to improve: > >> > > >> > > >> >On Tue, Jun 18, 2024 at 04:44:56PM +0200, Jiri Pirko wrote: > >> >> From: Jiri Pirko <jiri@nvidia.com> > >> >> > >> > >> [...] > >> > >> > >> >> +static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, > >> >> + bool in_napi, struct virtnet_sq_free_stats *stats) > >> >> { > >> >> unsigned int len; > >> >> void *ptr; > >> >> > >> >> while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { > >> >> - ++stats->packets; > >> >> - > >> >> if (!is_xdp_frame(ptr)) { > >> >> - struct sk_buff *skb = ptr; > >> >> + struct sk_buff *skb = ptr_to_skb(ptr); > >> >> > >> >> pr_debug("Sent skb %p\n", skb); > >> >> > >> >> - stats->bytes += skb->len; > >> >> + if (is_orphan_skb(ptr)) { > >> >> + stats->packets++; > >> >> + stats->bytes += skb->len; > >> >> + } else { > >> >> + stats->napi_packets++; > >> >> + stats->napi_bytes += skb->len; > >> >> + } > >> >> napi_consume_skb(skb, in_napi); > >> >> } else { > >> >> struct xdp_frame *frame = ptr_to_xdp(ptr); > >> >> > >> >> + stats->packets++; > >> >> stats->bytes += xdp_get_frame_len(frame); > >> >> xdp_return_frame(frame); > >> >> } > >> >> } > >> >> + netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes); > >> > > >> >Are you sure it's right? You are completing larger and larger > >> >number of bytes and packets each time. > >> > >> Not sure I get you. __free_old_xmit() is always called with stats > >> zeroed. So this is just sum-up of one queue completion run. > >> I don't see how this could become "larger and larger number" as you > >> describe. > > > >Oh. Right of course. Worth a comment maybe? Just to make sure > >we remember not to call __free_old_xmit twice in a row > >without reinitializing stats. > >Or move the initialization into __free_old_xmit to make it > >self-contained .. > > Well, the initialization happens in the caller by {0}, Wouldn't > memset in __free_old_xmit() add an extra overhead? IDK. > Perhaps a small comment in __free_old_xmit() would do better. > > One way or another, I think this is parallel to this patchset. Will > handle it separatelly if you don't mind. Okay. Acked-by: Michael S. Tsirkin <mst@redhat.com> > >WDYT? > > > >> > >> > > >> >For example as won't this eventually trigger this inside dql_completed: > >> > > >> > BUG_ON(count > num_queued - dql->num_completed); > >> > >> Nope, I don't see how we can hit it. Do not complete anything else > >> in addition to what was started in xmit(). Am I missing something? > >> > >> > >> > > >> >? > >> > > >> > > >> >If I am right the perf testing has to be redone with this fixed ... > >> > > >> > > >> >> } > >> >> > >> > >> [...] > >
On Wed, Jun 19, 2024 at 10:05:41AM +0200, Jiri Pirko wrote: > >Oh. Right of course. Worth a comment maybe? Just to make sure > >we remember not to call __free_old_xmit twice in a row > >without reinitializing stats. > >Or move the initialization into __free_old_xmit to make it > >self-contained .. > > Well, the initialization happens in the caller by {0}, Wouldn't > memset in __free_old_xmit() add an extra overhead? IDK. Well if I did the below the binary is a bit smaller. If you have to respin you can include it. If not I can submit separately. ---- virtio-net: cleanup __free_old_xmit Two call sites of __free_old_xmit zero-initialize stats, doing it inside __free_old_xmit seems to make compiler's job a bit easier: $ size /tmp/orig/virtio_net.o text data bss dec hex filename 65857 3892 100 69849 110d9 /tmp/orig/virtio_net.o $ size /tmp/new/virtio_net.o text data bss dec hex filename 65760 3892 100 69752 11078 /tmp/new/virtio_net.o Couldn't measure any performance impact, unsurprizingly. Signed-off-by: Michael S. Tsirkin <mst@redhat.com> --- diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 283b34d50296..c2ce8de340f7 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -383,6 +383,8 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi, unsigned int len; void *ptr; + stats->bytes = stats->packets = 0; + while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { ++stats->packets; @@ -828,7 +830,7 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) static void free_old_xmit(struct send_queue *sq, bool in_napi) { - struct virtnet_sq_free_stats stats = {0}; + struct virtnet_sq_free_stats stats; __free_old_xmit(sq, in_napi, &stats); @@ -979,7 +981,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags) { struct virtnet_info *vi = netdev_priv(dev); - struct virtnet_sq_free_stats stats = {0}; + struct virtnet_sq_free_stats stats; struct receive_queue *rq = vi->rq; struct bpf_prog *xdp_prog; struct send_queue *sq;
Wed, Jun 19, 2024 at 10:23:25AM CEST, mst@redhat.com wrote: >On Wed, Jun 19, 2024 at 10:05:41AM +0200, Jiri Pirko wrote: >> >Oh. Right of course. Worth a comment maybe? Just to make sure >> >we remember not to call __free_old_xmit twice in a row >> >without reinitializing stats. >> >Or move the initialization into __free_old_xmit to make it >> >self-contained .. >> >> Well, the initialization happens in the caller by {0}, Wouldn't >> memset in __free_old_xmit() add an extra overhead? IDK. > > >Well if I did the below the binary is a bit smaller. > >If you have to respin you can include it. >If not I can submit separately. Please send it reparately. It's should be a separate patch. Thanks! > >---- > > >virtio-net: cleanup __free_old_xmit > >Two call sites of __free_old_xmit zero-initialize stats, >doing it inside __free_old_xmit seems to make compiler's >job a bit easier: > >$ size /tmp/orig/virtio_net.o > text data bss dec hex filename > 65857 3892 100 69849 110d9 /tmp/orig/virtio_net.o >$ size /tmp/new/virtio_net.o > text data bss dec hex filename > 65760 3892 100 69752 11078 /tmp/new/virtio_net.o > >Couldn't measure any performance impact, unsurprizingly. > >Signed-off-by: Michael S. Tsirkin <mst@redhat.com> > >--- > >diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c >index 283b34d50296..c2ce8de340f7 100644 >--- a/drivers/net/virtio_net.c >+++ b/drivers/net/virtio_net.c >@@ -383,6 +383,8 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi, > unsigned int len; > void *ptr; > >+ stats->bytes = stats->packets = 0; Memset perhaps? >+ > while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { > ++stats->packets; > >@@ -828,7 +830,7 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) > > static void free_old_xmit(struct send_queue *sq, bool in_napi) > { >- struct virtnet_sq_free_stats stats = {0}; >+ struct virtnet_sq_free_stats stats; > > __free_old_xmit(sq, in_napi, &stats); > >@@ -979,7 +981,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > int n, struct xdp_frame **frames, u32 flags) > { > struct virtnet_info *vi = netdev_priv(dev); >- struct virtnet_sq_free_stats stats = {0}; >+ struct virtnet_sq_free_stats stats; > struct receive_queue *rq = vi->rq; > struct bpf_prog *xdp_prog; > struct send_queue *sq; >
Hello: This patch was applied to netdev/net-next.git (main) by Jakub Kicinski <kuba@kernel.org>: On Tue, 18 Jun 2024 16:44:56 +0200 you wrote: > From: Jiri Pirko <jiri@nvidia.com> > > Add support for Byte Queue Limits (BQL). > > Tested on qemu emulated virtio_net device with 1, 2 and 4 queues. > Tested with fq_codel and pfifo_fast. Super netperf with 50 threads is > running in background. Netperf TCP_RR results: > > [...] Here is the summary with links: - [net-next,v3] virtio_net: add support for Byte Queue Limits https://git.kernel.org/netdev/net-next/c/c8bd1f7f3e61 You are awesome, thank you!
On Wed, Jun 19, 2024 at 12:09:45PM +0200, Jiri Pirko wrote: > Wed, Jun 19, 2024 at 10:23:25AM CEST, mst@redhat.com wrote: > >On Wed, Jun 19, 2024 at 10:05:41AM +0200, Jiri Pirko wrote: > >> >Oh. Right of course. Worth a comment maybe? Just to make sure > >> >we remember not to call __free_old_xmit twice in a row > >> >without reinitializing stats. > >> >Or move the initialization into __free_old_xmit to make it > >> >self-contained .. > >> > >> Well, the initialization happens in the caller by {0}, Wouldn't > >> memset in __free_old_xmit() add an extra overhead? IDK. > > > > > >Well if I did the below the binary is a bit smaller. > > > >If you have to respin you can include it. > >If not I can submit separately. > > Please send it reparately. It's should be a separate patch. > > Thanks! > > > > >---- > > > > > >virtio-net: cleanup __free_old_xmit > > > >Two call sites of __free_old_xmit zero-initialize stats, > >doing it inside __free_old_xmit seems to make compiler's > >job a bit easier: > > > >$ size /tmp/orig/virtio_net.o > > text data bss dec hex filename > > 65857 3892 100 69849 110d9 /tmp/orig/virtio_net.o > >$ size /tmp/new/virtio_net.o > > text data bss dec hex filename > > 65760 3892 100 69752 11078 /tmp/new/virtio_net.o > > > >Couldn't measure any performance impact, unsurprizingly. > > > >Signed-off-by: Michael S. Tsirkin <mst@redhat.com> > > > >--- > > > >diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > >index 283b34d50296..c2ce8de340f7 100644 > >--- a/drivers/net/virtio_net.c > >+++ b/drivers/net/virtio_net.c > >@@ -383,6 +383,8 @@ static void __free_old_xmit(struct send_queue *sq, bool in_napi, > > unsigned int len; > > void *ptr; > > > >+ stats->bytes = stats->packets = 0; > > Memset perhaps? Generates the same code and I find it less readable - virtio generally opts for explicit initialization. > >+ > > while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { > > ++stats->packets; > > > >@@ -828,7 +830,7 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) > > > > static void free_old_xmit(struct send_queue *sq, bool in_napi) > > { > >- struct virtnet_sq_free_stats stats = {0}; > >+ struct virtnet_sq_free_stats stats; > > > > __free_old_xmit(sq, in_napi, &stats); > > > >@@ -979,7 +981,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > > int n, struct xdp_frame **frames, u32 flags) > > { > > struct virtnet_info *vi = netdev_priv(dev); > >- struct virtnet_sq_free_stats stats = {0}; > >+ struct virtnet_sq_free_stats stats; > > struct receive_queue *rq = vi->rq; > > struct bpf_prog *xdp_prog; > > struct send_queue *sq; > >
Hi, On 18.06.2024 16:44, Jiri Pirko wrote: > From: Jiri Pirko <jiri@nvidia.com> > > Add support for Byte Queue Limits (BQL). > > Tested on qemu emulated virtio_net device with 1, 2 and 4 queues. > Tested with fq_codel and pfifo_fast. Super netperf with 50 threads is > running in background. Netperf TCP_RR results: > > NOBQL FQC 1q: 159.56 159.33 158.50 154.31 agv: 157.925 > NOBQL FQC 2q: 184.64 184.96 174.73 174.15 agv: 179.62 > NOBQL FQC 4q: 994.46 441.96 416.50 499.56 agv: 588.12 > NOBQL PFF 1q: 148.68 148.92 145.95 149.48 agv: 148.2575 > NOBQL PFF 2q: 171.86 171.20 170.42 169.42 agv: 170.725 > NOBQL PFF 4q: 1505.23 1137.23 2488.70 3507.99 agv: 2159.7875 > BQL FQC 1q: 1332.80 1297.97 1351.41 1147.57 agv: 1282.4375 > BQL FQC 2q: 768.30 817.72 864.43 974.40 agv: 856.2125 > BQL FQC 4q: 945.66 942.68 878.51 822.82 agv: 897.4175 > BQL PFF 1q: 149.69 151.49 149.40 147.47 agv: 149.5125 > BQL PFF 2q: 2059.32 798.74 1844.12 381.80 agv: 1270.995 > BQL PFF 4q: 1871.98 4420.02 4916.59 13268.16 agv: 6119.1875 > > Signed-off-by: Jiri Pirko <jiri@nvidia.com> > --- > v2->v3: > - fixed the switch from/to orphan mode while skbs are yet to be > completed by using the second least significant bit in virtqueue > token pointer to indicate skb is orphan. Don't account orphan > skbs in completion. > - reorganized parallel skb/xdp free stats accounting to napi/others. > - fixed kick condition check in orphan mode > v1->v2: > - moved netdev_tx_completed_queue() call into __free_old_xmit(), > propagate use_napi flag to __free_old_xmit() and only call > netdev_tx_completed_queue() in case it is true > - added forgotten call to netdev_tx_reset_queue() > - fixed stats for xdp packets > - fixed bql accounting when __free_old_xmit() is called from xdp path > - handle the !use_napi case in start_xmit() kick section > --- > drivers/net/virtio_net.c | 81 ++++++++++++++++++++++++++++------------ > 1 file changed, 57 insertions(+), 24 deletions(-) I've recently found an issue with virtio-net driver and system suspend/resume. Bisecting pointed to the c8bd1f7f3e61 ("virtio_net: add support for Byte Queue Limits") commit and this patch. Once it got merged to linux-next and then Linus trees, the driver occasionally crashes with the following log (captured on QEMU's ARM 32bit 'virt' machine): root@target:~# time rtcwake -s10 -mmem rtcwake: wakeup from "mem" using /dev/rtc0 at Sat Aug 10 12:40:26 2024 PM: suspend entry (s2idle) Filesystems sync: 0.000 seconds Freezing user space processes Freezing user space processes completed (elapsed 0.006 seconds) OOM killer disabled. Freezing remaining freezable tasks Freezing remaining freezable tasks completed (elapsed 0.001 seconds) ------------[ cut here ]------------ kernel BUG at lib/dynamic_queue_limits.c:99! Internal error: Oops - BUG: 0 [#1] SMP ARM Modules linked in: bluetooth ecdh_generic ecc libaes CPU: 1 PID: 1282 Comm: rtcwake Not tainted 6.10.0-rc3-00732-gc8bd1f7f3e61 #15240 Hardware name: Generic DT based system PC is at dql_completed+0x270/0x2cc LR is at __free_old_xmit+0x120/0x198 pc : [<c07ffa54>] lr : [<c0c42bf4>] psr: 80000013 ... Flags: Nzcv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none Control: 10c5387d Table: 43a4406a DAC: 00000051 ... Process rtcwake (pid: 1282, stack limit = 0xfbc21278) Stack: (0xe0805e80 to 0xe0806000) ... Call trace: dql_completed from __free_old_xmit+0x120/0x198 __free_old_xmit from free_old_xmit+0x44/0xe4 free_old_xmit from virtnet_poll_tx+0x88/0x1b4 virtnet_poll_tx from __napi_poll+0x2c/0x1d4 __napi_poll from net_rx_action+0x140/0x2b4 net_rx_action from handle_softirqs+0x11c/0x350 handle_softirqs from call_with_stack+0x18/0x20 call_with_stack from do_softirq+0x48/0x50 do_softirq from __local_bh_enable_ip+0xa0/0xa4 __local_bh_enable_ip from virtnet_open+0xd4/0x21c virtnet_open from virtnet_restore+0x94/0x120 virtnet_restore from virtio_device_restore+0x110/0x1f4 virtio_device_restore from dpm_run_callback+0x3c/0x100 dpm_run_callback from device_resume+0x12c/0x2a8 device_resume from dpm_resume+0x12c/0x1e0 dpm_resume from dpm_resume_end+0xc/0x18 dpm_resume_end from suspend_devices_and_enter+0x1f0/0x72c suspend_devices_and_enter from pm_suspend+0x270/0x2a0 pm_suspend from state_store+0x68/0xc8 state_store from kernfs_fop_write_iter+0x10c/0x1cc kernfs_fop_write_iter from vfs_write+0x2b0/0x3dc vfs_write from ksys_write+0x5c/0xd4 ksys_write from ret_fast_syscall+0x0/0x54 Exception stack(0xe8bf1fa8 to 0xe8bf1ff0) ... ---[ end trace 0000000000000000 ]--- Kernel panic - not syncing: Fatal exception in interrupt ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- I have fully reproducible setup for this issue. Reverting it together with f8321fa75102 ("virtio_net: Fix napi_skb_cache_put warning") (due to some code dependencies) fixes this issue on top of Linux v6.11-rc1 and recent linux-next releases. Let me know if I can help debugging this issue further and help fixing. > ... Best regards
Mon, Aug 12, 2024 at 04:57:24PM CEST, m.szyprowski@samsung.com wrote: >Hi, > >On 18.06.2024 16:44, Jiri Pirko wrote: >> From: Jiri Pirko <jiri@nvidia.com> >> >> Add support for Byte Queue Limits (BQL). >> >> Tested on qemu emulated virtio_net device with 1, 2 and 4 queues. >> Tested with fq_codel and pfifo_fast. Super netperf with 50 threads is >> running in background. Netperf TCP_RR results: >> >> NOBQL FQC 1q: 159.56 159.33 158.50 154.31 agv: 157.925 >> NOBQL FQC 2q: 184.64 184.96 174.73 174.15 agv: 179.62 >> NOBQL FQC 4q: 994.46 441.96 416.50 499.56 agv: 588.12 >> NOBQL PFF 1q: 148.68 148.92 145.95 149.48 agv: 148.2575 >> NOBQL PFF 2q: 171.86 171.20 170.42 169.42 agv: 170.725 >> NOBQL PFF 4q: 1505.23 1137.23 2488.70 3507.99 agv: 2159.7875 >> BQL FQC 1q: 1332.80 1297.97 1351.41 1147.57 agv: 1282.4375 >> BQL FQC 2q: 768.30 817.72 864.43 974.40 agv: 856.2125 >> BQL FQC 4q: 945.66 942.68 878.51 822.82 agv: 897.4175 >> BQL PFF 1q: 149.69 151.49 149.40 147.47 agv: 149.5125 >> BQL PFF 2q: 2059.32 798.74 1844.12 381.80 agv: 1270.995 >> BQL PFF 4q: 1871.98 4420.02 4916.59 13268.16 agv: 6119.1875 >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >> --- >> v2->v3: >> - fixed the switch from/to orphan mode while skbs are yet to be >> completed by using the second least significant bit in virtqueue >> token pointer to indicate skb is orphan. Don't account orphan >> skbs in completion. >> - reorganized parallel skb/xdp free stats accounting to napi/others. >> - fixed kick condition check in orphan mode >> v1->v2: >> - moved netdev_tx_completed_queue() call into __free_old_xmit(), >> propagate use_napi flag to __free_old_xmit() and only call >> netdev_tx_completed_queue() in case it is true >> - added forgotten call to netdev_tx_reset_queue() >> - fixed stats for xdp packets >> - fixed bql accounting when __free_old_xmit() is called from xdp path >> - handle the !use_napi case in start_xmit() kick section >> --- >> drivers/net/virtio_net.c | 81 ++++++++++++++++++++++++++++------------ >> 1 file changed, 57 insertions(+), 24 deletions(-) > >I've recently found an issue with virtio-net driver and system >suspend/resume. Bisecting pointed to the c8bd1f7f3e61 ("virtio_net: add >support for Byte Queue Limits") commit and this patch. Once it got >merged to linux-next and then Linus trees, the driver occasionally >crashes with the following log (captured on QEMU's ARM 32bit 'virt' >machine): > >root@target:~# time rtcwake -s10 -mmem >rtcwake: wakeup from "mem" using /dev/rtc0 at Sat Aug 10 12:40:26 2024 >PM: suspend entry (s2idle) >Filesystems sync: 0.000 seconds >Freezing user space processes >Freezing user space processes completed (elapsed 0.006 seconds) >OOM killer disabled. >Freezing remaining freezable tasks >Freezing remaining freezable tasks completed (elapsed 0.001 seconds) >------------[ cut here ]------------ >kernel BUG at lib/dynamic_queue_limits.c:99! >Internal error: Oops - BUG: 0 [#1] SMP ARM >Modules linked in: bluetooth ecdh_generic ecc libaes >CPU: 1 PID: 1282 Comm: rtcwake Not tainted >6.10.0-rc3-00732-gc8bd1f7f3e61 #15240 >Hardware name: Generic DT based system >PC is at dql_completed+0x270/0x2cc >LR is at __free_old_xmit+0x120/0x198 >pc : [<c07ffa54>] lr : [<c0c42bf4>] psr: 80000013 >... >Flags: Nzcv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none >Control: 10c5387d Table: 43a4406a DAC: 00000051 >... >Process rtcwake (pid: 1282, stack limit = 0xfbc21278) >Stack: (0xe0805e80 to 0xe0806000) >... >Call trace: > dql_completed from __free_old_xmit+0x120/0x198 > __free_old_xmit from free_old_xmit+0x44/0xe4 > free_old_xmit from virtnet_poll_tx+0x88/0x1b4 > virtnet_poll_tx from __napi_poll+0x2c/0x1d4 > __napi_poll from net_rx_action+0x140/0x2b4 > net_rx_action from handle_softirqs+0x11c/0x350 > handle_softirqs from call_with_stack+0x18/0x20 > call_with_stack from do_softirq+0x48/0x50 > do_softirq from __local_bh_enable_ip+0xa0/0xa4 > __local_bh_enable_ip from virtnet_open+0xd4/0x21c > virtnet_open from virtnet_restore+0x94/0x120 > virtnet_restore from virtio_device_restore+0x110/0x1f4 > virtio_device_restore from dpm_run_callback+0x3c/0x100 > dpm_run_callback from device_resume+0x12c/0x2a8 > device_resume from dpm_resume+0x12c/0x1e0 > dpm_resume from dpm_resume_end+0xc/0x18 > dpm_resume_end from suspend_devices_and_enter+0x1f0/0x72c > suspend_devices_and_enter from pm_suspend+0x270/0x2a0 > pm_suspend from state_store+0x68/0xc8 > state_store from kernfs_fop_write_iter+0x10c/0x1cc > kernfs_fop_write_iter from vfs_write+0x2b0/0x3dc > vfs_write from ksys_write+0x5c/0xd4 > ksys_write from ret_fast_syscall+0x0/0x54 >Exception stack(0xe8bf1fa8 to 0xe8bf1ff0) >... >---[ end trace 0000000000000000 ]--- >Kernel panic - not syncing: Fatal exception in interrupt >---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- > >I have fully reproducible setup for this issue. Reverting it together >with f8321fa75102 ("virtio_net: Fix napi_skb_cache_put warning") (due to >some code dependencies) fixes this issue on top of Linux v6.11-rc1 and >recent linux-next releases. Let me know if I can help debugging this >issue further and help fixing. Will fix this tomorrow. In the meantime, could you provide full reproduce steps? Thanks! > > > ... > >Best regards >-- >Marek Szyprowski, PhD >Samsung R&D Institute Poland >
On 12.08.2024 18:47, Jiri Pirko wrote: > Mon, Aug 12, 2024 at 04:57:24PM CEST, m.szyprowski@samsung.com wrote: >> On 18.06.2024 16:44, Jiri Pirko wrote: >>> From: Jiri Pirko <jiri@nvidia.com> >>> >>> Add support for Byte Queue Limits (BQL). >>> >>> Tested on qemu emulated virtio_net device with 1, 2 and 4 queues. >>> Tested with fq_codel and pfifo_fast. Super netperf with 50 threads is >>> running in background. Netperf TCP_RR results: >>> >>> NOBQL FQC 1q: 159.56 159.33 158.50 154.31 agv: 157.925 >>> NOBQL FQC 2q: 184.64 184.96 174.73 174.15 agv: 179.62 >>> NOBQL FQC 4q: 994.46 441.96 416.50 499.56 agv: 588.12 >>> NOBQL PFF 1q: 148.68 148.92 145.95 149.48 agv: 148.2575 >>> NOBQL PFF 2q: 171.86 171.20 170.42 169.42 agv: 170.725 >>> NOBQL PFF 4q: 1505.23 1137.23 2488.70 3507.99 agv: 2159.7875 >>> BQL FQC 1q: 1332.80 1297.97 1351.41 1147.57 agv: 1282.4375 >>> BQL FQC 2q: 768.30 817.72 864.43 974.40 agv: 856.2125 >>> BQL FQC 4q: 945.66 942.68 878.51 822.82 agv: 897.4175 >>> BQL PFF 1q: 149.69 151.49 149.40 147.47 agv: 149.5125 >>> BQL PFF 2q: 2059.32 798.74 1844.12 381.80 agv: 1270.995 >>> BQL PFF 4q: 1871.98 4420.02 4916.59 13268.16 agv: 6119.1875 >>> >>> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >>> --- >>> v2->v3: >>> - fixed the switch from/to orphan mode while skbs are yet to be >>> completed by using the second least significant bit in virtqueue >>> token pointer to indicate skb is orphan. Don't account orphan >>> skbs in completion. >>> - reorganized parallel skb/xdp free stats accounting to napi/others. >>> - fixed kick condition check in orphan mode >>> v1->v2: >>> - moved netdev_tx_completed_queue() call into __free_old_xmit(), >>> propagate use_napi flag to __free_old_xmit() and only call >>> netdev_tx_completed_queue() in case it is true >>> - added forgotten call to netdev_tx_reset_queue() >>> - fixed stats for xdp packets >>> - fixed bql accounting when __free_old_xmit() is called from xdp path >>> - handle the !use_napi case in start_xmit() kick section >>> --- >>> drivers/net/virtio_net.c | 81 ++++++++++++++++++++++++++++------------ >>> 1 file changed, 57 insertions(+), 24 deletions(-) >> I've recently found an issue with virtio-net driver and system >> suspend/resume. Bisecting pointed to the c8bd1f7f3e61 ("virtio_net: add >> support for Byte Queue Limits") commit and this patch. Once it got >> merged to linux-next and then Linus trees, the driver occasionally >> crashes with the following log (captured on QEMU's ARM 32bit 'virt' >> machine): >> >> root@target:~# time rtcwake -s10 -mmem >> rtcwake: wakeup from "mem" using /dev/rtc0 at Sat Aug 10 12:40:26 2024 >> PM: suspend entry (s2idle) >> Filesystems sync: 0.000 seconds >> Freezing user space processes >> Freezing user space processes completed (elapsed 0.006 seconds) >> OOM killer disabled. >> Freezing remaining freezable tasks >> Freezing remaining freezable tasks completed (elapsed 0.001 seconds) >> ------------[ cut here ]------------ >> kernel BUG at lib/dynamic_queue_limits.c:99! >> Internal error: Oops - BUG: 0 [#1] SMP ARM >> Modules linked in: bluetooth ecdh_generic ecc libaes >> CPU: 1 PID: 1282 Comm: rtcwake Not tainted >> 6.10.0-rc3-00732-gc8bd1f7f3e61 #15240 >> Hardware name: Generic DT based system >> PC is at dql_completed+0x270/0x2cc >> LR is at __free_old_xmit+0x120/0x198 >> pc : [<c07ffa54>] lr : [<c0c42bf4>] psr: 80000013 >> ... >> Flags: Nzcv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none >> Control: 10c5387d Table: 43a4406a DAC: 00000051 >> ... >> Process rtcwake (pid: 1282, stack limit = 0xfbc21278) >> Stack: (0xe0805e80 to 0xe0806000) >> ... >> Call trace: >> dql_completed from __free_old_xmit+0x120/0x198 >> __free_old_xmit from free_old_xmit+0x44/0xe4 >> free_old_xmit from virtnet_poll_tx+0x88/0x1b4 >> virtnet_poll_tx from __napi_poll+0x2c/0x1d4 >> __napi_poll from net_rx_action+0x140/0x2b4 >> net_rx_action from handle_softirqs+0x11c/0x350 >> handle_softirqs from call_with_stack+0x18/0x20 >> call_with_stack from do_softirq+0x48/0x50 >> do_softirq from __local_bh_enable_ip+0xa0/0xa4 >> __local_bh_enable_ip from virtnet_open+0xd4/0x21c >> virtnet_open from virtnet_restore+0x94/0x120 >> virtnet_restore from virtio_device_restore+0x110/0x1f4 >> virtio_device_restore from dpm_run_callback+0x3c/0x100 >> dpm_run_callback from device_resume+0x12c/0x2a8 >> device_resume from dpm_resume+0x12c/0x1e0 >> dpm_resume from dpm_resume_end+0xc/0x18 >> dpm_resume_end from suspend_devices_and_enter+0x1f0/0x72c >> suspend_devices_and_enter from pm_suspend+0x270/0x2a0 >> pm_suspend from state_store+0x68/0xc8 >> state_store from kernfs_fop_write_iter+0x10c/0x1cc >> kernfs_fop_write_iter from vfs_write+0x2b0/0x3dc >> vfs_write from ksys_write+0x5c/0xd4 >> ksys_write from ret_fast_syscall+0x0/0x54 >> Exception stack(0xe8bf1fa8 to 0xe8bf1ff0) >> ... >> ---[ end trace 0000000000000000 ]--- >> Kernel panic - not syncing: Fatal exception in interrupt >> ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- >> >> I have fully reproducible setup for this issue. Reverting it together >> with f8321fa75102 ("virtio_net: Fix napi_skb_cache_put warning") (due to >> some code dependencies) fixes this issue on top of Linux v6.11-rc1 and >> recent linux-next releases. Let me know if I can help debugging this >> issue further and help fixing. > Will fix this tomorrow. In the meantime, could you provide full > reproduce steps? Well, it is easy to reproduce it simply by calling # time rtcwake -s10 -mmem a few times and sooner or later it will cause a kernel panic. Best regards
Mon, Aug 12, 2024 at 06:55:26PM CEST, m.szyprowski@samsung.com wrote: >On 12.08.2024 18:47, Jiri Pirko wrote: >> Mon, Aug 12, 2024 at 04:57:24PM CEST, m.szyprowski@samsung.com wrote: >>> On 18.06.2024 16:44, Jiri Pirko wrote: >>>> From: Jiri Pirko <jiri@nvidia.com> >>>> >>>> Add support for Byte Queue Limits (BQL). >>>> >>>> Tested on qemu emulated virtio_net device with 1, 2 and 4 queues. >>>> Tested with fq_codel and pfifo_fast. Super netperf with 50 threads is >>>> running in background. Netperf TCP_RR results: >>>> >>>> NOBQL FQC 1q: 159.56 159.33 158.50 154.31 agv: 157.925 >>>> NOBQL FQC 2q: 184.64 184.96 174.73 174.15 agv: 179.62 >>>> NOBQL FQC 4q: 994.46 441.96 416.50 499.56 agv: 588.12 >>>> NOBQL PFF 1q: 148.68 148.92 145.95 149.48 agv: 148.2575 >>>> NOBQL PFF 2q: 171.86 171.20 170.42 169.42 agv: 170.725 >>>> NOBQL PFF 4q: 1505.23 1137.23 2488.70 3507.99 agv: 2159.7875 >>>> BQL FQC 1q: 1332.80 1297.97 1351.41 1147.57 agv: 1282.4375 >>>> BQL FQC 2q: 768.30 817.72 864.43 974.40 agv: 856.2125 >>>> BQL FQC 4q: 945.66 942.68 878.51 822.82 agv: 897.4175 >>>> BQL PFF 1q: 149.69 151.49 149.40 147.47 agv: 149.5125 >>>> BQL PFF 2q: 2059.32 798.74 1844.12 381.80 agv: 1270.995 >>>> BQL PFF 4q: 1871.98 4420.02 4916.59 13268.16 agv: 6119.1875 >>>> >>>> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >>>> --- >>>> v2->v3: >>>> - fixed the switch from/to orphan mode while skbs are yet to be >>>> completed by using the second least significant bit in virtqueue >>>> token pointer to indicate skb is orphan. Don't account orphan >>>> skbs in completion. >>>> - reorganized parallel skb/xdp free stats accounting to napi/others. >>>> - fixed kick condition check in orphan mode >>>> v1->v2: >>>> - moved netdev_tx_completed_queue() call into __free_old_xmit(), >>>> propagate use_napi flag to __free_old_xmit() and only call >>>> netdev_tx_completed_queue() in case it is true >>>> - added forgotten call to netdev_tx_reset_queue() >>>> - fixed stats for xdp packets >>>> - fixed bql accounting when __free_old_xmit() is called from xdp path >>>> - handle the !use_napi case in start_xmit() kick section >>>> --- >>>> drivers/net/virtio_net.c | 81 ++++++++++++++++++++++++++++------------ >>>> 1 file changed, 57 insertions(+), 24 deletions(-) >>> I've recently found an issue with virtio-net driver and system >>> suspend/resume. Bisecting pointed to the c8bd1f7f3e61 ("virtio_net: add >>> support for Byte Queue Limits") commit and this patch. Once it got >>> merged to linux-next and then Linus trees, the driver occasionally >>> crashes with the following log (captured on QEMU's ARM 32bit 'virt' >>> machine): >>> >>> root@target:~# time rtcwake -s10 -mmem >>> rtcwake: wakeup from "mem" using /dev/rtc0 at Sat Aug 10 12:40:26 2024 >>> PM: suspend entry (s2idle) >>> Filesystems sync: 0.000 seconds >>> Freezing user space processes >>> Freezing user space processes completed (elapsed 0.006 seconds) >>> OOM killer disabled. >>> Freezing remaining freezable tasks >>> Freezing remaining freezable tasks completed (elapsed 0.001 seconds) >>> ------------[ cut here ]------------ >>> kernel BUG at lib/dynamic_queue_limits.c:99! >>> Internal error: Oops - BUG: 0 [#1] SMP ARM >>> Modules linked in: bluetooth ecdh_generic ecc libaes >>> CPU: 1 PID: 1282 Comm: rtcwake Not tainted >>> 6.10.0-rc3-00732-gc8bd1f7f3e61 #15240 >>> Hardware name: Generic DT based system >>> PC is at dql_completed+0x270/0x2cc >>> LR is at __free_old_xmit+0x120/0x198 >>> pc : [<c07ffa54>] lr : [<c0c42bf4>] psr: 80000013 >>> ... >>> Flags: Nzcv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none >>> Control: 10c5387d Table: 43a4406a DAC: 00000051 >>> ... >>> Process rtcwake (pid: 1282, stack limit = 0xfbc21278) >>> Stack: (0xe0805e80 to 0xe0806000) >>> ... >>> Call trace: >>> dql_completed from __free_old_xmit+0x120/0x198 >>> __free_old_xmit from free_old_xmit+0x44/0xe4 >>> free_old_xmit from virtnet_poll_tx+0x88/0x1b4 >>> virtnet_poll_tx from __napi_poll+0x2c/0x1d4 >>> __napi_poll from net_rx_action+0x140/0x2b4 >>> net_rx_action from handle_softirqs+0x11c/0x350 >>> handle_softirqs from call_with_stack+0x18/0x20 >>> call_with_stack from do_softirq+0x48/0x50 >>> do_softirq from __local_bh_enable_ip+0xa0/0xa4 >>> __local_bh_enable_ip from virtnet_open+0xd4/0x21c >>> virtnet_open from virtnet_restore+0x94/0x120 >>> virtnet_restore from virtio_device_restore+0x110/0x1f4 >>> virtio_device_restore from dpm_run_callback+0x3c/0x100 >>> dpm_run_callback from device_resume+0x12c/0x2a8 >>> device_resume from dpm_resume+0x12c/0x1e0 >>> dpm_resume from dpm_resume_end+0xc/0x18 >>> dpm_resume_end from suspend_devices_and_enter+0x1f0/0x72c >>> suspend_devices_and_enter from pm_suspend+0x270/0x2a0 >>> pm_suspend from state_store+0x68/0xc8 >>> state_store from kernfs_fop_write_iter+0x10c/0x1cc >>> kernfs_fop_write_iter from vfs_write+0x2b0/0x3dc >>> vfs_write from ksys_write+0x5c/0xd4 >>> ksys_write from ret_fast_syscall+0x0/0x54 >>> Exception stack(0xe8bf1fa8 to 0xe8bf1ff0) >>> ... >>> ---[ end trace 0000000000000000 ]--- >>> Kernel panic - not syncing: Fatal exception in interrupt >>> ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- >>> >>> I have fully reproducible setup for this issue. Reverting it together >>> with f8321fa75102 ("virtio_net: Fix napi_skb_cache_put warning") (due to >>> some code dependencies) fixes this issue on top of Linux v6.11-rc1 and >>> recent linux-next releases. Let me know if I can help debugging this >>> issue further and help fixing. >> Will fix this tomorrow. In the meantime, could you provide full >> reproduce steps? > >Well, it is easy to reproduce it simply by calling > ># time rtcwake -s10 -mmem > >a few times and sooner or later it will cause a kernel panic. I found the problem. Following patch will help: diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 3f10c72743e9..c6af18948092 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2867,8 +2867,8 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) if (err < 0) goto err_xdp_reg_mem_model; - virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi); netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, qp_index)); + virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi); virtnet_napi_tx_enable(vi, vi->sq[qp_index].vq, &vi->sq[qp_index].napi); return 0; Will submit the patch in a jiff. Thanks! > >Best regards >-- >Marek Szyprowski, PhD >Samsung R&D Institute Poland >
Wed, Aug 14, 2024 at 09:49:57AM CEST, jiri@resnulli.us wrote: >Mon, Aug 12, 2024 at 06:55:26PM CEST, m.szyprowski@samsung.com wrote: >>On 12.08.2024 18:47, Jiri Pirko wrote: >>> Mon, Aug 12, 2024 at 04:57:24PM CEST, m.szyprowski@samsung.com wrote: >>>> On 18.06.2024 16:44, Jiri Pirko wrote: >>>>> From: Jiri Pirko <jiri@nvidia.com> >>>>> >>>>> Add support for Byte Queue Limits (BQL). >>>>> >>>>> Tested on qemu emulated virtio_net device with 1, 2 and 4 queues. >>>>> Tested with fq_codel and pfifo_fast. Super netperf with 50 threads is >>>>> running in background. Netperf TCP_RR results: >>>>> >>>>> NOBQL FQC 1q: 159.56 159.33 158.50 154.31 agv: 157.925 >>>>> NOBQL FQC 2q: 184.64 184.96 174.73 174.15 agv: 179.62 >>>>> NOBQL FQC 4q: 994.46 441.96 416.50 499.56 agv: 588.12 >>>>> NOBQL PFF 1q: 148.68 148.92 145.95 149.48 agv: 148.2575 >>>>> NOBQL PFF 2q: 171.86 171.20 170.42 169.42 agv: 170.725 >>>>> NOBQL PFF 4q: 1505.23 1137.23 2488.70 3507.99 agv: 2159.7875 >>>>> BQL FQC 1q: 1332.80 1297.97 1351.41 1147.57 agv: 1282.4375 >>>>> BQL FQC 2q: 768.30 817.72 864.43 974.40 agv: 856.2125 >>>>> BQL FQC 4q: 945.66 942.68 878.51 822.82 agv: 897.4175 >>>>> BQL PFF 1q: 149.69 151.49 149.40 147.47 agv: 149.5125 >>>>> BQL PFF 2q: 2059.32 798.74 1844.12 381.80 agv: 1270.995 >>>>> BQL PFF 4q: 1871.98 4420.02 4916.59 13268.16 agv: 6119.1875 >>>>> >>>>> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >>>>> --- >>>>> v2->v3: >>>>> - fixed the switch from/to orphan mode while skbs are yet to be >>>>> completed by using the second least significant bit in virtqueue >>>>> token pointer to indicate skb is orphan. Don't account orphan >>>>> skbs in completion. >>>>> - reorganized parallel skb/xdp free stats accounting to napi/others. >>>>> - fixed kick condition check in orphan mode >>>>> v1->v2: >>>>> - moved netdev_tx_completed_queue() call into __free_old_xmit(), >>>>> propagate use_napi flag to __free_old_xmit() and only call >>>>> netdev_tx_completed_queue() in case it is true >>>>> - added forgotten call to netdev_tx_reset_queue() >>>>> - fixed stats for xdp packets >>>>> - fixed bql accounting when __free_old_xmit() is called from xdp path >>>>> - handle the !use_napi case in start_xmit() kick section >>>>> --- >>>>> drivers/net/virtio_net.c | 81 ++++++++++++++++++++++++++++------------ >>>>> 1 file changed, 57 insertions(+), 24 deletions(-) >>>> I've recently found an issue with virtio-net driver and system >>>> suspend/resume. Bisecting pointed to the c8bd1f7f3e61 ("virtio_net: add >>>> support for Byte Queue Limits") commit and this patch. Once it got >>>> merged to linux-next and then Linus trees, the driver occasionally >>>> crashes with the following log (captured on QEMU's ARM 32bit 'virt' >>>> machine): >>>> >>>> root@target:~# time rtcwake -s10 -mmem >>>> rtcwake: wakeup from "mem" using /dev/rtc0 at Sat Aug 10 12:40:26 2024 >>>> PM: suspend entry (s2idle) >>>> Filesystems sync: 0.000 seconds >>>> Freezing user space processes >>>> Freezing user space processes completed (elapsed 0.006 seconds) >>>> OOM killer disabled. >>>> Freezing remaining freezable tasks >>>> Freezing remaining freezable tasks completed (elapsed 0.001 seconds) >>>> ------------[ cut here ]------------ >>>> kernel BUG at lib/dynamic_queue_limits.c:99! >>>> Internal error: Oops - BUG: 0 [#1] SMP ARM >>>> Modules linked in: bluetooth ecdh_generic ecc libaes >>>> CPU: 1 PID: 1282 Comm: rtcwake Not tainted >>>> 6.10.0-rc3-00732-gc8bd1f7f3e61 #15240 >>>> Hardware name: Generic DT based system >>>> PC is at dql_completed+0x270/0x2cc >>>> LR is at __free_old_xmit+0x120/0x198 >>>> pc : [<c07ffa54>] lr : [<c0c42bf4>] psr: 80000013 >>>> ... >>>> Flags: Nzcv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none >>>> Control: 10c5387d Table: 43a4406a DAC: 00000051 >>>> ... >>>> Process rtcwake (pid: 1282, stack limit = 0xfbc21278) >>>> Stack: (0xe0805e80 to 0xe0806000) >>>> ... >>>> Call trace: >>>> dql_completed from __free_old_xmit+0x120/0x198 >>>> __free_old_xmit from free_old_xmit+0x44/0xe4 >>>> free_old_xmit from virtnet_poll_tx+0x88/0x1b4 >>>> virtnet_poll_tx from __napi_poll+0x2c/0x1d4 >>>> __napi_poll from net_rx_action+0x140/0x2b4 >>>> net_rx_action from handle_softirqs+0x11c/0x350 >>>> handle_softirqs from call_with_stack+0x18/0x20 >>>> call_with_stack from do_softirq+0x48/0x50 >>>> do_softirq from __local_bh_enable_ip+0xa0/0xa4 >>>> __local_bh_enable_ip from virtnet_open+0xd4/0x21c >>>> virtnet_open from virtnet_restore+0x94/0x120 >>>> virtnet_restore from virtio_device_restore+0x110/0x1f4 >>>> virtio_device_restore from dpm_run_callback+0x3c/0x100 >>>> dpm_run_callback from device_resume+0x12c/0x2a8 >>>> device_resume from dpm_resume+0x12c/0x1e0 >>>> dpm_resume from dpm_resume_end+0xc/0x18 >>>> dpm_resume_end from suspend_devices_and_enter+0x1f0/0x72c >>>> suspend_devices_and_enter from pm_suspend+0x270/0x2a0 >>>> pm_suspend from state_store+0x68/0xc8 >>>> state_store from kernfs_fop_write_iter+0x10c/0x1cc >>>> kernfs_fop_write_iter from vfs_write+0x2b0/0x3dc >>>> vfs_write from ksys_write+0x5c/0xd4 >>>> ksys_write from ret_fast_syscall+0x0/0x54 >>>> Exception stack(0xe8bf1fa8 to 0xe8bf1ff0) >>>> ... >>>> ---[ end trace 0000000000000000 ]--- >>>> Kernel panic - not syncing: Fatal exception in interrupt >>>> ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- >>>> >>>> I have fully reproducible setup for this issue. Reverting it together >>>> with f8321fa75102 ("virtio_net: Fix napi_skb_cache_put warning") (due to >>>> some code dependencies) fixes this issue on top of Linux v6.11-rc1 and >>>> recent linux-next releases. Let me know if I can help debugging this >>>> issue further and help fixing. >>> Will fix this tomorrow. In the meantime, could you provide full >>> reproduce steps? >> >>Well, it is easy to reproduce it simply by calling >> >># time rtcwake -s10 -mmem >> >>a few times and sooner or later it will cause a kernel panic. > >I found the problem. Following patch will help: > > >diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c >index 3f10c72743e9..c6af18948092 100644 >--- a/drivers/net/virtio_net.c >+++ b/drivers/net/virtio_net.c >@@ -2867,8 +2867,8 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > if (err < 0) > goto err_xdp_reg_mem_model; > >- virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi); > netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, qp_index)); >+ virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi); > virtnet_napi_tx_enable(vi, vi->sq[qp_index].vq, &vi->sq[qp_index].napi); Hmm, I have to look at this a bit more. I think this might be accidental fix. The thing is, napi can be triggered even if it is disabled: ->__local_bh_enable_ip() -> net_rx_action() -> __napi_poll() Here __napi_poll() calls napi_is_scheduled() and calls virtnet_poll_tx() in case napi is scheduled. napi_is_scheduled() checks NAPI_STATE_SCHED bit in napi state. However, this bit is set previously by netif_napi_add_weight(). > > > ... > >Best regards >-- >Marek Szyprowski, PhD >Samsung R&D Institute Poland > > > return 0; > > >Will submit the patch in a jiff. Thanks! > > > >> >>Best regards >>-- >>Marek Szyprowski, PhD >>Samsung R&D Institute Poland >>
On 14.08.2024 09:49, Jiri Pirko wrote: > Mon, Aug 12, 2024 at 06:55:26PM CEST, m.szyprowski@samsung.com wrote: >> On 12.08.2024 18:47, Jiri Pirko wrote: >>> Mon, Aug 12, 2024 at 04:57:24PM CEST, m.szyprowski@samsung.com wrote: >>>> On 18.06.2024 16:44, Jiri Pirko wrote: >>>>> From: Jiri Pirko <jiri@nvidia.com> >>>>> >>>>> Add support for Byte Queue Limits (BQL). >>>>> >>>>> Tested on qemu emulated virtio_net device with 1, 2 and 4 queues. >>>>> Tested with fq_codel and pfifo_fast. Super netperf with 50 threads is >>>>> running in background. Netperf TCP_RR results: >>>>> >>>>> NOBQL FQC 1q: 159.56 159.33 158.50 154.31 agv: 157.925 >>>>> NOBQL FQC 2q: 184.64 184.96 174.73 174.15 agv: 179.62 >>>>> NOBQL FQC 4q: 994.46 441.96 416.50 499.56 agv: 588.12 >>>>> NOBQL PFF 1q: 148.68 148.92 145.95 149.48 agv: 148.2575 >>>>> NOBQL PFF 2q: 171.86 171.20 170.42 169.42 agv: 170.725 >>>>> NOBQL PFF 4q: 1505.23 1137.23 2488.70 3507.99 agv: 2159.7875 >>>>> BQL FQC 1q: 1332.80 1297.97 1351.41 1147.57 agv: 1282.4375 >>>>> BQL FQC 2q: 768.30 817.72 864.43 974.40 agv: 856.2125 >>>>> BQL FQC 4q: 945.66 942.68 878.51 822.82 agv: 897.4175 >>>>> BQL PFF 1q: 149.69 151.49 149.40 147.47 agv: 149.5125 >>>>> BQL PFF 2q: 2059.32 798.74 1844.12 381.80 agv: 1270.995 >>>>> BQL PFF 4q: 1871.98 4420.02 4916.59 13268.16 agv: 6119.1875 >>>>> >>>>> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >>>>> --- >>>>> v2->v3: >>>>> - fixed the switch from/to orphan mode while skbs are yet to be >>>>> completed by using the second least significant bit in virtqueue >>>>> token pointer to indicate skb is orphan. Don't account orphan >>>>> skbs in completion. >>>>> - reorganized parallel skb/xdp free stats accounting to napi/others. >>>>> - fixed kick condition check in orphan mode >>>>> v1->v2: >>>>> - moved netdev_tx_completed_queue() call into __free_old_xmit(), >>>>> propagate use_napi flag to __free_old_xmit() and only call >>>>> netdev_tx_completed_queue() in case it is true >>>>> - added forgotten call to netdev_tx_reset_queue() >>>>> - fixed stats for xdp packets >>>>> - fixed bql accounting when __free_old_xmit() is called from xdp path >>>>> - handle the !use_napi case in start_xmit() kick section >>>>> --- >>>>> drivers/net/virtio_net.c | 81 ++++++++++++++++++++++++++++------------ >>>>> 1 file changed, 57 insertions(+), 24 deletions(-) >>>> I've recently found an issue with virtio-net driver and system >>>> suspend/resume. Bisecting pointed to the c8bd1f7f3e61 ("virtio_net: add >>>> support for Byte Queue Limits") commit and this patch. Once it got >>>> merged to linux-next and then Linus trees, the driver occasionally >>>> crashes with the following log (captured on QEMU's ARM 32bit 'virt' >>>> machine): >>>> >>>> root@target:~# time rtcwake -s10 -mmem >>>> rtcwake: wakeup from "mem" using /dev/rtc0 at Sat Aug 10 12:40:26 2024 >>>> PM: suspend entry (s2idle) >>>> Filesystems sync: 0.000 seconds >>>> Freezing user space processes >>>> Freezing user space processes completed (elapsed 0.006 seconds) >>>> OOM killer disabled. >>>> Freezing remaining freezable tasks >>>> Freezing remaining freezable tasks completed (elapsed 0.001 seconds) >>>> ------------[ cut here ]------------ >>>> kernel BUG at lib/dynamic_queue_limits.c:99! >>>> Internal error: Oops - BUG: 0 [#1] SMP ARM >>>> Modules linked in: bluetooth ecdh_generic ecc libaes >>>> CPU: 1 PID: 1282 Comm: rtcwake Not tainted >>>> 6.10.0-rc3-00732-gc8bd1f7f3e61 #15240 >>>> Hardware name: Generic DT based system >>>> PC is at dql_completed+0x270/0x2cc >>>> LR is at __free_old_xmit+0x120/0x198 >>>> pc : [<c07ffa54>] lr : [<c0c42bf4>] psr: 80000013 >>>> ... >>>> Flags: Nzcv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none >>>> Control: 10c5387d Table: 43a4406a DAC: 00000051 >>>> ... >>>> Process rtcwake (pid: 1282, stack limit = 0xfbc21278) >>>> Stack: (0xe0805e80 to 0xe0806000) >>>> ... >>>> Call trace: >>>> dql_completed from __free_old_xmit+0x120/0x198 >>>> __free_old_xmit from free_old_xmit+0x44/0xe4 >>>> free_old_xmit from virtnet_poll_tx+0x88/0x1b4 >>>> virtnet_poll_tx from __napi_poll+0x2c/0x1d4 >>>> __napi_poll from net_rx_action+0x140/0x2b4 >>>> net_rx_action from handle_softirqs+0x11c/0x350 >>>> handle_softirqs from call_with_stack+0x18/0x20 >>>> call_with_stack from do_softirq+0x48/0x50 >>>> do_softirq from __local_bh_enable_ip+0xa0/0xa4 >>>> __local_bh_enable_ip from virtnet_open+0xd4/0x21c >>>> virtnet_open from virtnet_restore+0x94/0x120 >>>> virtnet_restore from virtio_device_restore+0x110/0x1f4 >>>> virtio_device_restore from dpm_run_callback+0x3c/0x100 >>>> dpm_run_callback from device_resume+0x12c/0x2a8 >>>> device_resume from dpm_resume+0x12c/0x1e0 >>>> dpm_resume from dpm_resume_end+0xc/0x18 >>>> dpm_resume_end from suspend_devices_and_enter+0x1f0/0x72c >>>> suspend_devices_and_enter from pm_suspend+0x270/0x2a0 >>>> pm_suspend from state_store+0x68/0xc8 >>>> state_store from kernfs_fop_write_iter+0x10c/0x1cc >>>> kernfs_fop_write_iter from vfs_write+0x2b0/0x3dc >>>> vfs_write from ksys_write+0x5c/0xd4 >>>> ksys_write from ret_fast_syscall+0x0/0x54 >>>> Exception stack(0xe8bf1fa8 to 0xe8bf1ff0) >>>> ... >>>> ---[ end trace 0000000000000000 ]--- >>>> Kernel panic - not syncing: Fatal exception in interrupt >>>> ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- >>>> >>>> I have fully reproducible setup for this issue. Reverting it together >>>> with f8321fa75102 ("virtio_net: Fix napi_skb_cache_put warning") (due to >>>> some code dependencies) fixes this issue on top of Linux v6.11-rc1 and >>>> recent linux-next releases. Let me know if I can help debugging this >>>> issue further and help fixing. >>> Will fix this tomorrow. In the meantime, could you provide full >>> reproduce steps? >> Well, it is easy to reproduce it simply by calling >> >> # time rtcwake -s10 -mmem >> >> a few times and sooner or later it will cause a kernel panic. > I found the problem. Following patch will help: > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index 3f10c72743e9..c6af18948092 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -2867,8 +2867,8 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > if (err < 0) > goto err_xdp_reg_mem_model; > > - virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi); > netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, qp_index)); > + virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi); > virtnet_napi_tx_enable(vi, vi->sq[qp_index].vq, &vi->sq[qp_index].napi); > > return 0; > > > Will submit the patch in a jiff. Thanks! Confirmed. The above change fixed this issue in my tests. Feel free to add: Reported-by: Marek Szyprowski <m.szyprowski@samsung.com> Tested-by: Marek Szyprowski <m.szyprowski@samsung.com> Best regards
On Wed, Aug 14, 2024 at 10:17:15AM +0200, Jiri Pirko wrote: > >diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > >index 3f10c72743e9..c6af18948092 100644 > >--- a/drivers/net/virtio_net.c > >+++ b/drivers/net/virtio_net.c > >@@ -2867,8 +2867,8 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > > if (err < 0) > > goto err_xdp_reg_mem_model; > > > >- virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi); > > netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, qp_index)); > >+ virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi); > > virtnet_napi_tx_enable(vi, vi->sq[qp_index].vq, &vi->sq[qp_index].napi); > > Hmm, I have to look at this a bit more. I think this might be accidental > fix. The thing is, napi can be triggered even if it is disabled: > > ->__local_bh_enable_ip() > -> net_rx_action() > -> __napi_poll() > > Here __napi_poll() calls napi_is_scheduled() and calls virtnet_poll_tx() > in case napi is scheduled. napi_is_scheduled() checks NAPI_STATE_SCHED > bit in napi state. > > However, this bit is set previously by netif_napi_add_weight(). It's actually set in napi_disable too, isn't it? > > > > > > ... > > > >Best regards > >-- > >Marek Szyprowski, PhD > >Samsung R&D Institute Poland > > > > > > > > return 0; > > > > > >Will submit the patch in a jiff. Thanks! > > > > > > > >> > >>Best regards > >>-- > >>Marek Szyprowski, PhD > >>Samsung R&D Institute Poland > >>
Wed, Aug 14, 2024 at 11:43:51AM CEST, mst@redhat.com wrote: >On Wed, Aug 14, 2024 at 10:17:15AM +0200, Jiri Pirko wrote: >> >diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c >> >index 3f10c72743e9..c6af18948092 100644 >> >--- a/drivers/net/virtio_net.c >> >+++ b/drivers/net/virtio_net.c >> >@@ -2867,8 +2867,8 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) >> > if (err < 0) >> > goto err_xdp_reg_mem_model; >> > >> >- virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi); >> > netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, qp_index)); >> >+ virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi); >> > virtnet_napi_tx_enable(vi, vi->sq[qp_index].vq, &vi->sq[qp_index].napi); >> >> Hmm, I have to look at this a bit more. I think this might be accidental >> fix. The thing is, napi can be triggered even if it is disabled: >> >> ->__local_bh_enable_ip() >> -> net_rx_action() >> -> __napi_poll() >> >> Here __napi_poll() calls napi_is_scheduled() and calls virtnet_poll_tx() >> in case napi is scheduled. napi_is_scheduled() checks NAPI_STATE_SCHED >> bit in napi state. >> >> However, this bit is set previously by netif_napi_add_weight(). > >It's actually set in napi_disable too, isn't it? Yes, in both. I actually find exactly what's the issue. After virtnet_napi_enable() is called, the following path is hit __napi_poll() -> virtnet_poll() -> virtnet_poll_cleantx() -> netif_tx_wake_queue() That wakes the TX queue and allows skbs to be submitted and accounted by BQL counters. Then netdev_tx_reset_queue() is called that resets BQL counters and eventually leads to the BUG in dql_completed(). That's why virtnet_napi_tx_enable() move helped. Will submit. > >> >> > >> > > ... >> > >> >Best regards >> >-- >> >Marek Szyprowski, PhD >> >Samsung R&D Institute Poland >> > >> >> >> > >> > return 0; >> > >> > >> >Will submit the patch in a jiff. Thanks! >> > >> > >> > >> >> >> >>Best regards >> >>-- >> >>Marek Szyprowski, PhD >> >>Samsung R&D Institute Poland >> >> >
Wed, Aug 14, 2024 at 11:17:35AM CEST, m.szyprowski@samsung.com wrote: >On 14.08.2024 09:49, Jiri Pirko wrote: >> Mon, Aug 12, 2024 at 06:55:26PM CEST, m.szyprowski@samsung.com wrote: >>> On 12.08.2024 18:47, Jiri Pirko wrote: >>>> Mon, Aug 12, 2024 at 04:57:24PM CEST, m.szyprowski@samsung.com wrote: >>>>> On 18.06.2024 16:44, Jiri Pirko wrote: >>>>>> From: Jiri Pirko <jiri@nvidia.com> >>>>>> >>>>>> Add support for Byte Queue Limits (BQL). >>>>>> >>>>>> Tested on qemu emulated virtio_net device with 1, 2 and 4 queues. >>>>>> Tested with fq_codel and pfifo_fast. Super netperf with 50 threads is >>>>>> running in background. Netperf TCP_RR results: >>>>>> >>>>>> NOBQL FQC 1q: 159.56 159.33 158.50 154.31 agv: 157.925 >>>>>> NOBQL FQC 2q: 184.64 184.96 174.73 174.15 agv: 179.62 >>>>>> NOBQL FQC 4q: 994.46 441.96 416.50 499.56 agv: 588.12 >>>>>> NOBQL PFF 1q: 148.68 148.92 145.95 149.48 agv: 148.2575 >>>>>> NOBQL PFF 2q: 171.86 171.20 170.42 169.42 agv: 170.725 >>>>>> NOBQL PFF 4q: 1505.23 1137.23 2488.70 3507.99 agv: 2159.7875 >>>>>> BQL FQC 1q: 1332.80 1297.97 1351.41 1147.57 agv: 1282.4375 >>>>>> BQL FQC 2q: 768.30 817.72 864.43 974.40 agv: 856.2125 >>>>>> BQL FQC 4q: 945.66 942.68 878.51 822.82 agv: 897.4175 >>>>>> BQL PFF 1q: 149.69 151.49 149.40 147.47 agv: 149.5125 >>>>>> BQL PFF 2q: 2059.32 798.74 1844.12 381.80 agv: 1270.995 >>>>>> BQL PFF 4q: 1871.98 4420.02 4916.59 13268.16 agv: 6119.1875 >>>>>> >>>>>> Signed-off-by: Jiri Pirko <jiri@nvidia.com> >>>>>> --- >>>>>> v2->v3: >>>>>> - fixed the switch from/to orphan mode while skbs are yet to be >>>>>> completed by using the second least significant bit in virtqueue >>>>>> token pointer to indicate skb is orphan. Don't account orphan >>>>>> skbs in completion. >>>>>> - reorganized parallel skb/xdp free stats accounting to napi/others. >>>>>> - fixed kick condition check in orphan mode >>>>>> v1->v2: >>>>>> - moved netdev_tx_completed_queue() call into __free_old_xmit(), >>>>>> propagate use_napi flag to __free_old_xmit() and only call >>>>>> netdev_tx_completed_queue() in case it is true >>>>>> - added forgotten call to netdev_tx_reset_queue() >>>>>> - fixed stats for xdp packets >>>>>> - fixed bql accounting when __free_old_xmit() is called from xdp path >>>>>> - handle the !use_napi case in start_xmit() kick section >>>>>> --- >>>>>> drivers/net/virtio_net.c | 81 ++++++++++++++++++++++++++++------------ >>>>>> 1 file changed, 57 insertions(+), 24 deletions(-) >>>>> I've recently found an issue with virtio-net driver and system >>>>> suspend/resume. Bisecting pointed to the c8bd1f7f3e61 ("virtio_net: add >>>>> support for Byte Queue Limits") commit and this patch. Once it got >>>>> merged to linux-next and then Linus trees, the driver occasionally >>>>> crashes with the following log (captured on QEMU's ARM 32bit 'virt' >>>>> machine): >>>>> >>>>> root@target:~# time rtcwake -s10 -mmem >>>>> rtcwake: wakeup from "mem" using /dev/rtc0 at Sat Aug 10 12:40:26 2024 >>>>> PM: suspend entry (s2idle) >>>>> Filesystems sync: 0.000 seconds >>>>> Freezing user space processes >>>>> Freezing user space processes completed (elapsed 0.006 seconds) >>>>> OOM killer disabled. >>>>> Freezing remaining freezable tasks >>>>> Freezing remaining freezable tasks completed (elapsed 0.001 seconds) >>>>> ------------[ cut here ]------------ >>>>> kernel BUG at lib/dynamic_queue_limits.c:99! >>>>> Internal error: Oops - BUG: 0 [#1] SMP ARM >>>>> Modules linked in: bluetooth ecdh_generic ecc libaes >>>>> CPU: 1 PID: 1282 Comm: rtcwake Not tainted >>>>> 6.10.0-rc3-00732-gc8bd1f7f3e61 #15240 >>>>> Hardware name: Generic DT based system >>>>> PC is at dql_completed+0x270/0x2cc >>>>> LR is at __free_old_xmit+0x120/0x198 >>>>> pc : [<c07ffa54>] lr : [<c0c42bf4>] psr: 80000013 >>>>> ... >>>>> Flags: Nzcv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none >>>>> Control: 10c5387d Table: 43a4406a DAC: 00000051 >>>>> ... >>>>> Process rtcwake (pid: 1282, stack limit = 0xfbc21278) >>>>> Stack: (0xe0805e80 to 0xe0806000) >>>>> ... >>>>> Call trace: >>>>> dql_completed from __free_old_xmit+0x120/0x198 >>>>> __free_old_xmit from free_old_xmit+0x44/0xe4 >>>>> free_old_xmit from virtnet_poll_tx+0x88/0x1b4 >>>>> virtnet_poll_tx from __napi_poll+0x2c/0x1d4 >>>>> __napi_poll from net_rx_action+0x140/0x2b4 >>>>> net_rx_action from handle_softirqs+0x11c/0x350 >>>>> handle_softirqs from call_with_stack+0x18/0x20 >>>>> call_with_stack from do_softirq+0x48/0x50 >>>>> do_softirq from __local_bh_enable_ip+0xa0/0xa4 >>>>> __local_bh_enable_ip from virtnet_open+0xd4/0x21c >>>>> virtnet_open from virtnet_restore+0x94/0x120 >>>>> virtnet_restore from virtio_device_restore+0x110/0x1f4 >>>>> virtio_device_restore from dpm_run_callback+0x3c/0x100 >>>>> dpm_run_callback from device_resume+0x12c/0x2a8 >>>>> device_resume from dpm_resume+0x12c/0x1e0 >>>>> dpm_resume from dpm_resume_end+0xc/0x18 >>>>> dpm_resume_end from suspend_devices_and_enter+0x1f0/0x72c >>>>> suspend_devices_and_enter from pm_suspend+0x270/0x2a0 >>>>> pm_suspend from state_store+0x68/0xc8 >>>>> state_store from kernfs_fop_write_iter+0x10c/0x1cc >>>>> kernfs_fop_write_iter from vfs_write+0x2b0/0x3dc >>>>> vfs_write from ksys_write+0x5c/0xd4 >>>>> ksys_write from ret_fast_syscall+0x0/0x54 >>>>> Exception stack(0xe8bf1fa8 to 0xe8bf1ff0) >>>>> ... >>>>> ---[ end trace 0000000000000000 ]--- >>>>> Kernel panic - not syncing: Fatal exception in interrupt >>>>> ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- >>>>> >>>>> I have fully reproducible setup for this issue. Reverting it together >>>>> with f8321fa75102 ("virtio_net: Fix napi_skb_cache_put warning") (due to >>>>> some code dependencies) fixes this issue on top of Linux v6.11-rc1 and >>>>> recent linux-next releases. Let me know if I can help debugging this >>>>> issue further and help fixing. >>>> Will fix this tomorrow. In the meantime, could you provide full >>>> reproduce steps? >>> Well, it is easy to reproduce it simply by calling >>> >>> # time rtcwake -s10 -mmem >>> >>> a few times and sooner or later it will cause a kernel panic. >> I found the problem. Following patch will help: >> >> >> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c >> index 3f10c72743e9..c6af18948092 100644 >> --- a/drivers/net/virtio_net.c >> +++ b/drivers/net/virtio_net.c >> @@ -2867,8 +2867,8 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) >> if (err < 0) >> goto err_xdp_reg_mem_model; >> >> - virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi); >> netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, qp_index)); >> + virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi); >> virtnet_napi_tx_enable(vi, vi->sq[qp_index].vq, &vi->sq[qp_index].napi); >> >> return 0; >> >> >> Will submit the patch in a jiff. Thanks! > >Confirmed. The above change fixed this issue in my tests. > >Feel free to add: > >Reported-by: Marek Szyprowski <m.szyprowski@samsung.com> >Tested-by: Marek Szyprowski <m.szyprowski@samsung.com> Thanks! > >Best regards >-- >Marek Szyprowski, PhD >Samsung R&D Institute Poland >
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 61a57d134544..9f9b86874173 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -47,7 +47,8 @@ module_param(napi_tx, bool, 0644); #define VIRTIO_XDP_TX BIT(0) #define VIRTIO_XDP_REDIR BIT(1) -#define VIRTIO_XDP_FLAG BIT(0) +#define VIRTIO_XDP_FLAG BIT(0) +#define VIRTIO_ORPHAN_FLAG BIT(1) /* RX packet size EWMA. The average packet size is used to determine the packet * buffer size when refilling RX rings. As the entire RX ring may be refilled @@ -85,6 +86,8 @@ struct virtnet_stat_desc { struct virtnet_sq_free_stats { u64 packets; u64 bytes; + u64 napi_packets; + u64 napi_bytes; }; struct virtnet_sq_stats { @@ -506,29 +509,50 @@ static struct xdp_frame *ptr_to_xdp(void *ptr) return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG); } -static void __free_old_xmit(struct send_queue *sq, bool in_napi, - struct virtnet_sq_free_stats *stats) +static bool is_orphan_skb(void *ptr) +{ + return (unsigned long)ptr & VIRTIO_ORPHAN_FLAG; +} + +static void *skb_to_ptr(struct sk_buff *skb, bool orphan) +{ + return (void *)((unsigned long)skb | (orphan ? VIRTIO_ORPHAN_FLAG : 0)); +} + +static struct sk_buff *ptr_to_skb(void *ptr) +{ + return (struct sk_buff *)((unsigned long)ptr & ~VIRTIO_ORPHAN_FLAG); +} + +static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, + bool in_napi, struct virtnet_sq_free_stats *stats) { unsigned int len; void *ptr; while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { - ++stats->packets; - if (!is_xdp_frame(ptr)) { - struct sk_buff *skb = ptr; + struct sk_buff *skb = ptr_to_skb(ptr); pr_debug("Sent skb %p\n", skb); - stats->bytes += skb->len; + if (is_orphan_skb(ptr)) { + stats->packets++; + stats->bytes += skb->len; + } else { + stats->napi_packets++; + stats->napi_bytes += skb->len; + } napi_consume_skb(skb, in_napi); } else { struct xdp_frame *frame = ptr_to_xdp(ptr); + stats->packets++; stats->bytes += xdp_get_frame_len(frame); xdp_return_frame(frame); } } + netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes); } /* Converting between virtqueue no. and kernel tx/rx queue no. @@ -955,21 +979,22 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf) virtnet_rq_free_buf(vi, rq, buf); } -static void free_old_xmit(struct send_queue *sq, bool in_napi) +static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, + bool in_napi) { struct virtnet_sq_free_stats stats = {0}; - __free_old_xmit(sq, in_napi, &stats); + __free_old_xmit(sq, txq, in_napi, &stats); /* Avoid overhead when no packets have been processed * happens when called speculatively from start_xmit. */ - if (!stats.packets) + if (!stats.packets && !stats.napi_packets) return; u64_stats_update_begin(&sq->stats.syncp); - u64_stats_add(&sq->stats.bytes, stats.bytes); - u64_stats_add(&sq->stats.packets, stats.packets); + u64_stats_add(&sq->stats.bytes, stats.bytes + stats.napi_bytes); + u64_stats_add(&sq->stats.packets, stats.packets + stats.napi_packets); u64_stats_update_end(&sq->stats.syncp); } @@ -1003,7 +1028,9 @@ static void check_sq_full_and_disable(struct virtnet_info *vi, * early means 16 slots are typically wasted. */ if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { - netif_stop_subqueue(dev, qnum); + struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); + + netif_tx_stop_queue(txq); u64_stats_update_begin(&sq->stats.syncp); u64_stats_inc(&sq->stats.stop); u64_stats_update_end(&sq->stats.syncp); @@ -1012,7 +1039,7 @@ static void check_sq_full_and_disable(struct virtnet_info *vi, virtqueue_napi_schedule(&sq->napi, sq->vq); } else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { /* More just got used, free them then recheck. */ - free_old_xmit(sq, false); + free_old_xmit(sq, txq, false); if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { netif_start_subqueue(dev, qnum); u64_stats_update_begin(&sq->stats.syncp); @@ -1138,7 +1165,8 @@ static int virtnet_xdp_xmit(struct net_device *dev, } /* Free up any pending old buffers before queueing new ones. */ - __free_old_xmit(sq, false, &stats); + __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), + false, &stats); for (i = 0; i < n; i++) { struct xdp_frame *xdpf = frames[i]; @@ -2313,7 +2341,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq) do { virtqueue_disable_cb(sq->vq); - free_old_xmit(sq, true); + free_old_xmit(sq, txq, true); } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq))); if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) { @@ -2412,6 +2440,7 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) goto err_xdp_reg_mem_model; virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi); + netdev_tx_reset_queue(netdev_get_tx_queue(vi->dev, qp_index)); virtnet_napi_tx_enable(vi, vi->sq[qp_index].vq, &vi->sq[qp_index].napi); return 0; @@ -2471,7 +2500,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) txq = netdev_get_tx_queue(vi->dev, index); __netif_tx_lock(txq, raw_smp_processor_id()); virtqueue_disable_cb(sq->vq); - free_old_xmit(sq, true); + free_old_xmit(sq, txq, true); if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) { if (netif_tx_queue_stopped(txq)) { @@ -2505,7 +2534,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) return 0; } -static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) +static int xmit_skb(struct send_queue *sq, struct sk_buff *skb, bool orphan) { struct virtio_net_hdr_mrg_rxbuf *hdr; const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; @@ -2549,7 +2578,8 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) return num_sg; num_sg++; } - return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC); + return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, + skb_to_ptr(skb, orphan), GFP_ATOMIC); } static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) @@ -2559,24 +2589,25 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) struct send_queue *sq = &vi->sq[qnum]; int err; struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); - bool kick = !netdev_xmit_more(); + bool xmit_more = netdev_xmit_more(); bool use_napi = sq->napi.weight; + bool kick; /* Free up any pending old buffers before queueing new ones. */ do { if (use_napi) virtqueue_disable_cb(sq->vq); - free_old_xmit(sq, false); + free_old_xmit(sq, txq, false); - } while (use_napi && kick && + } while (use_napi && !xmit_more && unlikely(!virtqueue_enable_cb_delayed(sq->vq))); /* timestamp packet in software */ skb_tx_timestamp(skb); /* Try to transmit */ - err = xmit_skb(sq, skb); + err = xmit_skb(sq, skb, !use_napi); /* This should not happen! */ if (unlikely(err)) { @@ -2598,7 +2629,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) check_sq_full_and_disable(vi, dev, sq); - if (kick || netif_xmit_stopped(txq)) { + kick = use_napi ? __netdev_tx_sent_queue(txq, skb->len, xmit_more) : + !xmit_more || netif_xmit_stopped(txq); + if (kick) { if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { u64_stats_update_begin(&sq->stats.syncp); u64_stats_inc(&sq->stats.kicks);