diff mbox series

[net-next] virtio_net: Prevent misidentified spurious interrupts from killing the irq

Message ID 20240801135639.11400-1-hengqi@linux.alibaba.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net-next] virtio_net: Prevent misidentified spurious interrupts from killing the irq | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 44 this patch: 46
netdev/build_tools success Errors and warnings before: 10 this patch: 10
netdev/cc_maintainers warning 1 maintainers not CCed: virtualization@lists.linux.dev
netdev/build_clang fail Errors and warnings before: 45 this patch: 47
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 72 this patch: 74
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc fail Errors and warnings before: 22 this patch: 24
netdev/source_inline success Was 0 now: 0

Commit Message

Heng Qi Aug. 1, 2024, 1:56 p.m. UTC
Michael has effectively reduced the number of spurious interrupts in
commit a7766ef18b33 ("virtio_net: disable cb aggressively") by disabling
irq callbacks before cleaning old buffers.

But it is still possible that the irq is killed by mistake:

  When a delayed tx interrupt arrives, old buffers has been cleaned in
  other paths (start_xmit and virtnet_poll_cleantx), then the interrupt is
  mistakenly identified as a spurious interrupt in vring_interrupt.

  We should refrain from labeling it as a spurious interrupt; otherwise,
  note_interrupt may inadvertently kill the legitimate irq.

Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
---
 drivers/net/virtio_net.c     |  9 ++++++
 drivers/virtio/virtio_ring.c | 53 ++++++++++++++++++++++++++++++++++++
 include/linux/virtio.h       |  3 ++
 3 files changed, 65 insertions(+)

Comments

Michael S. Tsirkin Aug. 1, 2024, 2:04 p.m. UTC | #1
On Thu, Aug 01, 2024 at 09:56:39PM +0800, Heng Qi wrote:
> Michael has effectively reduced the number of spurious interrupts in
> commit a7766ef18b33 ("virtio_net: disable cb aggressively") by disabling
> irq callbacks before cleaning old buffers.
> 
> But it is still possible that the irq is killed by mistake:
> 
>   When a delayed tx interrupt arrives, old buffers has been cleaned in
>   other paths (start_xmit and virtnet_poll_cleantx), then the interrupt is
>   mistakenly identified as a spurious interrupt in vring_interrupt.
> 
>   We should refrain from labeling it as a spurious interrupt; otherwise,
>   note_interrupt may inadvertently kill the legitimate irq.
> 
> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>


Is this a practical or theoretical issue? Do you observe an issue
and see that this patch fixes it? Or is this from code review?
> ---
>  drivers/net/virtio_net.c     |  9 ++++++
>  drivers/virtio/virtio_ring.c | 53 ++++++++++++++++++++++++++++++++++++
>  include/linux/virtio.h       |  3 ++
>  3 files changed, 65 insertions(+)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 0383a3e136d6..6d8739418203 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -2769,6 +2769,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq, int budget)
>  		do {
>  			virtqueue_disable_cb(sq->vq);
>  			free_old_xmit(sq, txq, !!budget);
> +			virtqueue_set_tx_oldbuf_cleaned(sq->vq, true);
>  		} while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
>  
>  		if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) {
> @@ -3035,6 +3036,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  
>  		free_old_xmit(sq, txq, false);
>  
> +		if (use_napi)
> +			virtqueue_set_tx_oldbuf_cleaned(sq->vq, true);
> +
>  	} while (use_napi && !xmit_more &&
>  	       unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
>  
> @@ -3044,6 +3048,11 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  	/* Try to transmit */
>  	err = xmit_skb(sq, skb, !use_napi);
>  
> +	if (use_napi) {
> +		virtqueue_set_tx_newbuf_sent(sq->vq, true);
> +		virtqueue_set_tx_oldbuf_cleaned(sq->vq, false);
> +	}
> +
>  	/* This should not happen! */
>  	if (unlikely(err)) {
>  		DEV_STATS_INC(dev, tx_fifo_errors);
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index be7309b1e860..fb2afc716371 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -180,6 +180,11 @@ struct vring_virtqueue {
>  	 */
>  	bool do_unmap;
>  
> +	/* Has any new data been sent? */
> +	bool is_tx_newbuf_sent;
> +	/* Is the old data recently sent cleaned up? */
> +	bool is_tx_oldbuf_cleaned;
> +
>  	/* Head of free buffer list. */
>  	unsigned int free_head;
>  	/* Number we've added since last sync. */
> @@ -2092,6 +2097,9 @@ static struct virtqueue *vring_create_virtqueue_packed(
>  	vq->use_dma_api = vring_use_dma_api(vdev);
>  	vq->premapped = false;
>  	vq->do_unmap = vq->use_dma_api;
> +	vq->is_tx_newbuf_sent = false; /* Initially, no new buffer to send. */
> +	vq->is_tx_oldbuf_cleaned = true; /* Initially, no old buffer to clean. */
> +
>  
>  	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
>  		!context;
> @@ -2375,6 +2383,38 @@ bool virtqueue_notify(struct virtqueue *_vq)
>  }
>  EXPORT_SYMBOL_GPL(virtqueue_notify);
>  
> +/**
> + * virtqueue_set_tx_newbuf_sent - set whether there is new tx buf to send.
> + * @_vq: the struct virtqueue
> + *
> + * If is_tx_newbuf_sent and is_tx_oldbuf_cleaned are both true, the
> + * spurious interrupt is caused by polling TX vq in other paths outside
> + * the tx irq callback.
> + */
> +void virtqueue_set_tx_newbuf_sent(struct virtqueue *_vq, bool val)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	vq->is_tx_newbuf_sent = val;
> +}
> +EXPORT_SYMBOL_GPL(virtqueue_set_tx_newbuf_sent);
> +
> +/**
> + * virtqueue_set_tx_oldbuf_cleaned - set whether there is old tx buf to clean.
> + * @_vq: the struct virtqueue
> + *
> + * If is_tx_oldbuf_cleaned and is_tx_newbuf_sent are both true, the
> + * spurious interrupt is caused by polling TX vq in other paths outside
> + * the tx irq callback.
> + */
> +void virtqueue_set_tx_oldbuf_cleaned(struct virtqueue *_vq, bool val)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	vq->is_tx_oldbuf_cleaned = val;
> +}
> +EXPORT_SYMBOL_GPL(virtqueue_set_tx_oldbuf_cleaned);
> +
>  /**
>   * virtqueue_kick - update after add_buf
>   * @vq: the struct virtqueue
> @@ -2572,6 +2612,16 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
>  	struct vring_virtqueue *vq = to_vvq(_vq);
>  
>  	if (!more_used(vq)) {
> +		/* When the delayed TX interrupt arrives, the old buffers are
> +		 * cleaned in other cases(start_xmit and virtnet_poll_cleantx).
> +		 * We'd better not identify it as a spurious interrupt,
> +		 * otherwise note_interrupt may kill the interrupt.
> +		 */
> +		if (unlikely(vq->is_tx_newbuf_sent && vq->is_tx_oldbuf_cleaned)) {
> +			vq->is_tx_newbuf_sent = false;
> +			return IRQ_HANDLED;
> +		}
> +
>  		pr_debug("virtqueue interrupt with no work for %p\n", vq);
>  		return IRQ_NONE;
>  	}
> @@ -2637,6 +2687,9 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index,
>  	vq->use_dma_api = vring_use_dma_api(vdev);
>  	vq->premapped = false;
>  	vq->do_unmap = vq->use_dma_api;
> +	vq->is_tx_newbuf_sent = false; /* Initially, no new buffer to send. */
> +	vq->is_tx_oldbuf_cleaned = true; /* Initially, no old buffer to clean. */
> +
>  
>  	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
>  		!context;
> diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> index ecc5cb7b8c91..ba3be9276c09 100644
> --- a/include/linux/virtio.h
> +++ b/include/linux/virtio.h
> @@ -103,6 +103,9 @@ int virtqueue_resize(struct virtqueue *vq, u32 num,
>  int virtqueue_reset(struct virtqueue *vq,
>  		    void (*recycle)(struct virtqueue *vq, void *buf));
>  
> +void virtqueue_set_tx_newbuf_sent(struct virtqueue *vq, bool val);
> +void virtqueue_set_tx_oldbuf_cleaned(struct virtqueue *vq, bool val);
> +
>  struct virtio_admin_cmd {
>  	__le16 opcode;
>  	__le16 group_type;
> -- 
> 2.32.0.3.g01195cf9f
Jason Wang Aug. 2, 2024, 3:41 a.m. UTC | #2
On Thu, Aug 1, 2024 at 9:56 PM Heng Qi <hengqi@linux.alibaba.com> wrote:
>
> Michael has effectively reduced the number of spurious interrupts in
> commit a7766ef18b33 ("virtio_net: disable cb aggressively") by disabling
> irq callbacks before cleaning old buffers.
>
> But it is still possible that the irq is killed by mistake:
>
>   When a delayed tx interrupt arrives, old buffers has been cleaned in
>   other paths (start_xmit and virtnet_poll_cleantx), then the interrupt is
>   mistakenly identified as a spurious interrupt in vring_interrupt.
>
>   We should refrain from labeling it as a spurious interrupt; otherwise,
>   note_interrupt may inadvertently kill the legitimate irq.

I think the evil came from where we do free_old_xmit() in
start_xmit(). I know it is for performance, but we may need to make
the code work correctly instead of adding endless hacks. Personally, I
think the virtio-net TX path is over-complicated. We probably pay too
much (e.g there's netif_tx_lock in TX NAPI path) to try to "optimize"
the performance.

How about just don't do free_old_xmit and do that solely in the TX NAPI?

>
> Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
> ---
>  drivers/net/virtio_net.c     |  9 ++++++
>  drivers/virtio/virtio_ring.c | 53 ++++++++++++++++++++++++++++++++++++
>  include/linux/virtio.h       |  3 ++
>  3 files changed, 65 insertions(+)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 0383a3e136d6..6d8739418203 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -2769,6 +2769,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq, int budget)
>                 do {
>                         virtqueue_disable_cb(sq->vq);
>                         free_old_xmit(sq, txq, !!budget);
> +                       virtqueue_set_tx_oldbuf_cleaned(sq->vq, true);
>                 } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
>
>                 if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) {
> @@ -3035,6 +3036,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>
>                 free_old_xmit(sq, txq, false);
>
> +               if (use_napi)
> +                       virtqueue_set_tx_oldbuf_cleaned(sq->vq, true);
> +
>         } while (use_napi && !xmit_more &&
>                unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
>
> @@ -3044,6 +3048,11 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>         /* Try to transmit */
>         err = xmit_skb(sq, skb, !use_napi);
>
> +       if (use_napi) {
> +               virtqueue_set_tx_newbuf_sent(sq->vq, true);
> +               virtqueue_set_tx_oldbuf_cleaned(sq->vq, false);
> +       }
> +
>         /* This should not happen! */
>         if (unlikely(err)) {
>                 DEV_STATS_INC(dev, tx_fifo_errors);
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index be7309b1e860..fb2afc716371 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -180,6 +180,11 @@ struct vring_virtqueue {
>          */
>         bool do_unmap;
>
> +       /* Has any new data been sent? */
> +       bool is_tx_newbuf_sent;
> +       /* Is the old data recently sent cleaned up? */
> +       bool is_tx_oldbuf_cleaned;
> +
>         /* Head of free buffer list. */
>         unsigned int free_head;
>         /* Number we've added since last sync. */
> @@ -2092,6 +2097,9 @@ static struct virtqueue *vring_create_virtqueue_packed(
>         vq->use_dma_api = vring_use_dma_api(vdev);
>         vq->premapped = false;
>         vq->do_unmap = vq->use_dma_api;
> +       vq->is_tx_newbuf_sent = false; /* Initially, no new buffer to send. */
> +       vq->is_tx_oldbuf_cleaned = true; /* Initially, no old buffer to clean. */
> +
>
>         vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
>                 !context;
> @@ -2375,6 +2383,38 @@ bool virtqueue_notify(struct virtqueue *_vq)
>  }
>  EXPORT_SYMBOL_GPL(virtqueue_notify);
>
> +/**
> + * virtqueue_set_tx_newbuf_sent - set whether there is new tx buf to send.
> + * @_vq: the struct virtqueue
> + *
> + * If is_tx_newbuf_sent and is_tx_oldbuf_cleaned are both true, the
> + * spurious interrupt is caused by polling TX vq in other paths outside
> + * the tx irq callback.
> + */
> +void virtqueue_set_tx_newbuf_sent(struct virtqueue *_vq, bool val)
> +{
> +       struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +       vq->is_tx_newbuf_sent = val;
> +}
> +EXPORT_SYMBOL_GPL(virtqueue_set_tx_newbuf_sent);
> +
> +/**
> + * virtqueue_set_tx_oldbuf_cleaned - set whether there is old tx buf to clean.
> + * @_vq: the struct virtqueue
> + *
> + * If is_tx_oldbuf_cleaned and is_tx_newbuf_sent are both true, the
> + * spurious interrupt is caused by polling TX vq in other paths outside
> + * the tx irq callback.
> + */
> +void virtqueue_set_tx_oldbuf_cleaned(struct virtqueue *_vq, bool val)
> +{
> +       struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +       vq->is_tx_oldbuf_cleaned = val;
> +}
> +EXPORT_SYMBOL_GPL(virtqueue_set_tx_oldbuf_cleaned);
> +
>  /**
>   * virtqueue_kick - update after add_buf
>   * @vq: the struct virtqueue
> @@ -2572,6 +2612,16 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
>         struct vring_virtqueue *vq = to_vvq(_vq);
>
>         if (!more_used(vq)) {
> +               /* When the delayed TX interrupt arrives, the old buffers are
> +                * cleaned in other cases(start_xmit and virtnet_poll_cleantx).
> +                * We'd better not identify it as a spurious interrupt,
> +                * otherwise note_interrupt may kill the interrupt.
> +                */
> +               if (unlikely(vq->is_tx_newbuf_sent && vq->is_tx_oldbuf_cleaned)) {
> +                       vq->is_tx_newbuf_sent = false;
> +                       return IRQ_HANDLED;
> +               }

This is the general virtio code, it's better to avoid any device specific logic.

> +
>                 pr_debug("virtqueue interrupt with no work for %p\n", vq);
>                 return IRQ_NONE;
>         }
> @@ -2637,6 +2687,9 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index,
>         vq->use_dma_api = vring_use_dma_api(vdev);
>         vq->premapped = false;
>         vq->do_unmap = vq->use_dma_api;
> +       vq->is_tx_newbuf_sent = false; /* Initially, no new buffer to send. */
> +       vq->is_tx_oldbuf_cleaned = true; /* Initially, no old buffer to clean. */
> +
>
>         vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
>                 !context;
> diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> index ecc5cb7b8c91..ba3be9276c09 100644
> --- a/include/linux/virtio.h
> +++ b/include/linux/virtio.h
> @@ -103,6 +103,9 @@ int virtqueue_resize(struct virtqueue *vq, u32 num,
>  int virtqueue_reset(struct virtqueue *vq,
>                     void (*recycle)(struct virtqueue *vq, void *buf));
>
> +void virtqueue_set_tx_newbuf_sent(struct virtqueue *vq, bool val);
> +void virtqueue_set_tx_oldbuf_cleaned(struct virtqueue *vq, bool val);
> +
>  struct virtio_admin_cmd {
>         __le16 opcode;
>         __le16 group_type;
> --
> 2.32.0.3.g01195cf9f
>
Heng Qi Aug. 2, 2024, 7:42 a.m. UTC | #3
On Thu, 1 Aug 2024 10:04:05 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Thu, Aug 01, 2024 at 09:56:39PM +0800, Heng Qi wrote:
> > Michael has effectively reduced the number of spurious interrupts in
> > commit a7766ef18b33 ("virtio_net: disable cb aggressively") by disabling
> > irq callbacks before cleaning old buffers.
> > 
> > But it is still possible that the irq is killed by mistake:
> > 
> >   When a delayed tx interrupt arrives, old buffers has been cleaned in
> >   other paths (start_xmit and virtnet_poll_cleantx), then the interrupt is
> >   mistakenly identified as a spurious interrupt in vring_interrupt.
> > 
> >   We should refrain from labeling it as a spurious interrupt; otherwise,
> >   note_interrupt may inadvertently kill the legitimate irq.
> > 
> > Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
> 
> 
> Is this a practical or theoretical issue? Do you observe an issue
> and see that this patch fixes it? Or is this from code review?


This issue was previously documented in our bugzilla, but that was in 2020.

I can't easily reproduce the issue after a7766ef18b33, but interrupt suppression
under virtio is unreliable and out of sync, which is still a potential risk for
DPUs where the VM and the device are not on the same host.

Thanks.

> > ---
> >  drivers/net/virtio_net.c     |  9 ++++++
> >  drivers/virtio/virtio_ring.c | 53 ++++++++++++++++++++++++++++++++++++
> >  include/linux/virtio.h       |  3 ++
> >  3 files changed, 65 insertions(+)
> > 
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 0383a3e136d6..6d8739418203 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -2769,6 +2769,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq, int budget)
> >  		do {
> >  			virtqueue_disable_cb(sq->vq);
> >  			free_old_xmit(sq, txq, !!budget);
> > +			virtqueue_set_tx_oldbuf_cleaned(sq->vq, true);
> >  		} while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
> >  
> >  		if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) {
> > @@ -3035,6 +3036,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> >  
> >  		free_old_xmit(sq, txq, false);
> >  
> > +		if (use_napi)
> > +			virtqueue_set_tx_oldbuf_cleaned(sq->vq, true);
> > +
> >  	} while (use_napi && !xmit_more &&
> >  	       unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
> >  
> > @@ -3044,6 +3048,11 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> >  	/* Try to transmit */
> >  	err = xmit_skb(sq, skb, !use_napi);
> >  
> > +	if (use_napi) {
> > +		virtqueue_set_tx_newbuf_sent(sq->vq, true);
> > +		virtqueue_set_tx_oldbuf_cleaned(sq->vq, false);
> > +	}
> > +
> >  	/* This should not happen! */
> >  	if (unlikely(err)) {
> >  		DEV_STATS_INC(dev, tx_fifo_errors);
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index be7309b1e860..fb2afc716371 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -180,6 +180,11 @@ struct vring_virtqueue {
> >  	 */
> >  	bool do_unmap;
> >  
> > +	/* Has any new data been sent? */
> > +	bool is_tx_newbuf_sent;
> > +	/* Is the old data recently sent cleaned up? */
> > +	bool is_tx_oldbuf_cleaned;
> > +
> >  	/* Head of free buffer list. */
> >  	unsigned int free_head;
> >  	/* Number we've added since last sync. */
> > @@ -2092,6 +2097,9 @@ static struct virtqueue *vring_create_virtqueue_packed(
> >  	vq->use_dma_api = vring_use_dma_api(vdev);
> >  	vq->premapped = false;
> >  	vq->do_unmap = vq->use_dma_api;
> > +	vq->is_tx_newbuf_sent = false; /* Initially, no new buffer to send. */
> > +	vq->is_tx_oldbuf_cleaned = true; /* Initially, no old buffer to clean. */
> > +
> >  
> >  	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
> >  		!context;
> > @@ -2375,6 +2383,38 @@ bool virtqueue_notify(struct virtqueue *_vq)
> >  }
> >  EXPORT_SYMBOL_GPL(virtqueue_notify);
> >  
> > +/**
> > + * virtqueue_set_tx_newbuf_sent - set whether there is new tx buf to send.
> > + * @_vq: the struct virtqueue
> > + *
> > + * If is_tx_newbuf_sent and is_tx_oldbuf_cleaned are both true, the
> > + * spurious interrupt is caused by polling TX vq in other paths outside
> > + * the tx irq callback.
> > + */
> > +void virtqueue_set_tx_newbuf_sent(struct virtqueue *_vq, bool val)
> > +{
> > +	struct vring_virtqueue *vq = to_vvq(_vq);
> > +
> > +	vq->is_tx_newbuf_sent = val;
> > +}
> > +EXPORT_SYMBOL_GPL(virtqueue_set_tx_newbuf_sent);
> > +
> > +/**
> > + * virtqueue_set_tx_oldbuf_cleaned - set whether there is old tx buf to clean.
> > + * @_vq: the struct virtqueue
> > + *
> > + * If is_tx_oldbuf_cleaned and is_tx_newbuf_sent are both true, the
> > + * spurious interrupt is caused by polling TX vq in other paths outside
> > + * the tx irq callback.
> > + */
> > +void virtqueue_set_tx_oldbuf_cleaned(struct virtqueue *_vq, bool val)
> > +{
> > +	struct vring_virtqueue *vq = to_vvq(_vq);
> > +
> > +	vq->is_tx_oldbuf_cleaned = val;
> > +}
> > +EXPORT_SYMBOL_GPL(virtqueue_set_tx_oldbuf_cleaned);
> > +
> >  /**
> >   * virtqueue_kick - update after add_buf
> >   * @vq: the struct virtqueue
> > @@ -2572,6 +2612,16 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
> >  	struct vring_virtqueue *vq = to_vvq(_vq);
> >  
> >  	if (!more_used(vq)) {
> > +		/* When the delayed TX interrupt arrives, the old buffers are
> > +		 * cleaned in other cases(start_xmit and virtnet_poll_cleantx).
> > +		 * We'd better not identify it as a spurious interrupt,
> > +		 * otherwise note_interrupt may kill the interrupt.
> > +		 */
> > +		if (unlikely(vq->is_tx_newbuf_sent && vq->is_tx_oldbuf_cleaned)) {
> > +			vq->is_tx_newbuf_sent = false;
> > +			return IRQ_HANDLED;
> > +		}
> > +
> >  		pr_debug("virtqueue interrupt with no work for %p\n", vq);
> >  		return IRQ_NONE;
> >  	}
> > @@ -2637,6 +2687,9 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index,
> >  	vq->use_dma_api = vring_use_dma_api(vdev);
> >  	vq->premapped = false;
> >  	vq->do_unmap = vq->use_dma_api;
> > +	vq->is_tx_newbuf_sent = false; /* Initially, no new buffer to send. */
> > +	vq->is_tx_oldbuf_cleaned = true; /* Initially, no old buffer to clean. */
> > +
> >  
> >  	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
> >  		!context;
> > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > index ecc5cb7b8c91..ba3be9276c09 100644
> > --- a/include/linux/virtio.h
> > +++ b/include/linux/virtio.h
> > @@ -103,6 +103,9 @@ int virtqueue_resize(struct virtqueue *vq, u32 num,
> >  int virtqueue_reset(struct virtqueue *vq,
> >  		    void (*recycle)(struct virtqueue *vq, void *buf));
> >  
> > +void virtqueue_set_tx_newbuf_sent(struct virtqueue *vq, bool val);
> > +void virtqueue_set_tx_oldbuf_cleaned(struct virtqueue *vq, bool val);
> > +
> >  struct virtio_admin_cmd {
> >  	__le16 opcode;
> >  	__le16 group_type;
> > -- 
> > 2.32.0.3.g01195cf9f
>
Michael S. Tsirkin Aug. 2, 2024, 1:08 p.m. UTC | #4
On Fri, Aug 02, 2024 at 03:42:06PM +0800, Heng Qi wrote:
> On Thu, 1 Aug 2024 10:04:05 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Thu, Aug 01, 2024 at 09:56:39PM +0800, Heng Qi wrote:
> > > Michael has effectively reduced the number of spurious interrupts in
> > > commit a7766ef18b33 ("virtio_net: disable cb aggressively") by disabling
> > > irq callbacks before cleaning old buffers.
> > > 
> > > But it is still possible that the irq is killed by mistake:
> > > 
> > >   When a delayed tx interrupt arrives, old buffers has been cleaned in
> > >   other paths (start_xmit and virtnet_poll_cleantx), then the interrupt is
> > >   mistakenly identified as a spurious interrupt in vring_interrupt.
> > > 
> > >   We should refrain from labeling it as a spurious interrupt; otherwise,
> > >   note_interrupt may inadvertently kill the legitimate irq.
> > > 
> > > Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
> > 
> > 
> > Is this a practical or theoretical issue? Do you observe an issue
> > and see that this patch fixes it? Or is this from code review?
> 
> 
> This issue was previously documented in our bugzilla, but that was in 2020.
> 
> I can't easily reproduce the issue after a7766ef18b33, but interrupt suppression
> under virtio is unreliable and out of sync, which is still a potential risk for
> DPUs where the VM and the device are not on the same host.
> 
> Thanks.

I find it hard to believe there's a real problem after a7766ef18b33.



> > > ---
> > >  drivers/net/virtio_net.c     |  9 ++++++
> > >  drivers/virtio/virtio_ring.c | 53 ++++++++++++++++++++++++++++++++++++
> > >  include/linux/virtio.h       |  3 ++
> > >  3 files changed, 65 insertions(+)
> > > 
> > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > index 0383a3e136d6..6d8739418203 100644
> > > --- a/drivers/net/virtio_net.c
> > > +++ b/drivers/net/virtio_net.c
> > > @@ -2769,6 +2769,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq, int budget)
> > >  		do {
> > >  			virtqueue_disable_cb(sq->vq);
> > >  			free_old_xmit(sq, txq, !!budget);
> > > +			virtqueue_set_tx_oldbuf_cleaned(sq->vq, true);
> > >  		} while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
> > >  
> > >  		if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) {
> > > @@ -3035,6 +3036,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> > >  
> > >  		free_old_xmit(sq, txq, false);
> > >  
> > > +		if (use_napi)
> > > +			virtqueue_set_tx_oldbuf_cleaned(sq->vq, true);
> > > +
> > >  	} while (use_napi && !xmit_more &&
> > >  	       unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
> > >  
> > > @@ -3044,6 +3048,11 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> > >  	/* Try to transmit */
> > >  	err = xmit_skb(sq, skb, !use_napi);
> > >  
> > > +	if (use_napi) {
> > > +		virtqueue_set_tx_newbuf_sent(sq->vq, true);
> > > +		virtqueue_set_tx_oldbuf_cleaned(sq->vq, false);
> > > +	}
> > > +
> > >  	/* This should not happen! */
> > >  	if (unlikely(err)) {
> > >  		DEV_STATS_INC(dev, tx_fifo_errors);
> > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > index be7309b1e860..fb2afc716371 100644
> > > --- a/drivers/virtio/virtio_ring.c
> > > +++ b/drivers/virtio/virtio_ring.c
> > > @@ -180,6 +180,11 @@ struct vring_virtqueue {
> > >  	 */
> > >  	bool do_unmap;
> > >  
> > > +	/* Has any new data been sent? */
> > > +	bool is_tx_newbuf_sent;
> > > +	/* Is the old data recently sent cleaned up? */
> > > +	bool is_tx_oldbuf_cleaned;
> > > +
> > >  	/* Head of free buffer list. */
> > >  	unsigned int free_head;
> > >  	/* Number we've added since last sync. */
> > > @@ -2092,6 +2097,9 @@ static struct virtqueue *vring_create_virtqueue_packed(
> > >  	vq->use_dma_api = vring_use_dma_api(vdev);
> > >  	vq->premapped = false;
> > >  	vq->do_unmap = vq->use_dma_api;
> > > +	vq->is_tx_newbuf_sent = false; /* Initially, no new buffer to send. */
> > > +	vq->is_tx_oldbuf_cleaned = true; /* Initially, no old buffer to clean. */
> > > +
> > >  
> > >  	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
> > >  		!context;
> > > @@ -2375,6 +2383,38 @@ bool virtqueue_notify(struct virtqueue *_vq)
> > >  }
> > >  EXPORT_SYMBOL_GPL(virtqueue_notify);
> > >  
> > > +/**
> > > + * virtqueue_set_tx_newbuf_sent - set whether there is new tx buf to send.
> > > + * @_vq: the struct virtqueue
> > > + *
> > > + * If is_tx_newbuf_sent and is_tx_oldbuf_cleaned are both true, the
> > > + * spurious interrupt is caused by polling TX vq in other paths outside
> > > + * the tx irq callback.
> > > + */
> > > +void virtqueue_set_tx_newbuf_sent(struct virtqueue *_vq, bool val)
> > > +{
> > > +	struct vring_virtqueue *vq = to_vvq(_vq);
> > > +
> > > +	vq->is_tx_newbuf_sent = val;
> > > +}
> > > +EXPORT_SYMBOL_GPL(virtqueue_set_tx_newbuf_sent);
> > > +
> > > +/**
> > > + * virtqueue_set_tx_oldbuf_cleaned - set whether there is old tx buf to clean.
> > > + * @_vq: the struct virtqueue
> > > + *
> > > + * If is_tx_oldbuf_cleaned and is_tx_newbuf_sent are both true, the
> > > + * spurious interrupt is caused by polling TX vq in other paths outside
> > > + * the tx irq callback.
> > > + */
> > > +void virtqueue_set_tx_oldbuf_cleaned(struct virtqueue *_vq, bool val)
> > > +{
> > > +	struct vring_virtqueue *vq = to_vvq(_vq);
> > > +
> > > +	vq->is_tx_oldbuf_cleaned = val;
> > > +}
> > > +EXPORT_SYMBOL_GPL(virtqueue_set_tx_oldbuf_cleaned);
> > > +
> > >  /**
> > >   * virtqueue_kick - update after add_buf
> > >   * @vq: the struct virtqueue
> > > @@ -2572,6 +2612,16 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
> > >  	struct vring_virtqueue *vq = to_vvq(_vq);
> > >  
> > >  	if (!more_used(vq)) {
> > > +		/* When the delayed TX interrupt arrives, the old buffers are
> > > +		 * cleaned in other cases(start_xmit and virtnet_poll_cleantx).
> > > +		 * We'd better not identify it as a spurious interrupt,
> > > +		 * otherwise note_interrupt may kill the interrupt.
> > > +		 */
> > > +		if (unlikely(vq->is_tx_newbuf_sent && vq->is_tx_oldbuf_cleaned)) {
> > > +			vq->is_tx_newbuf_sent = false;
> > > +			return IRQ_HANDLED;
> > > +		}
> > > +

I am not merging anything this ugly.


> > >  		pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > >  		return IRQ_NONE;
> > >  	}
> > > @@ -2637,6 +2687,9 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index,
> > >  	vq->use_dma_api = vring_use_dma_api(vdev);
> > >  	vq->premapped = false;
> > >  	vq->do_unmap = vq->use_dma_api;
> > > +	vq->is_tx_newbuf_sent = false; /* Initially, no new buffer to send. */
> > > +	vq->is_tx_oldbuf_cleaned = true; /* Initially, no old buffer to clean. */
> > > +
> > >  
> > >  	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
> > >  		!context;
> > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > index ecc5cb7b8c91..ba3be9276c09 100644
> > > --- a/include/linux/virtio.h
> > > +++ b/include/linux/virtio.h
> > > @@ -103,6 +103,9 @@ int virtqueue_resize(struct virtqueue *vq, u32 num,
> > >  int virtqueue_reset(struct virtqueue *vq,
> > >  		    void (*recycle)(struct virtqueue *vq, void *buf));
> > >  
> > > +void virtqueue_set_tx_newbuf_sent(struct virtqueue *vq, bool val);
> > > +void virtqueue_set_tx_oldbuf_cleaned(struct virtqueue *vq, bool val);
> > > +
> > >  struct virtio_admin_cmd {
> > >  	__le16 opcode;
> > >  	__le16 group_type;
> > > -- 
> > > 2.32.0.3.g01195cf9f
> >
Michael S. Tsirkin Aug. 2, 2024, 1:11 p.m. UTC | #5
On Fri, Aug 02, 2024 at 11:41:57AM +0800, Jason Wang wrote:
> On Thu, Aug 1, 2024 at 9:56 PM Heng Qi <hengqi@linux.alibaba.com> wrote:
> >
> > Michael has effectively reduced the number of spurious interrupts in
> > commit a7766ef18b33 ("virtio_net: disable cb aggressively") by disabling
> > irq callbacks before cleaning old buffers.
> >
> > But it is still possible that the irq is killed by mistake:
> >
> >   When a delayed tx interrupt arrives, old buffers has been cleaned in
> >   other paths (start_xmit and virtnet_poll_cleantx), then the interrupt is
> >   mistakenly identified as a spurious interrupt in vring_interrupt.
> >
> >   We should refrain from labeling it as a spurious interrupt; otherwise,
> >   note_interrupt may inadvertently kill the legitimate irq.
> 
> I think the evil came from where we do free_old_xmit() in
> start_xmit(). I know it is for performance, but we may need to make
> the code work correctly instead of adding endless hacks. Personally, I
> think the virtio-net TX path is over-complicated. We probably pay too
> much (e.g there's netif_tx_lock in TX NAPI path) to try to "optimize"
> the performance.
> 
> How about just don't do free_old_xmit and do that solely in the TX NAPI?

Not getting interrupts is always better than getting interrupts.
This is not new code, there are no plans to erase it all and start
anew "to make it work correctly" - it's widely deployed,
you will cause performance regressions and they are hard
to debug.


> >
> > Signed-off-by: Heng Qi <hengqi@linux.alibaba.com>
> > ---
> >  drivers/net/virtio_net.c     |  9 ++++++
> >  drivers/virtio/virtio_ring.c | 53 ++++++++++++++++++++++++++++++++++++
> >  include/linux/virtio.h       |  3 ++
> >  3 files changed, 65 insertions(+)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 0383a3e136d6..6d8739418203 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -2769,6 +2769,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq, int budget)
> >                 do {
> >                         virtqueue_disable_cb(sq->vq);
> >                         free_old_xmit(sq, txq, !!budget);
> > +                       virtqueue_set_tx_oldbuf_cleaned(sq->vq, true);
> >                 } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
> >
> >                 if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) {
> > @@ -3035,6 +3036,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> >
> >                 free_old_xmit(sq, txq, false);
> >
> > +               if (use_napi)
> > +                       virtqueue_set_tx_oldbuf_cleaned(sq->vq, true);
> > +
> >         } while (use_napi && !xmit_more &&
> >                unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
> >
> > @@ -3044,6 +3048,11 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> >         /* Try to transmit */
> >         err = xmit_skb(sq, skb, !use_napi);
> >
> > +       if (use_napi) {
> > +               virtqueue_set_tx_newbuf_sent(sq->vq, true);
> > +               virtqueue_set_tx_oldbuf_cleaned(sq->vq, false);
> > +       }
> > +
> >         /* This should not happen! */
> >         if (unlikely(err)) {
> >                 DEV_STATS_INC(dev, tx_fifo_errors);
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index be7309b1e860..fb2afc716371 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -180,6 +180,11 @@ struct vring_virtqueue {
> >          */
> >         bool do_unmap;
> >
> > +       /* Has any new data been sent? */
> > +       bool is_tx_newbuf_sent;
> > +       /* Is the old data recently sent cleaned up? */
> > +       bool is_tx_oldbuf_cleaned;
> > +
> >         /* Head of free buffer list. */
> >         unsigned int free_head;
> >         /* Number we've added since last sync. */
> > @@ -2092,6 +2097,9 @@ static struct virtqueue *vring_create_virtqueue_packed(
> >         vq->use_dma_api = vring_use_dma_api(vdev);
> >         vq->premapped = false;
> >         vq->do_unmap = vq->use_dma_api;
> > +       vq->is_tx_newbuf_sent = false; /* Initially, no new buffer to send. */
> > +       vq->is_tx_oldbuf_cleaned = true; /* Initially, no old buffer to clean. */
> > +
> >
> >         vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
> >                 !context;
> > @@ -2375,6 +2383,38 @@ bool virtqueue_notify(struct virtqueue *_vq)
> >  }
> >  EXPORT_SYMBOL_GPL(virtqueue_notify);
> >
> > +/**
> > + * virtqueue_set_tx_newbuf_sent - set whether there is new tx buf to send.
> > + * @_vq: the struct virtqueue
> > + *
> > + * If is_tx_newbuf_sent and is_tx_oldbuf_cleaned are both true, the
> > + * spurious interrupt is caused by polling TX vq in other paths outside
> > + * the tx irq callback.
> > + */
> > +void virtqueue_set_tx_newbuf_sent(struct virtqueue *_vq, bool val)
> > +{
> > +       struct vring_virtqueue *vq = to_vvq(_vq);
> > +
> > +       vq->is_tx_newbuf_sent = val;
> > +}
> > +EXPORT_SYMBOL_GPL(virtqueue_set_tx_newbuf_sent);
> > +
> > +/**
> > + * virtqueue_set_tx_oldbuf_cleaned - set whether there is old tx buf to clean.
> > + * @_vq: the struct virtqueue
> > + *
> > + * If is_tx_oldbuf_cleaned and is_tx_newbuf_sent are both true, the
> > + * spurious interrupt is caused by polling TX vq in other paths outside
> > + * the tx irq callback.
> > + */
> > +void virtqueue_set_tx_oldbuf_cleaned(struct virtqueue *_vq, bool val)
> > +{
> > +       struct vring_virtqueue *vq = to_vvq(_vq);
> > +
> > +       vq->is_tx_oldbuf_cleaned = val;
> > +}
> > +EXPORT_SYMBOL_GPL(virtqueue_set_tx_oldbuf_cleaned);
> > +
> >  /**
> >   * virtqueue_kick - update after add_buf
> >   * @vq: the struct virtqueue
> > @@ -2572,6 +2612,16 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
> >         struct vring_virtqueue *vq = to_vvq(_vq);
> >
> >         if (!more_used(vq)) {
> > +               /* When the delayed TX interrupt arrives, the old buffers are
> > +                * cleaned in other cases(start_xmit and virtnet_poll_cleantx).
> > +                * We'd better not identify it as a spurious interrupt,
> > +                * otherwise note_interrupt may kill the interrupt.
> > +                */
> > +               if (unlikely(vq->is_tx_newbuf_sent && vq->is_tx_oldbuf_cleaned)) {
> > +                       vq->is_tx_newbuf_sent = false;
> > +                       return IRQ_HANDLED;
> > +               }
> 
> This is the general virtio code, it's better to avoid any device specific logic.
> 
> > +
> >                 pr_debug("virtqueue interrupt with no work for %p\n", vq);
> >                 return IRQ_NONE;
> >         }
> > @@ -2637,6 +2687,9 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index,
> >         vq->use_dma_api = vring_use_dma_api(vdev);
> >         vq->premapped = false;
> >         vq->do_unmap = vq->use_dma_api;
> > +       vq->is_tx_newbuf_sent = false; /* Initially, no new buffer to send. */
> > +       vq->is_tx_oldbuf_cleaned = true; /* Initially, no old buffer to clean. */
> > +
> >
> >         vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
> >                 !context;
> > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > index ecc5cb7b8c91..ba3be9276c09 100644
> > --- a/include/linux/virtio.h
> > +++ b/include/linux/virtio.h
> > @@ -103,6 +103,9 @@ int virtqueue_resize(struct virtqueue *vq, u32 num,
> >  int virtqueue_reset(struct virtqueue *vq,
> >                     void (*recycle)(struct virtqueue *vq, void *buf));
> >
> > +void virtqueue_set_tx_newbuf_sent(struct virtqueue *vq, bool val);
> > +void virtqueue_set_tx_oldbuf_cleaned(struct virtqueue *vq, bool val);
> > +
> >  struct virtio_admin_cmd {
> >         __le16 opcode;
> >         __le16 group_type;
> > --
> > 2.32.0.3.g01195cf9f
> >
Jason Wang Aug. 5, 2024, 3:26 a.m. UTC | #6
On Fri, Aug 2, 2024 at 9:11 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Fri, Aug 02, 2024 at 11:41:57AM +0800, Jason Wang wrote:
> > On Thu, Aug 1, 2024 at 9:56 PM Heng Qi <hengqi@linux.alibaba.com> wrote:
> > >
> > > Michael has effectively reduced the number of spurious interrupts in
> > > commit a7766ef18b33 ("virtio_net: disable cb aggressively") by disabling
> > > irq callbacks before cleaning old buffers.
> > >
> > > But it is still possible that the irq is killed by mistake:
> > >
> > >   When a delayed tx interrupt arrives, old buffers has been cleaned in
> > >   other paths (start_xmit and virtnet_poll_cleantx), then the interrupt is
> > >   mistakenly identified as a spurious interrupt in vring_interrupt.
> > >
> > >   We should refrain from labeling it as a spurious interrupt; otherwise,
> > >   note_interrupt may inadvertently kill the legitimate irq.
> >
> > I think the evil came from where we do free_old_xmit() in
> > start_xmit(). I know it is for performance, but we may need to make
> > the code work correctly instead of adding endless hacks. Personally, I
> > think the virtio-net TX path is over-complicated. We probably pay too
> > much (e.g there's netif_tx_lock in TX NAPI path) to try to "optimize"
> > the performance.
> >
> > How about just don't do free_old_xmit and do that solely in the TX NAPI?
>
> Not getting interrupts is always better than getting interrupts.

Not sure. For example letting 1 cpu to do the transmission without the
dealing of xmit skbs should give us better performance.

> This is not new code, there are no plans to erase it all and start
> anew "to make it work correctly" - it's widely deployed,
> you will cause performance regressions and they are hard
> to debug.

I actually meant the TX NAPI mode, we tried to hold the TX lock in the
TX NAPI, which turns out to slow down both the transmission and the
NAPI itself.

Thanks
Michael S. Tsirkin Aug. 5, 2024, 6:29 a.m. UTC | #7
On Mon, Aug 05, 2024 at 11:26:56AM +0800, Jason Wang wrote:
> On Fri, Aug 2, 2024 at 9:11 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Fri, Aug 02, 2024 at 11:41:57AM +0800, Jason Wang wrote:
> > > On Thu, Aug 1, 2024 at 9:56 PM Heng Qi <hengqi@linux.alibaba.com> wrote:
> > > >
> > > > Michael has effectively reduced the number of spurious interrupts in
> > > > commit a7766ef18b33 ("virtio_net: disable cb aggressively") by disabling
> > > > irq callbacks before cleaning old buffers.
> > > >
> > > > But it is still possible that the irq is killed by mistake:
> > > >
> > > >   When a delayed tx interrupt arrives, old buffers has been cleaned in
> > > >   other paths (start_xmit and virtnet_poll_cleantx), then the interrupt is
> > > >   mistakenly identified as a spurious interrupt in vring_interrupt.
> > > >
> > > >   We should refrain from labeling it as a spurious interrupt; otherwise,
> > > >   note_interrupt may inadvertently kill the legitimate irq.
> > >
> > > I think the evil came from where we do free_old_xmit() in
> > > start_xmit(). I know it is for performance, but we may need to make
> > > the code work correctly instead of adding endless hacks. Personally, I
> > > think the virtio-net TX path is over-complicated. We probably pay too
> > > much (e.g there's netif_tx_lock in TX NAPI path) to try to "optimize"
> > > the performance.
> > >
> > > How about just don't do free_old_xmit and do that solely in the TX NAPI?
> >
> > Not getting interrupts is always better than getting interrupts.
> 
> Not sure. For example letting 1 cpu to do the transmission without the
> dealing of xmit skbs should give us better performance.

Hmm. It's a subtle thing. I suspect until certain limit
(e.g. ping pong test) free_old_xmit will win anyway.

> > This is not new code, there are no plans to erase it all and start
> > anew "to make it work correctly" - it's widely deployed,
> > you will cause performance regressions and they are hard
> > to debug.
> 
> I actually meant the TX NAPI mode, we tried to hold the TX lock in the
> TX NAPI, which turns out to slow down both the transmission and the
> NAPI itself.
> 
> Thanks

We do need to synchronize anyway though, virtio expects drivers to do
their own serialization of vq operations. You could try to instead move
skbs to some kind of array under the tx lock, then free them all up
later after unlocking tx.

Can be helpful for batching as well?


I also always wondered whether it is an issue that free_old_xmit
just polls vq until it is empty, without a limit.
napi is supposed to poll until a limit is reached.
I guess not many people have very deep vqs.
Jason Wang Aug. 6, 2024, 3:18 a.m. UTC | #8
On Mon, Aug 5, 2024 at 2:29 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Mon, Aug 05, 2024 at 11:26:56AM +0800, Jason Wang wrote:
> > On Fri, Aug 2, 2024 at 9:11 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Fri, Aug 02, 2024 at 11:41:57AM +0800, Jason Wang wrote:
> > > > On Thu, Aug 1, 2024 at 9:56 PM Heng Qi <hengqi@linux.alibaba.com> wrote:
> > > > >
> > > > > Michael has effectively reduced the number of spurious interrupts in
> > > > > commit a7766ef18b33 ("virtio_net: disable cb aggressively") by disabling
> > > > > irq callbacks before cleaning old buffers.
> > > > >
> > > > > But it is still possible that the irq is killed by mistake:
> > > > >
> > > > >   When a delayed tx interrupt arrives, old buffers has been cleaned in
> > > > >   other paths (start_xmit and virtnet_poll_cleantx), then the interrupt is
> > > > >   mistakenly identified as a spurious interrupt in vring_interrupt.
> > > > >
> > > > >   We should refrain from labeling it as a spurious interrupt; otherwise,
> > > > >   note_interrupt may inadvertently kill the legitimate irq.
> > > >
> > > > I think the evil came from where we do free_old_xmit() in
> > > > start_xmit(). I know it is for performance, but we may need to make
> > > > the code work correctly instead of adding endless hacks. Personally, I
> > > > think the virtio-net TX path is over-complicated. We probably pay too
> > > > much (e.g there's netif_tx_lock in TX NAPI path) to try to "optimize"
> > > > the performance.
> > > >
> > > > How about just don't do free_old_xmit and do that solely in the TX NAPI?
> > >
> > > Not getting interrupts is always better than getting interrupts.
> >
> > Not sure. For example letting 1 cpu to do the transmission without the
> > dealing of xmit skbs should give us better performance.
>
> Hmm. It's a subtle thing. I suspect until certain limit
> (e.g. ping pong test) free_old_xmit will win anyway.

Not sure I understand here.

>
> > > This is not new code, there are no plans to erase it all and start
> > > anew "to make it work correctly" - it's widely deployed,
> > > you will cause performance regressions and they are hard
> > > to debug.
> >
> > I actually meant the TX NAPI mode, we tried to hold the TX lock in the
> > TX NAPI, which turns out to slow down both the transmission and the
> > NAPI itself.
> >
> > Thanks
>
> We do need to synchronize anyway though, virtio expects drivers to do
> their own serialization of vq operations.

Right, but currently add and get needs to be serialized which is a
bottleneck. I don't see any issue to parallelize that.

> You could try to instead move
> skbs to some kind of array under the tx lock, then free them all up
> later after unlocking tx.
>
> Can be helpful for batching as well?

It's worth a try and see.

>
>
> I also always wondered whether it is an issue that free_old_xmit
> just polls vq until it is empty, without a limit.

Did you mean schedule a NAPI if free_old_xmit() exceeds the NAPI quota?

> napi is supposed to poll until a limit is reached.
> I guess not many people have very deep vqs.

Current NAPI weight is 64, so I think we can meet it in stressful workload.

Thanks

>
> --
> MST
>
Michael S. Tsirkin Aug. 6, 2024, 1:24 p.m. UTC | #9
On Tue, Aug 06, 2024 at 11:18:14AM +0800, Jason Wang wrote:
> On Mon, Aug 5, 2024 at 2:29 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Mon, Aug 05, 2024 at 11:26:56AM +0800, Jason Wang wrote:
> > > On Fri, Aug 2, 2024 at 9:11 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Fri, Aug 02, 2024 at 11:41:57AM +0800, Jason Wang wrote:
> > > > > On Thu, Aug 1, 2024 at 9:56 PM Heng Qi <hengqi@linux.alibaba.com> wrote:
> > > > > >
> > > > > > Michael has effectively reduced the number of spurious interrupts in
> > > > > > commit a7766ef18b33 ("virtio_net: disable cb aggressively") by disabling
> > > > > > irq callbacks before cleaning old buffers.
> > > > > >
> > > > > > But it is still possible that the irq is killed by mistake:
> > > > > >
> > > > > >   When a delayed tx interrupt arrives, old buffers has been cleaned in
> > > > > >   other paths (start_xmit and virtnet_poll_cleantx), then the interrupt is
> > > > > >   mistakenly identified as a spurious interrupt in vring_interrupt.
> > > > > >
> > > > > >   We should refrain from labeling it as a spurious interrupt; otherwise,
> > > > > >   note_interrupt may inadvertently kill the legitimate irq.
> > > > >
> > > > > I think the evil came from where we do free_old_xmit() in
> > > > > start_xmit(). I know it is for performance, but we may need to make
> > > > > the code work correctly instead of adding endless hacks. Personally, I
> > > > > think the virtio-net TX path is over-complicated. We probably pay too
> > > > > much (e.g there's netif_tx_lock in TX NAPI path) to try to "optimize"
> > > > > the performance.
> > > > >
> > > > > How about just don't do free_old_xmit and do that solely in the TX NAPI?
> > > >
> > > > Not getting interrupts is always better than getting interrupts.
> > >
> > > Not sure. For example letting 1 cpu to do the transmission without the
> > > dealing of xmit skbs should give us better performance.
> >
> > Hmm. It's a subtle thing. I suspect until certain limit
> > (e.g. ping pong test) free_old_xmit will win anyway.
> 
> Not sure I understand here.

If you transmit 1 packet and then wait for another one anyway,
you are better off just handling the tx interrupt.


> >
> > > > This is not new code, there are no plans to erase it all and start
> > > > anew "to make it work correctly" - it's widely deployed,
> > > > you will cause performance regressions and they are hard
> > > > to debug.
> > >
> > > I actually meant the TX NAPI mode, we tried to hold the TX lock in the
> > > TX NAPI, which turns out to slow down both the transmission and the
> > > NAPI itself.
> > >
> > > Thanks
> >
> > We do need to synchronize anyway though, virtio expects drivers to do
> > their own serialization of vq operations.
> 
> Right, but currently add and get needs to be serialized which is a
> bottleneck. I don't see any issue to parallelize that.

Do you see this in traces?

> > You could try to instead move
> > skbs to some kind of array under the tx lock, then free them all up
> > later after unlocking tx.
> >
> > Can be helpful for batching as well?
> 
> It's worth a try and see.

Why not.

> >
> >
> > I also always wondered whether it is an issue that free_old_xmit
> > just polls vq until it is empty, without a limit.
> 
> Did you mean schedule a NAPI if free_old_xmit() exceeds the NAPI quota?

yes

> > napi is supposed to poll until a limit is reached.
> > I guess not many people have very deep vqs.
> 
> Current NAPI weight is 64, so I think we can meet it in stressful workload.
> 
> Thanks

yes, but it's just a random number.  since we hold the tx lock,
we get at most vq size bufs, so it's limited.

> >
> > --
> > MST
> >
Jason Wang Aug. 7, 2024, 4:06 a.m. UTC | #10
On Tue, Aug 6, 2024 at 9:25 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Tue, Aug 06, 2024 at 11:18:14AM +0800, Jason Wang wrote:
> > On Mon, Aug 5, 2024 at 2:29 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Mon, Aug 05, 2024 at 11:26:56AM +0800, Jason Wang wrote:
> > > > On Fri, Aug 2, 2024 at 9:11 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Fri, Aug 02, 2024 at 11:41:57AM +0800, Jason Wang wrote:
> > > > > > On Thu, Aug 1, 2024 at 9:56 PM Heng Qi <hengqi@linux.alibaba.com> wrote:
> > > > > > >
> > > > > > > Michael has effectively reduced the number of spurious interrupts in
> > > > > > > commit a7766ef18b33 ("virtio_net: disable cb aggressively") by disabling
> > > > > > > irq callbacks before cleaning old buffers.
> > > > > > >
> > > > > > > But it is still possible that the irq is killed by mistake:
> > > > > > >
> > > > > > >   When a delayed tx interrupt arrives, old buffers has been cleaned in
> > > > > > >   other paths (start_xmit and virtnet_poll_cleantx), then the interrupt is
> > > > > > >   mistakenly identified as a spurious interrupt in vring_interrupt.
> > > > > > >
> > > > > > >   We should refrain from labeling it as a spurious interrupt; otherwise,
> > > > > > >   note_interrupt may inadvertently kill the legitimate irq.
> > > > > >
> > > > > > I think the evil came from where we do free_old_xmit() in
> > > > > > start_xmit(). I know it is for performance, but we may need to make
> > > > > > the code work correctly instead of adding endless hacks. Personally, I
> > > > > > think the virtio-net TX path is over-complicated. We probably pay too
> > > > > > much (e.g there's netif_tx_lock in TX NAPI path) to try to "optimize"
> > > > > > the performance.
> > > > > >
> > > > > > How about just don't do free_old_xmit and do that solely in the TX NAPI?
> > > > >
> > > > > Not getting interrupts is always better than getting interrupts.
> > > >
> > > > Not sure. For example letting 1 cpu to do the transmission without the
> > > > dealing of xmit skbs should give us better performance.
> > >
> > > Hmm. It's a subtle thing. I suspect until certain limit
> > > (e.g. ping pong test) free_old_xmit will win anyway.
> >
> > Not sure I understand here.
>
> If you transmit 1 packet and then wait for another one anyway,
> you are better off just handling the tx interrupt.

Yes for light load but not for heavy load like pktgen and others probably.

>
>
> > >
> > > > > This is not new code, there are no plans to erase it all and start
> > > > > anew "to make it work correctly" - it's widely deployed,
> > > > > you will cause performance regressions and they are hard
> > > > > to debug.
> > > >
> > > > I actually meant the TX NAPI mode, we tried to hold the TX lock in the
> > > > TX NAPI, which turns out to slow down both the transmission and the
> > > > NAPI itself.
> > > >
> > > > Thanks
> > >
> > > We do need to synchronize anyway though, virtio expects drivers to do
> > > their own serialization of vq operations.
> >
> > Right, but currently add and get needs to be serialized which is a
> > bottleneck. I don't see any issue to parallelize that.
>
> Do you see this in traces?

I mean current virtio_core requires the caller to serialize add/get:

virtqueue_add() {
START_USE()
END_USE()
}

virtqueue_get() {
START_USE()
END_USE()
}

It seems to be a limitation of the current driver not the spec itself
which means we can find some way to allow those to be executed in
parallel.

One example is to use ptr_ring to maintain a free id list or it is not
even needed in the case of in order.

>
> > > You could try to instead move
> > > skbs to some kind of array under the tx lock, then free them all up
> > > later after unlocking tx.
> > >
> > > Can be helpful for batching as well?
> >
> > It's worth a try and see.
>
> Why not.
>
> > >
> > >
> > > I also always wondered whether it is an issue that free_old_xmit
> > > just polls vq until it is empty, without a limit.
> >
> > Did you mean schedule a NAPI if free_old_xmit() exceeds the NAPI quota?
>
> yes
>
> > > napi is supposed to poll until a limit is reached.
> > > I guess not many people have very deep vqs.
> >
> > Current NAPI weight is 64, so I think we can meet it in stressful workload.
> >
> > Thanks
>
> yes, but it's just a random number.  since we hold the tx lock,
> we get at most vq size bufs, so it's limited.

Ok.

Thanks

>
> > >
> > > --
> > > MST
> > >
>
Michael S. Tsirkin Aug. 7, 2024, 10:37 a.m. UTC | #11
On Wed, Aug 07, 2024 at 12:06:16PM +0800, Jason Wang wrote:
> On Tue, Aug 6, 2024 at 9:25 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Tue, Aug 06, 2024 at 11:18:14AM +0800, Jason Wang wrote:
> > > On Mon, Aug 5, 2024 at 2:29 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Mon, Aug 05, 2024 at 11:26:56AM +0800, Jason Wang wrote:
> > > > > On Fri, Aug 2, 2024 at 9:11 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > On Fri, Aug 02, 2024 at 11:41:57AM +0800, Jason Wang wrote:
> > > > > > > On Thu, Aug 1, 2024 at 9:56 PM Heng Qi <hengqi@linux.alibaba.com> wrote:
> > > > > > > >
> > > > > > > > Michael has effectively reduced the number of spurious interrupts in
> > > > > > > > commit a7766ef18b33 ("virtio_net: disable cb aggressively") by disabling
> > > > > > > > irq callbacks before cleaning old buffers.
> > > > > > > >
> > > > > > > > But it is still possible that the irq is killed by mistake:
> > > > > > > >
> > > > > > > >   When a delayed tx interrupt arrives, old buffers has been cleaned in
> > > > > > > >   other paths (start_xmit and virtnet_poll_cleantx), then the interrupt is
> > > > > > > >   mistakenly identified as a spurious interrupt in vring_interrupt.
> > > > > > > >
> > > > > > > >   We should refrain from labeling it as a spurious interrupt; otherwise,
> > > > > > > >   note_interrupt may inadvertently kill the legitimate irq.
> > > > > > >
> > > > > > > I think the evil came from where we do free_old_xmit() in
> > > > > > > start_xmit(). I know it is for performance, but we may need to make
> > > > > > > the code work correctly instead of adding endless hacks. Personally, I
> > > > > > > think the virtio-net TX path is over-complicated. We probably pay too
> > > > > > > much (e.g there's netif_tx_lock in TX NAPI path) to try to "optimize"
> > > > > > > the performance.
> > > > > > >
> > > > > > > How about just don't do free_old_xmit and do that solely in the TX NAPI?
> > > > > >
> > > > > > Not getting interrupts is always better than getting interrupts.
> > > > >
> > > > > Not sure. For example letting 1 cpu to do the transmission without the
> > > > > dealing of xmit skbs should give us better performance.
> > > >
> > > > Hmm. It's a subtle thing. I suspect until certain limit
> > > > (e.g. ping pong test) free_old_xmit will win anyway.
> > >
> > > Not sure I understand here.
> >
> > If you transmit 1 packet and then wait for another one anyway,
> > you are better off just handling the tx interrupt.
> 
> Yes for light load but not for heavy load like pktgen and others probably.

If you are extermely busy sending packets, and you don't really care
when they are freed, and the vq is deep
and you have another, idle CPU, and sending interrupt does not need
a vmexit - moving work out at the cost of an interrupt will be a win.


> >
> >
> > > >
> > > > > > This is not new code, there are no plans to erase it all and start
> > > > > > anew "to make it work correctly" - it's widely deployed,
> > > > > > you will cause performance regressions and they are hard
> > > > > > to debug.
> > > > >
> > > > > I actually meant the TX NAPI mode, we tried to hold the TX lock in the
> > > > > TX NAPI, which turns out to slow down both the transmission and the
> > > > > NAPI itself.
> > > > >
> > > > > Thanks
> > > >
> > > > We do need to synchronize anyway though, virtio expects drivers to do
> > > > their own serialization of vq operations.
> > >
> > > Right, but currently add and get needs to be serialized which is a
> > > bottleneck. I don't see any issue to parallelize that.
> >
> > Do you see this in traces?
> 
> I mean current virtio_core requires the caller to serialize add/get:
> 
> virtqueue_add() {
> START_USE()
> END_USE()
> }
> 
> virtqueue_get() {
> START_USE()
> END_USE()
> }
> 
> It seems to be a limitation of the current driver not the spec itself
> which means we can find some way to allow those to be executed in
> parallel.
> 
> One example is to use ptr_ring to maintain a free id list or it is not
> even needed in the case of in order.

All quite tricky.

But again - do you have traces showing contention on tx lock?

Until we do, it's pointless to try and optimize: make changes to
code, see if performance changes - is not a good way to do this,
the system is too chaotic for that.


> >
> > > > You could try to instead move
> > > > skbs to some kind of array under the tx lock, then free them all up
> > > > later after unlocking tx.
> > > >
> > > > Can be helpful for batching as well?
> > >
> > > It's worth a try and see.
> >
> > Why not.
> >
> > > >
> > > >
> > > > I also always wondered whether it is an issue that free_old_xmit
> > > > just polls vq until it is empty, without a limit.
> > >
> > > Did you mean schedule a NAPI if free_old_xmit() exceeds the NAPI quota?
> >
> > yes
> >
> > > > napi is supposed to poll until a limit is reached.
> > > > I guess not many people have very deep vqs.
> > >
> > > Current NAPI weight is 64, so I think we can meet it in stressful workload.
> > >
> > > Thanks
> >
> > yes, but it's just a random number.  since we hold the tx lock,
> > we get at most vq size bufs, so it's limited.
> 
> Ok.
> 
> Thanks
> 
> >
> > > >
> > > > --
> > > > MST
> > > >
> >
Jason Wang Aug. 8, 2024, 2:50 a.m. UTC | #12
On Wed, Aug 7, 2024 at 6:37 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, Aug 07, 2024 at 12:06:16PM +0800, Jason Wang wrote:
> > On Tue, Aug 6, 2024 at 9:25 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Tue, Aug 06, 2024 at 11:18:14AM +0800, Jason Wang wrote:
> > > > On Mon, Aug 5, 2024 at 2:29 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Mon, Aug 05, 2024 at 11:26:56AM +0800, Jason Wang wrote:
> > > > > > On Fri, Aug 2, 2024 at 9:11 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >
> > > > > > > On Fri, Aug 02, 2024 at 11:41:57AM +0800, Jason Wang wrote:
> > > > > > > > On Thu, Aug 1, 2024 at 9:56 PM Heng Qi <hengqi@linux.alibaba.com> wrote:
> > > > > > > > >
> > > > > > > > > Michael has effectively reduced the number of spurious interrupts in
> > > > > > > > > commit a7766ef18b33 ("virtio_net: disable cb aggressively") by disabling
> > > > > > > > > irq callbacks before cleaning old buffers.
> > > > > > > > >
> > > > > > > > > But it is still possible that the irq is killed by mistake:
> > > > > > > > >
> > > > > > > > >   When a delayed tx interrupt arrives, old buffers has been cleaned in
> > > > > > > > >   other paths (start_xmit and virtnet_poll_cleantx), then the interrupt is
> > > > > > > > >   mistakenly identified as a spurious interrupt in vring_interrupt.
> > > > > > > > >
> > > > > > > > >   We should refrain from labeling it as a spurious interrupt; otherwise,
> > > > > > > > >   note_interrupt may inadvertently kill the legitimate irq.
> > > > > > > >
> > > > > > > > I think the evil came from where we do free_old_xmit() in
> > > > > > > > start_xmit(). I know it is for performance, but we may need to make
> > > > > > > > the code work correctly instead of adding endless hacks. Personally, I
> > > > > > > > think the virtio-net TX path is over-complicated. We probably pay too
> > > > > > > > much (e.g there's netif_tx_lock in TX NAPI path) to try to "optimize"
> > > > > > > > the performance.
> > > > > > > >
> > > > > > > > How about just don't do free_old_xmit and do that solely in the TX NAPI?
> > > > > > >
> > > > > > > Not getting interrupts is always better than getting interrupts.
> > > > > >
> > > > > > Not sure. For example letting 1 cpu to do the transmission without the
> > > > > > dealing of xmit skbs should give us better performance.
> > > > >
> > > > > Hmm. It's a subtle thing. I suspect until certain limit
> > > > > (e.g. ping pong test) free_old_xmit will win anyway.
> > > >
> > > > Not sure I understand here.
> > >
> > > If you transmit 1 packet and then wait for another one anyway,
> > > you are better off just handling the tx interrupt.
> >
> > Yes for light load but not for heavy load like pktgen and others probably.
>
> If you are extermely busy sending packets, and you don't really care
> when they are freed, and the vq is deep
> and you have another, idle CPU, and sending interrupt does not need
> a vmexit - moving work out at the cost of an interrupt will be a win.

Exactly.

>
>
> > >
> > >
> > > > >
> > > > > > > This is not new code, there are no plans to erase it all and start
> > > > > > > anew "to make it work correctly" - it's widely deployed,
> > > > > > > you will cause performance regressions and they are hard
> > > > > > > to debug.
> > > > > >
> > > > > > I actually meant the TX NAPI mode, we tried to hold the TX lock in the
> > > > > > TX NAPI, which turns out to slow down both the transmission and the
> > > > > > NAPI itself.
> > > > > >
> > > > > > Thanks
> > > > >
> > > > > We do need to synchronize anyway though, virtio expects drivers to do
> > > > > their own serialization of vq operations.
> > > >
> > > > Right, but currently add and get needs to be serialized which is a
> > > > bottleneck. I don't see any issue to parallelize that.
> > >
> > > Do you see this in traces?
> >
> > I mean current virtio_core requires the caller to serialize add/get:
> >
> > virtqueue_add() {
> > START_USE()
> > END_USE()
> > }
> >
> > virtqueue_get() {
> > START_USE()
> > END_USE()
> > }
> >
> > It seems to be a limitation of the current driver not the spec itself
> > which means we can find some way to allow those to be executed in
> > parallel.
> >
> > One example is to use ptr_ring to maintain a free id list or it is not
> > even needed in the case of in order.
>
> All quite tricky.

Yes but most modern NIC doesn't require serialization between add and get.

>
> But again - do you have traces showing contention on tx lock?
>
> Until we do, it's pointless to try and optimize: make changes to
> code, see if performance changes - is not a good way to do this,
> the system is too chaotic for that.

Yes. E.g 2 vcpus 1 queue. Pktgen 64 burst on vcpu0, perf top -C 1 shows:

 94.01%  [kernel]              [k] queued_spin_lock_slowpath
   0.93%  [kernel]              [k] native_apic_mem_eoi
   0.83%  [kernel]              [k] __irqentry_text_start
   0.48%  [kernel]              [k] vring_interrupt
   0.22%  perf                  [.] io__get_char
...

And calltrace shows it came from virtnet_poll_tx().

Reduce bursts may decrease the contention, e.g if burst 4 I can see
15% and if burst is 1, I can only see 4% in
queued_spin_lock_slowpath() in guests. Burst indeed increases the
performance but we pay the price as it may hold the tx lock for a
little bit a long time which slows down the NAPI.

Burst mode should be normal as we have BQL and qdisc bulk dequeuing
may work more. Generally, it's not a good idea to block NAPI/softirq
for a long time as it may delay other bh.

Thanks



>
>
> > >
> > > > > You could try to instead move
> > > > > skbs to some kind of array under the tx lock, then free them all up
> > > > > later after unlocking tx.
> > > > >
> > > > > Can be helpful for batching as well?
> > > >
> > > > It's worth a try and see.
> > >
> > > Why not.
> > >
> > > > >
> > > > >
> > > > > I also always wondered whether it is an issue that free_old_xmit
> > > > > just polls vq until it is empty, without a limit.
> > > >
> > > > Did you mean schedule a NAPI if free_old_xmit() exceeds the NAPI quota?
> > >
> > > yes
> > >
> > > > > napi is supposed to poll until a limit is reached.
> > > > > I guess not many people have very deep vqs.
> > > >
> > > > Current NAPI weight is 64, so I think we can meet it in stressful workload.
> > > >
> > > > Thanks
> > >
> > > yes, but it's just a random number.  since we hold the tx lock,
> > > we get at most vq size bufs, so it's limited.
> >
> > Ok.
> >
> > Thanks
> >
> > >
> > > > >
> > > > > --
> > > > > MST
> > > > >
> > >
>
diff mbox series

Patch

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 0383a3e136d6..6d8739418203 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2769,6 +2769,7 @@  static void virtnet_poll_cleantx(struct receive_queue *rq, int budget)
 		do {
 			virtqueue_disable_cb(sq->vq);
 			free_old_xmit(sq, txq, !!budget);
+			virtqueue_set_tx_oldbuf_cleaned(sq->vq, true);
 		} while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
 
 		if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) {
@@ -3035,6 +3036,9 @@  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 		free_old_xmit(sq, txq, false);
 
+		if (use_napi)
+			virtqueue_set_tx_oldbuf_cleaned(sq->vq, true);
+
 	} while (use_napi && !xmit_more &&
 	       unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
 
@@ -3044,6 +3048,11 @@  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* Try to transmit */
 	err = xmit_skb(sq, skb, !use_napi);
 
+	if (use_napi) {
+		virtqueue_set_tx_newbuf_sent(sq->vq, true);
+		virtqueue_set_tx_oldbuf_cleaned(sq->vq, false);
+	}
+
 	/* This should not happen! */
 	if (unlikely(err)) {
 		DEV_STATS_INC(dev, tx_fifo_errors);
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index be7309b1e860..fb2afc716371 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -180,6 +180,11 @@  struct vring_virtqueue {
 	 */
 	bool do_unmap;
 
+	/* Has any new data been sent? */
+	bool is_tx_newbuf_sent;
+	/* Is the old data recently sent cleaned up? */
+	bool is_tx_oldbuf_cleaned;
+
 	/* Head of free buffer list. */
 	unsigned int free_head;
 	/* Number we've added since last sync. */
@@ -2092,6 +2097,9 @@  static struct virtqueue *vring_create_virtqueue_packed(
 	vq->use_dma_api = vring_use_dma_api(vdev);
 	vq->premapped = false;
 	vq->do_unmap = vq->use_dma_api;
+	vq->is_tx_newbuf_sent = false; /* Initially, no new buffer to send. */
+	vq->is_tx_oldbuf_cleaned = true; /* Initially, no old buffer to clean. */
+
 
 	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
 		!context;
@@ -2375,6 +2383,38 @@  bool virtqueue_notify(struct virtqueue *_vq)
 }
 EXPORT_SYMBOL_GPL(virtqueue_notify);
 
+/**
+ * virtqueue_set_tx_newbuf_sent - set whether there is new tx buf to send.
+ * @_vq: the struct virtqueue
+ *
+ * If is_tx_newbuf_sent and is_tx_oldbuf_cleaned are both true, the
+ * spurious interrupt is caused by polling TX vq in other paths outside
+ * the tx irq callback.
+ */
+void virtqueue_set_tx_newbuf_sent(struct virtqueue *_vq, bool val)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	vq->is_tx_newbuf_sent = val;
+}
+EXPORT_SYMBOL_GPL(virtqueue_set_tx_newbuf_sent);
+
+/**
+ * virtqueue_set_tx_oldbuf_cleaned - set whether there is old tx buf to clean.
+ * @_vq: the struct virtqueue
+ *
+ * If is_tx_oldbuf_cleaned and is_tx_newbuf_sent are both true, the
+ * spurious interrupt is caused by polling TX vq in other paths outside
+ * the tx irq callback.
+ */
+void virtqueue_set_tx_oldbuf_cleaned(struct virtqueue *_vq, bool val)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	vq->is_tx_oldbuf_cleaned = val;
+}
+EXPORT_SYMBOL_GPL(virtqueue_set_tx_oldbuf_cleaned);
+
 /**
  * virtqueue_kick - update after add_buf
  * @vq: the struct virtqueue
@@ -2572,6 +2612,16 @@  irqreturn_t vring_interrupt(int irq, void *_vq)
 	struct vring_virtqueue *vq = to_vvq(_vq);
 
 	if (!more_used(vq)) {
+		/* When the delayed TX interrupt arrives, the old buffers are
+		 * cleaned in other cases(start_xmit and virtnet_poll_cleantx).
+		 * We'd better not identify it as a spurious interrupt,
+		 * otherwise note_interrupt may kill the interrupt.
+		 */
+		if (unlikely(vq->is_tx_newbuf_sent && vq->is_tx_oldbuf_cleaned)) {
+			vq->is_tx_newbuf_sent = false;
+			return IRQ_HANDLED;
+		}
+
 		pr_debug("virtqueue interrupt with no work for %p\n", vq);
 		return IRQ_NONE;
 	}
@@ -2637,6 +2687,9 @@  static struct virtqueue *__vring_new_virtqueue(unsigned int index,
 	vq->use_dma_api = vring_use_dma_api(vdev);
 	vq->premapped = false;
 	vq->do_unmap = vq->use_dma_api;
+	vq->is_tx_newbuf_sent = false; /* Initially, no new buffer to send. */
+	vq->is_tx_oldbuf_cleaned = true; /* Initially, no old buffer to clean. */
+
 
 	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
 		!context;
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index ecc5cb7b8c91..ba3be9276c09 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -103,6 +103,9 @@  int virtqueue_resize(struct virtqueue *vq, u32 num,
 int virtqueue_reset(struct virtqueue *vq,
 		    void (*recycle)(struct virtqueue *vq, void *buf));
 
+void virtqueue_set_tx_newbuf_sent(struct virtqueue *vq, bool val);
+void virtqueue_set_tx_oldbuf_cleaned(struct virtqueue *vq, bool val);
+
 struct virtio_admin_cmd {
 	__le16 opcode;
 	__le16 group_type;