diff mbox series

[vhost,v11,10/10] virtio_net: merge dma operation for one page

Message ID 20230710034237.12391-11-xuanzhuo@linux.alibaba.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series virtio core prepares for AF_XDP | expand

Checks

Context Check Description
netdev/tree_selection success Guessing tree name failed - patch did not apply

Commit Message

Xuan Zhuo July 10, 2023, 3:42 a.m. UTC
Currently, the virtio core will perform a dma operation for each
operation. Although, the same page may be operated multiple times.

The driver does the dma operation and manages the dma address based the
feature premapped of virtio core.

This way, we can perform only one dma operation for the same page. In
the case of mtu 1500, this can reduce a lot of dma operations.

Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
increased from 1893766 to 1901105. An increase of 0.4%.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 267 insertions(+), 16 deletions(-)

Comments

Michael S. Tsirkin July 10, 2023, 9:40 a.m. UTC | #1
On Mon, Jul 10, 2023 at 11:42:37AM +0800, Xuan Zhuo wrote:
> Currently, the virtio core will perform a dma operation for each
> operation. Although, the same page may be operated multiple times.
> 
> The driver does the dma operation and manages the dma address based the
> feature premapped of virtio core.
> 
> This way, we can perform only one dma operation for the same page. In
> the case of mtu 1500, this can reduce a lot of dma operations.
> 
> Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> increased from 1893766 to 1901105. An increase of 0.4%.

what kind of dma was there? an IOMMU? which vendors? in which mode
of operation?

> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>

This kind of difference is likely in the noise.


> ---
>  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
>  1 file changed, 267 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 486b5849033d..4de845d35bed 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
>  #define VIRTNET_SQ_STATS_LEN	ARRAY_SIZE(virtnet_sq_stats_desc)
>  #define VIRTNET_RQ_STATS_LEN	ARRAY_SIZE(virtnet_rq_stats_desc)
>  
> +/* The bufs on the same page may share this struct. */
> +struct virtnet_rq_dma {
> +	struct virtnet_rq_dma *next;
> +
> +	dma_addr_t addr;
> +
> +	void *buf;
> +	u32 len;
> +
> +	u32 ref;
> +};
> +
> +/* Record the dma and buf. */

I guess I see that. But why?
And these two comments are the extent of the available
documentation, that's not enough I feel.


> +struct virtnet_rq_data {
> +	struct virtnet_rq_data *next;

Is manually reimplementing a linked list the best
we can do?

> +
> +	void *buf;
> +
> +	struct virtnet_rq_dma *dma;
> +};
> +
>  /* Internal representation of a send virtqueue */
>  struct send_queue {
>  	/* Virtqueue associated with this send _queue */
> @@ -175,6 +196,13 @@ struct receive_queue {
>  	char name[16];
>  
>  	struct xdp_rxq_info xdp_rxq;
> +
> +	struct virtnet_rq_data *data_array;
> +	struct virtnet_rq_data *data_free;
> +
> +	struct virtnet_rq_dma *dma_array;
> +	struct virtnet_rq_dma *dma_free;
> +	struct virtnet_rq_dma *last_dma;
>  };
>  
>  /* This structure can contain rss message with maximum settings for indirection table and keysize
> @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>  	return skb;
>  }
>  
> +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> +{
> +	struct device *dev;
> +
> +	--dma->ref;
> +
> +	if (dma->ref)
> +		return;
> +

If you don't unmap there is no guarantee valid data will be
there in the buffer.

> +	dev = virtqueue_dma_dev(rq->vq);
> +
> +	dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);





> +
> +	dma->next = rq->dma_free;
> +	rq->dma_free = dma;
> +}
> +
> +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> +				     struct virtnet_rq_data *data)
> +{
> +	void *buf;
> +
> +	buf = data->buf;
> +
> +	data->next = rq->data_free;
> +	rq->data_free = data;
> +
> +	return buf;
> +}
> +
> +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> +						   void *buf,
> +						   struct virtnet_rq_dma *dma)
> +{
> +	struct virtnet_rq_data *data;
> +
> +	data = rq->data_free;
> +	rq->data_free = data->next;
> +
> +	data->buf = buf;
> +	data->dma = dma;
> +
> +	return data;
> +}
> +
> +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> +{
> +	struct virtnet_rq_data *data;
> +	void *buf;
> +
> +	buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> +	if (!buf || !rq->data_array)
> +		return buf;
> +
> +	data = buf;
> +
> +	virtnet_rq_unmap(rq, data->dma);
> +
> +	return virtnet_rq_recycle_data(rq, data);
> +}
> +
> +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> +{
> +	struct virtnet_rq_data *data;
> +	void *buf;
> +
> +	buf = virtqueue_detach_unused_buf(rq->vq);
> +	if (!buf || !rq->data_array)
> +		return buf;
> +
> +	data = buf;
> +
> +	virtnet_rq_unmap(rq, data->dma);
> +
> +	return virtnet_rq_recycle_data(rq, data);
> +}
> +
> +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> +{
> +	struct virtnet_rq_dma *dma = rq->last_dma;
> +	struct device *dev;
> +	u32 off, map_len;
> +	dma_addr_t addr;
> +	void *end;
> +
> +	if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> +		++dma->ref;
> +		addr = dma->addr + (buf - dma->buf);
> +		goto ok;
> +	}

So this is the meat of the proposed optimization. I guess that
if the last buffer we allocated happens to be in the same page
as this one then they can both be mapped for DMA together.
Why last one specifically? Whether next one happens to
be close depends on luck. If you want to try optimizing this
the right thing to do is likely by using a page pool.
There's actually work upstream on page pool, look it up.

> +
> +	end = buf + len - 1;
> +	off = offset_in_page(end);
> +	map_len = len + PAGE_SIZE - off;
> +
> +	dev = virtqueue_dma_dev(rq->vq);
> +
> +	addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> +				  map_len, DMA_FROM_DEVICE, 0);
> +	if (addr == DMA_MAPPING_ERROR)
> +		return -ENOMEM;
> +
> +	dma = rq->dma_free;
> +	rq->dma_free = dma->next;
> +
> +	dma->ref = 1;
> +	dma->buf = buf;
> +	dma->addr = addr;
> +	dma->len = map_len;
> +
> +	rq->last_dma = dma;
> +
> +ok:
> +	sg_init_table(rq->sg, 1);
> +	rq->sg[0].dma_address = addr;
> +	rq->sg[0].length = len;
> +
> +	return 0;
> +}
> +
> +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> +{
> +	struct receive_queue *rq;
> +	int i, err, j, num;
> +
> +	/* disable for big mode */
> +	if (!vi->mergeable_rx_bufs && vi->big_packets)
> +		return 0;
> +
> +	for (i = 0; i < vi->max_queue_pairs; i++) {
> +		err = virtqueue_set_premapped(vi->rq[i].vq);
> +		if (err)
> +			continue;
> +
> +		rq = &vi->rq[i];
> +
> +		num = virtqueue_get_vring_size(rq->vq);
> +
> +		rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> +		if (!rq->data_array)
> +			goto err;
> +
> +		rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> +		if (!rq->dma_array)
> +			goto err;
> +
> +		for (j = 0; j < num; ++j) {
> +			rq->data_array[j].next = rq->data_free;
> +			rq->data_free = &rq->data_array[j];
> +
> +			rq->dma_array[j].next = rq->dma_free;
> +			rq->dma_free = &rq->dma_array[j];
> +		}
> +	}
> +
> +	return 0;
> +
> +err:
> +	for (i = 0; i < vi->max_queue_pairs; i++) {
> +		struct receive_queue *rq;
> +
> +		rq = &vi->rq[i];
> +
> +		kfree(rq->dma_array);
> +		kfree(rq->data_array);
> +	}
> +
> +	return -ENOMEM;
> +}
> +
>  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
>  {
>  	unsigned int len;
> @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
>  		void *buf;
>  		int off;
>  
> -		buf = virtqueue_get_buf(rq->vq, &buflen);
> +		buf = virtnet_rq_get_buf(rq, &buflen, NULL);
>  		if (unlikely(!buf))
>  			goto err_buf;
>  
> @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
>  		return -EINVAL;
>  
>  	while (--*num_buf > 0) {
> -		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> +		buf = virtnet_rq_get_buf(rq, &len, &ctx);
>  		if (unlikely(!buf)) {
>  			pr_debug("%s: rx error: %d buffers out of %d missing\n",
>  				 dev->name, *num_buf,
> @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>  	while (--num_buf) {
>  		int num_skb_frags;
>  
> -		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> +		buf = virtnet_rq_get_buf(rq, &len, &ctx);
>  		if (unlikely(!buf)) {
>  			pr_debug("%s: rx error: %d buffers out of %d missing\n",
>  				 dev->name, num_buf,
> @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>  err_skb:
>  	put_page(page);
>  	while (num_buf-- > 1) {
> -		buf = virtqueue_get_buf(rq->vq, &len);
> +		buf = virtnet_rq_get_buf(rq, &len, NULL);
>  		if (unlikely(!buf)) {
>  			pr_debug("%s: rx error: %d buffers missing\n",
>  				 dev->name, num_buf);
> @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
>  	unsigned int xdp_headroom = virtnet_get_headroom(vi);
>  	void *ctx = (void *)(unsigned long)xdp_headroom;
>  	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> +	struct virtnet_rq_data *data;
>  	int err;
>  
>  	len = SKB_DATA_ALIGN(len) +
> @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
>  	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
>  	get_page(alloc_frag->page);
>  	alloc_frag->offset += len;
> -	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> -		    vi->hdr_len + GOOD_PACKET_LEN);
> -	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> +
> +	if (rq->data_array) {
> +		err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> +					vi->hdr_len + GOOD_PACKET_LEN);
> +		if (err)
> +			goto map_err;
> +
> +		data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> +	} else {
> +		sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> +			    vi->hdr_len + GOOD_PACKET_LEN);
> +		data = (void *)buf;
> +	}
> +
> +	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
>  	if (err < 0)
> -		put_page(virt_to_head_page(buf));
> +		goto add_err;
> +
> +	return err;
> +
> +add_err:
> +	if (rq->data_array) {
> +		virtnet_rq_unmap(rq, data->dma);
> +		virtnet_rq_recycle_data(rq, data);
> +	}
> +
> +map_err:
> +	put_page(virt_to_head_page(buf));
>  	return err;
>  }
>  
> @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
>  	unsigned int headroom = virtnet_get_headroom(vi);
>  	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
>  	unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> +	struct virtnet_rq_data *data;
>  	char *buf;
>  	void *ctx;
>  	int err;
> @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
>  		alloc_frag->offset += hole;
>  	}
>  
> -	sg_init_one(rq->sg, buf, len);
> +	if (rq->data_array) {
> +		err = virtnet_rq_map_sg(rq, buf, len);
> +		if (err)
> +			goto map_err;
> +
> +		data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> +	} else {
> +		sg_init_one(rq->sg, buf, len);
> +		data = (void *)buf;
> +	}
> +
>  	ctx = mergeable_len_to_ctx(len + room, headroom);
> -	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> +	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
>  	if (err < 0)
> -		put_page(virt_to_head_page(buf));
> +		goto add_err;
> +
> +	return 0;
> +
> +add_err:
> +	if (rq->data_array) {
> +		virtnet_rq_unmap(rq, data->dma);
> +		virtnet_rq_recycle_data(rq, data);
> +	}
>  
> +map_err:
> +	put_page(virt_to_head_page(buf));
>  	return err;
>  }
>  
> @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
>  		void *ctx;
>  
>  		while (stats.packets < budget &&
> -		       (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> +		       (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
>  			receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
>  			stats.packets++;
>  		}
>  	} else {
>  		while (stats.packets < budget &&
> -		       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> +		       (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
>  			receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
>  			stats.packets++;
>  		}
> @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
>  	for (i = 0; i < vi->max_queue_pairs; i++) {
>  		__netif_napi_del(&vi->rq[i].napi);
>  		__netif_napi_del(&vi->sq[i].napi);
> +
> +		kfree(vi->rq[i].data_array);
> +		kfree(vi->rq[i].dma_array);
>  	}
>  
>  	/* We called __netif_napi_del(),
> @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
>  	}
>  
>  	for (i = 0; i < vi->max_queue_pairs; i++) {
> -		struct virtqueue *vq = vi->rq[i].vq;
> -		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> -			virtnet_rq_free_unused_buf(vq, buf);
> +		struct receive_queue *rq = &vi->rq[i];
> +
> +		while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> +			virtnet_rq_free_unused_buf(rq->vq, buf);
>  		cond_resched();
>  	}
>  }
> @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
>  	if (ret)
>  		goto err_free;
>  
> +	ret = virtnet_rq_merge_map_init(vi);
> +	if (ret)
> +		goto err_free;
> +
>  	cpus_read_lock();
>  	virtnet_set_affinity(vi);
>  	cpus_read_unlock();
> -- 
> 2.32.0.3.g01195cf9f
Xuan Zhuo July 10, 2023, 10:18 a.m. UTC | #2
On Mon, 10 Jul 2023 05:40:21 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Mon, Jul 10, 2023 at 11:42:37AM +0800, Xuan Zhuo wrote:
> > Currently, the virtio core will perform a dma operation for each
> > operation. Although, the same page may be operated multiple times.
> >
> > The driver does the dma operation and manages the dma address based the
> > feature premapped of virtio core.
> >
> > This way, we can perform only one dma operation for the same page. In
> > the case of mtu 1500, this can reduce a lot of dma operations.
> >
> > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > increased from 1893766 to 1901105. An increase of 0.4%.
>
> what kind of dma was there? an IOMMU? which vendors? in which mode
> of operation?


Do you mean this:

[    0.470816] iommu: Default domain type: Passthrough


>
> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
>
> This kind of difference is likely in the noise.

It's really not high, but this is because the proportion of DMA under perf top
is not high. Probably that much.

>
>
> > ---
> >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> >  1 file changed, 267 insertions(+), 16 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 486b5849033d..4de845d35bed 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> >  #define VIRTNET_SQ_STATS_LEN	ARRAY_SIZE(virtnet_sq_stats_desc)
> >  #define VIRTNET_RQ_STATS_LEN	ARRAY_SIZE(virtnet_rq_stats_desc)
> >
> > +/* The bufs on the same page may share this struct. */
> > +struct virtnet_rq_dma {
> > +	struct virtnet_rq_dma *next;
> > +
> > +	dma_addr_t addr;
> > +
> > +	void *buf;
> > +	u32 len;
> > +
> > +	u32 ref;
> > +};
> > +
> > +/* Record the dma and buf. */
>
> I guess I see that. But why?
> And these two comments are the extent of the available
> documentation, that's not enough I feel.
>
>
> > +struct virtnet_rq_data {
> > +	struct virtnet_rq_data *next;
>
> Is manually reimplementing a linked list the best
> we can do?

Yes, we can use llist.

>
> > +
> > +	void *buf;
> > +
> > +	struct virtnet_rq_dma *dma;
> > +};
> > +
> >  /* Internal representation of a send virtqueue */
> >  struct send_queue {
> >  	/* Virtqueue associated with this send _queue */
> > @@ -175,6 +196,13 @@ struct receive_queue {
> >  	char name[16];
> >
> >  	struct xdp_rxq_info xdp_rxq;
> > +
> > +	struct virtnet_rq_data *data_array;
> > +	struct virtnet_rq_data *data_free;
> > +
> > +	struct virtnet_rq_dma *dma_array;
> > +	struct virtnet_rq_dma *dma_free;
> > +	struct virtnet_rq_dma *last_dma;
> >  };
> >
> >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> >  	return skb;
> >  }
> >
> > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > +{
> > +	struct device *dev;
> > +
> > +	--dma->ref;
> > +
> > +	if (dma->ref)
> > +		return;
> > +
>
> If you don't unmap there is no guarantee valid data will be
> there in the buffer.
>
> > +	dev = virtqueue_dma_dev(rq->vq);
> > +
> > +	dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
>
>
>
>
>
> > +
> > +	dma->next = rq->dma_free;
> > +	rq->dma_free = dma;
> > +}
> > +
> > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > +				     struct virtnet_rq_data *data)
> > +{
> > +	void *buf;
> > +
> > +	buf = data->buf;
> > +
> > +	data->next = rq->data_free;
> > +	rq->data_free = data;
> > +
> > +	return buf;
> > +}
> > +
> > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > +						   void *buf,
> > +						   struct virtnet_rq_dma *dma)
> > +{
> > +	struct virtnet_rq_data *data;
> > +
> > +	data = rq->data_free;
> > +	rq->data_free = data->next;
> > +
> > +	data->buf = buf;
> > +	data->dma = dma;
> > +
> > +	return data;
> > +}
> > +
> > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > +{
> > +	struct virtnet_rq_data *data;
> > +	void *buf;
> > +
> > +	buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > +	if (!buf || !rq->data_array)
> > +		return buf;
> > +
> > +	data = buf;
> > +
> > +	virtnet_rq_unmap(rq, data->dma);
> > +
> > +	return virtnet_rq_recycle_data(rq, data);
> > +}
> > +
> > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > +{
> > +	struct virtnet_rq_data *data;
> > +	void *buf;
> > +
> > +	buf = virtqueue_detach_unused_buf(rq->vq);
> > +	if (!buf || !rq->data_array)
> > +		return buf;
> > +
> > +	data = buf;
> > +
> > +	virtnet_rq_unmap(rq, data->dma);
> > +
> > +	return virtnet_rq_recycle_data(rq, data);
> > +}
> > +
> > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > +{
> > +	struct virtnet_rq_dma *dma = rq->last_dma;
> > +	struct device *dev;
> > +	u32 off, map_len;
> > +	dma_addr_t addr;
> > +	void *end;
> > +
> > +	if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > +		++dma->ref;
> > +		addr = dma->addr + (buf - dma->buf);
> > +		goto ok;
> > +	}
>
> So this is the meat of the proposed optimization. I guess that
> if the last buffer we allocated happens to be in the same page
> as this one then they can both be mapped for DMA together.

Since we use page_frag, the buffers we allocated are all continuous.

> Why last one specifically? Whether next one happens to
> be close depends on luck. If you want to try optimizing this
> the right thing to do is likely by using a page pool.
> There's actually work upstream on page pool, look it up.

As we discussed in another thread, the page pool is first used for xdp. Let's
transform it step by step.

Thanks.

>
> > +
> > +	end = buf + len - 1;
> > +	off = offset_in_page(end);
> > +	map_len = len + PAGE_SIZE - off;
> > +
> > +	dev = virtqueue_dma_dev(rq->vq);
> > +
> > +	addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > +				  map_len, DMA_FROM_DEVICE, 0);
> > +	if (addr == DMA_MAPPING_ERROR)
> > +		return -ENOMEM;
> > +
> > +	dma = rq->dma_free;
> > +	rq->dma_free = dma->next;
> > +
> > +	dma->ref = 1;
> > +	dma->buf = buf;
> > +	dma->addr = addr;
> > +	dma->len = map_len;
> > +
> > +	rq->last_dma = dma;
> > +
> > +ok:
> > +	sg_init_table(rq->sg, 1);
> > +	rq->sg[0].dma_address = addr;
> > +	rq->sg[0].length = len;
> > +
> > +	return 0;
> > +}
> > +
> > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > +{
> > +	struct receive_queue *rq;
> > +	int i, err, j, num;
> > +
> > +	/* disable for big mode */
> > +	if (!vi->mergeable_rx_bufs && vi->big_packets)
> > +		return 0;
> > +
> > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > +		err = virtqueue_set_premapped(vi->rq[i].vq);
> > +		if (err)
> > +			continue;
> > +
> > +		rq = &vi->rq[i];
> > +
> > +		num = virtqueue_get_vring_size(rq->vq);
> > +
> > +		rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > +		if (!rq->data_array)
> > +			goto err;
> > +
> > +		rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > +		if (!rq->dma_array)
> > +			goto err;
> > +
> > +		for (j = 0; j < num; ++j) {
> > +			rq->data_array[j].next = rq->data_free;
> > +			rq->data_free = &rq->data_array[j];
> > +
> > +			rq->dma_array[j].next = rq->dma_free;
> > +			rq->dma_free = &rq->dma_array[j];
> > +		}
> > +	}
> > +
> > +	return 0;
> > +
> > +err:
> > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > +		struct receive_queue *rq;
> > +
> > +		rq = &vi->rq[i];
> > +
> > +		kfree(rq->dma_array);
> > +		kfree(rq->data_array);
> > +	}
> > +
> > +	return -ENOMEM;
> > +}
> > +
> >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> >  {
> >  	unsigned int len;
> > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> >  		void *buf;
> >  		int off;
> >
> > -		buf = virtqueue_get_buf(rq->vq, &buflen);
> > +		buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> >  		if (unlikely(!buf))
> >  			goto err_buf;
> >
> > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> >  		return -EINVAL;
> >
> >  	while (--*num_buf > 0) {
> > -		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > +		buf = virtnet_rq_get_buf(rq, &len, &ctx);
> >  		if (unlikely(!buf)) {
> >  			pr_debug("%s: rx error: %d buffers out of %d missing\n",
> >  				 dev->name, *num_buf,
> > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> >  	while (--num_buf) {
> >  		int num_skb_frags;
> >
> > -		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > +		buf = virtnet_rq_get_buf(rq, &len, &ctx);
> >  		if (unlikely(!buf)) {
> >  			pr_debug("%s: rx error: %d buffers out of %d missing\n",
> >  				 dev->name, num_buf,
> > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> >  err_skb:
> >  	put_page(page);
> >  	while (num_buf-- > 1) {
> > -		buf = virtqueue_get_buf(rq->vq, &len);
> > +		buf = virtnet_rq_get_buf(rq, &len, NULL);
> >  		if (unlikely(!buf)) {
> >  			pr_debug("%s: rx error: %d buffers missing\n",
> >  				 dev->name, num_buf);
> > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> >  	unsigned int xdp_headroom = virtnet_get_headroom(vi);
> >  	void *ctx = (void *)(unsigned long)xdp_headroom;
> >  	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > +	struct virtnet_rq_data *data;
> >  	int err;
> >
> >  	len = SKB_DATA_ALIGN(len) +
> > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> >  	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> >  	get_page(alloc_frag->page);
> >  	alloc_frag->offset += len;
> > -	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > -		    vi->hdr_len + GOOD_PACKET_LEN);
> > -	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > +
> > +	if (rq->data_array) {
> > +		err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > +					vi->hdr_len + GOOD_PACKET_LEN);
> > +		if (err)
> > +			goto map_err;
> > +
> > +		data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > +	} else {
> > +		sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > +			    vi->hdr_len + GOOD_PACKET_LEN);
> > +		data = (void *)buf;
> > +	}
> > +
> > +	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> >  	if (err < 0)
> > -		put_page(virt_to_head_page(buf));
> > +		goto add_err;
> > +
> > +	return err;
> > +
> > +add_err:
> > +	if (rq->data_array) {
> > +		virtnet_rq_unmap(rq, data->dma);
> > +		virtnet_rq_recycle_data(rq, data);
> > +	}
> > +
> > +map_err:
> > +	put_page(virt_to_head_page(buf));
> >  	return err;
> >  }
> >
> > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> >  	unsigned int headroom = virtnet_get_headroom(vi);
> >  	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> >  	unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > +	struct virtnet_rq_data *data;
> >  	char *buf;
> >  	void *ctx;
> >  	int err;
> > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> >  		alloc_frag->offset += hole;
> >  	}
> >
> > -	sg_init_one(rq->sg, buf, len);
> > +	if (rq->data_array) {
> > +		err = virtnet_rq_map_sg(rq, buf, len);
> > +		if (err)
> > +			goto map_err;
> > +
> > +		data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > +	} else {
> > +		sg_init_one(rq->sg, buf, len);
> > +		data = (void *)buf;
> > +	}
> > +
> >  	ctx = mergeable_len_to_ctx(len + room, headroom);
> > -	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > +	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> >  	if (err < 0)
> > -		put_page(virt_to_head_page(buf));
> > +		goto add_err;
> > +
> > +	return 0;
> > +
> > +add_err:
> > +	if (rq->data_array) {
> > +		virtnet_rq_unmap(rq, data->dma);
> > +		virtnet_rq_recycle_data(rq, data);
> > +	}
> >
> > +map_err:
> > +	put_page(virt_to_head_page(buf));
> >  	return err;
> >  }
> >
> > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> >  		void *ctx;
> >
> >  		while (stats.packets < budget &&
> > -		       (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > +		       (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> >  			receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> >  			stats.packets++;
> >  		}
> >  	} else {
> >  		while (stats.packets < budget &&
> > -		       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > +		       (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> >  			receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> >  			stats.packets++;
> >  		}
> > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> >  	for (i = 0; i < vi->max_queue_pairs; i++) {
> >  		__netif_napi_del(&vi->rq[i].napi);
> >  		__netif_napi_del(&vi->sq[i].napi);
> > +
> > +		kfree(vi->rq[i].data_array);
> > +		kfree(vi->rq[i].dma_array);
> >  	}
> >
> >  	/* We called __netif_napi_del(),
> > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> >  	}
> >
> >  	for (i = 0; i < vi->max_queue_pairs; i++) {
> > -		struct virtqueue *vq = vi->rq[i].vq;
> > -		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > -			virtnet_rq_free_unused_buf(vq, buf);
> > +		struct receive_queue *rq = &vi->rq[i];
> > +
> > +		while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > +			virtnet_rq_free_unused_buf(rq->vq, buf);
> >  		cond_resched();
> >  	}
> >  }
> > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> >  	if (ret)
> >  		goto err_free;
> >
> > +	ret = virtnet_rq_merge_map_init(vi);
> > +	if (ret)
> > +		goto err_free;
> > +
> >  	cpus_read_lock();
> >  	virtnet_set_affinity(vi);
> >  	cpus_read_unlock();
> > --
> > 2.32.0.3.g01195cf9f
>
Michael S. Tsirkin July 10, 2023, 11:59 a.m. UTC | #3
On Mon, Jul 10, 2023 at 06:18:30PM +0800, Xuan Zhuo wrote:
> On Mon, 10 Jul 2023 05:40:21 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Mon, Jul 10, 2023 at 11:42:37AM +0800, Xuan Zhuo wrote:
> > > Currently, the virtio core will perform a dma operation for each
> > > operation. Although, the same page may be operated multiple times.
> > >
> > > The driver does the dma operation and manages the dma address based the
> > > feature premapped of virtio core.
> > >
> > > This way, we can perform only one dma operation for the same page. In
> > > the case of mtu 1500, this can reduce a lot of dma operations.
> > >
> > > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > > increased from 1893766 to 1901105. An increase of 0.4%.
> >
> > what kind of dma was there? an IOMMU? which vendors? in which mode
> > of operation?
> 
> 
> Do you mean this:
> 
> [    0.470816] iommu: Default domain type: Passthrough
> 

With passthrough, dma API is just some indirect function calls, they do
not affect the performance a lot.

Try e.g. bounce buffer. Which is where you will see a problem: your
patches won't work.


> >
> > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> >
> > This kind of difference is likely in the noise.
> 
> It's really not high, but this is because the proportion of DMA under perf top
> is not high. Probably that much.

So maybe not worth the complexity.

> >
> >
> > > ---
> > >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> > >  1 file changed, 267 insertions(+), 16 deletions(-)
> > >
> > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > index 486b5849033d..4de845d35bed 100644
> > > --- a/drivers/net/virtio_net.c
> > > +++ b/drivers/net/virtio_net.c
> > > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> > >  #define VIRTNET_SQ_STATS_LEN	ARRAY_SIZE(virtnet_sq_stats_desc)
> > >  #define VIRTNET_RQ_STATS_LEN	ARRAY_SIZE(virtnet_rq_stats_desc)
> > >
> > > +/* The bufs on the same page may share this struct. */
> > > +struct virtnet_rq_dma {
> > > +	struct virtnet_rq_dma *next;
> > > +
> > > +	dma_addr_t addr;
> > > +
> > > +	void *buf;
> > > +	u32 len;
> > > +
> > > +	u32 ref;
> > > +};
> > > +
> > > +/* Record the dma and buf. */
> >
> > I guess I see that. But why?
> > And these two comments are the extent of the available
> > documentation, that's not enough I feel.
> >
> >
> > > +struct virtnet_rq_data {
> > > +	struct virtnet_rq_data *next;
> >
> > Is manually reimplementing a linked list the best
> > we can do?
> 
> Yes, we can use llist.
> 
> >
> > > +
> > > +	void *buf;
> > > +
> > > +	struct virtnet_rq_dma *dma;
> > > +};
> > > +
> > >  /* Internal representation of a send virtqueue */
> > >  struct send_queue {
> > >  	/* Virtqueue associated with this send _queue */
> > > @@ -175,6 +196,13 @@ struct receive_queue {
> > >  	char name[16];
> > >
> > >  	struct xdp_rxq_info xdp_rxq;
> > > +
> > > +	struct virtnet_rq_data *data_array;
> > > +	struct virtnet_rq_data *data_free;
> > > +
> > > +	struct virtnet_rq_dma *dma_array;
> > > +	struct virtnet_rq_dma *dma_free;
> > > +	struct virtnet_rq_dma *last_dma;
> > >  };
> > >
> > >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > >  	return skb;
> > >  }
> > >
> > > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > > +{
> > > +	struct device *dev;
> > > +
> > > +	--dma->ref;
> > > +
> > > +	if (dma->ref)
> > > +		return;
> > > +
> >
> > If you don't unmap there is no guarantee valid data will be
> > there in the buffer.
> >
> > > +	dev = virtqueue_dma_dev(rq->vq);
> > > +
> > > +	dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> >
> >
> >
> >
> >
> > > +
> > > +	dma->next = rq->dma_free;
> > > +	rq->dma_free = dma;
> > > +}
> > > +
> > > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > > +				     struct virtnet_rq_data *data)
> > > +{
> > > +	void *buf;
> > > +
> > > +	buf = data->buf;
> > > +
> > > +	data->next = rq->data_free;
> > > +	rq->data_free = data;
> > > +
> > > +	return buf;
> > > +}
> > > +
> > > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > > +						   void *buf,
> > > +						   struct virtnet_rq_dma *dma)
> > > +{
> > > +	struct virtnet_rq_data *data;
> > > +
> > > +	data = rq->data_free;
> > > +	rq->data_free = data->next;
> > > +
> > > +	data->buf = buf;
> > > +	data->dma = dma;
> > > +
> > > +	return data;
> > > +}
> > > +
> > > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > > +{
> > > +	struct virtnet_rq_data *data;
> > > +	void *buf;
> > > +
> > > +	buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > > +	if (!buf || !rq->data_array)
> > > +		return buf;
> > > +
> > > +	data = buf;
> > > +
> > > +	virtnet_rq_unmap(rq, data->dma);
> > > +
> > > +	return virtnet_rq_recycle_data(rq, data);
> > > +}
> > > +
> > > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > > +{
> > > +	struct virtnet_rq_data *data;
> > > +	void *buf;
> > > +
> > > +	buf = virtqueue_detach_unused_buf(rq->vq);
> > > +	if (!buf || !rq->data_array)
> > > +		return buf;
> > > +
> > > +	data = buf;
> > > +
> > > +	virtnet_rq_unmap(rq, data->dma);
> > > +
> > > +	return virtnet_rq_recycle_data(rq, data);
> > > +}
> > > +
> > > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > > +{
> > > +	struct virtnet_rq_dma *dma = rq->last_dma;
> > > +	struct device *dev;
> > > +	u32 off, map_len;
> > > +	dma_addr_t addr;
> > > +	void *end;
> > > +
> > > +	if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > > +		++dma->ref;
> > > +		addr = dma->addr + (buf - dma->buf);
> > > +		goto ok;
> > > +	}
> >
> > So this is the meat of the proposed optimization. I guess that
> > if the last buffer we allocated happens to be in the same page
> > as this one then they can both be mapped for DMA together.
> 
> Since we use page_frag, the buffers we allocated are all continuous.
> 
> > Why last one specifically? Whether next one happens to
> > be close depends on luck. If you want to try optimizing this
> > the right thing to do is likely by using a page pool.
> > There's actually work upstream on page pool, look it up.
> 
> As we discussed in another thread, the page pool is first used for xdp. Let's
> transform it step by step.
> 
> Thanks.

ok so this should wait then?

> >
> > > +
> > > +	end = buf + len - 1;
> > > +	off = offset_in_page(end);
> > > +	map_len = len + PAGE_SIZE - off;
> > > +
> > > +	dev = virtqueue_dma_dev(rq->vq);
> > > +
> > > +	addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > > +				  map_len, DMA_FROM_DEVICE, 0);
> > > +	if (addr == DMA_MAPPING_ERROR)
> > > +		return -ENOMEM;
> > > +
> > > +	dma = rq->dma_free;
> > > +	rq->dma_free = dma->next;
> > > +
> > > +	dma->ref = 1;
> > > +	dma->buf = buf;
> > > +	dma->addr = addr;
> > > +	dma->len = map_len;
> > > +
> > > +	rq->last_dma = dma;
> > > +
> > > +ok:
> > > +	sg_init_table(rq->sg, 1);
> > > +	rq->sg[0].dma_address = addr;
> > > +	rq->sg[0].length = len;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > > +{
> > > +	struct receive_queue *rq;
> > > +	int i, err, j, num;
> > > +
> > > +	/* disable for big mode */
> > > +	if (!vi->mergeable_rx_bufs && vi->big_packets)
> > > +		return 0;
> > > +
> > > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > > +		err = virtqueue_set_premapped(vi->rq[i].vq);
> > > +		if (err)
> > > +			continue;
> > > +
> > > +		rq = &vi->rq[i];
> > > +
> > > +		num = virtqueue_get_vring_size(rq->vq);
> > > +
> > > +		rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > > +		if (!rq->data_array)
> > > +			goto err;
> > > +
> > > +		rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > > +		if (!rq->dma_array)
> > > +			goto err;
> > > +
> > > +		for (j = 0; j < num; ++j) {
> > > +			rq->data_array[j].next = rq->data_free;
> > > +			rq->data_free = &rq->data_array[j];
> > > +
> > > +			rq->dma_array[j].next = rq->dma_free;
> > > +			rq->dma_free = &rq->dma_array[j];
> > > +		}
> > > +	}
> > > +
> > > +	return 0;
> > > +
> > > +err:
> > > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > > +		struct receive_queue *rq;
> > > +
> > > +		rq = &vi->rq[i];
> > > +
> > > +		kfree(rq->dma_array);
> > > +		kfree(rq->data_array);
> > > +	}
> > > +
> > > +	return -ENOMEM;
> > > +}
> > > +
> > >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> > >  {
> > >  	unsigned int len;
> > > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > >  		void *buf;
> > >  		int off;
> > >
> > > -		buf = virtqueue_get_buf(rq->vq, &buflen);
> > > +		buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> > >  		if (unlikely(!buf))
> > >  			goto err_buf;
> > >
> > > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > >  		return -EINVAL;
> > >
> > >  	while (--*num_buf > 0) {
> > > -		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > +		buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > >  		if (unlikely(!buf)) {
> > >  			pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > >  				 dev->name, *num_buf,
> > > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > >  	while (--num_buf) {
> > >  		int num_skb_frags;
> > >
> > > -		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > +		buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > >  		if (unlikely(!buf)) {
> > >  			pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > >  				 dev->name, num_buf,
> > > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > >  err_skb:
> > >  	put_page(page);
> > >  	while (num_buf-- > 1) {
> > > -		buf = virtqueue_get_buf(rq->vq, &len);
> > > +		buf = virtnet_rq_get_buf(rq, &len, NULL);
> > >  		if (unlikely(!buf)) {
> > >  			pr_debug("%s: rx error: %d buffers missing\n",
> > >  				 dev->name, num_buf);
> > > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > >  	unsigned int xdp_headroom = virtnet_get_headroom(vi);
> > >  	void *ctx = (void *)(unsigned long)xdp_headroom;
> > >  	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > > +	struct virtnet_rq_data *data;
> > >  	int err;
> > >
> > >  	len = SKB_DATA_ALIGN(len) +
> > > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > >  	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > >  	get_page(alloc_frag->page);
> > >  	alloc_frag->offset += len;
> > > -	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > -		    vi->hdr_len + GOOD_PACKET_LEN);
> > > -	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > +
> > > +	if (rq->data_array) {
> > > +		err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > +					vi->hdr_len + GOOD_PACKET_LEN);
> > > +		if (err)
> > > +			goto map_err;
> > > +
> > > +		data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > +	} else {
> > > +		sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > +			    vi->hdr_len + GOOD_PACKET_LEN);
> > > +		data = (void *)buf;
> > > +	}
> > > +
> > > +	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > >  	if (err < 0)
> > > -		put_page(virt_to_head_page(buf));
> > > +		goto add_err;
> > > +
> > > +	return err;
> > > +
> > > +add_err:
> > > +	if (rq->data_array) {
> > > +		virtnet_rq_unmap(rq, data->dma);
> > > +		virtnet_rq_recycle_data(rq, data);
> > > +	}
> > > +
> > > +map_err:
> > > +	put_page(virt_to_head_page(buf));
> > >  	return err;
> > >  }
> > >
> > > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > >  	unsigned int headroom = virtnet_get_headroom(vi);
> > >  	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> > >  	unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > > +	struct virtnet_rq_data *data;
> > >  	char *buf;
> > >  	void *ctx;
> > >  	int err;
> > > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > >  		alloc_frag->offset += hole;
> > >  	}
> > >
> > > -	sg_init_one(rq->sg, buf, len);
> > > +	if (rq->data_array) {
> > > +		err = virtnet_rq_map_sg(rq, buf, len);
> > > +		if (err)
> > > +			goto map_err;
> > > +
> > > +		data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > +	} else {
> > > +		sg_init_one(rq->sg, buf, len);
> > > +		data = (void *)buf;
> > > +	}
> > > +
> > >  	ctx = mergeable_len_to_ctx(len + room, headroom);
> > > -	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > +	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > >  	if (err < 0)
> > > -		put_page(virt_to_head_page(buf));
> > > +		goto add_err;
> > > +
> > > +	return 0;
> > > +
> > > +add_err:
> > > +	if (rq->data_array) {
> > > +		virtnet_rq_unmap(rq, data->dma);
> > > +		virtnet_rq_recycle_data(rq, data);
> > > +	}
> > >
> > > +map_err:
> > > +	put_page(virt_to_head_page(buf));
> > >  	return err;
> > >  }
> > >
> > > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> > >  		void *ctx;
> > >
> > >  		while (stats.packets < budget &&
> > > -		       (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > > +		       (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> > >  			receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> > >  			stats.packets++;
> > >  		}
> > >  	} else {
> > >  		while (stats.packets < budget &&
> > > -		       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > > +		       (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> > >  			receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> > >  			stats.packets++;
> > >  		}
> > > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > >  	for (i = 0; i < vi->max_queue_pairs; i++) {
> > >  		__netif_napi_del(&vi->rq[i].napi);
> > >  		__netif_napi_del(&vi->sq[i].napi);
> > > +
> > > +		kfree(vi->rq[i].data_array);
> > > +		kfree(vi->rq[i].dma_array);
> > >  	}
> > >
> > >  	/* We called __netif_napi_del(),
> > > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> > >  	}
> > >
> > >  	for (i = 0; i < vi->max_queue_pairs; i++) {
> > > -		struct virtqueue *vq = vi->rq[i].vq;
> > > -		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > > -			virtnet_rq_free_unused_buf(vq, buf);
> > > +		struct receive_queue *rq = &vi->rq[i];
> > > +
> > > +		while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > > +			virtnet_rq_free_unused_buf(rq->vq, buf);
> > >  		cond_resched();
> > >  	}
> > >  }
> > > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> > >  	if (ret)
> > >  		goto err_free;
> > >
> > > +	ret = virtnet_rq_merge_map_init(vi);
> > > +	if (ret)
> > > +		goto err_free;
> > > +
> > >  	cpus_read_lock();
> > >  	virtnet_set_affinity(vi);
> > >  	cpus_read_unlock();
> > > --
> > > 2.32.0.3.g01195cf9f
> >
Xuan Zhuo July 10, 2023, 12:38 p.m. UTC | #4
On Mon, 10 Jul 2023 07:59:03 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Mon, Jul 10, 2023 at 06:18:30PM +0800, Xuan Zhuo wrote:
> > On Mon, 10 Jul 2023 05:40:21 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > On Mon, Jul 10, 2023 at 11:42:37AM +0800, Xuan Zhuo wrote:
> > > > Currently, the virtio core will perform a dma operation for each
> > > > operation. Although, the same page may be operated multiple times.
> > > >
> > > > The driver does the dma operation and manages the dma address based the
> > > > feature premapped of virtio core.
> > > >
> > > > This way, we can perform only one dma operation for the same page. In
> > > > the case of mtu 1500, this can reduce a lot of dma operations.
> > > >
> > > > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > > > increased from 1893766 to 1901105. An increase of 0.4%.
> > >
> > > what kind of dma was there? an IOMMU? which vendors? in which mode
> > > of operation?
> >
> >
> > Do you mean this:
> >
> > [    0.470816] iommu: Default domain type: Passthrough
> >
>
> With passthrough, dma API is just some indirect function calls, they do
> not affect the performance a lot.


Yes, this benefit is worthless. I seem to have done a meaningless thing. The
overhead of DMA I observed is indeed not too high.

Thanks.


>
> Try e.g. bounce buffer. Which is where you will see a problem: your
> patches won't work.
>
>
> > >
> > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > >
> > > This kind of difference is likely in the noise.
> >
> > It's really not high, but this is because the proportion of DMA under perf top
> > is not high. Probably that much.
>
> So maybe not worth the complexity.
>
> > >
> > >
> > > > ---
> > > >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> > > >  1 file changed, 267 insertions(+), 16 deletions(-)
> > > >
> > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > index 486b5849033d..4de845d35bed 100644
> > > > --- a/drivers/net/virtio_net.c
> > > > +++ b/drivers/net/virtio_net.c
> > > > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> > > >  #define VIRTNET_SQ_STATS_LEN	ARRAY_SIZE(virtnet_sq_stats_desc)
> > > >  #define VIRTNET_RQ_STATS_LEN	ARRAY_SIZE(virtnet_rq_stats_desc)
> > > >
> > > > +/* The bufs on the same page may share this struct. */
> > > > +struct virtnet_rq_dma {
> > > > +	struct virtnet_rq_dma *next;
> > > > +
> > > > +	dma_addr_t addr;
> > > > +
> > > > +	void *buf;
> > > > +	u32 len;
> > > > +
> > > > +	u32 ref;
> > > > +};
> > > > +
> > > > +/* Record the dma and buf. */
> > >
> > > I guess I see that. But why?
> > > And these two comments are the extent of the available
> > > documentation, that's not enough I feel.
> > >
> > >
> > > > +struct virtnet_rq_data {
> > > > +	struct virtnet_rq_data *next;
> > >
> > > Is manually reimplementing a linked list the best
> > > we can do?
> >
> > Yes, we can use llist.
> >
> > >
> > > > +
> > > > +	void *buf;
> > > > +
> > > > +	struct virtnet_rq_dma *dma;
> > > > +};
> > > > +
> > > >  /* Internal representation of a send virtqueue */
> > > >  struct send_queue {
> > > >  	/* Virtqueue associated with this send _queue */
> > > > @@ -175,6 +196,13 @@ struct receive_queue {
> > > >  	char name[16];
> > > >
> > > >  	struct xdp_rxq_info xdp_rxq;
> > > > +
> > > > +	struct virtnet_rq_data *data_array;
> > > > +	struct virtnet_rq_data *data_free;
> > > > +
> > > > +	struct virtnet_rq_dma *dma_array;
> > > > +	struct virtnet_rq_dma *dma_free;
> > > > +	struct virtnet_rq_dma *last_dma;
> > > >  };
> > > >
> > > >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > > > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > >  	return skb;
> > > >  }
> > > >
> > > > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > > > +{
> > > > +	struct device *dev;
> > > > +
> > > > +	--dma->ref;
> > > > +
> > > > +	if (dma->ref)
> > > > +		return;
> > > > +
> > >
> > > If you don't unmap there is no guarantee valid data will be
> > > there in the buffer.
> > >
> > > > +	dev = virtqueue_dma_dev(rq->vq);
> > > > +
> > > > +	dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> > >
> > >
> > >
> > >
> > >
> > > > +
> > > > +	dma->next = rq->dma_free;
> > > > +	rq->dma_free = dma;
> > > > +}
> > > > +
> > > > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > > > +				     struct virtnet_rq_data *data)
> > > > +{
> > > > +	void *buf;
> > > > +
> > > > +	buf = data->buf;
> > > > +
> > > > +	data->next = rq->data_free;
> > > > +	rq->data_free = data;
> > > > +
> > > > +	return buf;
> > > > +}
> > > > +
> > > > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > > > +						   void *buf,
> > > > +						   struct virtnet_rq_dma *dma)
> > > > +{
> > > > +	struct virtnet_rq_data *data;
> > > > +
> > > > +	data = rq->data_free;
> > > > +	rq->data_free = data->next;
> > > > +
> > > > +	data->buf = buf;
> > > > +	data->dma = dma;
> > > > +
> > > > +	return data;
> > > > +}
> > > > +
> > > > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > > > +{
> > > > +	struct virtnet_rq_data *data;
> > > > +	void *buf;
> > > > +
> > > > +	buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > > > +	if (!buf || !rq->data_array)
> > > > +		return buf;
> > > > +
> > > > +	data = buf;
> > > > +
> > > > +	virtnet_rq_unmap(rq, data->dma);
> > > > +
> > > > +	return virtnet_rq_recycle_data(rq, data);
> > > > +}
> > > > +
> > > > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > > > +{
> > > > +	struct virtnet_rq_data *data;
> > > > +	void *buf;
> > > > +
> > > > +	buf = virtqueue_detach_unused_buf(rq->vq);
> > > > +	if (!buf || !rq->data_array)
> > > > +		return buf;
> > > > +
> > > > +	data = buf;
> > > > +
> > > > +	virtnet_rq_unmap(rq, data->dma);
> > > > +
> > > > +	return virtnet_rq_recycle_data(rq, data);
> > > > +}
> > > > +
> > > > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > > > +{
> > > > +	struct virtnet_rq_dma *dma = rq->last_dma;
> > > > +	struct device *dev;
> > > > +	u32 off, map_len;
> > > > +	dma_addr_t addr;
> > > > +	void *end;
> > > > +
> > > > +	if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > > > +		++dma->ref;
> > > > +		addr = dma->addr + (buf - dma->buf);
> > > > +		goto ok;
> > > > +	}
> > >
> > > So this is the meat of the proposed optimization. I guess that
> > > if the last buffer we allocated happens to be in the same page
> > > as this one then they can both be mapped for DMA together.
> >
> > Since we use page_frag, the buffers we allocated are all continuous.
> >
> > > Why last one specifically? Whether next one happens to
> > > be close depends on luck. If you want to try optimizing this
> > > the right thing to do is likely by using a page pool.
> > > There's actually work upstream on page pool, look it up.
> >
> > As we discussed in another thread, the page pool is first used for xdp. Let's
> > transform it step by step.
> >
> > Thanks.
>
> ok so this should wait then?
>
> > >
> > > > +
> > > > +	end = buf + len - 1;
> > > > +	off = offset_in_page(end);
> > > > +	map_len = len + PAGE_SIZE - off;
> > > > +
> > > > +	dev = virtqueue_dma_dev(rq->vq);
> > > > +
> > > > +	addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > > > +				  map_len, DMA_FROM_DEVICE, 0);
> > > > +	if (addr == DMA_MAPPING_ERROR)
> > > > +		return -ENOMEM;
> > > > +
> > > > +	dma = rq->dma_free;
> > > > +	rq->dma_free = dma->next;
> > > > +
> > > > +	dma->ref = 1;
> > > > +	dma->buf = buf;
> > > > +	dma->addr = addr;
> > > > +	dma->len = map_len;
> > > > +
> > > > +	rq->last_dma = dma;
> > > > +
> > > > +ok:
> > > > +	sg_init_table(rq->sg, 1);
> > > > +	rq->sg[0].dma_address = addr;
> > > > +	rq->sg[0].length = len;
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > > > +{
> > > > +	struct receive_queue *rq;
> > > > +	int i, err, j, num;
> > > > +
> > > > +	/* disable for big mode */
> > > > +	if (!vi->mergeable_rx_bufs && vi->big_packets)
> > > > +		return 0;
> > > > +
> > > > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > +		err = virtqueue_set_premapped(vi->rq[i].vq);
> > > > +		if (err)
> > > > +			continue;
> > > > +
> > > > +		rq = &vi->rq[i];
> > > > +
> > > > +		num = virtqueue_get_vring_size(rq->vq);
> > > > +
> > > > +		rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > > > +		if (!rq->data_array)
> > > > +			goto err;
> > > > +
> > > > +		rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > > > +		if (!rq->dma_array)
> > > > +			goto err;
> > > > +
> > > > +		for (j = 0; j < num; ++j) {
> > > > +			rq->data_array[j].next = rq->data_free;
> > > > +			rq->data_free = &rq->data_array[j];
> > > > +
> > > > +			rq->dma_array[j].next = rq->dma_free;
> > > > +			rq->dma_free = &rq->dma_array[j];
> > > > +		}
> > > > +	}
> > > > +
> > > > +	return 0;
> > > > +
> > > > +err:
> > > > +	for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > +		struct receive_queue *rq;
> > > > +
> > > > +		rq = &vi->rq[i];
> > > > +
> > > > +		kfree(rq->dma_array);
> > > > +		kfree(rq->data_array);
> > > > +	}
> > > > +
> > > > +	return -ENOMEM;
> > > > +}
> > > > +
> > > >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> > > >  {
> > > >  	unsigned int len;
> > > > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > >  		void *buf;
> > > >  		int off;
> > > >
> > > > -		buf = virtqueue_get_buf(rq->vq, &buflen);
> > > > +		buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> > > >  		if (unlikely(!buf))
> > > >  			goto err_buf;
> > > >
> > > > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > >  		return -EINVAL;
> > > >
> > > >  	while (--*num_buf > 0) {
> > > > -		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > +		buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > >  		if (unlikely(!buf)) {
> > > >  			pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > >  				 dev->name, *num_buf,
> > > > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > >  	while (--num_buf) {
> > > >  		int num_skb_frags;
> > > >
> > > > -		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > +		buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > >  		if (unlikely(!buf)) {
> > > >  			pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > >  				 dev->name, num_buf,
> > > > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > >  err_skb:
> > > >  	put_page(page);
> > > >  	while (num_buf-- > 1) {
> > > > -		buf = virtqueue_get_buf(rq->vq, &len);
> > > > +		buf = virtnet_rq_get_buf(rq, &len, NULL);
> > > >  		if (unlikely(!buf)) {
> > > >  			pr_debug("%s: rx error: %d buffers missing\n",
> > > >  				 dev->name, num_buf);
> > > > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > >  	unsigned int xdp_headroom = virtnet_get_headroom(vi);
> > > >  	void *ctx = (void *)(unsigned long)xdp_headroom;
> > > >  	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > > > +	struct virtnet_rq_data *data;
> > > >  	int err;
> > > >
> > > >  	len = SKB_DATA_ALIGN(len) +
> > > > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > >  	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > >  	get_page(alloc_frag->page);
> > > >  	alloc_frag->offset += len;
> > > > -	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > -		    vi->hdr_len + GOOD_PACKET_LEN);
> > > > -	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > +
> > > > +	if (rq->data_array) {
> > > > +		err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > +					vi->hdr_len + GOOD_PACKET_LEN);
> > > > +		if (err)
> > > > +			goto map_err;
> > > > +
> > > > +		data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > +	} else {
> > > > +		sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > +			    vi->hdr_len + GOOD_PACKET_LEN);
> > > > +		data = (void *)buf;
> > > > +	}
> > > > +
> > > > +	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > >  	if (err < 0)
> > > > -		put_page(virt_to_head_page(buf));
> > > > +		goto add_err;
> > > > +
> > > > +	return err;
> > > > +
> > > > +add_err:
> > > > +	if (rq->data_array) {
> > > > +		virtnet_rq_unmap(rq, data->dma);
> > > > +		virtnet_rq_recycle_data(rq, data);
> > > > +	}
> > > > +
> > > > +map_err:
> > > > +	put_page(virt_to_head_page(buf));
> > > >  	return err;
> > > >  }
> > > >
> > > > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > >  	unsigned int headroom = virtnet_get_headroom(vi);
> > > >  	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> > > >  	unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > > > +	struct virtnet_rq_data *data;
> > > >  	char *buf;
> > > >  	void *ctx;
> > > >  	int err;
> > > > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > >  		alloc_frag->offset += hole;
> > > >  	}
> > > >
> > > > -	sg_init_one(rq->sg, buf, len);
> > > > +	if (rq->data_array) {
> > > > +		err = virtnet_rq_map_sg(rq, buf, len);
> > > > +		if (err)
> > > > +			goto map_err;
> > > > +
> > > > +		data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > +	} else {
> > > > +		sg_init_one(rq->sg, buf, len);
> > > > +		data = (void *)buf;
> > > > +	}
> > > > +
> > > >  	ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > -	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > +	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > >  	if (err < 0)
> > > > -		put_page(virt_to_head_page(buf));
> > > > +		goto add_err;
> > > > +
> > > > +	return 0;
> > > > +
> > > > +add_err:
> > > > +	if (rq->data_array) {
> > > > +		virtnet_rq_unmap(rq, data->dma);
> > > > +		virtnet_rq_recycle_data(rq, data);
> > > > +	}
> > > >
> > > > +map_err:
> > > > +	put_page(virt_to_head_page(buf));
> > > >  	return err;
> > > >  }
> > > >
> > > > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> > > >  		void *ctx;
> > > >
> > > >  		while (stats.packets < budget &&
> > > > -		       (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > > > +		       (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> > > >  			receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> > > >  			stats.packets++;
> > > >  		}
> > > >  	} else {
> > > >  		while (stats.packets < budget &&
> > > > -		       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > > > +		       (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> > > >  			receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> > > >  			stats.packets++;
> > > >  		}
> > > > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > >  	for (i = 0; i < vi->max_queue_pairs; i++) {
> > > >  		__netif_napi_del(&vi->rq[i].napi);
> > > >  		__netif_napi_del(&vi->sq[i].napi);
> > > > +
> > > > +		kfree(vi->rq[i].data_array);
> > > > +		kfree(vi->rq[i].dma_array);
> > > >  	}
> > > >
> > > >  	/* We called __netif_napi_del(),
> > > > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> > > >  	}
> > > >
> > > >  	for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > -		struct virtqueue *vq = vi->rq[i].vq;
> > > > -		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > > > -			virtnet_rq_free_unused_buf(vq, buf);
> > > > +		struct receive_queue *rq = &vi->rq[i];
> > > > +
> > > > +		while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > > > +			virtnet_rq_free_unused_buf(rq->vq, buf);
> > > >  		cond_resched();
> > > >  	}
> > > >  }
> > > > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> > > >  	if (ret)
> > > >  		goto err_free;
> > > >
> > > > +	ret = virtnet_rq_merge_map_init(vi);
> > > > +	if (ret)
> > > > +		goto err_free;
> > > > +
> > > >  	cpus_read_lock();
> > > >  	virtnet_set_affinity(vi);
> > > >  	cpus_read_unlock();
> > > > --
> > > > 2.32.0.3.g01195cf9f
> > >
>
Jason Wang July 11, 2023, 2:36 a.m. UTC | #5
On Mon, Jul 10, 2023 at 8:41 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> On Mon, 10 Jul 2023 07:59:03 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Mon, Jul 10, 2023 at 06:18:30PM +0800, Xuan Zhuo wrote:
> > > On Mon, 10 Jul 2023 05:40:21 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > On Mon, Jul 10, 2023 at 11:42:37AM +0800, Xuan Zhuo wrote:
> > > > > Currently, the virtio core will perform a dma operation for each
> > > > > operation. Although, the same page may be operated multiple times.
> > > > >
> > > > > The driver does the dma operation and manages the dma address based the
> > > > > feature premapped of virtio core.
> > > > >
> > > > > This way, we can perform only one dma operation for the same page. In
> > > > > the case of mtu 1500, this can reduce a lot of dma operations.
> > > > >
> > > > > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > > > > increased from 1893766 to 1901105. An increase of 0.4%.
> > > >
> > > > what kind of dma was there? an IOMMU? which vendors? in which mode
> > > > of operation?
> > >
> > >
> > > Do you mean this:
> > >
> > > [    0.470816] iommu: Default domain type: Passthrough
> > >
> >
> > With passthrough, dma API is just some indirect function calls, they do
> > not affect the performance a lot.
>
>
> Yes, this benefit is worthless. I seem to have done a meaningless thing. The
> overhead of DMA I observed is indeed not too high.

Have you measured with iommu=strict?

Thanks

>
> Thanks.
>
>
> >
> > Try e.g. bounce buffer. Which is where you will see a problem: your
> > patches won't work.
> >
> >
> > > >
> > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > >
> > > > This kind of difference is likely in the noise.
> > >
> > > It's really not high, but this is because the proportion of DMA under perf top
> > > is not high. Probably that much.
> >
> > So maybe not worth the complexity.
> >
> > > >
> > > >
> > > > > ---
> > > > >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> > > > >  1 file changed, 267 insertions(+), 16 deletions(-)
> > > > >
> > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > index 486b5849033d..4de845d35bed 100644
> > > > > --- a/drivers/net/virtio_net.c
> > > > > +++ b/drivers/net/virtio_net.c
> > > > > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> > > > >  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
> > > > >  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
> > > > >
> > > > > +/* The bufs on the same page may share this struct. */
> > > > > +struct virtnet_rq_dma {
> > > > > +       struct virtnet_rq_dma *next;
> > > > > +
> > > > > +       dma_addr_t addr;
> > > > > +
> > > > > +       void *buf;
> > > > > +       u32 len;
> > > > > +
> > > > > +       u32 ref;
> > > > > +};
> > > > > +
> > > > > +/* Record the dma and buf. */
> > > >
> > > > I guess I see that. But why?
> > > > And these two comments are the extent of the available
> > > > documentation, that's not enough I feel.
> > > >
> > > >
> > > > > +struct virtnet_rq_data {
> > > > > +       struct virtnet_rq_data *next;
> > > >
> > > > Is manually reimplementing a linked list the best
> > > > we can do?
> > >
> > > Yes, we can use llist.
> > >
> > > >
> > > > > +
> > > > > +       void *buf;
> > > > > +
> > > > > +       struct virtnet_rq_dma *dma;
> > > > > +};
> > > > > +
> > > > >  /* Internal representation of a send virtqueue */
> > > > >  struct send_queue {
> > > > >         /* Virtqueue associated with this send _queue */
> > > > > @@ -175,6 +196,13 @@ struct receive_queue {
> > > > >         char name[16];
> > > > >
> > > > >         struct xdp_rxq_info xdp_rxq;
> > > > > +
> > > > > +       struct virtnet_rq_data *data_array;
> > > > > +       struct virtnet_rq_data *data_free;
> > > > > +
> > > > > +       struct virtnet_rq_dma *dma_array;
> > > > > +       struct virtnet_rq_dma *dma_free;
> > > > > +       struct virtnet_rq_dma *last_dma;
> > > > >  };
> > > > >
> > > > >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > > > > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > >         return skb;
> > > > >  }
> > > > >
> > > > > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > > > > +{
> > > > > +       struct device *dev;
> > > > > +
> > > > > +       --dma->ref;
> > > > > +
> > > > > +       if (dma->ref)
> > > > > +               return;
> > > > > +
> > > >
> > > > If you don't unmap there is no guarantee valid data will be
> > > > there in the buffer.
> > > >
> > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > +
> > > > > +       dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> > > >
> > > >
> > > >
> > > >
> > > >
> > > > > +
> > > > > +       dma->next = rq->dma_free;
> > > > > +       rq->dma_free = dma;
> > > > > +}
> > > > > +
> > > > > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > > > > +                                    struct virtnet_rq_data *data)
> > > > > +{
> > > > > +       void *buf;
> > > > > +
> > > > > +       buf = data->buf;
> > > > > +
> > > > > +       data->next = rq->data_free;
> > > > > +       rq->data_free = data;
> > > > > +
> > > > > +       return buf;
> > > > > +}
> > > > > +
> > > > > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > > > > +                                                  void *buf,
> > > > > +                                                  struct virtnet_rq_dma *dma)
> > > > > +{
> > > > > +       struct virtnet_rq_data *data;
> > > > > +
> > > > > +       data = rq->data_free;
> > > > > +       rq->data_free = data->next;
> > > > > +
> > > > > +       data->buf = buf;
> > > > > +       data->dma = dma;
> > > > > +
> > > > > +       return data;
> > > > > +}
> > > > > +
> > > > > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > > > > +{
> > > > > +       struct virtnet_rq_data *data;
> > > > > +       void *buf;
> > > > > +
> > > > > +       buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > > > > +       if (!buf || !rq->data_array)
> > > > > +               return buf;
> > > > > +
> > > > > +       data = buf;
> > > > > +
> > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > +
> > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > +}
> > > > > +
> > > > > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > > > > +{
> > > > > +       struct virtnet_rq_data *data;
> > > > > +       void *buf;
> > > > > +
> > > > > +       buf = virtqueue_detach_unused_buf(rq->vq);
> > > > > +       if (!buf || !rq->data_array)
> > > > > +               return buf;
> > > > > +
> > > > > +       data = buf;
> > > > > +
> > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > +
> > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > +}
> > > > > +
> > > > > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > > > > +{
> > > > > +       struct virtnet_rq_dma *dma = rq->last_dma;
> > > > > +       struct device *dev;
> > > > > +       u32 off, map_len;
> > > > > +       dma_addr_t addr;
> > > > > +       void *end;
> > > > > +
> > > > > +       if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > > > > +               ++dma->ref;
> > > > > +               addr = dma->addr + (buf - dma->buf);
> > > > > +               goto ok;
> > > > > +       }
> > > >
> > > > So this is the meat of the proposed optimization. I guess that
> > > > if the last buffer we allocated happens to be in the same page
> > > > as this one then they can both be mapped for DMA together.
> > >
> > > Since we use page_frag, the buffers we allocated are all continuous.
> > >
> > > > Why last one specifically? Whether next one happens to
> > > > be close depends on luck. If you want to try optimizing this
> > > > the right thing to do is likely by using a page pool.
> > > > There's actually work upstream on page pool, look it up.
> > >
> > > As we discussed in another thread, the page pool is first used for xdp. Let's
> > > transform it step by step.
> > >
> > > Thanks.
> >
> > ok so this should wait then?
> >
> > > >
> > > > > +
> > > > > +       end = buf + len - 1;
> > > > > +       off = offset_in_page(end);
> > > > > +       map_len = len + PAGE_SIZE - off;
> > > > > +
> > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > +
> > > > > +       addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > > > > +                                 map_len, DMA_FROM_DEVICE, 0);
> > > > > +       if (addr == DMA_MAPPING_ERROR)
> > > > > +               return -ENOMEM;
> > > > > +
> > > > > +       dma = rq->dma_free;
> > > > > +       rq->dma_free = dma->next;
> > > > > +
> > > > > +       dma->ref = 1;
> > > > > +       dma->buf = buf;
> > > > > +       dma->addr = addr;
> > > > > +       dma->len = map_len;
> > > > > +
> > > > > +       rq->last_dma = dma;
> > > > > +
> > > > > +ok:
> > > > > +       sg_init_table(rq->sg, 1);
> > > > > +       rq->sg[0].dma_address = addr;
> > > > > +       rq->sg[0].length = len;
> > > > > +
> > > > > +       return 0;
> > > > > +}
> > > > > +
> > > > > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > > > > +{
> > > > > +       struct receive_queue *rq;
> > > > > +       int i, err, j, num;
> > > > > +
> > > > > +       /* disable for big mode */
> > > > > +       if (!vi->mergeable_rx_bufs && vi->big_packets)
> > > > > +               return 0;
> > > > > +
> > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > +               err = virtqueue_set_premapped(vi->rq[i].vq);
> > > > > +               if (err)
> > > > > +                       continue;
> > > > > +
> > > > > +               rq = &vi->rq[i];
> > > > > +
> > > > > +               num = virtqueue_get_vring_size(rq->vq);
> > > > > +
> > > > > +               rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > > > > +               if (!rq->data_array)
> > > > > +                       goto err;
> > > > > +
> > > > > +               rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > > > > +               if (!rq->dma_array)
> > > > > +                       goto err;
> > > > > +
> > > > > +               for (j = 0; j < num; ++j) {
> > > > > +                       rq->data_array[j].next = rq->data_free;
> > > > > +                       rq->data_free = &rq->data_array[j];
> > > > > +
> > > > > +                       rq->dma_array[j].next = rq->dma_free;
> > > > > +                       rq->dma_free = &rq->dma_array[j];
> > > > > +               }
> > > > > +       }
> > > > > +
> > > > > +       return 0;
> > > > > +
> > > > > +err:
> > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > +               struct receive_queue *rq;
> > > > > +
> > > > > +               rq = &vi->rq[i];
> > > > > +
> > > > > +               kfree(rq->dma_array);
> > > > > +               kfree(rq->data_array);
> > > > > +       }
> > > > > +
> > > > > +       return -ENOMEM;
> > > > > +}
> > > > > +
> > > > >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> > > > >  {
> > > > >         unsigned int len;
> > > > > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > >                 void *buf;
> > > > >                 int off;
> > > > >
> > > > > -               buf = virtqueue_get_buf(rq->vq, &buflen);
> > > > > +               buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> > > > >                 if (unlikely(!buf))
> > > > >                         goto err_buf;
> > > > >
> > > > > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > >                 return -EINVAL;
> > > > >
> > > > >         while (--*num_buf > 0) {
> > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > >                 if (unlikely(!buf)) {
> > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > >                                  dev->name, *num_buf,
> > > > > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > >         while (--num_buf) {
> > > > >                 int num_skb_frags;
> > > > >
> > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > >                 if (unlikely(!buf)) {
> > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > >                                  dev->name, num_buf,
> > > > > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > >  err_skb:
> > > > >         put_page(page);
> > > > >         while (num_buf-- > 1) {
> > > > > -               buf = virtqueue_get_buf(rq->vq, &len);
> > > > > +               buf = virtnet_rq_get_buf(rq, &len, NULL);
> > > > >                 if (unlikely(!buf)) {
> > > > >                         pr_debug("%s: rx error: %d buffers missing\n",
> > > > >                                  dev->name, num_buf);
> > > > > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > >         unsigned int xdp_headroom = virtnet_get_headroom(vi);
> > > > >         void *ctx = (void *)(unsigned long)xdp_headroom;
> > > > >         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > > > > +       struct virtnet_rq_data *data;
> > > > >         int err;
> > > > >
> > > > >         len = SKB_DATA_ALIGN(len) +
> > > > > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > >         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > >         get_page(alloc_frag->page);
> > > > >         alloc_frag->offset += len;
> > > > > -       sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > -                   vi->hdr_len + GOOD_PACKET_LEN);
> > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > +
> > > > > +       if (rq->data_array) {
> > > > > +               err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > +                                       vi->hdr_len + GOOD_PACKET_LEN);
> > > > > +               if (err)
> > > > > +                       goto map_err;
> > > > > +
> > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > +       } else {
> > > > > +               sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > +                           vi->hdr_len + GOOD_PACKET_LEN);
> > > > > +               data = (void *)buf;
> > > > > +       }
> > > > > +
> > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > >         if (err < 0)
> > > > > -               put_page(virt_to_head_page(buf));
> > > > > +               goto add_err;
> > > > > +
> > > > > +       return err;
> > > > > +
> > > > > +add_err:
> > > > > +       if (rq->data_array) {
> > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > +       }
> > > > > +
> > > > > +map_err:
> > > > > +       put_page(virt_to_head_page(buf));
> > > > >         return err;
> > > > >  }
> > > > >
> > > > > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > >         unsigned int headroom = virtnet_get_headroom(vi);
> > > > >         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> > > > >         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > > > > +       struct virtnet_rq_data *data;
> > > > >         char *buf;
> > > > >         void *ctx;
> > > > >         int err;
> > > > > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > >                 alloc_frag->offset += hole;
> > > > >         }
> > > > >
> > > > > -       sg_init_one(rq->sg, buf, len);
> > > > > +       if (rq->data_array) {
> > > > > +               err = virtnet_rq_map_sg(rq, buf, len);
> > > > > +               if (err)
> > > > > +                       goto map_err;
> > > > > +
> > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > +       } else {
> > > > > +               sg_init_one(rq->sg, buf, len);
> > > > > +               data = (void *)buf;
> > > > > +       }
> > > > > +
> > > > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > >         if (err < 0)
> > > > > -               put_page(virt_to_head_page(buf));
> > > > > +               goto add_err;
> > > > > +
> > > > > +       return 0;
> > > > > +
> > > > > +add_err:
> > > > > +       if (rq->data_array) {
> > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > +       }
> > > > >
> > > > > +map_err:
> > > > > +       put_page(virt_to_head_page(buf));
> > > > >         return err;
> > > > >  }
> > > > >
> > > > > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> > > > >                 void *ctx;
> > > > >
> > > > >                 while (stats.packets < budget &&
> > > > > -                      (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> > > > >                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> > > > >                         stats.packets++;
> > > > >                 }
> > > > >         } else {
> > > > >                 while (stats.packets < budget &&
> > > > > -                      (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> > > > >                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> > > > >                         stats.packets++;
> > > > >                 }
> > > > > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > >                 __netif_napi_del(&vi->rq[i].napi);
> > > > >                 __netif_napi_del(&vi->sq[i].napi);
> > > > > +
> > > > > +               kfree(vi->rq[i].data_array);
> > > > > +               kfree(vi->rq[i].dma_array);
> > > > >         }
> > > > >
> > > > >         /* We called __netif_napi_del(),
> > > > > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> > > > >         }
> > > > >
> > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > -               struct virtqueue *vq = vi->rq[i].vq;
> > > > > -               while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > > > > -                       virtnet_rq_free_unused_buf(vq, buf);
> > > > > +               struct receive_queue *rq = &vi->rq[i];
> > > > > +
> > > > > +               while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > > > > +                       virtnet_rq_free_unused_buf(rq->vq, buf);
> > > > >                 cond_resched();
> > > > >         }
> > > > >  }
> > > > > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> > > > >         if (ret)
> > > > >                 goto err_free;
> > > > >
> > > > > +       ret = virtnet_rq_merge_map_init(vi);
> > > > > +       if (ret)
> > > > > +               goto err_free;
> > > > > +
> > > > >         cpus_read_lock();
> > > > >         virtnet_set_affinity(vi);
> > > > >         cpus_read_unlock();
> > > > > --
> > > > > 2.32.0.3.g01195cf9f
> > > >
> >
>
Xuan Zhuo July 11, 2023, 2:40 a.m. UTC | #6
On Tue, 11 Jul 2023 10:36:17 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Mon, Jul 10, 2023 at 8:41 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
> > On Mon, 10 Jul 2023 07:59:03 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > On Mon, Jul 10, 2023 at 06:18:30PM +0800, Xuan Zhuo wrote:
> > > > On Mon, 10 Jul 2023 05:40:21 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > On Mon, Jul 10, 2023 at 11:42:37AM +0800, Xuan Zhuo wrote:
> > > > > > Currently, the virtio core will perform a dma operation for each
> > > > > > operation. Although, the same page may be operated multiple times.
> > > > > >
> > > > > > The driver does the dma operation and manages the dma address based the
> > > > > > feature premapped of virtio core.
> > > > > >
> > > > > > This way, we can perform only one dma operation for the same page. In
> > > > > > the case of mtu 1500, this can reduce a lot of dma operations.
> > > > > >
> > > > > > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > > > > > increased from 1893766 to 1901105. An increase of 0.4%.
> > > > >
> > > > > what kind of dma was there? an IOMMU? which vendors? in which mode
> > > > > of operation?
> > > >
> > > >
> > > > Do you mean this:
> > > >
> > > > [    0.470816] iommu: Default domain type: Passthrough
> > > >
> > >
> > > With passthrough, dma API is just some indirect function calls, they do
> > > not affect the performance a lot.
> >
> >
> > Yes, this benefit is worthless. I seem to have done a meaningless thing. The
> > overhead of DMA I observed is indeed not too high.
>
> Have you measured with iommu=strict?

I have not tested this way, our environment is pt, I wonder if strict is a
common scenario. I can test it.

Thanks.


>
> Thanks
>
> >
> > Thanks.
> >
> >
> > >
> > > Try e.g. bounce buffer. Which is where you will see a problem: your
> > > patches won't work.
> > >
> > >
> > > > >
> > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > >
> > > > > This kind of difference is likely in the noise.
> > > >
> > > > It's really not high, but this is because the proportion of DMA under perf top
> > > > is not high. Probably that much.
> > >
> > > So maybe not worth the complexity.
> > >
> > > > >
> > > > >
> > > > > > ---
> > > > > >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> > > > > >  1 file changed, 267 insertions(+), 16 deletions(-)
> > > > > >
> > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > index 486b5849033d..4de845d35bed 100644
> > > > > > --- a/drivers/net/virtio_net.c
> > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> > > > > >  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
> > > > > >  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
> > > > > >
> > > > > > +/* The bufs on the same page may share this struct. */
> > > > > > +struct virtnet_rq_dma {
> > > > > > +       struct virtnet_rq_dma *next;
> > > > > > +
> > > > > > +       dma_addr_t addr;
> > > > > > +
> > > > > > +       void *buf;
> > > > > > +       u32 len;
> > > > > > +
> > > > > > +       u32 ref;
> > > > > > +};
> > > > > > +
> > > > > > +/* Record the dma and buf. */
> > > > >
> > > > > I guess I see that. But why?
> > > > > And these two comments are the extent of the available
> > > > > documentation, that's not enough I feel.
> > > > >
> > > > >
> > > > > > +struct virtnet_rq_data {
> > > > > > +       struct virtnet_rq_data *next;
> > > > >
> > > > > Is manually reimplementing a linked list the best
> > > > > we can do?
> > > >
> > > > Yes, we can use llist.
> > > >
> > > > >
> > > > > > +
> > > > > > +       void *buf;
> > > > > > +
> > > > > > +       struct virtnet_rq_dma *dma;
> > > > > > +};
> > > > > > +
> > > > > >  /* Internal representation of a send virtqueue */
> > > > > >  struct send_queue {
> > > > > >         /* Virtqueue associated with this send _queue */
> > > > > > @@ -175,6 +196,13 @@ struct receive_queue {
> > > > > >         char name[16];
> > > > > >
> > > > > >         struct xdp_rxq_info xdp_rxq;
> > > > > > +
> > > > > > +       struct virtnet_rq_data *data_array;
> > > > > > +       struct virtnet_rq_data *data_free;
> > > > > > +
> > > > > > +       struct virtnet_rq_dma *dma_array;
> > > > > > +       struct virtnet_rq_dma *dma_free;
> > > > > > +       struct virtnet_rq_dma *last_dma;
> > > > > >  };
> > > > > >
> > > > > >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > > > > > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > >         return skb;
> > > > > >  }
> > > > > >
> > > > > > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > > > > > +{
> > > > > > +       struct device *dev;
> > > > > > +
> > > > > > +       --dma->ref;
> > > > > > +
> > > > > > +       if (dma->ref)
> > > > > > +               return;
> > > > > > +
> > > > >
> > > > > If you don't unmap there is no guarantee valid data will be
> > > > > there in the buffer.
> > > > >
> > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > +
> > > > > > +       dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> > > > >
> > > > >
> > > > >
> > > > >
> > > > >
> > > > > > +
> > > > > > +       dma->next = rq->dma_free;
> > > > > > +       rq->dma_free = dma;
> > > > > > +}
> > > > > > +
> > > > > > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > > > > > +                                    struct virtnet_rq_data *data)
> > > > > > +{
> > > > > > +       void *buf;
> > > > > > +
> > > > > > +       buf = data->buf;
> > > > > > +
> > > > > > +       data->next = rq->data_free;
> > > > > > +       rq->data_free = data;
> > > > > > +
> > > > > > +       return buf;
> > > > > > +}
> > > > > > +
> > > > > > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > > > > > +                                                  void *buf,
> > > > > > +                                                  struct virtnet_rq_dma *dma)
> > > > > > +{
> > > > > > +       struct virtnet_rq_data *data;
> > > > > > +
> > > > > > +       data = rq->data_free;
> > > > > > +       rq->data_free = data->next;
> > > > > > +
> > > > > > +       data->buf = buf;
> > > > > > +       data->dma = dma;
> > > > > > +
> > > > > > +       return data;
> > > > > > +}
> > > > > > +
> > > > > > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > > > > > +{
> > > > > > +       struct virtnet_rq_data *data;
> > > > > > +       void *buf;
> > > > > > +
> > > > > > +       buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > > > > > +       if (!buf || !rq->data_array)
> > > > > > +               return buf;
> > > > > > +
> > > > > > +       data = buf;
> > > > > > +
> > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > +
> > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > +}
> > > > > > +
> > > > > > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > > > > > +{
> > > > > > +       struct virtnet_rq_data *data;
> > > > > > +       void *buf;
> > > > > > +
> > > > > > +       buf = virtqueue_detach_unused_buf(rq->vq);
> > > > > > +       if (!buf || !rq->data_array)
> > > > > > +               return buf;
> > > > > > +
> > > > > > +       data = buf;
> > > > > > +
> > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > +
> > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > +}
> > > > > > +
> > > > > > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > > > > > +{
> > > > > > +       struct virtnet_rq_dma *dma = rq->last_dma;
> > > > > > +       struct device *dev;
> > > > > > +       u32 off, map_len;
> > > > > > +       dma_addr_t addr;
> > > > > > +       void *end;
> > > > > > +
> > > > > > +       if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > > > > > +               ++dma->ref;
> > > > > > +               addr = dma->addr + (buf - dma->buf);
> > > > > > +               goto ok;
> > > > > > +       }
> > > > >
> > > > > So this is the meat of the proposed optimization. I guess that
> > > > > if the last buffer we allocated happens to be in the same page
> > > > > as this one then they can both be mapped for DMA together.
> > > >
> > > > Since we use page_frag, the buffers we allocated are all continuous.
> > > >
> > > > > Why last one specifically? Whether next one happens to
> > > > > be close depends on luck. If you want to try optimizing this
> > > > > the right thing to do is likely by using a page pool.
> > > > > There's actually work upstream on page pool, look it up.
> > > >
> > > > As we discussed in another thread, the page pool is first used for xdp. Let's
> > > > transform it step by step.
> > > >
> > > > Thanks.
> > >
> > > ok so this should wait then?
> > >
> > > > >
> > > > > > +
> > > > > > +       end = buf + len - 1;
> > > > > > +       off = offset_in_page(end);
> > > > > > +       map_len = len + PAGE_SIZE - off;
> > > > > > +
> > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > +
> > > > > > +       addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > > > > > +                                 map_len, DMA_FROM_DEVICE, 0);
> > > > > > +       if (addr == DMA_MAPPING_ERROR)
> > > > > > +               return -ENOMEM;
> > > > > > +
> > > > > > +       dma = rq->dma_free;
> > > > > > +       rq->dma_free = dma->next;
> > > > > > +
> > > > > > +       dma->ref = 1;
> > > > > > +       dma->buf = buf;
> > > > > > +       dma->addr = addr;
> > > > > > +       dma->len = map_len;
> > > > > > +
> > > > > > +       rq->last_dma = dma;
> > > > > > +
> > > > > > +ok:
> > > > > > +       sg_init_table(rq->sg, 1);
> > > > > > +       rq->sg[0].dma_address = addr;
> > > > > > +       rq->sg[0].length = len;
> > > > > > +
> > > > > > +       return 0;
> > > > > > +}
> > > > > > +
> > > > > > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > > > > > +{
> > > > > > +       struct receive_queue *rq;
> > > > > > +       int i, err, j, num;
> > > > > > +
> > > > > > +       /* disable for big mode */
> > > > > > +       if (!vi->mergeable_rx_bufs && vi->big_packets)
> > > > > > +               return 0;
> > > > > > +
> > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > +               err = virtqueue_set_premapped(vi->rq[i].vq);
> > > > > > +               if (err)
> > > > > > +                       continue;
> > > > > > +
> > > > > > +               rq = &vi->rq[i];
> > > > > > +
> > > > > > +               num = virtqueue_get_vring_size(rq->vq);
> > > > > > +
> > > > > > +               rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > > > > > +               if (!rq->data_array)
> > > > > > +                       goto err;
> > > > > > +
> > > > > > +               rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > > > > > +               if (!rq->dma_array)
> > > > > > +                       goto err;
> > > > > > +
> > > > > > +               for (j = 0; j < num; ++j) {
> > > > > > +                       rq->data_array[j].next = rq->data_free;
> > > > > > +                       rq->data_free = &rq->data_array[j];
> > > > > > +
> > > > > > +                       rq->dma_array[j].next = rq->dma_free;
> > > > > > +                       rq->dma_free = &rq->dma_array[j];
> > > > > > +               }
> > > > > > +       }
> > > > > > +
> > > > > > +       return 0;
> > > > > > +
> > > > > > +err:
> > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > +               struct receive_queue *rq;
> > > > > > +
> > > > > > +               rq = &vi->rq[i];
> > > > > > +
> > > > > > +               kfree(rq->dma_array);
> > > > > > +               kfree(rq->data_array);
> > > > > > +       }
> > > > > > +
> > > > > > +       return -ENOMEM;
> > > > > > +}
> > > > > > +
> > > > > >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> > > > > >  {
> > > > > >         unsigned int len;
> > > > > > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > >                 void *buf;
> > > > > >                 int off;
> > > > > >
> > > > > > -               buf = virtqueue_get_buf(rq->vq, &buflen);
> > > > > > +               buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> > > > > >                 if (unlikely(!buf))
> > > > > >                         goto err_buf;
> > > > > >
> > > > > > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > >                 return -EINVAL;
> > > > > >
> > > > > >         while (--*num_buf > 0) {
> > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > >                 if (unlikely(!buf)) {
> > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > >                                  dev->name, *num_buf,
> > > > > > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > >         while (--num_buf) {
> > > > > >                 int num_skb_frags;
> > > > > >
> > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > >                 if (unlikely(!buf)) {
> > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > >                                  dev->name, num_buf,
> > > > > > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > >  err_skb:
> > > > > >         put_page(page);
> > > > > >         while (num_buf-- > 1) {
> > > > > > -               buf = virtqueue_get_buf(rq->vq, &len);
> > > > > > +               buf = virtnet_rq_get_buf(rq, &len, NULL);
> > > > > >                 if (unlikely(!buf)) {
> > > > > >                         pr_debug("%s: rx error: %d buffers missing\n",
> > > > > >                                  dev->name, num_buf);
> > > > > > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > >         unsigned int xdp_headroom = virtnet_get_headroom(vi);
> > > > > >         void *ctx = (void *)(unsigned long)xdp_headroom;
> > > > > >         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > > > > > +       struct virtnet_rq_data *data;
> > > > > >         int err;
> > > > > >
> > > > > >         len = SKB_DATA_ALIGN(len) +
> > > > > > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > >         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > >         get_page(alloc_frag->page);
> > > > > >         alloc_frag->offset += len;
> > > > > > -       sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > -                   vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > +
> > > > > > +       if (rq->data_array) {
> > > > > > +               err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > +                                       vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > +               if (err)
> > > > > > +                       goto map_err;
> > > > > > +
> > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > +       } else {
> > > > > > +               sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > +                           vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > +               data = (void *)buf;
> > > > > > +       }
> > > > > > +
> > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > >         if (err < 0)
> > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > +               goto add_err;
> > > > > > +
> > > > > > +       return err;
> > > > > > +
> > > > > > +add_err:
> > > > > > +       if (rq->data_array) {
> > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > +       }
> > > > > > +
> > > > > > +map_err:
> > > > > > +       put_page(virt_to_head_page(buf));
> > > > > >         return err;
> > > > > >  }
> > > > > >
> > > > > > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > >         unsigned int headroom = virtnet_get_headroom(vi);
> > > > > >         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> > > > > >         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > > > > > +       struct virtnet_rq_data *data;
> > > > > >         char *buf;
> > > > > >         void *ctx;
> > > > > >         int err;
> > > > > > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > >                 alloc_frag->offset += hole;
> > > > > >         }
> > > > > >
> > > > > > -       sg_init_one(rq->sg, buf, len);
> > > > > > +       if (rq->data_array) {
> > > > > > +               err = virtnet_rq_map_sg(rq, buf, len);
> > > > > > +               if (err)
> > > > > > +                       goto map_err;
> > > > > > +
> > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > +       } else {
> > > > > > +               sg_init_one(rq->sg, buf, len);
> > > > > > +               data = (void *)buf;
> > > > > > +       }
> > > > > > +
> > > > > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > >         if (err < 0)
> > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > +               goto add_err;
> > > > > > +
> > > > > > +       return 0;
> > > > > > +
> > > > > > +add_err:
> > > > > > +       if (rq->data_array) {
> > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > +       }
> > > > > >
> > > > > > +map_err:
> > > > > > +       put_page(virt_to_head_page(buf));
> > > > > >         return err;
> > > > > >  }
> > > > > >
> > > > > > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> > > > > >                 void *ctx;
> > > > > >
> > > > > >                 while (stats.packets < budget &&
> > > > > > -                      (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> > > > > >                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> > > > > >                         stats.packets++;
> > > > > >                 }
> > > > > >         } else {
> > > > > >                 while (stats.packets < budget &&
> > > > > > -                      (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> > > > > >                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> > > > > >                         stats.packets++;
> > > > > >                 }
> > > > > > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > >                 __netif_napi_del(&vi->rq[i].napi);
> > > > > >                 __netif_napi_del(&vi->sq[i].napi);
> > > > > > +
> > > > > > +               kfree(vi->rq[i].data_array);
> > > > > > +               kfree(vi->rq[i].dma_array);
> > > > > >         }
> > > > > >
> > > > > >         /* We called __netif_napi_del(),
> > > > > > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> > > > > >         }
> > > > > >
> > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > -               struct virtqueue *vq = vi->rq[i].vq;
> > > > > > -               while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > > > > > -                       virtnet_rq_free_unused_buf(vq, buf);
> > > > > > +               struct receive_queue *rq = &vi->rq[i];
> > > > > > +
> > > > > > +               while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > > > > > +                       virtnet_rq_free_unused_buf(rq->vq, buf);
> > > > > >                 cond_resched();
> > > > > >         }
> > > > > >  }
> > > > > > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> > > > > >         if (ret)
> > > > > >                 goto err_free;
> > > > > >
> > > > > > +       ret = virtnet_rq_merge_map_init(vi);
> > > > > > +       if (ret)
> > > > > > +               goto err_free;
> > > > > > +
> > > > > >         cpus_read_lock();
> > > > > >         virtnet_set_affinity(vi);
> > > > > >         cpus_read_unlock();
> > > > > > --
> > > > > > 2.32.0.3.g01195cf9f
> > > > >
> > >
> >
>
Jason Wang July 11, 2023, 2:58 a.m. UTC | #7
On Tue, Jul 11, 2023 at 10:42 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> On Tue, 11 Jul 2023 10:36:17 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > On Mon, Jul 10, 2023 at 8:41 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > >
> > > On Mon, 10 Jul 2023 07:59:03 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > On Mon, Jul 10, 2023 at 06:18:30PM +0800, Xuan Zhuo wrote:
> > > > > On Mon, 10 Jul 2023 05:40:21 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > On Mon, Jul 10, 2023 at 11:42:37AM +0800, Xuan Zhuo wrote:
> > > > > > > Currently, the virtio core will perform a dma operation for each
> > > > > > > operation. Although, the same page may be operated multiple times.
> > > > > > >
> > > > > > > The driver does the dma operation and manages the dma address based the
> > > > > > > feature premapped of virtio core.
> > > > > > >
> > > > > > > This way, we can perform only one dma operation for the same page. In
> > > > > > > the case of mtu 1500, this can reduce a lot of dma operations.
> > > > > > >
> > > > > > > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > > > > > > increased from 1893766 to 1901105. An increase of 0.4%.
> > > > > >
> > > > > > what kind of dma was there? an IOMMU? which vendors? in which mode
> > > > > > of operation?
> > > > >
> > > > >
> > > > > Do you mean this:
> > > > >
> > > > > [    0.470816] iommu: Default domain type: Passthrough
> > > > >
> > > >
> > > > With passthrough, dma API is just some indirect function calls, they do
> > > > not affect the performance a lot.
> > >
> > >
> > > Yes, this benefit is worthless. I seem to have done a meaningless thing. The
> > > overhead of DMA I observed is indeed not too high.
> >
> > Have you measured with iommu=strict?
>
> I have not tested this way, our environment is pt, I wonder if strict is a
> common scenario. I can test it.

It's not a common setup, but it's a way to stress DMA layer to see the overhead.

Thanks

>
> Thanks.
>
>
> >
> > Thanks
> >
> > >
> > > Thanks.
> > >
> > >
> > > >
> > > > Try e.g. bounce buffer. Which is where you will see a problem: your
> > > > patches won't work.
> > > >
> > > >
> > > > > >
> > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > >
> > > > > > This kind of difference is likely in the noise.
> > > > >
> > > > > It's really not high, but this is because the proportion of DMA under perf top
> > > > > is not high. Probably that much.
> > > >
> > > > So maybe not worth the complexity.
> > > >
> > > > > >
> > > > > >
> > > > > > > ---
> > > > > > >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> > > > > > >  1 file changed, 267 insertions(+), 16 deletions(-)
> > > > > > >
> > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > index 486b5849033d..4de845d35bed 100644
> > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> > > > > > >  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
> > > > > > >  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
> > > > > > >
> > > > > > > +/* The bufs on the same page may share this struct. */
> > > > > > > +struct virtnet_rq_dma {
> > > > > > > +       struct virtnet_rq_dma *next;
> > > > > > > +
> > > > > > > +       dma_addr_t addr;
> > > > > > > +
> > > > > > > +       void *buf;
> > > > > > > +       u32 len;
> > > > > > > +
> > > > > > > +       u32 ref;
> > > > > > > +};
> > > > > > > +
> > > > > > > +/* Record the dma and buf. */
> > > > > >
> > > > > > I guess I see that. But why?
> > > > > > And these two comments are the extent of the available
> > > > > > documentation, that's not enough I feel.
> > > > > >
> > > > > >
> > > > > > > +struct virtnet_rq_data {
> > > > > > > +       struct virtnet_rq_data *next;
> > > > > >
> > > > > > Is manually reimplementing a linked list the best
> > > > > > we can do?
> > > > >
> > > > > Yes, we can use llist.
> > > > >
> > > > > >
> > > > > > > +
> > > > > > > +       void *buf;
> > > > > > > +
> > > > > > > +       struct virtnet_rq_dma *dma;
> > > > > > > +};
> > > > > > > +
> > > > > > >  /* Internal representation of a send virtqueue */
> > > > > > >  struct send_queue {
> > > > > > >         /* Virtqueue associated with this send _queue */
> > > > > > > @@ -175,6 +196,13 @@ struct receive_queue {
> > > > > > >         char name[16];
> > > > > > >
> > > > > > >         struct xdp_rxq_info xdp_rxq;
> > > > > > > +
> > > > > > > +       struct virtnet_rq_data *data_array;
> > > > > > > +       struct virtnet_rq_data *data_free;
> > > > > > > +
> > > > > > > +       struct virtnet_rq_dma *dma_array;
> > > > > > > +       struct virtnet_rq_dma *dma_free;
> > > > > > > +       struct virtnet_rq_dma *last_dma;
> > > > > > >  };
> > > > > > >
> > > > > > >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > > > > > > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > >         return skb;
> > > > > > >  }
> > > > > > >
> > > > > > > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > > > > > > +{
> > > > > > > +       struct device *dev;
> > > > > > > +
> > > > > > > +       --dma->ref;
> > > > > > > +
> > > > > > > +       if (dma->ref)
> > > > > > > +               return;
> > > > > > > +
> > > > > >
> > > > > > If you don't unmap there is no guarantee valid data will be
> > > > > > there in the buffer.
> > > > > >
> > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > +
> > > > > > > +       dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> > > > > >
> > > > > >
> > > > > >
> > > > > >
> > > > > >
> > > > > > > +
> > > > > > > +       dma->next = rq->dma_free;
> > > > > > > +       rq->dma_free = dma;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > > > > > > +                                    struct virtnet_rq_data *data)
> > > > > > > +{
> > > > > > > +       void *buf;
> > > > > > > +
> > > > > > > +       buf = data->buf;
> > > > > > > +
> > > > > > > +       data->next = rq->data_free;
> > > > > > > +       rq->data_free = data;
> > > > > > > +
> > > > > > > +       return buf;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > > > > > > +                                                  void *buf,
> > > > > > > +                                                  struct virtnet_rq_dma *dma)
> > > > > > > +{
> > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > +
> > > > > > > +       data = rq->data_free;
> > > > > > > +       rq->data_free = data->next;
> > > > > > > +
> > > > > > > +       data->buf = buf;
> > > > > > > +       data->dma = dma;
> > > > > > > +
> > > > > > > +       return data;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > > > > > > +{
> > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > +       void *buf;
> > > > > > > +
> > > > > > > +       buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > +               return buf;
> > > > > > > +
> > > > > > > +       data = buf;
> > > > > > > +
> > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > +
> > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > > > > > > +{
> > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > +       void *buf;
> > > > > > > +
> > > > > > > +       buf = virtqueue_detach_unused_buf(rq->vq);
> > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > +               return buf;
> > > > > > > +
> > > > > > > +       data = buf;
> > > > > > > +
> > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > +
> > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > +}
> > > > > > > +
> > > > > > > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > > > > > > +{
> > > > > > > +       struct virtnet_rq_dma *dma = rq->last_dma;
> > > > > > > +       struct device *dev;
> > > > > > > +       u32 off, map_len;
> > > > > > > +       dma_addr_t addr;
> > > > > > > +       void *end;
> > > > > > > +
> > > > > > > +       if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > > > > > > +               ++dma->ref;
> > > > > > > +               addr = dma->addr + (buf - dma->buf);
> > > > > > > +               goto ok;
> > > > > > > +       }
> > > > > >
> > > > > > So this is the meat of the proposed optimization. I guess that
> > > > > > if the last buffer we allocated happens to be in the same page
> > > > > > as this one then they can both be mapped for DMA together.
> > > > >
> > > > > Since we use page_frag, the buffers we allocated are all continuous.
> > > > >
> > > > > > Why last one specifically? Whether next one happens to
> > > > > > be close depends on luck. If you want to try optimizing this
> > > > > > the right thing to do is likely by using a page pool.
> > > > > > There's actually work upstream on page pool, look it up.
> > > > >
> > > > > As we discussed in another thread, the page pool is first used for xdp. Let's
> > > > > transform it step by step.
> > > > >
> > > > > Thanks.
> > > >
> > > > ok so this should wait then?
> > > >
> > > > > >
> > > > > > > +
> > > > > > > +       end = buf + len - 1;
> > > > > > > +       off = offset_in_page(end);
> > > > > > > +       map_len = len + PAGE_SIZE - off;
> > > > > > > +
> > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > +
> > > > > > > +       addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > > > > > > +                                 map_len, DMA_FROM_DEVICE, 0);
> > > > > > > +       if (addr == DMA_MAPPING_ERROR)
> > > > > > > +               return -ENOMEM;
> > > > > > > +
> > > > > > > +       dma = rq->dma_free;
> > > > > > > +       rq->dma_free = dma->next;
> > > > > > > +
> > > > > > > +       dma->ref = 1;
> > > > > > > +       dma->buf = buf;
> > > > > > > +       dma->addr = addr;
> > > > > > > +       dma->len = map_len;
> > > > > > > +
> > > > > > > +       rq->last_dma = dma;
> > > > > > > +
> > > > > > > +ok:
> > > > > > > +       sg_init_table(rq->sg, 1);
> > > > > > > +       rq->sg[0].dma_address = addr;
> > > > > > > +       rq->sg[0].length = len;
> > > > > > > +
> > > > > > > +       return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > > > > > > +{
> > > > > > > +       struct receive_queue *rq;
> > > > > > > +       int i, err, j, num;
> > > > > > > +
> > > > > > > +       /* disable for big mode */
> > > > > > > +       if (!vi->mergeable_rx_bufs && vi->big_packets)
> > > > > > > +               return 0;
> > > > > > > +
> > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > +               err = virtqueue_set_premapped(vi->rq[i].vq);
> > > > > > > +               if (err)
> > > > > > > +                       continue;
> > > > > > > +
> > > > > > > +               rq = &vi->rq[i];
> > > > > > > +
> > > > > > > +               num = virtqueue_get_vring_size(rq->vq);
> > > > > > > +
> > > > > > > +               rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > > > > > > +               if (!rq->data_array)
> > > > > > > +                       goto err;
> > > > > > > +
> > > > > > > +               rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > > > > > > +               if (!rq->dma_array)
> > > > > > > +                       goto err;
> > > > > > > +
> > > > > > > +               for (j = 0; j < num; ++j) {
> > > > > > > +                       rq->data_array[j].next = rq->data_free;
> > > > > > > +                       rq->data_free = &rq->data_array[j];
> > > > > > > +
> > > > > > > +                       rq->dma_array[j].next = rq->dma_free;
> > > > > > > +                       rq->dma_free = &rq->dma_array[j];
> > > > > > > +               }
> > > > > > > +       }
> > > > > > > +
> > > > > > > +       return 0;
> > > > > > > +
> > > > > > > +err:
> > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > +               struct receive_queue *rq;
> > > > > > > +
> > > > > > > +               rq = &vi->rq[i];
> > > > > > > +
> > > > > > > +               kfree(rq->dma_array);
> > > > > > > +               kfree(rq->data_array);
> > > > > > > +       }
> > > > > > > +
> > > > > > > +       return -ENOMEM;
> > > > > > > +}
> > > > > > > +
> > > > > > >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> > > > > > >  {
> > > > > > >         unsigned int len;
> > > > > > > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > >                 void *buf;
> > > > > > >                 int off;
> > > > > > >
> > > > > > > -               buf = virtqueue_get_buf(rq->vq, &buflen);
> > > > > > > +               buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> > > > > > >                 if (unlikely(!buf))
> > > > > > >                         goto err_buf;
> > > > > > >
> > > > > > > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > >                 return -EINVAL;
> > > > > > >
> > > > > > >         while (--*num_buf > 0) {
> > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > >                 if (unlikely(!buf)) {
> > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > >                                  dev->name, *num_buf,
> > > > > > > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > >         while (--num_buf) {
> > > > > > >                 int num_skb_frags;
> > > > > > >
> > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > >                 if (unlikely(!buf)) {
> > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > >                                  dev->name, num_buf,
> > > > > > > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > >  err_skb:
> > > > > > >         put_page(page);
> > > > > > >         while (num_buf-- > 1) {
> > > > > > > -               buf = virtqueue_get_buf(rq->vq, &len);
> > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, NULL);
> > > > > > >                 if (unlikely(!buf)) {
> > > > > > >                         pr_debug("%s: rx error: %d buffers missing\n",
> > > > > > >                                  dev->name, num_buf);
> > > > > > > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > >         unsigned int xdp_headroom = virtnet_get_headroom(vi);
> > > > > > >         void *ctx = (void *)(unsigned long)xdp_headroom;
> > > > > > >         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > > > > > > +       struct virtnet_rq_data *data;
> > > > > > >         int err;
> > > > > > >
> > > > > > >         len = SKB_DATA_ALIGN(len) +
> > > > > > > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > >         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > >         get_page(alloc_frag->page);
> > > > > > >         alloc_frag->offset += len;
> > > > > > > -       sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > -                   vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > +
> > > > > > > +       if (rq->data_array) {
> > > > > > > +               err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > +                                       vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > +               if (err)
> > > > > > > +                       goto map_err;
> > > > > > > +
> > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > +       } else {
> > > > > > > +               sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > +                           vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > +               data = (void *)buf;
> > > > > > > +       }
> > > > > > > +
> > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > >         if (err < 0)
> > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > +               goto add_err;
> > > > > > > +
> > > > > > > +       return err;
> > > > > > > +
> > > > > > > +add_err:
> > > > > > > +       if (rq->data_array) {
> > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > +       }
> > > > > > > +
> > > > > > > +map_err:
> > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > >         return err;
> > > > > > >  }
> > > > > > >
> > > > > > > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > >         unsigned int headroom = virtnet_get_headroom(vi);
> > > > > > >         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> > > > > > >         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > > > > > > +       struct virtnet_rq_data *data;
> > > > > > >         char *buf;
> > > > > > >         void *ctx;
> > > > > > >         int err;
> > > > > > > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > >                 alloc_frag->offset += hole;
> > > > > > >         }
> > > > > > >
> > > > > > > -       sg_init_one(rq->sg, buf, len);
> > > > > > > +       if (rq->data_array) {
> > > > > > > +               err = virtnet_rq_map_sg(rq, buf, len);
> > > > > > > +               if (err)
> > > > > > > +                       goto map_err;
> > > > > > > +
> > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > +       } else {
> > > > > > > +               sg_init_one(rq->sg, buf, len);
> > > > > > > +               data = (void *)buf;
> > > > > > > +       }
> > > > > > > +
> > > > > > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > >         if (err < 0)
> > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > +               goto add_err;
> > > > > > > +
> > > > > > > +       return 0;
> > > > > > > +
> > > > > > > +add_err:
> > > > > > > +       if (rq->data_array) {
> > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > +       }
> > > > > > >
> > > > > > > +map_err:
> > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > >         return err;
> > > > > > >  }
> > > > > > >
> > > > > > > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> > > > > > >                 void *ctx;
> > > > > > >
> > > > > > >                 while (stats.packets < budget &&
> > > > > > > -                      (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> > > > > > >                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> > > > > > >                         stats.packets++;
> > > > > > >                 }
> > > > > > >         } else {
> > > > > > >                 while (stats.packets < budget &&
> > > > > > > -                      (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> > > > > > >                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> > > > > > >                         stats.packets++;
> > > > > > >                 }
> > > > > > > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > >                 __netif_napi_del(&vi->rq[i].napi);
> > > > > > >                 __netif_napi_del(&vi->sq[i].napi);
> > > > > > > +
> > > > > > > +               kfree(vi->rq[i].data_array);
> > > > > > > +               kfree(vi->rq[i].dma_array);
> > > > > > >         }
> > > > > > >
> > > > > > >         /* We called __netif_napi_del(),
> > > > > > > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > >         }
> > > > > > >
> > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > -               struct virtqueue *vq = vi->rq[i].vq;
> > > > > > > -               while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > > > > > > -                       virtnet_rq_free_unused_buf(vq, buf);
> > > > > > > +               struct receive_queue *rq = &vi->rq[i];
> > > > > > > +
> > > > > > > +               while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > > > > > > +                       virtnet_rq_free_unused_buf(rq->vq, buf);
> > > > > > >                 cond_resched();
> > > > > > >         }
> > > > > > >  }
> > > > > > > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> > > > > > >         if (ret)
> > > > > > >                 goto err_free;
> > > > > > >
> > > > > > > +       ret = virtnet_rq_merge_map_init(vi);
> > > > > > > +       if (ret)
> > > > > > > +               goto err_free;
> > > > > > > +
> > > > > > >         cpus_read_lock();
> > > > > > >         virtnet_set_affinity(vi);
> > > > > > >         cpus_read_unlock();
> > > > > > > --
> > > > > > > 2.32.0.3.g01195cf9f
> > > > > >
> > > >
> > >
> >
>
Xuan Zhuo July 12, 2023, 7:54 a.m. UTC | #8
On Tue, 11 Jul 2023 10:58:51 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Tue, Jul 11, 2023 at 10:42 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
> > On Tue, 11 Jul 2023 10:36:17 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > On Mon, Jul 10, 2023 at 8:41 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > >
> > > > On Mon, 10 Jul 2023 07:59:03 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > On Mon, Jul 10, 2023 at 06:18:30PM +0800, Xuan Zhuo wrote:
> > > > > > On Mon, 10 Jul 2023 05:40:21 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > > On Mon, Jul 10, 2023 at 11:42:37AM +0800, Xuan Zhuo wrote:
> > > > > > > > Currently, the virtio core will perform a dma operation for each
> > > > > > > > operation. Although, the same page may be operated multiple times.
> > > > > > > >
> > > > > > > > The driver does the dma operation and manages the dma address based the
> > > > > > > > feature premapped of virtio core.
> > > > > > > >
> > > > > > > > This way, we can perform only one dma operation for the same page. In
> > > > > > > > the case of mtu 1500, this can reduce a lot of dma operations.
> > > > > > > >
> > > > > > > > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > > > > > > > increased from 1893766 to 1901105. An increase of 0.4%.
> > > > > > >
> > > > > > > what kind of dma was there? an IOMMU? which vendors? in which mode
> > > > > > > of operation?
> > > > > >
> > > > > >
> > > > > > Do you mean this:
> > > > > >
> > > > > > [    0.470816] iommu: Default domain type: Passthrough
> > > > > >
> > > > >
> > > > > With passthrough, dma API is just some indirect function calls, they do
> > > > > not affect the performance a lot.
> > > >
> > > >
> > > > Yes, this benefit is worthless. I seem to have done a meaningless thing. The
> > > > overhead of DMA I observed is indeed not too high.
> > >
> > > Have you measured with iommu=strict?
> >
> > I have not tested this way, our environment is pt, I wonder if strict is a
> > common scenario. I can test it.
>
> It's not a common setup, but it's a way to stress DMA layer to see the overhead.

kernel command line: intel_iommu=on iommu.strict=1 iommu.passthrough=0

virtio-net without merge dma 428614.00 pps

virtio-net with merge dma    742853.00 pps


Thanks.




>
> Thanks
>
> >
> > Thanks.
> >
> >
> > >
> > > Thanks
> > >
> > > >
> > > > Thanks.
> > > >
> > > >
> > > > >
> > > > > Try e.g. bounce buffer. Which is where you will see a problem: your
> > > > > patches won't work.
> > > > >
> > > > >
> > > > > > >
> > > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > >
> > > > > > > This kind of difference is likely in the noise.
> > > > > >
> > > > > > It's really not high, but this is because the proportion of DMA under perf top
> > > > > > is not high. Probably that much.
> > > > >
> > > > > So maybe not worth the complexity.
> > > > >
> > > > > > >
> > > > > > >
> > > > > > > > ---
> > > > > > > >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> > > > > > > >  1 file changed, 267 insertions(+), 16 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > > index 486b5849033d..4de845d35bed 100644
> > > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> > > > > > > >  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
> > > > > > > >  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
> > > > > > > >
> > > > > > > > +/* The bufs on the same page may share this struct. */
> > > > > > > > +struct virtnet_rq_dma {
> > > > > > > > +       struct virtnet_rq_dma *next;
> > > > > > > > +
> > > > > > > > +       dma_addr_t addr;
> > > > > > > > +
> > > > > > > > +       void *buf;
> > > > > > > > +       u32 len;
> > > > > > > > +
> > > > > > > > +       u32 ref;
> > > > > > > > +};
> > > > > > > > +
> > > > > > > > +/* Record the dma and buf. */
> > > > > > >
> > > > > > > I guess I see that. But why?
> > > > > > > And these two comments are the extent of the available
> > > > > > > documentation, that's not enough I feel.
> > > > > > >
> > > > > > >
> > > > > > > > +struct virtnet_rq_data {
> > > > > > > > +       struct virtnet_rq_data *next;
> > > > > > >
> > > > > > > Is manually reimplementing a linked list the best
> > > > > > > we can do?
> > > > > >
> > > > > > Yes, we can use llist.
> > > > > >
> > > > > > >
> > > > > > > > +
> > > > > > > > +       void *buf;
> > > > > > > > +
> > > > > > > > +       struct virtnet_rq_dma *dma;
> > > > > > > > +};
> > > > > > > > +
> > > > > > > >  /* Internal representation of a send virtqueue */
> > > > > > > >  struct send_queue {
> > > > > > > >         /* Virtqueue associated with this send _queue */
> > > > > > > > @@ -175,6 +196,13 @@ struct receive_queue {
> > > > > > > >         char name[16];
> > > > > > > >
> > > > > > > >         struct xdp_rxq_info xdp_rxq;
> > > > > > > > +
> > > > > > > > +       struct virtnet_rq_data *data_array;
> > > > > > > > +       struct virtnet_rq_data *data_free;
> > > > > > > > +
> > > > > > > > +       struct virtnet_rq_dma *dma_array;
> > > > > > > > +       struct virtnet_rq_dma *dma_free;
> > > > > > > > +       struct virtnet_rq_dma *last_dma;
> > > > > > > >  };
> > > > > > > >
> > > > > > > >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > > > > > > > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > > >         return skb;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > > > > > > > +{
> > > > > > > > +       struct device *dev;
> > > > > > > > +
> > > > > > > > +       --dma->ref;
> > > > > > > > +
> > > > > > > > +       if (dma->ref)
> > > > > > > > +               return;
> > > > > > > > +
> > > > > > >
> > > > > > > If you don't unmap there is no guarantee valid data will be
> > > > > > > there in the buffer.
> > > > > > >
> > > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > > +
> > > > > > > > +       dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > > > +
> > > > > > > > +       dma->next = rq->dma_free;
> > > > > > > > +       rq->dma_free = dma;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > > > > > > > +                                    struct virtnet_rq_data *data)
> > > > > > > > +{
> > > > > > > > +       void *buf;
> > > > > > > > +
> > > > > > > > +       buf = data->buf;
> > > > > > > > +
> > > > > > > > +       data->next = rq->data_free;
> > > > > > > > +       rq->data_free = data;
> > > > > > > > +
> > > > > > > > +       return buf;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > > > > > > > +                                                  void *buf,
> > > > > > > > +                                                  struct virtnet_rq_dma *dma)
> > > > > > > > +{
> > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > +
> > > > > > > > +       data = rq->data_free;
> > > > > > > > +       rq->data_free = data->next;
> > > > > > > > +
> > > > > > > > +       data->buf = buf;
> > > > > > > > +       data->dma = dma;
> > > > > > > > +
> > > > > > > > +       return data;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > > > > > > > +{
> > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > +       void *buf;
> > > > > > > > +
> > > > > > > > +       buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > > +               return buf;
> > > > > > > > +
> > > > > > > > +       data = buf;
> > > > > > > > +
> > > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > > +
> > > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > > > > > > > +{
> > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > +       void *buf;
> > > > > > > > +
> > > > > > > > +       buf = virtqueue_detach_unused_buf(rq->vq);
> > > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > > +               return buf;
> > > > > > > > +
> > > > > > > > +       data = buf;
> > > > > > > > +
> > > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > > +
> > > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > > > > > > > +{
> > > > > > > > +       struct virtnet_rq_dma *dma = rq->last_dma;
> > > > > > > > +       struct device *dev;
> > > > > > > > +       u32 off, map_len;
> > > > > > > > +       dma_addr_t addr;
> > > > > > > > +       void *end;
> > > > > > > > +
> > > > > > > > +       if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > > > > > > > +               ++dma->ref;
> > > > > > > > +               addr = dma->addr + (buf - dma->buf);
> > > > > > > > +               goto ok;
> > > > > > > > +       }
> > > > > > >
> > > > > > > So this is the meat of the proposed optimization. I guess that
> > > > > > > if the last buffer we allocated happens to be in the same page
> > > > > > > as this one then they can both be mapped for DMA together.
> > > > > >
> > > > > > Since we use page_frag, the buffers we allocated are all continuous.
> > > > > >
> > > > > > > Why last one specifically? Whether next one happens to
> > > > > > > be close depends on luck. If you want to try optimizing this
> > > > > > > the right thing to do is likely by using a page pool.
> > > > > > > There's actually work upstream on page pool, look it up.
> > > > > >
> > > > > > As we discussed in another thread, the page pool is first used for xdp. Let's
> > > > > > transform it step by step.
> > > > > >
> > > > > > Thanks.
> > > > >
> > > > > ok so this should wait then?
> > > > >
> > > > > > >
> > > > > > > > +
> > > > > > > > +       end = buf + len - 1;
> > > > > > > > +       off = offset_in_page(end);
> > > > > > > > +       map_len = len + PAGE_SIZE - off;
> > > > > > > > +
> > > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > > +
> > > > > > > > +       addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > > > > > > > +                                 map_len, DMA_FROM_DEVICE, 0);
> > > > > > > > +       if (addr == DMA_MAPPING_ERROR)
> > > > > > > > +               return -ENOMEM;
> > > > > > > > +
> > > > > > > > +       dma = rq->dma_free;
> > > > > > > > +       rq->dma_free = dma->next;
> > > > > > > > +
> > > > > > > > +       dma->ref = 1;
> > > > > > > > +       dma->buf = buf;
> > > > > > > > +       dma->addr = addr;
> > > > > > > > +       dma->len = map_len;
> > > > > > > > +
> > > > > > > > +       rq->last_dma = dma;
> > > > > > > > +
> > > > > > > > +ok:
> > > > > > > > +       sg_init_table(rq->sg, 1);
> > > > > > > > +       rq->sg[0].dma_address = addr;
> > > > > > > > +       rq->sg[0].length = len;
> > > > > > > > +
> > > > > > > > +       return 0;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > > > > > > > +{
> > > > > > > > +       struct receive_queue *rq;
> > > > > > > > +       int i, err, j, num;
> > > > > > > > +
> > > > > > > > +       /* disable for big mode */
> > > > > > > > +       if (!vi->mergeable_rx_bufs && vi->big_packets)
> > > > > > > > +               return 0;
> > > > > > > > +
> > > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > +               err = virtqueue_set_premapped(vi->rq[i].vq);
> > > > > > > > +               if (err)
> > > > > > > > +                       continue;
> > > > > > > > +
> > > > > > > > +               rq = &vi->rq[i];
> > > > > > > > +
> > > > > > > > +               num = virtqueue_get_vring_size(rq->vq);
> > > > > > > > +
> > > > > > > > +               rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > > > > > > > +               if (!rq->data_array)
> > > > > > > > +                       goto err;
> > > > > > > > +
> > > > > > > > +               rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > > > > > > > +               if (!rq->dma_array)
> > > > > > > > +                       goto err;
> > > > > > > > +
> > > > > > > > +               for (j = 0; j < num; ++j) {
> > > > > > > > +                       rq->data_array[j].next = rq->data_free;
> > > > > > > > +                       rq->data_free = &rq->data_array[j];
> > > > > > > > +
> > > > > > > > +                       rq->dma_array[j].next = rq->dma_free;
> > > > > > > > +                       rq->dma_free = &rq->dma_array[j];
> > > > > > > > +               }
> > > > > > > > +       }
> > > > > > > > +
> > > > > > > > +       return 0;
> > > > > > > > +
> > > > > > > > +err:
> > > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > +               struct receive_queue *rq;
> > > > > > > > +
> > > > > > > > +               rq = &vi->rq[i];
> > > > > > > > +
> > > > > > > > +               kfree(rq->dma_array);
> > > > > > > > +               kfree(rq->data_array);
> > > > > > > > +       }
> > > > > > > > +
> > > > > > > > +       return -ENOMEM;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> > > > > > > >  {
> > > > > > > >         unsigned int len;
> > > > > > > > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > > >                 void *buf;
> > > > > > > >                 int off;
> > > > > > > >
> > > > > > > > -               buf = virtqueue_get_buf(rq->vq, &buflen);
> > > > > > > > +               buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> > > > > > > >                 if (unlikely(!buf))
> > > > > > > >                         goto err_buf;
> > > > > > > >
> > > > > > > > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > > >                 return -EINVAL;
> > > > > > > >
> > > > > > > >         while (--*num_buf > 0) {
> > > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > > >                                  dev->name, *num_buf,
> > > > > > > > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > >         while (--num_buf) {
> > > > > > > >                 int num_skb_frags;
> > > > > > > >
> > > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > > >                                  dev->name, num_buf,
> > > > > > > > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > >  err_skb:
> > > > > > > >         put_page(page);
> > > > > > > >         while (num_buf-- > 1) {
> > > > > > > > -               buf = virtqueue_get_buf(rq->vq, &len);
> > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, NULL);
> > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > >                         pr_debug("%s: rx error: %d buffers missing\n",
> > > > > > > >                                  dev->name, num_buf);
> > > > > > > > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > > >         unsigned int xdp_headroom = virtnet_get_headroom(vi);
> > > > > > > >         void *ctx = (void *)(unsigned long)xdp_headroom;
> > > > > > > >         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > >         int err;
> > > > > > > >
> > > > > > > >         len = SKB_DATA_ALIGN(len) +
> > > > > > > > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > > >         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > >         get_page(alloc_frag->page);
> > > > > > > >         alloc_frag->offset += len;
> > > > > > > > -       sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > -                   vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > +
> > > > > > > > +       if (rq->data_array) {
> > > > > > > > +               err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > +                                       vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > +               if (err)
> > > > > > > > +                       goto map_err;
> > > > > > > > +
> > > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > > +       } else {
> > > > > > > > +               sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > +                           vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > +               data = (void *)buf;
> > > > > > > > +       }
> > > > > > > > +
> > > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > > >         if (err < 0)
> > > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > > +               goto add_err;
> > > > > > > > +
> > > > > > > > +       return err;
> > > > > > > > +
> > > > > > > > +add_err:
> > > > > > > > +       if (rq->data_array) {
> > > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > > +       }
> > > > > > > > +
> > > > > > > > +map_err:
> > > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > > >         return err;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > >         unsigned int headroom = virtnet_get_headroom(vi);
> > > > > > > >         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> > > > > > > >         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > >         char *buf;
> > > > > > > >         void *ctx;
> > > > > > > >         int err;
> > > > > > > > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > >                 alloc_frag->offset += hole;
> > > > > > > >         }
> > > > > > > >
> > > > > > > > -       sg_init_one(rq->sg, buf, len);
> > > > > > > > +       if (rq->data_array) {
> > > > > > > > +               err = virtnet_rq_map_sg(rq, buf, len);
> > > > > > > > +               if (err)
> > > > > > > > +                       goto map_err;
> > > > > > > > +
> > > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > > +       } else {
> > > > > > > > +               sg_init_one(rq->sg, buf, len);
> > > > > > > > +               data = (void *)buf;
> > > > > > > > +       }
> > > > > > > > +
> > > > > > > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > > >         if (err < 0)
> > > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > > +               goto add_err;
> > > > > > > > +
> > > > > > > > +       return 0;
> > > > > > > > +
> > > > > > > > +add_err:
> > > > > > > > +       if (rq->data_array) {
> > > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > > +       }
> > > > > > > >
> > > > > > > > +map_err:
> > > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > > >         return err;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> > > > > > > >                 void *ctx;
> > > > > > > >
> > > > > > > >                 while (stats.packets < budget &&
> > > > > > > > -                      (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> > > > > > > >                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> > > > > > > >                         stats.packets++;
> > > > > > > >                 }
> > > > > > > >         } else {
> > > > > > > >                 while (stats.packets < budget &&
> > > > > > > > -                      (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> > > > > > > >                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> > > > > > > >                         stats.packets++;
> > > > > > > >                 }
> > > > > > > > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > >                 __netif_napi_del(&vi->rq[i].napi);
> > > > > > > >                 __netif_napi_del(&vi->sq[i].napi);
> > > > > > > > +
> > > > > > > > +               kfree(vi->rq[i].data_array);
> > > > > > > > +               kfree(vi->rq[i].dma_array);
> > > > > > > >         }
> > > > > > > >
> > > > > > > >         /* We called __netif_napi_del(),
> > > > > > > > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > > >         }
> > > > > > > >
> > > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > -               struct virtqueue *vq = vi->rq[i].vq;
> > > > > > > > -               while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > > > > > > > -                       virtnet_rq_free_unused_buf(vq, buf);
> > > > > > > > +               struct receive_queue *rq = &vi->rq[i];
> > > > > > > > +
> > > > > > > > +               while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > > > > > > > +                       virtnet_rq_free_unused_buf(rq->vq, buf);
> > > > > > > >                 cond_resched();
> > > > > > > >         }
> > > > > > > >  }
> > > > > > > > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> > > > > > > >         if (ret)
> > > > > > > >                 goto err_free;
> > > > > > > >
> > > > > > > > +       ret = virtnet_rq_merge_map_init(vi);
> > > > > > > > +       if (ret)
> > > > > > > > +               goto err_free;
> > > > > > > > +
> > > > > > > >         cpus_read_lock();
> > > > > > > >         virtnet_set_affinity(vi);
> > > > > > > >         cpus_read_unlock();
> > > > > > > > --
> > > > > > > > 2.32.0.3.g01195cf9f
> > > > > > >
> > > > >
> > > >
> > >
> >
>
Xuan Zhuo July 12, 2023, 8:32 a.m. UTC | #9
On Wed, 12 Jul 2023 15:54:58 +0800, Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> On Tue, 11 Jul 2023 10:58:51 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > On Tue, Jul 11, 2023 at 10:42 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > >
> > > On Tue, 11 Jul 2023 10:36:17 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > On Mon, Jul 10, 2023 at 8:41 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > >
> > > > > On Mon, 10 Jul 2023 07:59:03 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > On Mon, Jul 10, 2023 at 06:18:30PM +0800, Xuan Zhuo wrote:
> > > > > > > On Mon, 10 Jul 2023 05:40:21 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > > > On Mon, Jul 10, 2023 at 11:42:37AM +0800, Xuan Zhuo wrote:
> > > > > > > > > Currently, the virtio core will perform a dma operation for each
> > > > > > > > > operation. Although, the same page may be operated multiple times.
> > > > > > > > >
> > > > > > > > > The driver does the dma operation and manages the dma address based the
> > > > > > > > > feature premapped of virtio core.
> > > > > > > > >
> > > > > > > > > This way, we can perform only one dma operation for the same page. In
> > > > > > > > > the case of mtu 1500, this can reduce a lot of dma operations.
> > > > > > > > >
> > > > > > > > > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > > > > > > > > increased from 1893766 to 1901105. An increase of 0.4%.
> > > > > > > >
> > > > > > > > what kind of dma was there? an IOMMU? which vendors? in which mode
> > > > > > > > of operation?
> > > > > > >
> > > > > > >
> > > > > > > Do you mean this:
> > > > > > >
> > > > > > > [    0.470816] iommu: Default domain type: Passthrough
> > > > > > >
> > > > > >
> > > > > > With passthrough, dma API is just some indirect function calls, they do
> > > > > > not affect the performance a lot.
> > > > >
> > > > >
> > > > > Yes, this benefit is worthless. I seem to have done a meaningless thing. The
> > > > > overhead of DMA I observed is indeed not too high.
> > > >
> > > > Have you measured with iommu=strict?
> > >
> > > I have not tested this way, our environment is pt, I wonder if strict is a
> > > common scenario. I can test it.
> >
> > It's not a common setup, but it's a way to stress DMA layer to see the overhead.
>
> kernel command line: intel_iommu=on iommu.strict=1 iommu.passthrough=0
>
> virtio-net without merge dma 428614.00 pps
>
> virtio-net with merge dma    742853.00 pps


kernel command line: intel_iommu=on iommu.strict=0 iommu.passthrough=0

virtio-net without merge dma 775496.00 pps

virtio-net with merge dma    1010514.00 pps


Thanks.

>
>
> Thanks.
>
>
>
>
> >
> > Thanks
> >
> > >
> > > Thanks.
> > >
> > >
> > > >
> > > > Thanks
> > > >
> > > > >
> > > > > Thanks.
> > > > >
> > > > >
> > > > > >
> > > > > > Try e.g. bounce buffer. Which is where you will see a problem: your
> > > > > > patches won't work.
> > > > > >
> > > > > >
> > > > > > > >
> > > > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > >
> > > > > > > > This kind of difference is likely in the noise.
> > > > > > >
> > > > > > > It's really not high, but this is because the proportion of DMA under perf top
> > > > > > > is not high. Probably that much.
> > > > > >
> > > > > > So maybe not worth the complexity.
> > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > ---
> > > > > > > > >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> > > > > > > > >  1 file changed, 267 insertions(+), 16 deletions(-)
> > > > > > > > >
> > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > > > index 486b5849033d..4de845d35bed 100644
> > > > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > > > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> > > > > > > > >  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
> > > > > > > > >  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
> > > > > > > > >
> > > > > > > > > +/* The bufs on the same page may share this struct. */
> > > > > > > > > +struct virtnet_rq_dma {
> > > > > > > > > +       struct virtnet_rq_dma *next;
> > > > > > > > > +
> > > > > > > > > +       dma_addr_t addr;
> > > > > > > > > +
> > > > > > > > > +       void *buf;
> > > > > > > > > +       u32 len;
> > > > > > > > > +
> > > > > > > > > +       u32 ref;
> > > > > > > > > +};
> > > > > > > > > +
> > > > > > > > > +/* Record the dma and buf. */
> > > > > > > >
> > > > > > > > I guess I see that. But why?
> > > > > > > > And these two comments are the extent of the available
> > > > > > > > documentation, that's not enough I feel.
> > > > > > > >
> > > > > > > >
> > > > > > > > > +struct virtnet_rq_data {
> > > > > > > > > +       struct virtnet_rq_data *next;
> > > > > > > >
> > > > > > > > Is manually reimplementing a linked list the best
> > > > > > > > we can do?
> > > > > > >
> > > > > > > Yes, we can use llist.
> > > > > > >
> > > > > > > >
> > > > > > > > > +
> > > > > > > > > +       void *buf;
> > > > > > > > > +
> > > > > > > > > +       struct virtnet_rq_dma *dma;
> > > > > > > > > +};
> > > > > > > > > +
> > > > > > > > >  /* Internal representation of a send virtqueue */
> > > > > > > > >  struct send_queue {
> > > > > > > > >         /* Virtqueue associated with this send _queue */
> > > > > > > > > @@ -175,6 +196,13 @@ struct receive_queue {
> > > > > > > > >         char name[16];
> > > > > > > > >
> > > > > > > > >         struct xdp_rxq_info xdp_rxq;
> > > > > > > > > +
> > > > > > > > > +       struct virtnet_rq_data *data_array;
> > > > > > > > > +       struct virtnet_rq_data *data_free;
> > > > > > > > > +
> > > > > > > > > +       struct virtnet_rq_dma *dma_array;
> > > > > > > > > +       struct virtnet_rq_dma *dma_free;
> > > > > > > > > +       struct virtnet_rq_dma *last_dma;
> > > > > > > > >  };
> > > > > > > > >
> > > > > > > > >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > > > > > > > > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > > > >         return skb;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > > > > > > > > +{
> > > > > > > > > +       struct device *dev;
> > > > > > > > > +
> > > > > > > > > +       --dma->ref;
> > > > > > > > > +
> > > > > > > > > +       if (dma->ref)
> > > > > > > > > +               return;
> > > > > > > > > +
> > > > > > > >
> > > > > > > > If you don't unmap there is no guarantee valid data will be
> > > > > > > > there in the buffer.
> > > > > > > >
> > > > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > > > +
> > > > > > > > > +       dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> > > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > +
> > > > > > > > > +       dma->next = rq->dma_free;
> > > > > > > > > +       rq->dma_free = dma;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > > > > > > > > +                                    struct virtnet_rq_data *data)
> > > > > > > > > +{
> > > > > > > > > +       void *buf;
> > > > > > > > > +
> > > > > > > > > +       buf = data->buf;
> > > > > > > > > +
> > > > > > > > > +       data->next = rq->data_free;
> > > > > > > > > +       rq->data_free = data;
> > > > > > > > > +
> > > > > > > > > +       return buf;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > > > > > > > > +                                                  void *buf,
> > > > > > > > > +                                                  struct virtnet_rq_dma *dma)
> > > > > > > > > +{
> > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > +
> > > > > > > > > +       data = rq->data_free;
> > > > > > > > > +       rq->data_free = data->next;
> > > > > > > > > +
> > > > > > > > > +       data->buf = buf;
> > > > > > > > > +       data->dma = dma;
> > > > > > > > > +
> > > > > > > > > +       return data;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > > > > > > > > +{
> > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > +       void *buf;
> > > > > > > > > +
> > > > > > > > > +       buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > > > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > > > +               return buf;
> > > > > > > > > +
> > > > > > > > > +       data = buf;
> > > > > > > > > +
> > > > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > +
> > > > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > > > > > > > > +{
> > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > +       void *buf;
> > > > > > > > > +
> > > > > > > > > +       buf = virtqueue_detach_unused_buf(rq->vq);
> > > > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > > > +               return buf;
> > > > > > > > > +
> > > > > > > > > +       data = buf;
> > > > > > > > > +
> > > > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > +
> > > > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > > > > > > > > +{
> > > > > > > > > +       struct virtnet_rq_dma *dma = rq->last_dma;
> > > > > > > > > +       struct device *dev;
> > > > > > > > > +       u32 off, map_len;
> > > > > > > > > +       dma_addr_t addr;
> > > > > > > > > +       void *end;
> > > > > > > > > +
> > > > > > > > > +       if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > > > > > > > > +               ++dma->ref;
> > > > > > > > > +               addr = dma->addr + (buf - dma->buf);
> > > > > > > > > +               goto ok;
> > > > > > > > > +       }
> > > > > > > >
> > > > > > > > So this is the meat of the proposed optimization. I guess that
> > > > > > > > if the last buffer we allocated happens to be in the same page
> > > > > > > > as this one then they can both be mapped for DMA together.
> > > > > > >
> > > > > > > Since we use page_frag, the buffers we allocated are all continuous.
> > > > > > >
> > > > > > > > Why last one specifically? Whether next one happens to
> > > > > > > > be close depends on luck. If you want to try optimizing this
> > > > > > > > the right thing to do is likely by using a page pool.
> > > > > > > > There's actually work upstream on page pool, look it up.
> > > > > > >
> > > > > > > As we discussed in another thread, the page pool is first used for xdp. Let's
> > > > > > > transform it step by step.
> > > > > > >
> > > > > > > Thanks.
> > > > > >
> > > > > > ok so this should wait then?
> > > > > >
> > > > > > > >
> > > > > > > > > +
> > > > > > > > > +       end = buf + len - 1;
> > > > > > > > > +       off = offset_in_page(end);
> > > > > > > > > +       map_len = len + PAGE_SIZE - off;
> > > > > > > > > +
> > > > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > > > +
> > > > > > > > > +       addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > > > > > > > > +                                 map_len, DMA_FROM_DEVICE, 0);
> > > > > > > > > +       if (addr == DMA_MAPPING_ERROR)
> > > > > > > > > +               return -ENOMEM;
> > > > > > > > > +
> > > > > > > > > +       dma = rq->dma_free;
> > > > > > > > > +       rq->dma_free = dma->next;
> > > > > > > > > +
> > > > > > > > > +       dma->ref = 1;
> > > > > > > > > +       dma->buf = buf;
> > > > > > > > > +       dma->addr = addr;
> > > > > > > > > +       dma->len = map_len;
> > > > > > > > > +
> > > > > > > > > +       rq->last_dma = dma;
> > > > > > > > > +
> > > > > > > > > +ok:
> > > > > > > > > +       sg_init_table(rq->sg, 1);
> > > > > > > > > +       rq->sg[0].dma_address = addr;
> > > > > > > > > +       rq->sg[0].length = len;
> > > > > > > > > +
> > > > > > > > > +       return 0;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > > > > > > > > +{
> > > > > > > > > +       struct receive_queue *rq;
> > > > > > > > > +       int i, err, j, num;
> > > > > > > > > +
> > > > > > > > > +       /* disable for big mode */
> > > > > > > > > +       if (!vi->mergeable_rx_bufs && vi->big_packets)
> > > > > > > > > +               return 0;
> > > > > > > > > +
> > > > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > +               err = virtqueue_set_premapped(vi->rq[i].vq);
> > > > > > > > > +               if (err)
> > > > > > > > > +                       continue;
> > > > > > > > > +
> > > > > > > > > +               rq = &vi->rq[i];
> > > > > > > > > +
> > > > > > > > > +               num = virtqueue_get_vring_size(rq->vq);
> > > > > > > > > +
> > > > > > > > > +               rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > > > > > > > > +               if (!rq->data_array)
> > > > > > > > > +                       goto err;
> > > > > > > > > +
> > > > > > > > > +               rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > > > > > > > > +               if (!rq->dma_array)
> > > > > > > > > +                       goto err;
> > > > > > > > > +
> > > > > > > > > +               for (j = 0; j < num; ++j) {
> > > > > > > > > +                       rq->data_array[j].next = rq->data_free;
> > > > > > > > > +                       rq->data_free = &rq->data_array[j];
> > > > > > > > > +
> > > > > > > > > +                       rq->dma_array[j].next = rq->dma_free;
> > > > > > > > > +                       rq->dma_free = &rq->dma_array[j];
> > > > > > > > > +               }
> > > > > > > > > +       }
> > > > > > > > > +
> > > > > > > > > +       return 0;
> > > > > > > > > +
> > > > > > > > > +err:
> > > > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > +               struct receive_queue *rq;
> > > > > > > > > +
> > > > > > > > > +               rq = &vi->rq[i];
> > > > > > > > > +
> > > > > > > > > +               kfree(rq->dma_array);
> > > > > > > > > +               kfree(rq->data_array);
> > > > > > > > > +       }
> > > > > > > > > +
> > > > > > > > > +       return -ENOMEM;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> > > > > > > > >  {
> > > > > > > > >         unsigned int len;
> > > > > > > > > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > > > >                 void *buf;
> > > > > > > > >                 int off;
> > > > > > > > >
> > > > > > > > > -               buf = virtqueue_get_buf(rq->vq, &buflen);
> > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> > > > > > > > >                 if (unlikely(!buf))
> > > > > > > > >                         goto err_buf;
> > > > > > > > >
> > > > > > > > > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > > > >                 return -EINVAL;
> > > > > > > > >
> > > > > > > > >         while (--*num_buf > 0) {
> > > > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > > > >                                  dev->name, *num_buf,
> > > > > > > > > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > >         while (--num_buf) {
> > > > > > > > >                 int num_skb_frags;
> > > > > > > > >
> > > > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > > > >                                  dev->name, num_buf,
> > > > > > > > > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > >  err_skb:
> > > > > > > > >         put_page(page);
> > > > > > > > >         while (num_buf-- > 1) {
> > > > > > > > > -               buf = virtqueue_get_buf(rq->vq, &len);
> > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, NULL);
> > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > >                         pr_debug("%s: rx error: %d buffers missing\n",
> > > > > > > > >                                  dev->name, num_buf);
> > > > > > > > > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > > > >         unsigned int xdp_headroom = virtnet_get_headroom(vi);
> > > > > > > > >         void *ctx = (void *)(unsigned long)xdp_headroom;
> > > > > > > > >         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > >         int err;
> > > > > > > > >
> > > > > > > > >         len = SKB_DATA_ALIGN(len) +
> > > > > > > > > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > > > >         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > > >         get_page(alloc_frag->page);
> > > > > > > > >         alloc_frag->offset += len;
> > > > > > > > > -       sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > -                   vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > > +
> > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > +               err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > +                                       vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > +               if (err)
> > > > > > > > > +                       goto map_err;
> > > > > > > > > +
> > > > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > > > +       } else {
> > > > > > > > > +               sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > +                           vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > +               data = (void *)buf;
> > > > > > > > > +       }
> > > > > > > > > +
> > > > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > > > >         if (err < 0)
> > > > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > > > +               goto add_err;
> > > > > > > > > +
> > > > > > > > > +       return err;
> > > > > > > > > +
> > > > > > > > > +add_err:
> > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > > > +       }
> > > > > > > > > +
> > > > > > > > > +map_err:
> > > > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > > > >         return err;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > >         unsigned int headroom = virtnet_get_headroom(vi);
> > > > > > > > >         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> > > > > > > > >         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > >         char *buf;
> > > > > > > > >         void *ctx;
> > > > > > > > >         int err;
> > > > > > > > > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > >                 alloc_frag->offset += hole;
> > > > > > > > >         }
> > > > > > > > >
> > > > > > > > > -       sg_init_one(rq->sg, buf, len);
> > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > +               err = virtnet_rq_map_sg(rq, buf, len);
> > > > > > > > > +               if (err)
> > > > > > > > > +                       goto map_err;
> > > > > > > > > +
> > > > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > > > +       } else {
> > > > > > > > > +               sg_init_one(rq->sg, buf, len);
> > > > > > > > > +               data = (void *)buf;
> > > > > > > > > +       }
> > > > > > > > > +
> > > > > > > > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > > > >         if (err < 0)
> > > > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > > > +               goto add_err;
> > > > > > > > > +
> > > > > > > > > +       return 0;
> > > > > > > > > +
> > > > > > > > > +add_err:
> > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > > > +       }
> > > > > > > > >
> > > > > > > > > +map_err:
> > > > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > > > >         return err;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> > > > > > > > >                 void *ctx;
> > > > > > > > >
> > > > > > > > >                 while (stats.packets < budget &&
> > > > > > > > > -                      (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > > > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> > > > > > > > >                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> > > > > > > > >                         stats.packets++;
> > > > > > > > >                 }
> > > > > > > > >         } else {
> > > > > > > > >                 while (stats.packets < budget &&
> > > > > > > > > -                      (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > > > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> > > > > > > > >                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> > > > > > > > >                         stats.packets++;
> > > > > > > > >                 }
> > > > > > > > > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > >                 __netif_napi_del(&vi->rq[i].napi);
> > > > > > > > >                 __netif_napi_del(&vi->sq[i].napi);
> > > > > > > > > +
> > > > > > > > > +               kfree(vi->rq[i].data_array);
> > > > > > > > > +               kfree(vi->rq[i].dma_array);
> > > > > > > > >         }
> > > > > > > > >
> > > > > > > > >         /* We called __netif_napi_del(),
> > > > > > > > > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > > > >         }
> > > > > > > > >
> > > > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > -               struct virtqueue *vq = vi->rq[i].vq;
> > > > > > > > > -               while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > > > > > > > > -                       virtnet_rq_free_unused_buf(vq, buf);
> > > > > > > > > +               struct receive_queue *rq = &vi->rq[i];
> > > > > > > > > +
> > > > > > > > > +               while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > > > > > > > > +                       virtnet_rq_free_unused_buf(rq->vq, buf);
> > > > > > > > >                 cond_resched();
> > > > > > > > >         }
> > > > > > > > >  }
> > > > > > > > > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> > > > > > > > >         if (ret)
> > > > > > > > >                 goto err_free;
> > > > > > > > >
> > > > > > > > > +       ret = virtnet_rq_merge_map_init(vi);
> > > > > > > > > +       if (ret)
> > > > > > > > > +               goto err_free;
> > > > > > > > > +
> > > > > > > > >         cpus_read_lock();
> > > > > > > > >         virtnet_set_affinity(vi);
> > > > > > > > >         cpus_read_unlock();
> > > > > > > > > --
> > > > > > > > > 2.32.0.3.g01195cf9f
> > > > > > > >
> > > > > >
> > > > >
> > > >
> > >
> >
>
Jason Wang July 12, 2023, 8:37 a.m. UTC | #10
On Wed, Jul 12, 2023 at 4:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> On Wed, 12 Jul 2023 15:54:58 +0800, Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > On Tue, 11 Jul 2023 10:58:51 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > On Tue, Jul 11, 2023 at 10:42 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > >
> > > > On Tue, 11 Jul 2023 10:36:17 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > On Mon, Jul 10, 2023 at 8:41 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > >
> > > > > > On Mon, 10 Jul 2023 07:59:03 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > > On Mon, Jul 10, 2023 at 06:18:30PM +0800, Xuan Zhuo wrote:
> > > > > > > > On Mon, 10 Jul 2023 05:40:21 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > > > > On Mon, Jul 10, 2023 at 11:42:37AM +0800, Xuan Zhuo wrote:
> > > > > > > > > > Currently, the virtio core will perform a dma operation for each
> > > > > > > > > > operation. Although, the same page may be operated multiple times.
> > > > > > > > > >
> > > > > > > > > > The driver does the dma operation and manages the dma address based the
> > > > > > > > > > feature premapped of virtio core.
> > > > > > > > > >
> > > > > > > > > > This way, we can perform only one dma operation for the same page. In
> > > > > > > > > > the case of mtu 1500, this can reduce a lot of dma operations.
> > > > > > > > > >
> > > > > > > > > > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > > > > > > > > > increased from 1893766 to 1901105. An increase of 0.4%.
> > > > > > > > >
> > > > > > > > > what kind of dma was there? an IOMMU? which vendors? in which mode
> > > > > > > > > of operation?
> > > > > > > >
> > > > > > > >
> > > > > > > > Do you mean this:
> > > > > > > >
> > > > > > > > [    0.470816] iommu: Default domain type: Passthrough
> > > > > > > >
> > > > > > >
> > > > > > > With passthrough, dma API is just some indirect function calls, they do
> > > > > > > not affect the performance a lot.
> > > > > >
> > > > > >
> > > > > > Yes, this benefit is worthless. I seem to have done a meaningless thing. The
> > > > > > overhead of DMA I observed is indeed not too high.
> > > > >
> > > > > Have you measured with iommu=strict?
> > > >
> > > > I have not tested this way, our environment is pt, I wonder if strict is a
> > > > common scenario. I can test it.
> > >
> > > It's not a common setup, but it's a way to stress DMA layer to see the overhead.
> >
> > kernel command line: intel_iommu=on iommu.strict=1 iommu.passthrough=0
> >
> > virtio-net without merge dma 428614.00 pps
> >
> > virtio-net with merge dma    742853.00 pps
>
>
> kernel command line: intel_iommu=on iommu.strict=0 iommu.passthrough=0
>
> virtio-net without merge dma 775496.00 pps
>
> virtio-net with merge dma    1010514.00 pps
>
>

Great, let's add those numbers to the changelog.

Thanks

> Thanks.
>
> >
> >
> > Thanks.
> >
> >
> >
> >
> > >
> > > Thanks
> > >
> > > >
> > > > Thanks.
> > > >
> > > >
> > > > >
> > > > > Thanks
> > > > >
> > > > > >
> > > > > > Thanks.
> > > > > >
> > > > > >
> > > > > > >
> > > > > > > Try e.g. bounce buffer. Which is where you will see a problem: your
> > > > > > > patches won't work.
> > > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > > >
> > > > > > > > > This kind of difference is likely in the noise.
> > > > > > > >
> > > > > > > > It's really not high, but this is because the proportion of DMA under perf top
> > > > > > > > is not high. Probably that much.
> > > > > > >
> > > > > > > So maybe not worth the complexity.
> > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > ---
> > > > > > > > > >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> > > > > > > > > >  1 file changed, 267 insertions(+), 16 deletions(-)
> > > > > > > > > >
> > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > > > > index 486b5849033d..4de845d35bed 100644
> > > > > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > > > > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> > > > > > > > > >  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
> > > > > > > > > >  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
> > > > > > > > > >
> > > > > > > > > > +/* The bufs on the same page may share this struct. */
> > > > > > > > > > +struct virtnet_rq_dma {
> > > > > > > > > > +       struct virtnet_rq_dma *next;
> > > > > > > > > > +
> > > > > > > > > > +       dma_addr_t addr;
> > > > > > > > > > +
> > > > > > > > > > +       void *buf;
> > > > > > > > > > +       u32 len;
> > > > > > > > > > +
> > > > > > > > > > +       u32 ref;
> > > > > > > > > > +};
> > > > > > > > > > +
> > > > > > > > > > +/* Record the dma and buf. */
> > > > > > > > >
> > > > > > > > > I guess I see that. But why?
> > > > > > > > > And these two comments are the extent of the available
> > > > > > > > > documentation, that's not enough I feel.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > +struct virtnet_rq_data {
> > > > > > > > > > +       struct virtnet_rq_data *next;
> > > > > > > > >
> > > > > > > > > Is manually reimplementing a linked list the best
> > > > > > > > > we can do?
> > > > > > > >
> > > > > > > > Yes, we can use llist.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > +
> > > > > > > > > > +       void *buf;
> > > > > > > > > > +
> > > > > > > > > > +       struct virtnet_rq_dma *dma;
> > > > > > > > > > +};
> > > > > > > > > > +
> > > > > > > > > >  /* Internal representation of a send virtqueue */
> > > > > > > > > >  struct send_queue {
> > > > > > > > > >         /* Virtqueue associated with this send _queue */
> > > > > > > > > > @@ -175,6 +196,13 @@ struct receive_queue {
> > > > > > > > > >         char name[16];
> > > > > > > > > >
> > > > > > > > > >         struct xdp_rxq_info xdp_rxq;
> > > > > > > > > > +
> > > > > > > > > > +       struct virtnet_rq_data *data_array;
> > > > > > > > > > +       struct virtnet_rq_data *data_free;
> > > > > > > > > > +
> > > > > > > > > > +       struct virtnet_rq_dma *dma_array;
> > > > > > > > > > +       struct virtnet_rq_dma *dma_free;
> > > > > > > > > > +       struct virtnet_rq_dma *last_dma;
> > > > > > > > > >  };
> > > > > > > > > >
> > > > > > > > > >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > > > > > > > > > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > > > > >         return skb;
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > > > > > > > > > +{
> > > > > > > > > > +       struct device *dev;
> > > > > > > > > > +
> > > > > > > > > > +       --dma->ref;
> > > > > > > > > > +
> > > > > > > > > > +       if (dma->ref)
> > > > > > > > > > +               return;
> > > > > > > > > > +
> > > > > > > > >
> > > > > > > > > If you don't unmap there is no guarantee valid data will be
> > > > > > > > > there in the buffer.
> > > > > > > > >
> > > > > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > > > > +
> > > > > > > > > > +       dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> > > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > +
> > > > > > > > > > +       dma->next = rq->dma_free;
> > > > > > > > > > +       rq->dma_free = dma;
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > > > > > > > > > +                                    struct virtnet_rq_data *data)
> > > > > > > > > > +{
> > > > > > > > > > +       void *buf;
> > > > > > > > > > +
> > > > > > > > > > +       buf = data->buf;
> > > > > > > > > > +
> > > > > > > > > > +       data->next = rq->data_free;
> > > > > > > > > > +       rq->data_free = data;
> > > > > > > > > > +
> > > > > > > > > > +       return buf;
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > > > > > > > > > +                                                  void *buf,
> > > > > > > > > > +                                                  struct virtnet_rq_dma *dma)
> > > > > > > > > > +{
> > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > +
> > > > > > > > > > +       data = rq->data_free;
> > > > > > > > > > +       rq->data_free = data->next;
> > > > > > > > > > +
> > > > > > > > > > +       data->buf = buf;
> > > > > > > > > > +       data->dma = dma;
> > > > > > > > > > +
> > > > > > > > > > +       return data;
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > > > > > > > > > +{
> > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > +       void *buf;
> > > > > > > > > > +
> > > > > > > > > > +       buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > > > > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > > > > +               return buf;
> > > > > > > > > > +
> > > > > > > > > > +       data = buf;
> > > > > > > > > > +
> > > > > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > +
> > > > > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > > > > > > > > > +{
> > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > +       void *buf;
> > > > > > > > > > +
> > > > > > > > > > +       buf = virtqueue_detach_unused_buf(rq->vq);
> > > > > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > > > > +               return buf;
> > > > > > > > > > +
> > > > > > > > > > +       data = buf;
> > > > > > > > > > +
> > > > > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > +
> > > > > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > > > > > > > > > +{
> > > > > > > > > > +       struct virtnet_rq_dma *dma = rq->last_dma;
> > > > > > > > > > +       struct device *dev;
> > > > > > > > > > +       u32 off, map_len;
> > > > > > > > > > +       dma_addr_t addr;
> > > > > > > > > > +       void *end;
> > > > > > > > > > +
> > > > > > > > > > +       if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > > > > > > > > > +               ++dma->ref;
> > > > > > > > > > +               addr = dma->addr + (buf - dma->buf);
> > > > > > > > > > +               goto ok;
> > > > > > > > > > +       }
> > > > > > > > >
> > > > > > > > > So this is the meat of the proposed optimization. I guess that
> > > > > > > > > if the last buffer we allocated happens to be in the same page
> > > > > > > > > as this one then they can both be mapped for DMA together.
> > > > > > > >
> > > > > > > > Since we use page_frag, the buffers we allocated are all continuous.
> > > > > > > >
> > > > > > > > > Why last one specifically? Whether next one happens to
> > > > > > > > > be close depends on luck. If you want to try optimizing this
> > > > > > > > > the right thing to do is likely by using a page pool.
> > > > > > > > > There's actually work upstream on page pool, look it up.
> > > > > > > >
> > > > > > > > As we discussed in another thread, the page pool is first used for xdp. Let's
> > > > > > > > transform it step by step.
> > > > > > > >
> > > > > > > > Thanks.
> > > > > > >
> > > > > > > ok so this should wait then?
> > > > > > >
> > > > > > > > >
> > > > > > > > > > +
> > > > > > > > > > +       end = buf + len - 1;
> > > > > > > > > > +       off = offset_in_page(end);
> > > > > > > > > > +       map_len = len + PAGE_SIZE - off;
> > > > > > > > > > +
> > > > > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > > > > +
> > > > > > > > > > +       addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > > > > > > > > > +                                 map_len, DMA_FROM_DEVICE, 0);
> > > > > > > > > > +       if (addr == DMA_MAPPING_ERROR)
> > > > > > > > > > +               return -ENOMEM;
> > > > > > > > > > +
> > > > > > > > > > +       dma = rq->dma_free;
> > > > > > > > > > +       rq->dma_free = dma->next;
> > > > > > > > > > +
> > > > > > > > > > +       dma->ref = 1;
> > > > > > > > > > +       dma->buf = buf;
> > > > > > > > > > +       dma->addr = addr;
> > > > > > > > > > +       dma->len = map_len;
> > > > > > > > > > +
> > > > > > > > > > +       rq->last_dma = dma;
> > > > > > > > > > +
> > > > > > > > > > +ok:
> > > > > > > > > > +       sg_init_table(rq->sg, 1);
> > > > > > > > > > +       rq->sg[0].dma_address = addr;
> > > > > > > > > > +       rq->sg[0].length = len;
> > > > > > > > > > +
> > > > > > > > > > +       return 0;
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > > > > > > > > > +{
> > > > > > > > > > +       struct receive_queue *rq;
> > > > > > > > > > +       int i, err, j, num;
> > > > > > > > > > +
> > > > > > > > > > +       /* disable for big mode */
> > > > > > > > > > +       if (!vi->mergeable_rx_bufs && vi->big_packets)
> > > > > > > > > > +               return 0;
> > > > > > > > > > +
> > > > > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > +               err = virtqueue_set_premapped(vi->rq[i].vq);
> > > > > > > > > > +               if (err)
> > > > > > > > > > +                       continue;
> > > > > > > > > > +
> > > > > > > > > > +               rq = &vi->rq[i];
> > > > > > > > > > +
> > > > > > > > > > +               num = virtqueue_get_vring_size(rq->vq);
> > > > > > > > > > +
> > > > > > > > > > +               rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > > > > > > > > > +               if (!rq->data_array)
> > > > > > > > > > +                       goto err;
> > > > > > > > > > +
> > > > > > > > > > +               rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > > > > > > > > > +               if (!rq->dma_array)
> > > > > > > > > > +                       goto err;
> > > > > > > > > > +
> > > > > > > > > > +               for (j = 0; j < num; ++j) {
> > > > > > > > > > +                       rq->data_array[j].next = rq->data_free;
> > > > > > > > > > +                       rq->data_free = &rq->data_array[j];
> > > > > > > > > > +
> > > > > > > > > > +                       rq->dma_array[j].next = rq->dma_free;
> > > > > > > > > > +                       rq->dma_free = &rq->dma_array[j];
> > > > > > > > > > +               }
> > > > > > > > > > +       }
> > > > > > > > > > +
> > > > > > > > > > +       return 0;
> > > > > > > > > > +
> > > > > > > > > > +err:
> > > > > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > +               struct receive_queue *rq;
> > > > > > > > > > +
> > > > > > > > > > +               rq = &vi->rq[i];
> > > > > > > > > > +
> > > > > > > > > > +               kfree(rq->dma_array);
> > > > > > > > > > +               kfree(rq->data_array);
> > > > > > > > > > +       }
> > > > > > > > > > +
> > > > > > > > > > +       return -ENOMEM;
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> > > > > > > > > >  {
> > > > > > > > > >         unsigned int len;
> > > > > > > > > > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > > > > >                 void *buf;
> > > > > > > > > >                 int off;
> > > > > > > > > >
> > > > > > > > > > -               buf = virtqueue_get_buf(rq->vq, &buflen);
> > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> > > > > > > > > >                 if (unlikely(!buf))
> > > > > > > > > >                         goto err_buf;
> > > > > > > > > >
> > > > > > > > > > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > > > > >                 return -EINVAL;
> > > > > > > > > >
> > > > > > > > > >         while (--*num_buf > 0) {
> > > > > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > > > > >                                  dev->name, *num_buf,
> > > > > > > > > > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > >         while (--num_buf) {
> > > > > > > > > >                 int num_skb_frags;
> > > > > > > > > >
> > > > > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > > > > >                                  dev->name, num_buf,
> > > > > > > > > > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > >  err_skb:
> > > > > > > > > >         put_page(page);
> > > > > > > > > >         while (num_buf-- > 1) {
> > > > > > > > > > -               buf = virtqueue_get_buf(rq->vq, &len);
> > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, NULL);
> > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > >                         pr_debug("%s: rx error: %d buffers missing\n",
> > > > > > > > > >                                  dev->name, num_buf);
> > > > > > > > > > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > > > > >         unsigned int xdp_headroom = virtnet_get_headroom(vi);
> > > > > > > > > >         void *ctx = (void *)(unsigned long)xdp_headroom;
> > > > > > > > > >         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > >         int err;
> > > > > > > > > >
> > > > > > > > > >         len = SKB_DATA_ALIGN(len) +
> > > > > > > > > > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > > > > >         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > > > >         get_page(alloc_frag->page);
> > > > > > > > > >         alloc_frag->offset += len;
> > > > > > > > > > -       sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > -                   vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > > > +
> > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > +               err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > +                                       vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > +               if (err)
> > > > > > > > > > +                       goto map_err;
> > > > > > > > > > +
> > > > > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > > > > +       } else {
> > > > > > > > > > +               sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > +                           vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > +               data = (void *)buf;
> > > > > > > > > > +       }
> > > > > > > > > > +
> > > > > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > > > > >         if (err < 0)
> > > > > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > > > > +               goto add_err;
> > > > > > > > > > +
> > > > > > > > > > +       return err;
> > > > > > > > > > +
> > > > > > > > > > +add_err:
> > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > +       }
> > > > > > > > > > +
> > > > > > > > > > +map_err:
> > > > > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > > > > >         return err;
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > > >         unsigned int headroom = virtnet_get_headroom(vi);
> > > > > > > > > >         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> > > > > > > > > >         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > >         char *buf;
> > > > > > > > > >         void *ctx;
> > > > > > > > > >         int err;
> > > > > > > > > > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > > >                 alloc_frag->offset += hole;
> > > > > > > > > >         }
> > > > > > > > > >
> > > > > > > > > > -       sg_init_one(rq->sg, buf, len);
> > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > +               err = virtnet_rq_map_sg(rq, buf, len);
> > > > > > > > > > +               if (err)
> > > > > > > > > > +                       goto map_err;
> > > > > > > > > > +
> > > > > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > > > > +       } else {
> > > > > > > > > > +               sg_init_one(rq->sg, buf, len);
> > > > > > > > > > +               data = (void *)buf;
> > > > > > > > > > +       }
> > > > > > > > > > +
> > > > > > > > > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > > > > >         if (err < 0)
> > > > > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > > > > +               goto add_err;
> > > > > > > > > > +
> > > > > > > > > > +       return 0;
> > > > > > > > > > +
> > > > > > > > > > +add_err:
> > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > +       }
> > > > > > > > > >
> > > > > > > > > > +map_err:
> > > > > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > > > > >         return err;
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> > > > > > > > > >                 void *ctx;
> > > > > > > > > >
> > > > > > > > > >                 while (stats.packets < budget &&
> > > > > > > > > > -                      (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > > > > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> > > > > > > > > >                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> > > > > > > > > >                         stats.packets++;
> > > > > > > > > >                 }
> > > > > > > > > >         } else {
> > > > > > > > > >                 while (stats.packets < budget &&
> > > > > > > > > > -                      (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > > > > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> > > > > > > > > >                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> > > > > > > > > >                         stats.packets++;
> > > > > > > > > >                 }
> > > > > > > > > > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > >                 __netif_napi_del(&vi->rq[i].napi);
> > > > > > > > > >                 __netif_napi_del(&vi->sq[i].napi);
> > > > > > > > > > +
> > > > > > > > > > +               kfree(vi->rq[i].data_array);
> > > > > > > > > > +               kfree(vi->rq[i].dma_array);
> > > > > > > > > >         }
> > > > > > > > > >
> > > > > > > > > >         /* We called __netif_napi_del(),
> > > > > > > > > > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > > > > >         }
> > > > > > > > > >
> > > > > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > -               struct virtqueue *vq = vi->rq[i].vq;
> > > > > > > > > > -               while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > > > > > > > > > -                       virtnet_rq_free_unused_buf(vq, buf);
> > > > > > > > > > +               struct receive_queue *rq = &vi->rq[i];
> > > > > > > > > > +
> > > > > > > > > > +               while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > > > > > > > > > +                       virtnet_rq_free_unused_buf(rq->vq, buf);
> > > > > > > > > >                 cond_resched();
> > > > > > > > > >         }
> > > > > > > > > >  }
> > > > > > > > > > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> > > > > > > > > >         if (ret)
> > > > > > > > > >                 goto err_free;
> > > > > > > > > >
> > > > > > > > > > +       ret = virtnet_rq_merge_map_init(vi);
> > > > > > > > > > +       if (ret)
> > > > > > > > > > +               goto err_free;
> > > > > > > > > > +
> > > > > > > > > >         cpus_read_lock();
> > > > > > > > > >         virtnet_set_affinity(vi);
> > > > > > > > > >         cpus_read_unlock();
> > > > > > > > > > --
> > > > > > > > > > 2.32.0.3.g01195cf9f
> > > > > > > > >
> > > > > > >
> > > > > >
> > > > >
> > > >
> > >
> >
>
Xuan Zhuo July 12, 2023, 8:38 a.m. UTC | #11
On Wed, 12 Jul 2023 16:37:43 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Wed, Jul 12, 2023 at 4:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
> > On Wed, 12 Jul 2023 15:54:58 +0800, Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > On Tue, 11 Jul 2023 10:58:51 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > On Tue, Jul 11, 2023 at 10:42 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > >
> > > > > On Tue, 11 Jul 2023 10:36:17 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > On Mon, Jul 10, 2023 at 8:41 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > >
> > > > > > > On Mon, 10 Jul 2023 07:59:03 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > > > On Mon, Jul 10, 2023 at 06:18:30PM +0800, Xuan Zhuo wrote:
> > > > > > > > > On Mon, 10 Jul 2023 05:40:21 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > > > > > On Mon, Jul 10, 2023 at 11:42:37AM +0800, Xuan Zhuo wrote:
> > > > > > > > > > > Currently, the virtio core will perform a dma operation for each
> > > > > > > > > > > operation. Although, the same page may be operated multiple times.
> > > > > > > > > > >
> > > > > > > > > > > The driver does the dma operation and manages the dma address based the
> > > > > > > > > > > feature premapped of virtio core.
> > > > > > > > > > >
> > > > > > > > > > > This way, we can perform only one dma operation for the same page. In
> > > > > > > > > > > the case of mtu 1500, this can reduce a lot of dma operations.
> > > > > > > > > > >
> > > > > > > > > > > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > > > > > > > > > > increased from 1893766 to 1901105. An increase of 0.4%.
> > > > > > > > > >
> > > > > > > > > > what kind of dma was there? an IOMMU? which vendors? in which mode
> > > > > > > > > > of operation?
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Do you mean this:
> > > > > > > > >
> > > > > > > > > [    0.470816] iommu: Default domain type: Passthrough
> > > > > > > > >
> > > > > > > >
> > > > > > > > With passthrough, dma API is just some indirect function calls, they do
> > > > > > > > not affect the performance a lot.
> > > > > > >
> > > > > > >
> > > > > > > Yes, this benefit is worthless. I seem to have done a meaningless thing. The
> > > > > > > overhead of DMA I observed is indeed not too high.
> > > > > >
> > > > > > Have you measured with iommu=strict?
> > > > >
> > > > > I have not tested this way, our environment is pt, I wonder if strict is a
> > > > > common scenario. I can test it.
> > > >
> > > > It's not a common setup, but it's a way to stress DMA layer to see the overhead.
> > >
> > > kernel command line: intel_iommu=on iommu.strict=1 iommu.passthrough=0
> > >
> > > virtio-net without merge dma 428614.00 pps
> > >
> > > virtio-net with merge dma    742853.00 pps
> >
> >
> > kernel command line: intel_iommu=on iommu.strict=0 iommu.passthrough=0
> >
> > virtio-net without merge dma 775496.00 pps
> >
> > virtio-net with merge dma    1010514.00 pps
> >
> >
>
> Great, let's add those numbers to the changelog.


Yes, I will do it in next version.


Thanks.


>
> Thanks
>
> > Thanks.
> >
> > >
> > >
> > > Thanks.
> > >
> > >
> > >
> > >
> > > >
> > > > Thanks
> > > >
> > > > >
> > > > > Thanks.
> > > > >
> > > > >
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > > > >
> > > > > > > Thanks.
> > > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > Try e.g. bounce buffer. Which is where you will see a problem: your
> > > > > > > > patches won't work.
> > > > > > > >
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > > > >
> > > > > > > > > > This kind of difference is likely in the noise.
> > > > > > > > >
> > > > > > > > > It's really not high, but this is because the proportion of DMA under perf top
> > > > > > > > > is not high. Probably that much.
> > > > > > > >
> > > > > > > > So maybe not worth the complexity.
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > ---
> > > > > > > > > > >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> > > > > > > > > > >  1 file changed, 267 insertions(+), 16 deletions(-)
> > > > > > > > > > >
> > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > > > > > index 486b5849033d..4de845d35bed 100644
> > > > > > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > > > > > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> > > > > > > > > > >  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
> > > > > > > > > > >  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
> > > > > > > > > > >
> > > > > > > > > > > +/* The bufs on the same page may share this struct. */
> > > > > > > > > > > +struct virtnet_rq_dma {
> > > > > > > > > > > +       struct virtnet_rq_dma *next;
> > > > > > > > > > > +
> > > > > > > > > > > +       dma_addr_t addr;
> > > > > > > > > > > +
> > > > > > > > > > > +       void *buf;
> > > > > > > > > > > +       u32 len;
> > > > > > > > > > > +
> > > > > > > > > > > +       u32 ref;
> > > > > > > > > > > +};
> > > > > > > > > > > +
> > > > > > > > > > > +/* Record the dma and buf. */
> > > > > > > > > >
> > > > > > > > > > I guess I see that. But why?
> > > > > > > > > > And these two comments are the extent of the available
> > > > > > > > > > documentation, that's not enough I feel.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > +struct virtnet_rq_data {
> > > > > > > > > > > +       struct virtnet_rq_data *next;
> > > > > > > > > >
> > > > > > > > > > Is manually reimplementing a linked list the best
> > > > > > > > > > we can do?
> > > > > > > > >
> > > > > > > > > Yes, we can use llist.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > +
> > > > > > > > > > > +       void *buf;
> > > > > > > > > > > +
> > > > > > > > > > > +       struct virtnet_rq_dma *dma;
> > > > > > > > > > > +};
> > > > > > > > > > > +
> > > > > > > > > > >  /* Internal representation of a send virtqueue */
> > > > > > > > > > >  struct send_queue {
> > > > > > > > > > >         /* Virtqueue associated with this send _queue */
> > > > > > > > > > > @@ -175,6 +196,13 @@ struct receive_queue {
> > > > > > > > > > >         char name[16];
> > > > > > > > > > >
> > > > > > > > > > >         struct xdp_rxq_info xdp_rxq;
> > > > > > > > > > > +
> > > > > > > > > > > +       struct virtnet_rq_data *data_array;
> > > > > > > > > > > +       struct virtnet_rq_data *data_free;
> > > > > > > > > > > +
> > > > > > > > > > > +       struct virtnet_rq_dma *dma_array;
> > > > > > > > > > > +       struct virtnet_rq_dma *dma_free;
> > > > > > > > > > > +       struct virtnet_rq_dma *last_dma;
> > > > > > > > > > >  };
> > > > > > > > > > >
> > > > > > > > > > >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > > > > > > > > > > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > > > > > >         return skb;
> > > > > > > > > > >  }
> > > > > > > > > > >
> > > > > > > > > > > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > > > > > > > > > > +{
> > > > > > > > > > > +       struct device *dev;
> > > > > > > > > > > +
> > > > > > > > > > > +       --dma->ref;
> > > > > > > > > > > +
> > > > > > > > > > > +       if (dma->ref)
> > > > > > > > > > > +               return;
> > > > > > > > > > > +
> > > > > > > > > >
> > > > > > > > > > If you don't unmap there is no guarantee valid data will be
> > > > > > > > > > there in the buffer.
> > > > > > > > > >
> > > > > > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > > > > > +
> > > > > > > > > > > +       dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > +
> > > > > > > > > > > +       dma->next = rq->dma_free;
> > > > > > > > > > > +       rq->dma_free = dma;
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > > > > > > > > > > +                                    struct virtnet_rq_data *data)
> > > > > > > > > > > +{
> > > > > > > > > > > +       void *buf;
> > > > > > > > > > > +
> > > > > > > > > > > +       buf = data->buf;
> > > > > > > > > > > +
> > > > > > > > > > > +       data->next = rq->data_free;
> > > > > > > > > > > +       rq->data_free = data;
> > > > > > > > > > > +
> > > > > > > > > > > +       return buf;
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > > > > > > > > > > +                                                  void *buf,
> > > > > > > > > > > +                                                  struct virtnet_rq_dma *dma)
> > > > > > > > > > > +{
> > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > +
> > > > > > > > > > > +       data = rq->data_free;
> > > > > > > > > > > +       rq->data_free = data->next;
> > > > > > > > > > > +
> > > > > > > > > > > +       data->buf = buf;
> > > > > > > > > > > +       data->dma = dma;
> > > > > > > > > > > +
> > > > > > > > > > > +       return data;
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > > > > > > > > > > +{
> > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > +       void *buf;
> > > > > > > > > > > +
> > > > > > > > > > > +       buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > > > > > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > > > > > +               return buf;
> > > > > > > > > > > +
> > > > > > > > > > > +       data = buf;
> > > > > > > > > > > +
> > > > > > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > +
> > > > > > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > > > > > > > > > > +{
> > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > +       void *buf;
> > > > > > > > > > > +
> > > > > > > > > > > +       buf = virtqueue_detach_unused_buf(rq->vq);
> > > > > > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > > > > > +               return buf;
> > > > > > > > > > > +
> > > > > > > > > > > +       data = buf;
> > > > > > > > > > > +
> > > > > > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > +
> > > > > > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > > > > > > > > > > +{
> > > > > > > > > > > +       struct virtnet_rq_dma *dma = rq->last_dma;
> > > > > > > > > > > +       struct device *dev;
> > > > > > > > > > > +       u32 off, map_len;
> > > > > > > > > > > +       dma_addr_t addr;
> > > > > > > > > > > +       void *end;
> > > > > > > > > > > +
> > > > > > > > > > > +       if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > > > > > > > > > > +               ++dma->ref;
> > > > > > > > > > > +               addr = dma->addr + (buf - dma->buf);
> > > > > > > > > > > +               goto ok;
> > > > > > > > > > > +       }
> > > > > > > > > >
> > > > > > > > > > So this is the meat of the proposed optimization. I guess that
> > > > > > > > > > if the last buffer we allocated happens to be in the same page
> > > > > > > > > > as this one then they can both be mapped for DMA together.
> > > > > > > > >
> > > > > > > > > Since we use page_frag, the buffers we allocated are all continuous.
> > > > > > > > >
> > > > > > > > > > Why last one specifically? Whether next one happens to
> > > > > > > > > > be close depends on luck. If you want to try optimizing this
> > > > > > > > > > the right thing to do is likely by using a page pool.
> > > > > > > > > > There's actually work upstream on page pool, look it up.
> > > > > > > > >
> > > > > > > > > As we discussed in another thread, the page pool is first used for xdp. Let's
> > > > > > > > > transform it step by step.
> > > > > > > > >
> > > > > > > > > Thanks.
> > > > > > > >
> > > > > > > > ok so this should wait then?
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > +
> > > > > > > > > > > +       end = buf + len - 1;
> > > > > > > > > > > +       off = offset_in_page(end);
> > > > > > > > > > > +       map_len = len + PAGE_SIZE - off;
> > > > > > > > > > > +
> > > > > > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > > > > > +
> > > > > > > > > > > +       addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > > > > > > > > > > +                                 map_len, DMA_FROM_DEVICE, 0);
> > > > > > > > > > > +       if (addr == DMA_MAPPING_ERROR)
> > > > > > > > > > > +               return -ENOMEM;
> > > > > > > > > > > +
> > > > > > > > > > > +       dma = rq->dma_free;
> > > > > > > > > > > +       rq->dma_free = dma->next;
> > > > > > > > > > > +
> > > > > > > > > > > +       dma->ref = 1;
> > > > > > > > > > > +       dma->buf = buf;
> > > > > > > > > > > +       dma->addr = addr;
> > > > > > > > > > > +       dma->len = map_len;
> > > > > > > > > > > +
> > > > > > > > > > > +       rq->last_dma = dma;
> > > > > > > > > > > +
> > > > > > > > > > > +ok:
> > > > > > > > > > > +       sg_init_table(rq->sg, 1);
> > > > > > > > > > > +       rq->sg[0].dma_address = addr;
> > > > > > > > > > > +       rq->sg[0].length = len;
> > > > > > > > > > > +
> > > > > > > > > > > +       return 0;
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > > > > > > > > > > +{
> > > > > > > > > > > +       struct receive_queue *rq;
> > > > > > > > > > > +       int i, err, j, num;
> > > > > > > > > > > +
> > > > > > > > > > > +       /* disable for big mode */
> > > > > > > > > > > +       if (!vi->mergeable_rx_bufs && vi->big_packets)
> > > > > > > > > > > +               return 0;
> > > > > > > > > > > +
> > > > > > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > +               err = virtqueue_set_premapped(vi->rq[i].vq);
> > > > > > > > > > > +               if (err)
> > > > > > > > > > > +                       continue;
> > > > > > > > > > > +
> > > > > > > > > > > +               rq = &vi->rq[i];
> > > > > > > > > > > +
> > > > > > > > > > > +               num = virtqueue_get_vring_size(rq->vq);
> > > > > > > > > > > +
> > > > > > > > > > > +               rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > > > > > > > > > > +               if (!rq->data_array)
> > > > > > > > > > > +                       goto err;
> > > > > > > > > > > +
> > > > > > > > > > > +               rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > > > > > > > > > > +               if (!rq->dma_array)
> > > > > > > > > > > +                       goto err;
> > > > > > > > > > > +
> > > > > > > > > > > +               for (j = 0; j < num; ++j) {
> > > > > > > > > > > +                       rq->data_array[j].next = rq->data_free;
> > > > > > > > > > > +                       rq->data_free = &rq->data_array[j];
> > > > > > > > > > > +
> > > > > > > > > > > +                       rq->dma_array[j].next = rq->dma_free;
> > > > > > > > > > > +                       rq->dma_free = &rq->dma_array[j];
> > > > > > > > > > > +               }
> > > > > > > > > > > +       }
> > > > > > > > > > > +
> > > > > > > > > > > +       return 0;
> > > > > > > > > > > +
> > > > > > > > > > > +err:
> > > > > > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > +               struct receive_queue *rq;
> > > > > > > > > > > +
> > > > > > > > > > > +               rq = &vi->rq[i];
> > > > > > > > > > > +
> > > > > > > > > > > +               kfree(rq->dma_array);
> > > > > > > > > > > +               kfree(rq->data_array);
> > > > > > > > > > > +       }
> > > > > > > > > > > +
> > > > > > > > > > > +       return -ENOMEM;
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> > > > > > > > > > >  {
> > > > > > > > > > >         unsigned int len;
> > > > > > > > > > > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > > > > > >                 void *buf;
> > > > > > > > > > >                 int off;
> > > > > > > > > > >
> > > > > > > > > > > -               buf = virtqueue_get_buf(rq->vq, &buflen);
> > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> > > > > > > > > > >                 if (unlikely(!buf))
> > > > > > > > > > >                         goto err_buf;
> > > > > > > > > > >
> > > > > > > > > > > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > > > > > >                 return -EINVAL;
> > > > > > > > > > >
> > > > > > > > > > >         while (--*num_buf > 0) {
> > > > > > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > > > > > >                                  dev->name, *num_buf,
> > > > > > > > > > > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > > >         while (--num_buf) {
> > > > > > > > > > >                 int num_skb_frags;
> > > > > > > > > > >
> > > > > > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > > > > > >                                  dev->name, num_buf,
> > > > > > > > > > > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > > >  err_skb:
> > > > > > > > > > >         put_page(page);
> > > > > > > > > > >         while (num_buf-- > 1) {
> > > > > > > > > > > -               buf = virtqueue_get_buf(rq->vq, &len);
> > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, NULL);
> > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers missing\n",
> > > > > > > > > > >                                  dev->name, num_buf);
> > > > > > > > > > > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > > > > > >         unsigned int xdp_headroom = virtnet_get_headroom(vi);
> > > > > > > > > > >         void *ctx = (void *)(unsigned long)xdp_headroom;
> > > > > > > > > > >         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > >         int err;
> > > > > > > > > > >
> > > > > > > > > > >         len = SKB_DATA_ALIGN(len) +
> > > > > > > > > > > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > > > > > >         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > > > > >         get_page(alloc_frag->page);
> > > > > > > > > > >         alloc_frag->offset += len;
> > > > > > > > > > > -       sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > -                   vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > > > > +
> > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > +               err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > +                                       vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > +               if (err)
> > > > > > > > > > > +                       goto map_err;
> > > > > > > > > > > +
> > > > > > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > > > > > +       } else {
> > > > > > > > > > > +               sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > +                           vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > +               data = (void *)buf;
> > > > > > > > > > > +       }
> > > > > > > > > > > +
> > > > > > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > > > > > >         if (err < 0)
> > > > > > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > > > > > +               goto add_err;
> > > > > > > > > > > +
> > > > > > > > > > > +       return err;
> > > > > > > > > > > +
> > > > > > > > > > > +add_err:
> > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > +       }
> > > > > > > > > > > +
> > > > > > > > > > > +map_err:
> > > > > > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > > > > > >         return err;
> > > > > > > > > > >  }
> > > > > > > > > > >
> > > > > > > > > > > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > > > >         unsigned int headroom = virtnet_get_headroom(vi);
> > > > > > > > > > >         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> > > > > > > > > > >         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > >         char *buf;
> > > > > > > > > > >         void *ctx;
> > > > > > > > > > >         int err;
> > > > > > > > > > > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > > > >                 alloc_frag->offset += hole;
> > > > > > > > > > >         }
> > > > > > > > > > >
> > > > > > > > > > > -       sg_init_one(rq->sg, buf, len);
> > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > +               err = virtnet_rq_map_sg(rq, buf, len);
> > > > > > > > > > > +               if (err)
> > > > > > > > > > > +                       goto map_err;
> > > > > > > > > > > +
> > > > > > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > > > > > +       } else {
> > > > > > > > > > > +               sg_init_one(rq->sg, buf, len);
> > > > > > > > > > > +               data = (void *)buf;
> > > > > > > > > > > +       }
> > > > > > > > > > > +
> > > > > > > > > > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > > > > > >         if (err < 0)
> > > > > > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > > > > > +               goto add_err;
> > > > > > > > > > > +
> > > > > > > > > > > +       return 0;
> > > > > > > > > > > +
> > > > > > > > > > > +add_err:
> > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > +       }
> > > > > > > > > > >
> > > > > > > > > > > +map_err:
> > > > > > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > > > > > >         return err;
> > > > > > > > > > >  }
> > > > > > > > > > >
> > > > > > > > > > > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> > > > > > > > > > >                 void *ctx;
> > > > > > > > > > >
> > > > > > > > > > >                 while (stats.packets < budget &&
> > > > > > > > > > > -                      (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > > > > > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> > > > > > > > > > >                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> > > > > > > > > > >                         stats.packets++;
> > > > > > > > > > >                 }
> > > > > > > > > > >         } else {
> > > > > > > > > > >                 while (stats.packets < budget &&
> > > > > > > > > > > -                      (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > > > > > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> > > > > > > > > > >                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> > > > > > > > > > >                         stats.packets++;
> > > > > > > > > > >                 }
> > > > > > > > > > > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > >                 __netif_napi_del(&vi->rq[i].napi);
> > > > > > > > > > >                 __netif_napi_del(&vi->sq[i].napi);
> > > > > > > > > > > +
> > > > > > > > > > > +               kfree(vi->rq[i].data_array);
> > > > > > > > > > > +               kfree(vi->rq[i].dma_array);
> > > > > > > > > > >         }
> > > > > > > > > > >
> > > > > > > > > > >         /* We called __netif_napi_del(),
> > > > > > > > > > > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > > > > > >         }
> > > > > > > > > > >
> > > > > > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > -               struct virtqueue *vq = vi->rq[i].vq;
> > > > > > > > > > > -               while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > > > > > > > > > > -                       virtnet_rq_free_unused_buf(vq, buf);
> > > > > > > > > > > +               struct receive_queue *rq = &vi->rq[i];
> > > > > > > > > > > +
> > > > > > > > > > > +               while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > > > > > > > > > > +                       virtnet_rq_free_unused_buf(rq->vq, buf);
> > > > > > > > > > >                 cond_resched();
> > > > > > > > > > >         }
> > > > > > > > > > >  }
> > > > > > > > > > > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> > > > > > > > > > >         if (ret)
> > > > > > > > > > >                 goto err_free;
> > > > > > > > > > >
> > > > > > > > > > > +       ret = virtnet_rq_merge_map_init(vi);
> > > > > > > > > > > +       if (ret)
> > > > > > > > > > > +               goto err_free;
> > > > > > > > > > > +
> > > > > > > > > > >         cpus_read_lock();
> > > > > > > > > > >         virtnet_set_affinity(vi);
> > > > > > > > > > >         cpus_read_unlock();
> > > > > > > > > > > --
> > > > > > > > > > > 2.32.0.3.g01195cf9f
> > > > > > > > > >
> > > > > > > >
> > > > > > >
> > > > > >
> > > > >
> > > >
> > >
> >
>
Jason Wang July 13, 2023, 4:20 a.m. UTC | #12
On Mon, Jul 10, 2023 at 11:43 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>

I'd suggest to tweak the title like:

"merge dma operations when refilling mergeable buffers"

> Currently, the virtio core will perform a dma operation for each
> operation.

"for each buffer"?

> Although, the same page may be operated multiple times.
>
> The driver does the dma operation and manages the dma address based the
> feature premapped of virtio core.
>
> This way, we can perform only one dma operation for the same page. In
> the case of mtu 1500, this can reduce a lot of dma operations.
>
> Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> increased from 1893766 to 1901105. An increase of 0.4%.

Btw, it looks to me the code to deal with XDP_TX/REDIRECT for
linearized pages was missed.

>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> ---
>  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
>  1 file changed, 267 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 486b5849033d..4de845d35bed 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
>  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
>  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
>
> +/* The bufs on the same page may share this struct. */
> +struct virtnet_rq_dma {
> +       struct virtnet_rq_dma *next;
> +
> +       dma_addr_t addr;
> +
> +       void *buf;
> +       u32 len;
> +
> +       u32 ref;
> +};
> +
> +/* Record the dma and buf. */
> +struct virtnet_rq_data {
> +       struct virtnet_rq_data *next;
> +
> +       void *buf;
> +
> +       struct virtnet_rq_dma *dma;
> +};
> +
>  /* Internal representation of a send virtqueue */
>  struct send_queue {
>         /* Virtqueue associated with this send _queue */
> @@ -175,6 +196,13 @@ struct receive_queue {
>         char name[16];
>
>         struct xdp_rxq_info xdp_rxq;
> +
> +       struct virtnet_rq_data *data_array;
> +       struct virtnet_rq_data *data_free;
> +
> +       struct virtnet_rq_dma *dma_array;
> +       struct virtnet_rq_dma *dma_free;
> +       struct virtnet_rq_dma *last_dma;
>  };
>
>  /* This structure can contain rss message with maximum settings for indirection table and keysize
> @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>         return skb;
>  }
>
> +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> +{
> +       struct device *dev;
> +
> +       --dma->ref;
> +
> +       if (dma->ref)
> +               return;
> +
> +       dev = virtqueue_dma_dev(rq->vq);
> +
> +       dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> +
> +       dma->next = rq->dma_free;
> +       rq->dma_free = dma;
> +}
> +
> +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> +                                    struct virtnet_rq_data *data)
> +{
> +       void *buf;
> +
> +       buf = data->buf;
> +
> +       data->next = rq->data_free;
> +       rq->data_free = data;
> +
> +       return buf;
> +}
> +
> +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> +                                                  void *buf,
> +                                                  struct virtnet_rq_dma *dma)
> +{
> +       struct virtnet_rq_data *data;
> +
> +       data = rq->data_free;
> +       rq->data_free = data->next;
> +
> +       data->buf = buf;
> +       data->dma = dma;
> +
> +       return data;
> +}
> +
> +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> +{
> +       struct virtnet_rq_data *data;
> +       void *buf;
> +
> +       buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> +       if (!buf || !rq->data_array)
> +               return buf;
> +
> +       data = buf;
> +
> +       virtnet_rq_unmap(rq, data->dma);
> +
> +       return virtnet_rq_recycle_data(rq, data);
> +}
> +
> +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> +{
> +       struct virtnet_rq_data *data;
> +       void *buf;
> +
> +       buf = virtqueue_detach_unused_buf(rq->vq);
> +       if (!buf || !rq->data_array)
> +               return buf;
> +
> +       data = buf;
> +
> +       virtnet_rq_unmap(rq, data->dma);
> +
> +       return virtnet_rq_recycle_data(rq, data);
> +}
> +
> +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> +{
> +       struct virtnet_rq_dma *dma = rq->last_dma;
> +       struct device *dev;
> +       u32 off, map_len;
> +       dma_addr_t addr;
> +       void *end;
> +
> +       if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> +               ++dma->ref;
> +               addr = dma->addr + (buf - dma->buf);
> +               goto ok;
> +       }
> +
> +       end = buf + len - 1;
> +       off = offset_in_page(end);
> +       map_len = len + PAGE_SIZE - off;

This assumes a PAGE_SIZE which seems sub-optimal as page frag could be
larger than this.

> +
> +       dev = virtqueue_dma_dev(rq->vq);
> +
> +       addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> +                                 map_len, DMA_FROM_DEVICE, 0);
> +       if (addr == DMA_MAPPING_ERROR)
> +               return -ENOMEM;
> +
> +       dma = rq->dma_free;
> +       rq->dma_free = dma->next;
> +
> +       dma->ref = 1;
> +       dma->buf = buf;
> +       dma->addr = addr;
> +       dma->len = map_len;
> +
> +       rq->last_dma = dma;
> +
> +ok:
> +       sg_init_table(rq->sg, 1);
> +       rq->sg[0].dma_address = addr;
> +       rq->sg[0].length = len;
> +
> +       return 0;
> +}
> +
> +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> +{
> +       struct receive_queue *rq;
> +       int i, err, j, num;
> +
> +       /* disable for big mode */
> +       if (!vi->mergeable_rx_bufs && vi->big_packets)
> +               return 0;
> +
> +       for (i = 0; i < vi->max_queue_pairs; i++) {
> +               err = virtqueue_set_premapped(vi->rq[i].vq);
> +               if (err)
> +                       continue;
> +
> +               rq = &vi->rq[i];
> +
> +               num = virtqueue_get_vring_size(rq->vq);
> +
> +               rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> +               if (!rq->data_array)

Can we avoid those allocations when we don't use the DMA API?

> +                       goto err;
> +
> +               rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> +               if (!rq->dma_array)
> +                       goto err;
> +
> +               for (j = 0; j < num; ++j) {
> +                       rq->data_array[j].next = rq->data_free;
> +                       rq->data_free = &rq->data_array[j];
> +
> +                       rq->dma_array[j].next = rq->dma_free;
> +                       rq->dma_free = &rq->dma_array[j];
> +               }
> +       }
> +
> +       return 0;
> +
> +err:
> +       for (i = 0; i < vi->max_queue_pairs; i++) {
> +               struct receive_queue *rq;
> +
> +               rq = &vi->rq[i];
> +
> +               kfree(rq->dma_array);
> +               kfree(rq->data_array);
> +       }
> +
> +       return -ENOMEM;
> +}
> +
>  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
>  {
>         unsigned int len;
> @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
>                 void *buf;
>                 int off;
>
> -               buf = virtqueue_get_buf(rq->vq, &buflen);
> +               buf = virtnet_rq_get_buf(rq, &buflen, NULL);
>                 if (unlikely(!buf))
>                         goto err_buf;
>
> @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
>                 return -EINVAL;
>
>         while (--*num_buf > 0) {
> -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
>                 if (unlikely(!buf)) {
>                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
>                                  dev->name, *num_buf,
> @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>         while (--num_buf) {
>                 int num_skb_frags;
>
> -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
>                 if (unlikely(!buf)) {
>                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
>                                  dev->name, num_buf,
> @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>  err_skb:
>         put_page(page);
>         while (num_buf-- > 1) {
> -               buf = virtqueue_get_buf(rq->vq, &len);
> +               buf = virtnet_rq_get_buf(rq, &len, NULL);
>                 if (unlikely(!buf)) {
>                         pr_debug("%s: rx error: %d buffers missing\n",
>                                  dev->name, num_buf);
> @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
>         unsigned int xdp_headroom = virtnet_get_headroom(vi);
>         void *ctx = (void *)(unsigned long)xdp_headroom;
>         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> +       struct virtnet_rq_data *data;
>         int err;
>
>         len = SKB_DATA_ALIGN(len) +
> @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
>         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
>         get_page(alloc_frag->page);
>         alloc_frag->offset += len;
> -       sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> -                   vi->hdr_len + GOOD_PACKET_LEN);
> -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> +
> +       if (rq->data_array) {
> +               err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> +                                       vi->hdr_len + GOOD_PACKET_LEN);

Thanks to the compound page. I wonder if everything could be
simplified if we just reuse page->private for storing metadata like
dma address and refcnt. Then we don't need extra stuff for tracking
any other thing?

Thanks



> +               if (err)
> +                       goto map_err;
> +
> +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> +       } else {
> +               sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> +                           vi->hdr_len + GOOD_PACKET_LEN);
> +               data = (void *)buf;
> +       }
> +
> +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
>         if (err < 0)
> -               put_page(virt_to_head_page(buf));
> +               goto add_err;
> +
> +       return err;
> +
> +add_err:
> +       if (rq->data_array) {
> +               virtnet_rq_unmap(rq, data->dma);
> +               virtnet_rq_recycle_data(rq, data);
> +       }
> +
> +map_err:
> +       put_page(virt_to_head_page(buf));
>         return err;
>  }
>
> @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
>         unsigned int headroom = virtnet_get_headroom(vi);
>         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
>         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> +       struct virtnet_rq_data *data;
>         char *buf;
>         void *ctx;
>         int err;
> @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
>                 alloc_frag->offset += hole;
>         }
>
> -       sg_init_one(rq->sg, buf, len);
> +       if (rq->data_array) {
> +               err = virtnet_rq_map_sg(rq, buf, len);
> +               if (err)
> +                       goto map_err;
> +
> +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> +       } else {
> +               sg_init_one(rq->sg, buf, len);
> +               data = (void *)buf;
> +       }
> +
>         ctx = mergeable_len_to_ctx(len + room, headroom);
> -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
>         if (err < 0)
> -               put_page(virt_to_head_page(buf));
> +               goto add_err;
> +
> +       return 0;
> +
> +add_err:
> +       if (rq->data_array) {
> +               virtnet_rq_unmap(rq, data->dma);
> +               virtnet_rq_recycle_data(rq, data);
> +       }
>
> +map_err:
> +       put_page(virt_to_head_page(buf));
>         return err;
>  }
>
> @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
>                 void *ctx;
>
>                 while (stats.packets < budget &&
> -                      (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> +                      (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
>                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
>                         stats.packets++;
>                 }
>         } else {
>                 while (stats.packets < budget &&
> -                      (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> +                      (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
>                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
>                         stats.packets++;
>                 }
> @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
>         for (i = 0; i < vi->max_queue_pairs; i++) {
>                 __netif_napi_del(&vi->rq[i].napi);
>                 __netif_napi_del(&vi->sq[i].napi);
> +
> +               kfree(vi->rq[i].data_array);
> +               kfree(vi->rq[i].dma_array);
>         }
>
>         /* We called __netif_napi_del(),
> @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
>         }
>
>         for (i = 0; i < vi->max_queue_pairs; i++) {
> -               struct virtqueue *vq = vi->rq[i].vq;
> -               while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> -                       virtnet_rq_free_unused_buf(vq, buf);
> +               struct receive_queue *rq = &vi->rq[i];
> +
> +               while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> +                       virtnet_rq_free_unused_buf(rq->vq, buf);
>                 cond_resched();
>         }
>  }
> @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
>         if (ret)
>                 goto err_free;
>
> +       ret = virtnet_rq_merge_map_init(vi);
> +       if (ret)
> +               goto err_free;
> +
>         cpus_read_lock();
>         virtnet_set_affinity(vi);
>         cpus_read_unlock();
> --
> 2.32.0.3.g01195cf9f
>
Xuan Zhuo July 13, 2023, 5:53 a.m. UTC | #13
On Thu, 13 Jul 2023 12:20:01 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Mon, Jul 10, 2023 at 11:43 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
>
> I'd suggest to tweak the title like:
>
> "merge dma operations when refilling mergeable buffers"
>
> > Currently, the virtio core will perform a dma operation for each
> > operation.
>
> "for each buffer"?
>
> > Although, the same page may be operated multiple times.
> >
> > The driver does the dma operation and manages the dma address based the
> > feature premapped of virtio core.
> >
> > This way, we can perform only one dma operation for the same page. In
> > the case of mtu 1500, this can reduce a lot of dma operations.
> >
> > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > increased from 1893766 to 1901105. An increase of 0.4%.
>
> Btw, it looks to me the code to deal with XDP_TX/REDIRECT for
> linearized pages was missed.

This patch just affects the filling buffers and the getting buffers.
So I guess that you mean the getting buffers from the xdp_linearize_page().

I actually handled this. Maybe you miss that.



>
> >
> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > ---
> >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> >  1 file changed, 267 insertions(+), 16 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 486b5849033d..4de845d35bed 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> >  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
> >  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
> >
> > +/* The bufs on the same page may share this struct. */
> > +struct virtnet_rq_dma {
> > +       struct virtnet_rq_dma *next;
> > +
> > +       dma_addr_t addr;
> > +
> > +       void *buf;
> > +       u32 len;
> > +
> > +       u32 ref;
> > +};
> > +
> > +/* Record the dma and buf. */
> > +struct virtnet_rq_data {
> > +       struct virtnet_rq_data *next;
> > +
> > +       void *buf;
> > +
> > +       struct virtnet_rq_dma *dma;
> > +};
> > +
> >  /* Internal representation of a send virtqueue */
> >  struct send_queue {
> >         /* Virtqueue associated with this send _queue */
> > @@ -175,6 +196,13 @@ struct receive_queue {
> >         char name[16];
> >
> >         struct xdp_rxq_info xdp_rxq;
> > +
> > +       struct virtnet_rq_data *data_array;
> > +       struct virtnet_rq_data *data_free;
> > +
> > +       struct virtnet_rq_dma *dma_array;
> > +       struct virtnet_rq_dma *dma_free;
> > +       struct virtnet_rq_dma *last_dma;
> >  };
> >
> >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> >         return skb;
> >  }
> >
> > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > +{
> > +       struct device *dev;
> > +
> > +       --dma->ref;
> > +
> > +       if (dma->ref)
> > +               return;
> > +
> > +       dev = virtqueue_dma_dev(rq->vq);
> > +
> > +       dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> > +
> > +       dma->next = rq->dma_free;
> > +       rq->dma_free = dma;
> > +}
> > +
> > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > +                                    struct virtnet_rq_data *data)
> > +{
> > +       void *buf;
> > +
> > +       buf = data->buf;
> > +
> > +       data->next = rq->data_free;
> > +       rq->data_free = data;
> > +
> > +       return buf;
> > +}
> > +
> > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > +                                                  void *buf,
> > +                                                  struct virtnet_rq_dma *dma)
> > +{
> > +       struct virtnet_rq_data *data;
> > +
> > +       data = rq->data_free;
> > +       rq->data_free = data->next;
> > +
> > +       data->buf = buf;
> > +       data->dma = dma;
> > +
> > +       return data;
> > +}
> > +
> > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > +{
> > +       struct virtnet_rq_data *data;
> > +       void *buf;
> > +
> > +       buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > +       if (!buf || !rq->data_array)
> > +               return buf;
> > +
> > +       data = buf;
> > +
> > +       virtnet_rq_unmap(rq, data->dma);
> > +
> > +       return virtnet_rq_recycle_data(rq, data);
> > +}
> > +
> > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > +{
> > +       struct virtnet_rq_data *data;
> > +       void *buf;
> > +
> > +       buf = virtqueue_detach_unused_buf(rq->vq);
> > +       if (!buf || !rq->data_array)
> > +               return buf;
> > +
> > +       data = buf;
> > +
> > +       virtnet_rq_unmap(rq, data->dma);
> > +
> > +       return virtnet_rq_recycle_data(rq, data);
> > +}
> > +
> > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > +{
> > +       struct virtnet_rq_dma *dma = rq->last_dma;
> > +       struct device *dev;
> > +       u32 off, map_len;
> > +       dma_addr_t addr;
> > +       void *end;
> > +
> > +       if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > +               ++dma->ref;
> > +               addr = dma->addr + (buf - dma->buf);
> > +               goto ok;
> > +       }
> > +
> > +       end = buf + len - 1;
> > +       off = offset_in_page(end);
> > +       map_len = len + PAGE_SIZE - off;
>
> This assumes a PAGE_SIZE which seems sub-optimal as page frag could be
> larger than this.

Actually, the each time I just handle one/two page. I do not map the page frag.
Because I want to avoid that just one buffer(1500) is using, but the entire page
frag(32k) is still mapped.

Mapping the entire page frag and mapping one page every time, I don't know
which is good.

>
> > +
> > +       dev = virtqueue_dma_dev(rq->vq);
> > +
> > +       addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > +                                 map_len, DMA_FROM_DEVICE, 0);
> > +       if (addr == DMA_MAPPING_ERROR)
> > +               return -ENOMEM;
> > +
> > +       dma = rq->dma_free;
> > +       rq->dma_free = dma->next;
> > +
> > +       dma->ref = 1;
> > +       dma->buf = buf;
> > +       dma->addr = addr;
> > +       dma->len = map_len;
> > +
> > +       rq->last_dma = dma;
> > +
> > +ok:
> > +       sg_init_table(rq->sg, 1);
> > +       rq->sg[0].dma_address = addr;
> > +       rq->sg[0].length = len;
> > +
> > +       return 0;
> > +}
> > +
> > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > +{
> > +       struct receive_queue *rq;
> > +       int i, err, j, num;
> > +
> > +       /* disable for big mode */
> > +       if (!vi->mergeable_rx_bufs && vi->big_packets)
> > +               return 0;
> > +
> > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > +               err = virtqueue_set_premapped(vi->rq[i].vq);
> > +               if (err)
> > +                       continue;
> > +
> > +               rq = &vi->rq[i];
> > +
> > +               num = virtqueue_get_vring_size(rq->vq);
> > +
> > +               rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > +               if (!rq->data_array)
>
> Can we avoid those allocations when we don't use the DMA API?

Yes.

The success of virtqueue_set_premapped() means that we use the DMA API.


>
> > +                       goto err;
> > +
> > +               rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > +               if (!rq->dma_array)
> > +                       goto err;
> > +
> > +               for (j = 0; j < num; ++j) {
> > +                       rq->data_array[j].next = rq->data_free;
> > +                       rq->data_free = &rq->data_array[j];
> > +
> > +                       rq->dma_array[j].next = rq->dma_free;
> > +                       rq->dma_free = &rq->dma_array[j];
> > +               }
> > +       }
> > +
> > +       return 0;
> > +
> > +err:
> > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > +               struct receive_queue *rq;
> > +
> > +               rq = &vi->rq[i];
> > +
> > +               kfree(rq->dma_array);
> > +               kfree(rq->data_array);
> > +       }
> > +
> > +       return -ENOMEM;
> > +}
> > +
> >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> >  {
> >         unsigned int len;
> > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> >                 void *buf;
> >                 int off;
> >
> > -               buf = virtqueue_get_buf(rq->vq, &buflen);
> > +               buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> >                 if (unlikely(!buf))
> >                         goto err_buf;
> >
> > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> >                 return -EINVAL;
> >
> >         while (--*num_buf > 0) {
> > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> >                 if (unlikely(!buf)) {
> >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> >                                  dev->name, *num_buf,
> > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> >         while (--num_buf) {
> >                 int num_skb_frags;
> >
> > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> >                 if (unlikely(!buf)) {
> >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> >                                  dev->name, num_buf,
> > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> >  err_skb:
> >         put_page(page);
> >         while (num_buf-- > 1) {
> > -               buf = virtqueue_get_buf(rq->vq, &len);
> > +               buf = virtnet_rq_get_buf(rq, &len, NULL);
> >                 if (unlikely(!buf)) {
> >                         pr_debug("%s: rx error: %d buffers missing\n",
> >                                  dev->name, num_buf);
> > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> >         unsigned int xdp_headroom = virtnet_get_headroom(vi);
> >         void *ctx = (void *)(unsigned long)xdp_headroom;
> >         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > +       struct virtnet_rq_data *data;
> >         int err;
> >
> >         len = SKB_DATA_ALIGN(len) +
> > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> >         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> >         get_page(alloc_frag->page);
> >         alloc_frag->offset += len;
> > -       sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > -                   vi->hdr_len + GOOD_PACKET_LEN);
> > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > +
> > +       if (rq->data_array) {
> > +               err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > +                                       vi->hdr_len + GOOD_PACKET_LEN);
>
> Thanks to the compound page. I wonder if everything could be
> simplified if we just reuse page->private for storing metadata like
> dma address and refcnt. Then we don't need extra stuff for tracking
> any other thing?


I will try.

Thanks.


>
> Thanks
>
>
>
> > +               if (err)
> > +                       goto map_err;
> > +
> > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > +       } else {
> > +               sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > +                           vi->hdr_len + GOOD_PACKET_LEN);
> > +               data = (void *)buf;
> > +       }
> > +
> > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> >         if (err < 0)
> > -               put_page(virt_to_head_page(buf));
> > +               goto add_err;
> > +
> > +       return err;
> > +
> > +add_err:
> > +       if (rq->data_array) {
> > +               virtnet_rq_unmap(rq, data->dma);
> > +               virtnet_rq_recycle_data(rq, data);
> > +       }
> > +
> > +map_err:
> > +       put_page(virt_to_head_page(buf));
> >         return err;
> >  }
> >
> > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> >         unsigned int headroom = virtnet_get_headroom(vi);
> >         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> >         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > +       struct virtnet_rq_data *data;
> >         char *buf;
> >         void *ctx;
> >         int err;
> > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> >                 alloc_frag->offset += hole;
> >         }
> >
> > -       sg_init_one(rq->sg, buf, len);
> > +       if (rq->data_array) {
> > +               err = virtnet_rq_map_sg(rq, buf, len);
> > +               if (err)
> > +                       goto map_err;
> > +
> > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > +       } else {
> > +               sg_init_one(rq->sg, buf, len);
> > +               data = (void *)buf;
> > +       }
> > +
> >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> >         if (err < 0)
> > -               put_page(virt_to_head_page(buf));
> > +               goto add_err;
> > +
> > +       return 0;
> > +
> > +add_err:
> > +       if (rq->data_array) {
> > +               virtnet_rq_unmap(rq, data->dma);
> > +               virtnet_rq_recycle_data(rq, data);
> > +       }
> >
> > +map_err:
> > +       put_page(virt_to_head_page(buf));
> >         return err;
> >  }
> >
> > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> >                 void *ctx;
> >
> >                 while (stats.packets < budget &&
> > -                      (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > +                      (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> >                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> >                         stats.packets++;
> >                 }
> >         } else {
> >                 while (stats.packets < budget &&
> > -                      (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > +                      (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> >                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> >                         stats.packets++;
> >                 }
> > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> >         for (i = 0; i < vi->max_queue_pairs; i++) {
> >                 __netif_napi_del(&vi->rq[i].napi);
> >                 __netif_napi_del(&vi->sq[i].napi);
> > +
> > +               kfree(vi->rq[i].data_array);
> > +               kfree(vi->rq[i].dma_array);
> >         }
> >
> >         /* We called __netif_napi_del(),
> > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> >         }
> >
> >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > -               struct virtqueue *vq = vi->rq[i].vq;
> > -               while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > -                       virtnet_rq_free_unused_buf(vq, buf);
> > +               struct receive_queue *rq = &vi->rq[i];
> > +
> > +               while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > +                       virtnet_rq_free_unused_buf(rq->vq, buf);
> >                 cond_resched();
> >         }
> >  }
> > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> >         if (ret)
> >                 goto err_free;
> >
> > +       ret = virtnet_rq_merge_map_init(vi);
> > +       if (ret)
> > +               goto err_free;
> > +
> >         cpus_read_lock();
> >         virtnet_set_affinity(vi);
> >         cpus_read_unlock();
> > --
> > 2.32.0.3.g01195cf9f
> >
>
Xuan Zhuo July 13, 2023, 6:51 a.m. UTC | #14
On Thu, 13 Jul 2023 12:20:01 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Mon, Jul 10, 2023 at 11:43 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
>
> I'd suggest to tweak the title like:
>
> "merge dma operations when refilling mergeable buffers"
>
> > Currently, the virtio core will perform a dma operation for each
> > operation.
>
> "for each buffer"?
>
> > Although, the same page may be operated multiple times.
> >
> > The driver does the dma operation and manages the dma address based the
> > feature premapped of virtio core.
> >
> > This way, we can perform only one dma operation for the same page. In
> > the case of mtu 1500, this can reduce a lot of dma operations.
> >
> > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > increased from 1893766 to 1901105. An increase of 0.4%.
>
> Btw, it looks to me the code to deal with XDP_TX/REDIRECT for
> linearized pages was missed.
>
> >
> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > ---
> >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> >  1 file changed, 267 insertions(+), 16 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 486b5849033d..4de845d35bed 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> >  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
> >  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
> >
> > +/* The bufs on the same page may share this struct. */
> > +struct virtnet_rq_dma {
> > +       struct virtnet_rq_dma *next;
> > +
> > +       dma_addr_t addr;
> > +
> > +       void *buf;
> > +       u32 len;
> > +
> > +       u32 ref;
> > +};
> > +
> > +/* Record the dma and buf. */
> > +struct virtnet_rq_data {
> > +       struct virtnet_rq_data *next;
> > +
> > +       void *buf;
> > +
> > +       struct virtnet_rq_dma *dma;
> > +};
> > +
> >  /* Internal representation of a send virtqueue */
> >  struct send_queue {
> >         /* Virtqueue associated with this send _queue */
> > @@ -175,6 +196,13 @@ struct receive_queue {
> >         char name[16];
> >
> >         struct xdp_rxq_info xdp_rxq;
> > +
> > +       struct virtnet_rq_data *data_array;
> > +       struct virtnet_rq_data *data_free;
> > +
> > +       struct virtnet_rq_dma *dma_array;
> > +       struct virtnet_rq_dma *dma_free;
> > +       struct virtnet_rq_dma *last_dma;
> >  };
> >
> >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> >         return skb;
> >  }
> >
> > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > +{
> > +       struct device *dev;
> > +
> > +       --dma->ref;
> > +
> > +       if (dma->ref)
> > +               return;
> > +
> > +       dev = virtqueue_dma_dev(rq->vq);
> > +
> > +       dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> > +
> > +       dma->next = rq->dma_free;
> > +       rq->dma_free = dma;
> > +}
> > +
> > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > +                                    struct virtnet_rq_data *data)
> > +{
> > +       void *buf;
> > +
> > +       buf = data->buf;
> > +
> > +       data->next = rq->data_free;
> > +       rq->data_free = data;
> > +
> > +       return buf;
> > +}
> > +
> > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > +                                                  void *buf,
> > +                                                  struct virtnet_rq_dma *dma)
> > +{
> > +       struct virtnet_rq_data *data;
> > +
> > +       data = rq->data_free;
> > +       rq->data_free = data->next;
> > +
> > +       data->buf = buf;
> > +       data->dma = dma;
> > +
> > +       return data;
> > +}
> > +
> > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > +{
> > +       struct virtnet_rq_data *data;
> > +       void *buf;
> > +
> > +       buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > +       if (!buf || !rq->data_array)
> > +               return buf;
> > +
> > +       data = buf;
> > +
> > +       virtnet_rq_unmap(rq, data->dma);
> > +
> > +       return virtnet_rq_recycle_data(rq, data);
> > +}
> > +
> > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > +{
> > +       struct virtnet_rq_data *data;
> > +       void *buf;
> > +
> > +       buf = virtqueue_detach_unused_buf(rq->vq);
> > +       if (!buf || !rq->data_array)
> > +               return buf;
> > +
> > +       data = buf;
> > +
> > +       virtnet_rq_unmap(rq, data->dma);
> > +
> > +       return virtnet_rq_recycle_data(rq, data);
> > +}
> > +
> > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > +{
> > +       struct virtnet_rq_dma *dma = rq->last_dma;
> > +       struct device *dev;
> > +       u32 off, map_len;
> > +       dma_addr_t addr;
> > +       void *end;
> > +
> > +       if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > +               ++dma->ref;
> > +               addr = dma->addr + (buf - dma->buf);
> > +               goto ok;
> > +       }
> > +
> > +       end = buf + len - 1;
> > +       off = offset_in_page(end);
> > +       map_len = len + PAGE_SIZE - off;
>
> This assumes a PAGE_SIZE which seems sub-optimal as page frag could be
> larger than this.
>
> > +
> > +       dev = virtqueue_dma_dev(rq->vq);
> > +
> > +       addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > +                                 map_len, DMA_FROM_DEVICE, 0);
> > +       if (addr == DMA_MAPPING_ERROR)
> > +               return -ENOMEM;
> > +
> > +       dma = rq->dma_free;
> > +       rq->dma_free = dma->next;
> > +
> > +       dma->ref = 1;
> > +       dma->buf = buf;
> > +       dma->addr = addr;
> > +       dma->len = map_len;
> > +
> > +       rq->last_dma = dma;
> > +
> > +ok:
> > +       sg_init_table(rq->sg, 1);
> > +       rq->sg[0].dma_address = addr;
> > +       rq->sg[0].length = len;
> > +
> > +       return 0;
> > +}
> > +
> > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > +{
> > +       struct receive_queue *rq;
> > +       int i, err, j, num;
> > +
> > +       /* disable for big mode */
> > +       if (!vi->mergeable_rx_bufs && vi->big_packets)
> > +               return 0;
> > +
> > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > +               err = virtqueue_set_premapped(vi->rq[i].vq);
> > +               if (err)
> > +                       continue;
> > +
> > +               rq = &vi->rq[i];
> > +
> > +               num = virtqueue_get_vring_size(rq->vq);
> > +
> > +               rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > +               if (!rq->data_array)
>
> Can we avoid those allocations when we don't use the DMA API?
>
> > +                       goto err;
> > +
> > +               rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > +               if (!rq->dma_array)
> > +                       goto err;
> > +
> > +               for (j = 0; j < num; ++j) {
> > +                       rq->data_array[j].next = rq->data_free;
> > +                       rq->data_free = &rq->data_array[j];
> > +
> > +                       rq->dma_array[j].next = rq->dma_free;
> > +                       rq->dma_free = &rq->dma_array[j];
> > +               }
> > +       }
> > +
> > +       return 0;
> > +
> > +err:
> > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > +               struct receive_queue *rq;
> > +
> > +               rq = &vi->rq[i];
> > +
> > +               kfree(rq->dma_array);
> > +               kfree(rq->data_array);
> > +       }
> > +
> > +       return -ENOMEM;
> > +}
> > +
> >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> >  {
> >         unsigned int len;
> > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> >                 void *buf;
> >                 int off;
> >
> > -               buf = virtqueue_get_buf(rq->vq, &buflen);
> > +               buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> >                 if (unlikely(!buf))
> >                         goto err_buf;
> >
> > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> >                 return -EINVAL;
> >
> >         while (--*num_buf > 0) {
> > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> >                 if (unlikely(!buf)) {
> >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> >                                  dev->name, *num_buf,
> > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> >         while (--num_buf) {
> >                 int num_skb_frags;
> >
> > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> >                 if (unlikely(!buf)) {
> >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> >                                  dev->name, num_buf,
> > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> >  err_skb:
> >         put_page(page);
> >         while (num_buf-- > 1) {
> > -               buf = virtqueue_get_buf(rq->vq, &len);
> > +               buf = virtnet_rq_get_buf(rq, &len, NULL);
> >                 if (unlikely(!buf)) {
> >                         pr_debug("%s: rx error: %d buffers missing\n",
> >                                  dev->name, num_buf);
> > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> >         unsigned int xdp_headroom = virtnet_get_headroom(vi);
> >         void *ctx = (void *)(unsigned long)xdp_headroom;
> >         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > +       struct virtnet_rq_data *data;
> >         int err;
> >
> >         len = SKB_DATA_ALIGN(len) +
> > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> >         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> >         get_page(alloc_frag->page);
> >         alloc_frag->offset += len;
> > -       sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > -                   vi->hdr_len + GOOD_PACKET_LEN);
> > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > +
> > +       if (rq->data_array) {
> > +               err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > +                                       vi->hdr_len + GOOD_PACKET_LEN);
>
> Thanks to the compound page. I wonder if everything could be
> simplified if we just reuse page->private for storing metadata like
> dma address and refcnt. Then we don't need extra stuff for tracking
> any other thing?

I didn't use page->private because if part of the page is used by one skb then
the driver is not the only owner. Can we still use page->private?

Thanks.




>
> Thanks
>
>
>
> > +               if (err)
> > +                       goto map_err;
> > +
> > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > +       } else {
> > +               sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > +                           vi->hdr_len + GOOD_PACKET_LEN);
> > +               data = (void *)buf;
> > +       }
> > +
> > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> >         if (err < 0)
> > -               put_page(virt_to_head_page(buf));
> > +               goto add_err;
> > +
> > +       return err;
> > +
> > +add_err:
> > +       if (rq->data_array) {
> > +               virtnet_rq_unmap(rq, data->dma);
> > +               virtnet_rq_recycle_data(rq, data);
> > +       }
> > +
> > +map_err:
> > +       put_page(virt_to_head_page(buf));
> >         return err;
> >  }
> >
> > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> >         unsigned int headroom = virtnet_get_headroom(vi);
> >         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> >         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > +       struct virtnet_rq_data *data;
> >         char *buf;
> >         void *ctx;
> >         int err;
> > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> >                 alloc_frag->offset += hole;
> >         }
> >
> > -       sg_init_one(rq->sg, buf, len);
> > +       if (rq->data_array) {
> > +               err = virtnet_rq_map_sg(rq, buf, len);
> > +               if (err)
> > +                       goto map_err;
> > +
> > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > +       } else {
> > +               sg_init_one(rq->sg, buf, len);
> > +               data = (void *)buf;
> > +       }
> > +
> >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> >         if (err < 0)
> > -               put_page(virt_to_head_page(buf));
> > +               goto add_err;
> > +
> > +       return 0;
> > +
> > +add_err:
> > +       if (rq->data_array) {
> > +               virtnet_rq_unmap(rq, data->dma);
> > +               virtnet_rq_recycle_data(rq, data);
> > +       }
> >
> > +map_err:
> > +       put_page(virt_to_head_page(buf));
> >         return err;
> >  }
> >
> > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> >                 void *ctx;
> >
> >                 while (stats.packets < budget &&
> > -                      (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > +                      (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> >                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> >                         stats.packets++;
> >                 }
> >         } else {
> >                 while (stats.packets < budget &&
> > -                      (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > +                      (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> >                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> >                         stats.packets++;
> >                 }
> > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> >         for (i = 0; i < vi->max_queue_pairs; i++) {
> >                 __netif_napi_del(&vi->rq[i].napi);
> >                 __netif_napi_del(&vi->sq[i].napi);
> > +
> > +               kfree(vi->rq[i].data_array);
> > +               kfree(vi->rq[i].dma_array);
> >         }
> >
> >         /* We called __netif_napi_del(),
> > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> >         }
> >
> >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > -               struct virtqueue *vq = vi->rq[i].vq;
> > -               while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > -                       virtnet_rq_free_unused_buf(vq, buf);
> > +               struct receive_queue *rq = &vi->rq[i];
> > +
> > +               while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > +                       virtnet_rq_free_unused_buf(rq->vq, buf);
> >                 cond_resched();
> >         }
> >  }
> > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> >         if (ret)
> >                 goto err_free;
> >
> > +       ret = virtnet_rq_merge_map_init(vi);
> > +       if (ret)
> > +               goto err_free;
> > +
> >         cpus_read_lock();
> >         virtnet_set_affinity(vi);
> >         cpus_read_unlock();
> > --
> > 2.32.0.3.g01195cf9f
> >
>
Xuan Zhuo July 13, 2023, 7 a.m. UTC | #15
On Thu, 13 Jul 2023 12:20:01 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Mon, Jul 10, 2023 at 11:43 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
>
> I'd suggest to tweak the title like:
>
> "merge dma operations when refilling mergeable buffers"
>
> > Currently, the virtio core will perform a dma operation for each
> > operation.
>
> "for each buffer"?
>
> > Although, the same page may be operated multiple times.
> >
> > The driver does the dma operation and manages the dma address based the
> > feature premapped of virtio core.
> >
> > This way, we can perform only one dma operation for the same page. In
> > the case of mtu 1500, this can reduce a lot of dma operations.
> >
> > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > increased from 1893766 to 1901105. An increase of 0.4%.
>
> Btw, it looks to me the code to deal with XDP_TX/REDIRECT for
> linearized pages was missed.
>
> >
> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > ---
> >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> >  1 file changed, 267 insertions(+), 16 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 486b5849033d..4de845d35bed 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> >  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
> >  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
> >
> > +/* The bufs on the same page may share this struct. */
> > +struct virtnet_rq_dma {
> > +       struct virtnet_rq_dma *next;
> > +
> > +       dma_addr_t addr;
> > +
> > +       void *buf;
> > +       u32 len;
> > +
> > +       u32 ref;
> > +};
> > +
> > +/* Record the dma and buf. */
> > +struct virtnet_rq_data {
> > +       struct virtnet_rq_data *next;
> > +
> > +       void *buf;
> > +
> > +       struct virtnet_rq_dma *dma;
> > +};
> > +
> >  /* Internal representation of a send virtqueue */
> >  struct send_queue {
> >         /* Virtqueue associated with this send _queue */
> > @@ -175,6 +196,13 @@ struct receive_queue {
> >         char name[16];
> >
> >         struct xdp_rxq_info xdp_rxq;
> > +
> > +       struct virtnet_rq_data *data_array;
> > +       struct virtnet_rq_data *data_free;
> > +
> > +       struct virtnet_rq_dma *dma_array;
> > +       struct virtnet_rq_dma *dma_free;
> > +       struct virtnet_rq_dma *last_dma;
> >  };
> >
> >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> >         return skb;
> >  }
> >
> > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > +{
> > +       struct device *dev;
> > +
> > +       --dma->ref;
> > +
> > +       if (dma->ref)
> > +               return;
> > +
> > +       dev = virtqueue_dma_dev(rq->vq);
> > +
> > +       dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> > +
> > +       dma->next = rq->dma_free;
> > +       rq->dma_free = dma;
> > +}
> > +
> > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > +                                    struct virtnet_rq_data *data)
> > +{
> > +       void *buf;
> > +
> > +       buf = data->buf;
> > +
> > +       data->next = rq->data_free;
> > +       rq->data_free = data;
> > +
> > +       return buf;
> > +}
> > +
> > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > +                                                  void *buf,
> > +                                                  struct virtnet_rq_dma *dma)
> > +{
> > +       struct virtnet_rq_data *data;
> > +
> > +       data = rq->data_free;
> > +       rq->data_free = data->next;
> > +
> > +       data->buf = buf;
> > +       data->dma = dma;
> > +
> > +       return data;
> > +}
> > +
> > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > +{
> > +       struct virtnet_rq_data *data;
> > +       void *buf;
> > +
> > +       buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > +       if (!buf || !rq->data_array)
> > +               return buf;
> > +
> > +       data = buf;
> > +
> > +       virtnet_rq_unmap(rq, data->dma);
> > +
> > +       return virtnet_rq_recycle_data(rq, data);
> > +}
> > +
> > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > +{
> > +       struct virtnet_rq_data *data;
> > +       void *buf;
> > +
> > +       buf = virtqueue_detach_unused_buf(rq->vq);
> > +       if (!buf || !rq->data_array)
> > +               return buf;
> > +
> > +       data = buf;
> > +
> > +       virtnet_rq_unmap(rq, data->dma);
> > +
> > +       return virtnet_rq_recycle_data(rq, data);
> > +}
> > +
> > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > +{
> > +       struct virtnet_rq_dma *dma = rq->last_dma;
> > +       struct device *dev;
> > +       u32 off, map_len;
> > +       dma_addr_t addr;
> > +       void *end;
> > +
> > +       if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > +               ++dma->ref;
> > +               addr = dma->addr + (buf - dma->buf);
> > +               goto ok;
> > +       }
> > +
> > +       end = buf + len - 1;
> > +       off = offset_in_page(end);
> > +       map_len = len + PAGE_SIZE - off;
>
> This assumes a PAGE_SIZE which seems sub-optimal as page frag could be
> larger than this.
>
> > +
> > +       dev = virtqueue_dma_dev(rq->vq);
> > +
> > +       addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > +                                 map_len, DMA_FROM_DEVICE, 0);
> > +       if (addr == DMA_MAPPING_ERROR)
> > +               return -ENOMEM;
> > +
> > +       dma = rq->dma_free;
> > +       rq->dma_free = dma->next;
> > +
> > +       dma->ref = 1;
> > +       dma->buf = buf;
> > +       dma->addr = addr;
> > +       dma->len = map_len;
> > +
> > +       rq->last_dma = dma;
> > +
> > +ok:
> > +       sg_init_table(rq->sg, 1);
> > +       rq->sg[0].dma_address = addr;
> > +       rq->sg[0].length = len;
> > +
> > +       return 0;
> > +}
> > +
> > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > +{
> > +       struct receive_queue *rq;
> > +       int i, err, j, num;
> > +
> > +       /* disable for big mode */
> > +       if (!vi->mergeable_rx_bufs && vi->big_packets)
> > +               return 0;
> > +
> > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > +               err = virtqueue_set_premapped(vi->rq[i].vq);
> > +               if (err)
> > +                       continue;
> > +
> > +               rq = &vi->rq[i];
> > +
> > +               num = virtqueue_get_vring_size(rq->vq);
> > +
> > +               rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > +               if (!rq->data_array)
>
> Can we avoid those allocations when we don't use the DMA API?
>
> > +                       goto err;
> > +
> > +               rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > +               if (!rq->dma_array)
> > +                       goto err;
> > +
> > +               for (j = 0; j < num; ++j) {
> > +                       rq->data_array[j].next = rq->data_free;
> > +                       rq->data_free = &rq->data_array[j];
> > +
> > +                       rq->dma_array[j].next = rq->dma_free;
> > +                       rq->dma_free = &rq->dma_array[j];
> > +               }
> > +       }
> > +
> > +       return 0;
> > +
> > +err:
> > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > +               struct receive_queue *rq;
> > +
> > +               rq = &vi->rq[i];
> > +
> > +               kfree(rq->dma_array);
> > +               kfree(rq->data_array);
> > +       }
> > +
> > +       return -ENOMEM;
> > +}
> > +
> >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> >  {
> >         unsigned int len;
> > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> >                 void *buf;
> >                 int off;
> >
> > -               buf = virtqueue_get_buf(rq->vq, &buflen);
> > +               buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> >                 if (unlikely(!buf))
> >                         goto err_buf;
> >
> > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> >                 return -EINVAL;
> >
> >         while (--*num_buf > 0) {
> > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> >                 if (unlikely(!buf)) {
> >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> >                                  dev->name, *num_buf,
> > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> >         while (--num_buf) {
> >                 int num_skb_frags;
> >
> > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> >                 if (unlikely(!buf)) {
> >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> >                                  dev->name, num_buf,
> > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> >  err_skb:
> >         put_page(page);
> >         while (num_buf-- > 1) {
> > -               buf = virtqueue_get_buf(rq->vq, &len);
> > +               buf = virtnet_rq_get_buf(rq, &len, NULL);
> >                 if (unlikely(!buf)) {
> >                         pr_debug("%s: rx error: %d buffers missing\n",
> >                                  dev->name, num_buf);
> > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> >         unsigned int xdp_headroom = virtnet_get_headroom(vi);
> >         void *ctx = (void *)(unsigned long)xdp_headroom;
> >         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > +       struct virtnet_rq_data *data;
> >         int err;
> >
> >         len = SKB_DATA_ALIGN(len) +
> > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> >         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> >         get_page(alloc_frag->page);
> >         alloc_frag->offset += len;
> > -       sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > -                   vi->hdr_len + GOOD_PACKET_LEN);
> > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > +
> > +       if (rq->data_array) {
> > +               err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > +                                       vi->hdr_len + GOOD_PACKET_LEN);
>
> Thanks to the compound page. I wonder if everything could be
> simplified if we just reuse page->private for storing metadata like
> dma address and refcnt. Then we don't need extra stuff for tracking
> any other thing?

Maybe we can try alloc one small buffer from the page_frag to store the dma info
when page_frag.offset == 0.

Thanks.


>
> Thanks
>
>
>
> > +               if (err)
> > +                       goto map_err;
> > +
> > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > +       } else {
> > +               sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > +                           vi->hdr_len + GOOD_PACKET_LEN);
> > +               data = (void *)buf;
> > +       }
> > +
> > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> >         if (err < 0)
> > -               put_page(virt_to_head_page(buf));
> > +               goto add_err;
> > +
> > +       return err;
> > +
> > +add_err:
> > +       if (rq->data_array) {
> > +               virtnet_rq_unmap(rq, data->dma);
> > +               virtnet_rq_recycle_data(rq, data);
> > +       }
> > +
> > +map_err:
> > +       put_page(virt_to_head_page(buf));
> >         return err;
> >  }
> >
> > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> >         unsigned int headroom = virtnet_get_headroom(vi);
> >         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> >         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > +       struct virtnet_rq_data *data;
> >         char *buf;
> >         void *ctx;
> >         int err;
> > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> >                 alloc_frag->offset += hole;
> >         }
> >
> > -       sg_init_one(rq->sg, buf, len);
> > +       if (rq->data_array) {
> > +               err = virtnet_rq_map_sg(rq, buf, len);
> > +               if (err)
> > +                       goto map_err;
> > +
> > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > +       } else {
> > +               sg_init_one(rq->sg, buf, len);
> > +               data = (void *)buf;
> > +       }
> > +
> >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> >         if (err < 0)
> > -               put_page(virt_to_head_page(buf));
> > +               goto add_err;
> > +
> > +       return 0;
> > +
> > +add_err:
> > +       if (rq->data_array) {
> > +               virtnet_rq_unmap(rq, data->dma);
> > +               virtnet_rq_recycle_data(rq, data);
> > +       }
> >
> > +map_err:
> > +       put_page(virt_to_head_page(buf));
> >         return err;
> >  }
> >
> > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> >                 void *ctx;
> >
> >                 while (stats.packets < budget &&
> > -                      (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > +                      (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> >                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> >                         stats.packets++;
> >                 }
> >         } else {
> >                 while (stats.packets < budget &&
> > -                      (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > +                      (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> >                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> >                         stats.packets++;
> >                 }
> > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> >         for (i = 0; i < vi->max_queue_pairs; i++) {
> >                 __netif_napi_del(&vi->rq[i].napi);
> >                 __netif_napi_del(&vi->sq[i].napi);
> > +
> > +               kfree(vi->rq[i].data_array);
> > +               kfree(vi->rq[i].dma_array);
> >         }
> >
> >         /* We called __netif_napi_del(),
> > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> >         }
> >
> >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > -               struct virtqueue *vq = vi->rq[i].vq;
> > -               while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > -                       virtnet_rq_free_unused_buf(vq, buf);
> > +               struct receive_queue *rq = &vi->rq[i];
> > +
> > +               while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > +                       virtnet_rq_free_unused_buf(rq->vq, buf);
> >                 cond_resched();
> >         }
> >  }
> > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> >         if (ret)
> >                 goto err_free;
> >
> > +       ret = virtnet_rq_merge_map_init(vi);
> > +       if (ret)
> > +               goto err_free;
> > +
> >         cpus_read_lock();
> >         virtnet_set_affinity(vi);
> >         cpus_read_unlock();
> > --
> > 2.32.0.3.g01195cf9f
> >
>
Jason Wang July 14, 2023, 3:56 a.m. UTC | #16
On Thu, Jul 13, 2023 at 2:54 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> On Thu, 13 Jul 2023 12:20:01 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > On Mon, Jul 10, 2023 at 11:43 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > >
> >
> > I'd suggest to tweak the title like:
> >
> > "merge dma operations when refilling mergeable buffers"
> >
> > > Currently, the virtio core will perform a dma operation for each
> > > operation.
> >
> > "for each buffer"?
> >
> > > Although, the same page may be operated multiple times.
> > >
> > > The driver does the dma operation and manages the dma address based the
> > > feature premapped of virtio core.
> > >
> > > This way, we can perform only one dma operation for the same page. In
> > > the case of mtu 1500, this can reduce a lot of dma operations.
> > >
> > > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > > increased from 1893766 to 1901105. An increase of 0.4%.
> >
> > Btw, it looks to me the code to deal with XDP_TX/REDIRECT for
> > linearized pages was missed.
> >
> > >
> > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > ---
> > >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> > >  1 file changed, 267 insertions(+), 16 deletions(-)
> > >
> > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > index 486b5849033d..4de845d35bed 100644
> > > --- a/drivers/net/virtio_net.c
> > > +++ b/drivers/net/virtio_net.c
> > > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> > >  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
> > >  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
> > >
> > > +/* The bufs on the same page may share this struct. */
> > > +struct virtnet_rq_dma {
> > > +       struct virtnet_rq_dma *next;
> > > +
> > > +       dma_addr_t addr;
> > > +
> > > +       void *buf;
> > > +       u32 len;
> > > +
> > > +       u32 ref;
> > > +};
> > > +
> > > +/* Record the dma and buf. */
> > > +struct virtnet_rq_data {
> > > +       struct virtnet_rq_data *next;
> > > +
> > > +       void *buf;
> > > +
> > > +       struct virtnet_rq_dma *dma;
> > > +};
> > > +
> > >  /* Internal representation of a send virtqueue */
> > >  struct send_queue {
> > >         /* Virtqueue associated with this send _queue */
> > > @@ -175,6 +196,13 @@ struct receive_queue {
> > >         char name[16];
> > >
> > >         struct xdp_rxq_info xdp_rxq;
> > > +
> > > +       struct virtnet_rq_data *data_array;
> > > +       struct virtnet_rq_data *data_free;
> > > +
> > > +       struct virtnet_rq_dma *dma_array;
> > > +       struct virtnet_rq_dma *dma_free;
> > > +       struct virtnet_rq_dma *last_dma;
> > >  };
> > >
> > >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > >         return skb;
> > >  }
> > >
> > > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > > +{
> > > +       struct device *dev;
> > > +
> > > +       --dma->ref;
> > > +
> > > +       if (dma->ref)
> > > +               return;
> > > +
> > > +       dev = virtqueue_dma_dev(rq->vq);
> > > +
> > > +       dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> > > +
> > > +       dma->next = rq->dma_free;
> > > +       rq->dma_free = dma;
> > > +}
> > > +
> > > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > > +                                    struct virtnet_rq_data *data)
> > > +{
> > > +       void *buf;
> > > +
> > > +       buf = data->buf;
> > > +
> > > +       data->next = rq->data_free;
> > > +       rq->data_free = data;
> > > +
> > > +       return buf;
> > > +}
> > > +
> > > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > > +                                                  void *buf,
> > > +                                                  struct virtnet_rq_dma *dma)
> > > +{
> > > +       struct virtnet_rq_data *data;
> > > +
> > > +       data = rq->data_free;
> > > +       rq->data_free = data->next;
> > > +
> > > +       data->buf = buf;
> > > +       data->dma = dma;
> > > +
> > > +       return data;
> > > +}
> > > +
> > > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > > +{
> > > +       struct virtnet_rq_data *data;
> > > +       void *buf;
> > > +
> > > +       buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > > +       if (!buf || !rq->data_array)
> > > +               return buf;
> > > +
> > > +       data = buf;
> > > +
> > > +       virtnet_rq_unmap(rq, data->dma);
> > > +
> > > +       return virtnet_rq_recycle_data(rq, data);
> > > +}
> > > +
> > > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > > +{
> > > +       struct virtnet_rq_data *data;
> > > +       void *buf;
> > > +
> > > +       buf = virtqueue_detach_unused_buf(rq->vq);
> > > +       if (!buf || !rq->data_array)
> > > +               return buf;
> > > +
> > > +       data = buf;
> > > +
> > > +       virtnet_rq_unmap(rq, data->dma);
> > > +
> > > +       return virtnet_rq_recycle_data(rq, data);
> > > +}
> > > +
> > > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > > +{
> > > +       struct virtnet_rq_dma *dma = rq->last_dma;
> > > +       struct device *dev;
> > > +       u32 off, map_len;
> > > +       dma_addr_t addr;
> > > +       void *end;
> > > +
> > > +       if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > > +               ++dma->ref;
> > > +               addr = dma->addr + (buf - dma->buf);
> > > +               goto ok;
> > > +       }
> > > +
> > > +       end = buf + len - 1;
> > > +       off = offset_in_page(end);
> > > +       map_len = len + PAGE_SIZE - off;
> >
> > This assumes a PAGE_SIZE which seems sub-optimal as page frag could be
> > larger than this.
> >
> > > +
> > > +       dev = virtqueue_dma_dev(rq->vq);
> > > +
> > > +       addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > > +                                 map_len, DMA_FROM_DEVICE, 0);
> > > +       if (addr == DMA_MAPPING_ERROR)
> > > +               return -ENOMEM;
> > > +
> > > +       dma = rq->dma_free;
> > > +       rq->dma_free = dma->next;
> > > +
> > > +       dma->ref = 1;
> > > +       dma->buf = buf;
> > > +       dma->addr = addr;
> > > +       dma->len = map_len;
> > > +
> > > +       rq->last_dma = dma;
> > > +
> > > +ok:
> > > +       sg_init_table(rq->sg, 1);
> > > +       rq->sg[0].dma_address = addr;
> > > +       rq->sg[0].length = len;
> > > +
> > > +       return 0;
> > > +}
> > > +
> > > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > > +{
> > > +       struct receive_queue *rq;
> > > +       int i, err, j, num;
> > > +
> > > +       /* disable for big mode */
> > > +       if (!vi->mergeable_rx_bufs && vi->big_packets)
> > > +               return 0;
> > > +
> > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > +               err = virtqueue_set_premapped(vi->rq[i].vq);
> > > +               if (err)
> > > +                       continue;
> > > +
> > > +               rq = &vi->rq[i];
> > > +
> > > +               num = virtqueue_get_vring_size(rq->vq);
> > > +
> > > +               rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > > +               if (!rq->data_array)
> >
> > Can we avoid those allocations when we don't use the DMA API?
> >
> > > +                       goto err;
> > > +
> > > +               rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > > +               if (!rq->dma_array)
> > > +                       goto err;
> > > +
> > > +               for (j = 0; j < num; ++j) {
> > > +                       rq->data_array[j].next = rq->data_free;
> > > +                       rq->data_free = &rq->data_array[j];
> > > +
> > > +                       rq->dma_array[j].next = rq->dma_free;
> > > +                       rq->dma_free = &rq->dma_array[j];
> > > +               }
> > > +       }
> > > +
> > > +       return 0;
> > > +
> > > +err:
> > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > +               struct receive_queue *rq;
> > > +
> > > +               rq = &vi->rq[i];
> > > +
> > > +               kfree(rq->dma_array);
> > > +               kfree(rq->data_array);
> > > +       }
> > > +
> > > +       return -ENOMEM;
> > > +}
> > > +
> > >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> > >  {
> > >         unsigned int len;
> > > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > >                 void *buf;
> > >                 int off;
> > >
> > > -               buf = virtqueue_get_buf(rq->vq, &buflen);
> > > +               buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> > >                 if (unlikely(!buf))
> > >                         goto err_buf;
> > >
> > > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > >                 return -EINVAL;
> > >
> > >         while (--*num_buf > 0) {
> > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > >                 if (unlikely(!buf)) {
> > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > >                                  dev->name, *num_buf,
> > > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > >         while (--num_buf) {
> > >                 int num_skb_frags;
> > >
> > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > >                 if (unlikely(!buf)) {
> > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > >                                  dev->name, num_buf,
> > > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > >  err_skb:
> > >         put_page(page);
> > >         while (num_buf-- > 1) {
> > > -               buf = virtqueue_get_buf(rq->vq, &len);
> > > +               buf = virtnet_rq_get_buf(rq, &len, NULL);
> > >                 if (unlikely(!buf)) {
> > >                         pr_debug("%s: rx error: %d buffers missing\n",
> > >                                  dev->name, num_buf);
> > > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > >         unsigned int xdp_headroom = virtnet_get_headroom(vi);
> > >         void *ctx = (void *)(unsigned long)xdp_headroom;
> > >         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > > +       struct virtnet_rq_data *data;
> > >         int err;
> > >
> > >         len = SKB_DATA_ALIGN(len) +
> > > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > >         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > >         get_page(alloc_frag->page);
> > >         alloc_frag->offset += len;
> > > -       sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > -                   vi->hdr_len + GOOD_PACKET_LEN);
> > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > +
> > > +       if (rq->data_array) {
> > > +               err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > +                                       vi->hdr_len + GOOD_PACKET_LEN);
> >
> > Thanks to the compound page. I wonder if everything could be
> > simplified if we just reuse page->private for storing metadata like
> > dma address and refcnt. Then we don't need extra stuff for tracking
> > any other thing?
>
> I didn't use page->private because if part of the page is used by one skb then
> the driver is not the only owner. Can we still use page->private?

You are right, we can't since there's no guarantee that a skb will
occupy a full page.

Thanks

>
> Thanks.
>
>
>
>
> >
> > Thanks
> >
> >
> >
> > > +               if (err)
> > > +                       goto map_err;
> > > +
> > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > +       } else {
> > > +               sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > +                           vi->hdr_len + GOOD_PACKET_LEN);
> > > +               data = (void *)buf;
> > > +       }
> > > +
> > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > >         if (err < 0)
> > > -               put_page(virt_to_head_page(buf));
> > > +               goto add_err;
> > > +
> > > +       return err;
> > > +
> > > +add_err:
> > > +       if (rq->data_array) {
> > > +               virtnet_rq_unmap(rq, data->dma);
> > > +               virtnet_rq_recycle_data(rq, data);
> > > +       }
> > > +
> > > +map_err:
> > > +       put_page(virt_to_head_page(buf));
> > >         return err;
> > >  }
> > >
> > > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > >         unsigned int headroom = virtnet_get_headroom(vi);
> > >         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> > >         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > > +       struct virtnet_rq_data *data;
> > >         char *buf;
> > >         void *ctx;
> > >         int err;
> > > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > >                 alloc_frag->offset += hole;
> > >         }
> > >
> > > -       sg_init_one(rq->sg, buf, len);
> > > +       if (rq->data_array) {
> > > +               err = virtnet_rq_map_sg(rq, buf, len);
> > > +               if (err)
> > > +                       goto map_err;
> > > +
> > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > +       } else {
> > > +               sg_init_one(rq->sg, buf, len);
> > > +               data = (void *)buf;
> > > +       }
> > > +
> > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > >         if (err < 0)
> > > -               put_page(virt_to_head_page(buf));
> > > +               goto add_err;
> > > +
> > > +       return 0;
> > > +
> > > +add_err:
> > > +       if (rq->data_array) {
> > > +               virtnet_rq_unmap(rq, data->dma);
> > > +               virtnet_rq_recycle_data(rq, data);
> > > +       }
> > >
> > > +map_err:
> > > +       put_page(virt_to_head_page(buf));
> > >         return err;
> > >  }
> > >
> > > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> > >                 void *ctx;
> > >
> > >                 while (stats.packets < budget &&
> > > -                      (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > > +                      (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> > >                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> > >                         stats.packets++;
> > >                 }
> > >         } else {
> > >                 while (stats.packets < budget &&
> > > -                      (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > > +                      (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> > >                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> > >                         stats.packets++;
> > >                 }
> > > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > >                 __netif_napi_del(&vi->rq[i].napi);
> > >                 __netif_napi_del(&vi->sq[i].napi);
> > > +
> > > +               kfree(vi->rq[i].data_array);
> > > +               kfree(vi->rq[i].dma_array);
> > >         }
> > >
> > >         /* We called __netif_napi_del(),
> > > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> > >         }
> > >
> > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > -               struct virtqueue *vq = vi->rq[i].vq;
> > > -               while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > > -                       virtnet_rq_free_unused_buf(vq, buf);
> > > +               struct receive_queue *rq = &vi->rq[i];
> > > +
> > > +               while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > > +                       virtnet_rq_free_unused_buf(rq->vq, buf);
> > >                 cond_resched();
> > >         }
> > >  }
> > > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> > >         if (ret)
> > >                 goto err_free;
> > >
> > > +       ret = virtnet_rq_merge_map_init(vi);
> > > +       if (ret)
> > > +               goto err_free;
> > > +
> > >         cpus_read_lock();
> > >         virtnet_set_affinity(vi);
> > >         cpus_read_unlock();
> > > --
> > > 2.32.0.3.g01195cf9f
> > >
> >
>
Jason Wang July 14, 2023, 3:57 a.m. UTC | #17
On Thu, Jul 13, 2023 at 3:02 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> On Thu, 13 Jul 2023 12:20:01 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > On Mon, Jul 10, 2023 at 11:43 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > >
> >
> > I'd suggest to tweak the title like:
> >
> > "merge dma operations when refilling mergeable buffers"
> >
> > > Currently, the virtio core will perform a dma operation for each
> > > operation.
> >
> > "for each buffer"?
> >
> > > Although, the same page may be operated multiple times.
> > >
> > > The driver does the dma operation and manages the dma address based the
> > > feature premapped of virtio core.
> > >
> > > This way, we can perform only one dma operation for the same page. In
> > > the case of mtu 1500, this can reduce a lot of dma operations.
> > >
> > > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > > increased from 1893766 to 1901105. An increase of 0.4%.
> >
> > Btw, it looks to me the code to deal with XDP_TX/REDIRECT for
> > linearized pages was missed.
> >
> > >
> > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > ---
> > >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> > >  1 file changed, 267 insertions(+), 16 deletions(-)
> > >
> > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > index 486b5849033d..4de845d35bed 100644
> > > --- a/drivers/net/virtio_net.c
> > > +++ b/drivers/net/virtio_net.c
> > > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> > >  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
> > >  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
> > >
> > > +/* The bufs on the same page may share this struct. */
> > > +struct virtnet_rq_dma {
> > > +       struct virtnet_rq_dma *next;
> > > +
> > > +       dma_addr_t addr;
> > > +
> > > +       void *buf;
> > > +       u32 len;
> > > +
> > > +       u32 ref;
> > > +};
> > > +
> > > +/* Record the dma and buf. */
> > > +struct virtnet_rq_data {
> > > +       struct virtnet_rq_data *next;
> > > +
> > > +       void *buf;
> > > +
> > > +       struct virtnet_rq_dma *dma;
> > > +};
> > > +
> > >  /* Internal representation of a send virtqueue */
> > >  struct send_queue {
> > >         /* Virtqueue associated with this send _queue */
> > > @@ -175,6 +196,13 @@ struct receive_queue {
> > >         char name[16];
> > >
> > >         struct xdp_rxq_info xdp_rxq;
> > > +
> > > +       struct virtnet_rq_data *data_array;
> > > +       struct virtnet_rq_data *data_free;
> > > +
> > > +       struct virtnet_rq_dma *dma_array;
> > > +       struct virtnet_rq_dma *dma_free;
> > > +       struct virtnet_rq_dma *last_dma;
> > >  };
> > >
> > >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > >         return skb;
> > >  }
> > >
> > > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > > +{
> > > +       struct device *dev;
> > > +
> > > +       --dma->ref;
> > > +
> > > +       if (dma->ref)
> > > +               return;
> > > +
> > > +       dev = virtqueue_dma_dev(rq->vq);
> > > +
> > > +       dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> > > +
> > > +       dma->next = rq->dma_free;
> > > +       rq->dma_free = dma;
> > > +}
> > > +
> > > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > > +                                    struct virtnet_rq_data *data)
> > > +{
> > > +       void *buf;
> > > +
> > > +       buf = data->buf;
> > > +
> > > +       data->next = rq->data_free;
> > > +       rq->data_free = data;
> > > +
> > > +       return buf;
> > > +}
> > > +
> > > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > > +                                                  void *buf,
> > > +                                                  struct virtnet_rq_dma *dma)
> > > +{
> > > +       struct virtnet_rq_data *data;
> > > +
> > > +       data = rq->data_free;
> > > +       rq->data_free = data->next;
> > > +
> > > +       data->buf = buf;
> > > +       data->dma = dma;
> > > +
> > > +       return data;
> > > +}
> > > +
> > > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > > +{
> > > +       struct virtnet_rq_data *data;
> > > +       void *buf;
> > > +
> > > +       buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > > +       if (!buf || !rq->data_array)
> > > +               return buf;
> > > +
> > > +       data = buf;
> > > +
> > > +       virtnet_rq_unmap(rq, data->dma);
> > > +
> > > +       return virtnet_rq_recycle_data(rq, data);
> > > +}
> > > +
> > > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > > +{
> > > +       struct virtnet_rq_data *data;
> > > +       void *buf;
> > > +
> > > +       buf = virtqueue_detach_unused_buf(rq->vq);
> > > +       if (!buf || !rq->data_array)
> > > +               return buf;
> > > +
> > > +       data = buf;
> > > +
> > > +       virtnet_rq_unmap(rq, data->dma);
> > > +
> > > +       return virtnet_rq_recycle_data(rq, data);
> > > +}
> > > +
> > > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > > +{
> > > +       struct virtnet_rq_dma *dma = rq->last_dma;
> > > +       struct device *dev;
> > > +       u32 off, map_len;
> > > +       dma_addr_t addr;
> > > +       void *end;
> > > +
> > > +       if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > > +               ++dma->ref;
> > > +               addr = dma->addr + (buf - dma->buf);
> > > +               goto ok;
> > > +       }
> > > +
> > > +       end = buf + len - 1;
> > > +       off = offset_in_page(end);
> > > +       map_len = len + PAGE_SIZE - off;
> >
> > This assumes a PAGE_SIZE which seems sub-optimal as page frag could be
> > larger than this.
> >
> > > +
> > > +       dev = virtqueue_dma_dev(rq->vq);
> > > +
> > > +       addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > > +                                 map_len, DMA_FROM_DEVICE, 0);
> > > +       if (addr == DMA_MAPPING_ERROR)
> > > +               return -ENOMEM;
> > > +
> > > +       dma = rq->dma_free;
> > > +       rq->dma_free = dma->next;
> > > +
> > > +       dma->ref = 1;
> > > +       dma->buf = buf;
> > > +       dma->addr = addr;
> > > +       dma->len = map_len;
> > > +
> > > +       rq->last_dma = dma;
> > > +
> > > +ok:
> > > +       sg_init_table(rq->sg, 1);
> > > +       rq->sg[0].dma_address = addr;
> > > +       rq->sg[0].length = len;
> > > +
> > > +       return 0;
> > > +}
> > > +
> > > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > > +{
> > > +       struct receive_queue *rq;
> > > +       int i, err, j, num;
> > > +
> > > +       /* disable for big mode */
> > > +       if (!vi->mergeable_rx_bufs && vi->big_packets)
> > > +               return 0;
> > > +
> > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > +               err = virtqueue_set_premapped(vi->rq[i].vq);
> > > +               if (err)
> > > +                       continue;
> > > +
> > > +               rq = &vi->rq[i];
> > > +
> > > +               num = virtqueue_get_vring_size(rq->vq);
> > > +
> > > +               rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > > +               if (!rq->data_array)
> >
> > Can we avoid those allocations when we don't use the DMA API?
> >
> > > +                       goto err;
> > > +
> > > +               rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > > +               if (!rq->dma_array)
> > > +                       goto err;
> > > +
> > > +               for (j = 0; j < num; ++j) {
> > > +                       rq->data_array[j].next = rq->data_free;
> > > +                       rq->data_free = &rq->data_array[j];
> > > +
> > > +                       rq->dma_array[j].next = rq->dma_free;
> > > +                       rq->dma_free = &rq->dma_array[j];
> > > +               }
> > > +       }
> > > +
> > > +       return 0;
> > > +
> > > +err:
> > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > +               struct receive_queue *rq;
> > > +
> > > +               rq = &vi->rq[i];
> > > +
> > > +               kfree(rq->dma_array);
> > > +               kfree(rq->data_array);
> > > +       }
> > > +
> > > +       return -ENOMEM;
> > > +}
> > > +
> > >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> > >  {
> > >         unsigned int len;
> > > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > >                 void *buf;
> > >                 int off;
> > >
> > > -               buf = virtqueue_get_buf(rq->vq, &buflen);
> > > +               buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> > >                 if (unlikely(!buf))
> > >                         goto err_buf;
> > >
> > > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > >                 return -EINVAL;
> > >
> > >         while (--*num_buf > 0) {
> > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > >                 if (unlikely(!buf)) {
> > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > >                                  dev->name, *num_buf,
> > > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > >         while (--num_buf) {
> > >                 int num_skb_frags;
> > >
> > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > >                 if (unlikely(!buf)) {
> > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > >                                  dev->name, num_buf,
> > > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > >  err_skb:
> > >         put_page(page);
> > >         while (num_buf-- > 1) {
> > > -               buf = virtqueue_get_buf(rq->vq, &len);
> > > +               buf = virtnet_rq_get_buf(rq, &len, NULL);
> > >                 if (unlikely(!buf)) {
> > >                         pr_debug("%s: rx error: %d buffers missing\n",
> > >                                  dev->name, num_buf);
> > > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > >         unsigned int xdp_headroom = virtnet_get_headroom(vi);
> > >         void *ctx = (void *)(unsigned long)xdp_headroom;
> > >         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > > +       struct virtnet_rq_data *data;
> > >         int err;
> > >
> > >         len = SKB_DATA_ALIGN(len) +
> > > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > >         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > >         get_page(alloc_frag->page);
> > >         alloc_frag->offset += len;
> > > -       sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > -                   vi->hdr_len + GOOD_PACKET_LEN);
> > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > +
> > > +       if (rq->data_array) {
> > > +               err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > +                                       vi->hdr_len + GOOD_PACKET_LEN);
> >
> > Thanks to the compound page. I wonder if everything could be
> > simplified if we just reuse page->private for storing metadata like
> > dma address and refcnt. Then we don't need extra stuff for tracking
> > any other thing?
>
> Maybe we can try alloc one small buffer from the page_frag to store the dma info
> when page_frag.offset == 0.

And store it in the ctx? I think it should work.

Thanks

>
> Thanks.
>
>
> >
> > Thanks
> >
> >
> >
> > > +               if (err)
> > > +                       goto map_err;
> > > +
> > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > +       } else {
> > > +               sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > +                           vi->hdr_len + GOOD_PACKET_LEN);
> > > +               data = (void *)buf;
> > > +       }
> > > +
> > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > >         if (err < 0)
> > > -               put_page(virt_to_head_page(buf));
> > > +               goto add_err;
> > > +
> > > +       return err;
> > > +
> > > +add_err:
> > > +       if (rq->data_array) {
> > > +               virtnet_rq_unmap(rq, data->dma);
> > > +               virtnet_rq_recycle_data(rq, data);
> > > +       }
> > > +
> > > +map_err:
> > > +       put_page(virt_to_head_page(buf));
> > >         return err;
> > >  }
> > >
> > > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > >         unsigned int headroom = virtnet_get_headroom(vi);
> > >         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> > >         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > > +       struct virtnet_rq_data *data;
> > >         char *buf;
> > >         void *ctx;
> > >         int err;
> > > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > >                 alloc_frag->offset += hole;
> > >         }
> > >
> > > -       sg_init_one(rq->sg, buf, len);
> > > +       if (rq->data_array) {
> > > +               err = virtnet_rq_map_sg(rq, buf, len);
> > > +               if (err)
> > > +                       goto map_err;
> > > +
> > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > +       } else {
> > > +               sg_init_one(rq->sg, buf, len);
> > > +               data = (void *)buf;
> > > +       }
> > > +
> > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > >         if (err < 0)
> > > -               put_page(virt_to_head_page(buf));
> > > +               goto add_err;
> > > +
> > > +       return 0;
> > > +
> > > +add_err:
> > > +       if (rq->data_array) {
> > > +               virtnet_rq_unmap(rq, data->dma);
> > > +               virtnet_rq_recycle_data(rq, data);
> > > +       }
> > >
> > > +map_err:
> > > +       put_page(virt_to_head_page(buf));
> > >         return err;
> > >  }
> > >
> > > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> > >                 void *ctx;
> > >
> > >                 while (stats.packets < budget &&
> > > -                      (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > > +                      (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> > >                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> > >                         stats.packets++;
> > >                 }
> > >         } else {
> > >                 while (stats.packets < budget &&
> > > -                      (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > > +                      (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> > >                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> > >                         stats.packets++;
> > >                 }
> > > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > >                 __netif_napi_del(&vi->rq[i].napi);
> > >                 __netif_napi_del(&vi->sq[i].napi);
> > > +
> > > +               kfree(vi->rq[i].data_array);
> > > +               kfree(vi->rq[i].dma_array);
> > >         }
> > >
> > >         /* We called __netif_napi_del(),
> > > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> > >         }
> > >
> > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > -               struct virtqueue *vq = vi->rq[i].vq;
> > > -               while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > > -                       virtnet_rq_free_unused_buf(vq, buf);
> > > +               struct receive_queue *rq = &vi->rq[i];
> > > +
> > > +               while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > > +                       virtnet_rq_free_unused_buf(rq->vq, buf);
> > >                 cond_resched();
> > >         }
> > >  }
> > > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> > >         if (ret)
> > >                 goto err_free;
> > >
> > > +       ret = virtnet_rq_merge_map_init(vi);
> > > +       if (ret)
> > > +               goto err_free;
> > > +
> > >         cpus_read_lock();
> > >         virtnet_set_affinity(vi);
> > >         cpus_read_unlock();
> > > --
> > > 2.32.0.3.g01195cf9f
> > >
> >
>
Xuan Zhuo July 14, 2023, 3:58 a.m. UTC | #18
On Fri, 14 Jul 2023 11:57:05 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Thu, Jul 13, 2023 at 3:02 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
> > On Thu, 13 Jul 2023 12:20:01 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > On Mon, Jul 10, 2023 at 11:43 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > >
> > >
> > > I'd suggest to tweak the title like:
> > >
> > > "merge dma operations when refilling mergeable buffers"
> > >
> > > > Currently, the virtio core will perform a dma operation for each
> > > > operation.
> > >
> > > "for each buffer"?
> > >
> > > > Although, the same page may be operated multiple times.
> > > >
> > > > The driver does the dma operation and manages the dma address based the
> > > > feature premapped of virtio core.
> > > >
> > > > This way, we can perform only one dma operation for the same page. In
> > > > the case of mtu 1500, this can reduce a lot of dma operations.
> > > >
> > > > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > > > increased from 1893766 to 1901105. An increase of 0.4%.
> > >
> > > Btw, it looks to me the code to deal with XDP_TX/REDIRECT for
> > > linearized pages was missed.
> > >
> > > >
> > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > ---
> > > >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> > > >  1 file changed, 267 insertions(+), 16 deletions(-)
> > > >
> > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > index 486b5849033d..4de845d35bed 100644
> > > > --- a/drivers/net/virtio_net.c
> > > > +++ b/drivers/net/virtio_net.c
> > > > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> > > >  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
> > > >  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
> > > >
> > > > +/* The bufs on the same page may share this struct. */
> > > > +struct virtnet_rq_dma {
> > > > +       struct virtnet_rq_dma *next;
> > > > +
> > > > +       dma_addr_t addr;
> > > > +
> > > > +       void *buf;
> > > > +       u32 len;
> > > > +
> > > > +       u32 ref;
> > > > +};
> > > > +
> > > > +/* Record the dma and buf. */
> > > > +struct virtnet_rq_data {
> > > > +       struct virtnet_rq_data *next;
> > > > +
> > > > +       void *buf;
> > > > +
> > > > +       struct virtnet_rq_dma *dma;
> > > > +};
> > > > +
> > > >  /* Internal representation of a send virtqueue */
> > > >  struct send_queue {
> > > >         /* Virtqueue associated with this send _queue */
> > > > @@ -175,6 +196,13 @@ struct receive_queue {
> > > >         char name[16];
> > > >
> > > >         struct xdp_rxq_info xdp_rxq;
> > > > +
> > > > +       struct virtnet_rq_data *data_array;
> > > > +       struct virtnet_rq_data *data_free;
> > > > +
> > > > +       struct virtnet_rq_dma *dma_array;
> > > > +       struct virtnet_rq_dma *dma_free;
> > > > +       struct virtnet_rq_dma *last_dma;
> > > >  };
> > > >
> > > >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > > > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > >         return skb;
> > > >  }
> > > >
> > > > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > > > +{
> > > > +       struct device *dev;
> > > > +
> > > > +       --dma->ref;
> > > > +
> > > > +       if (dma->ref)
> > > > +               return;
> > > > +
> > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > +
> > > > +       dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> > > > +
> > > > +       dma->next = rq->dma_free;
> > > > +       rq->dma_free = dma;
> > > > +}
> > > > +
> > > > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > > > +                                    struct virtnet_rq_data *data)
> > > > +{
> > > > +       void *buf;
> > > > +
> > > > +       buf = data->buf;
> > > > +
> > > > +       data->next = rq->data_free;
> > > > +       rq->data_free = data;
> > > > +
> > > > +       return buf;
> > > > +}
> > > > +
> > > > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > > > +                                                  void *buf,
> > > > +                                                  struct virtnet_rq_dma *dma)
> > > > +{
> > > > +       struct virtnet_rq_data *data;
> > > > +
> > > > +       data = rq->data_free;
> > > > +       rq->data_free = data->next;
> > > > +
> > > > +       data->buf = buf;
> > > > +       data->dma = dma;
> > > > +
> > > > +       return data;
> > > > +}
> > > > +
> > > > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > > > +{
> > > > +       struct virtnet_rq_data *data;
> > > > +       void *buf;
> > > > +
> > > > +       buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > > > +       if (!buf || !rq->data_array)
> > > > +               return buf;
> > > > +
> > > > +       data = buf;
> > > > +
> > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > +
> > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > +}
> > > > +
> > > > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > > > +{
> > > > +       struct virtnet_rq_data *data;
> > > > +       void *buf;
> > > > +
> > > > +       buf = virtqueue_detach_unused_buf(rq->vq);
> > > > +       if (!buf || !rq->data_array)
> > > > +               return buf;
> > > > +
> > > > +       data = buf;
> > > > +
> > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > +
> > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > +}
> > > > +
> > > > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > > > +{
> > > > +       struct virtnet_rq_dma *dma = rq->last_dma;
> > > > +       struct device *dev;
> > > > +       u32 off, map_len;
> > > > +       dma_addr_t addr;
> > > > +       void *end;
> > > > +
> > > > +       if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > > > +               ++dma->ref;
> > > > +               addr = dma->addr + (buf - dma->buf);
> > > > +               goto ok;
> > > > +       }
> > > > +
> > > > +       end = buf + len - 1;
> > > > +       off = offset_in_page(end);
> > > > +       map_len = len + PAGE_SIZE - off;
> > >
> > > This assumes a PAGE_SIZE which seems sub-optimal as page frag could be
> > > larger than this.
> > >
> > > > +
> > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > +
> > > > +       addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > > > +                                 map_len, DMA_FROM_DEVICE, 0);
> > > > +       if (addr == DMA_MAPPING_ERROR)
> > > > +               return -ENOMEM;
> > > > +
> > > > +       dma = rq->dma_free;
> > > > +       rq->dma_free = dma->next;
> > > > +
> > > > +       dma->ref = 1;
> > > > +       dma->buf = buf;
> > > > +       dma->addr = addr;
> > > > +       dma->len = map_len;
> > > > +
> > > > +       rq->last_dma = dma;
> > > > +
> > > > +ok:
> > > > +       sg_init_table(rq->sg, 1);
> > > > +       rq->sg[0].dma_address = addr;
> > > > +       rq->sg[0].length = len;
> > > > +
> > > > +       return 0;
> > > > +}
> > > > +
> > > > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > > > +{
> > > > +       struct receive_queue *rq;
> > > > +       int i, err, j, num;
> > > > +
> > > > +       /* disable for big mode */
> > > > +       if (!vi->mergeable_rx_bufs && vi->big_packets)
> > > > +               return 0;
> > > > +
> > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > +               err = virtqueue_set_premapped(vi->rq[i].vq);
> > > > +               if (err)
> > > > +                       continue;
> > > > +
> > > > +               rq = &vi->rq[i];
> > > > +
> > > > +               num = virtqueue_get_vring_size(rq->vq);
> > > > +
> > > > +               rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > > > +               if (!rq->data_array)
> > >
> > > Can we avoid those allocations when we don't use the DMA API?
> > >
> > > > +                       goto err;
> > > > +
> > > > +               rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > > > +               if (!rq->dma_array)
> > > > +                       goto err;
> > > > +
> > > > +               for (j = 0; j < num; ++j) {
> > > > +                       rq->data_array[j].next = rq->data_free;
> > > > +                       rq->data_free = &rq->data_array[j];
> > > > +
> > > > +                       rq->dma_array[j].next = rq->dma_free;
> > > > +                       rq->dma_free = &rq->dma_array[j];
> > > > +               }
> > > > +       }
> > > > +
> > > > +       return 0;
> > > > +
> > > > +err:
> > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > +               struct receive_queue *rq;
> > > > +
> > > > +               rq = &vi->rq[i];
> > > > +
> > > > +               kfree(rq->dma_array);
> > > > +               kfree(rq->data_array);
> > > > +       }
> > > > +
> > > > +       return -ENOMEM;
> > > > +}
> > > > +
> > > >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> > > >  {
> > > >         unsigned int len;
> > > > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > >                 void *buf;
> > > >                 int off;
> > > >
> > > > -               buf = virtqueue_get_buf(rq->vq, &buflen);
> > > > +               buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> > > >                 if (unlikely(!buf))
> > > >                         goto err_buf;
> > > >
> > > > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > >                 return -EINVAL;
> > > >
> > > >         while (--*num_buf > 0) {
> > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > >                 if (unlikely(!buf)) {
> > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > >                                  dev->name, *num_buf,
> > > > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > >         while (--num_buf) {
> > > >                 int num_skb_frags;
> > > >
> > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > >                 if (unlikely(!buf)) {
> > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > >                                  dev->name, num_buf,
> > > > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > >  err_skb:
> > > >         put_page(page);
> > > >         while (num_buf-- > 1) {
> > > > -               buf = virtqueue_get_buf(rq->vq, &len);
> > > > +               buf = virtnet_rq_get_buf(rq, &len, NULL);
> > > >                 if (unlikely(!buf)) {
> > > >                         pr_debug("%s: rx error: %d buffers missing\n",
> > > >                                  dev->name, num_buf);
> > > > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > >         unsigned int xdp_headroom = virtnet_get_headroom(vi);
> > > >         void *ctx = (void *)(unsigned long)xdp_headroom;
> > > >         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > > > +       struct virtnet_rq_data *data;
> > > >         int err;
> > > >
> > > >         len = SKB_DATA_ALIGN(len) +
> > > > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > >         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > >         get_page(alloc_frag->page);
> > > >         alloc_frag->offset += len;
> > > > -       sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > -                   vi->hdr_len + GOOD_PACKET_LEN);
> > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > +
> > > > +       if (rq->data_array) {
> > > > +               err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > +                                       vi->hdr_len + GOOD_PACKET_LEN);
> > >
> > > Thanks to the compound page. I wonder if everything could be
> > > simplified if we just reuse page->private for storing metadata like
> > > dma address and refcnt. Then we don't need extra stuff for tracking
> > > any other thing?
> >
> > Maybe we can try alloc one small buffer from the page_frag to store the dma info
> > when page_frag.offset == 0.
>
> And store it in the ctx? I think it should work.


Since the dma information is located on the first page of the composite page, we
can get the dma information through buf.

No need to modify ctx.

Thanks.


>
> Thanks
>
> >
> > Thanks.
> >
> >
> > >
> > > Thanks
> > >
> > >
> > >
> > > > +               if (err)
> > > > +                       goto map_err;
> > > > +
> > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > +       } else {
> > > > +               sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > +                           vi->hdr_len + GOOD_PACKET_LEN);
> > > > +               data = (void *)buf;
> > > > +       }
> > > > +
> > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > >         if (err < 0)
> > > > -               put_page(virt_to_head_page(buf));
> > > > +               goto add_err;
> > > > +
> > > > +       return err;
> > > > +
> > > > +add_err:
> > > > +       if (rq->data_array) {
> > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > +               virtnet_rq_recycle_data(rq, data);
> > > > +       }
> > > > +
> > > > +map_err:
> > > > +       put_page(virt_to_head_page(buf));
> > > >         return err;
> > > >  }
> > > >
> > > > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > >         unsigned int headroom = virtnet_get_headroom(vi);
> > > >         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> > > >         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > > > +       struct virtnet_rq_data *data;
> > > >         char *buf;
> > > >         void *ctx;
> > > >         int err;
> > > > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > >                 alloc_frag->offset += hole;
> > > >         }
> > > >
> > > > -       sg_init_one(rq->sg, buf, len);
> > > > +       if (rq->data_array) {
> > > > +               err = virtnet_rq_map_sg(rq, buf, len);
> > > > +               if (err)
> > > > +                       goto map_err;
> > > > +
> > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > +       } else {
> > > > +               sg_init_one(rq->sg, buf, len);
> > > > +               data = (void *)buf;
> > > > +       }
> > > > +
> > > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > >         if (err < 0)
> > > > -               put_page(virt_to_head_page(buf));
> > > > +               goto add_err;
> > > > +
> > > > +       return 0;
> > > > +
> > > > +add_err:
> > > > +       if (rq->data_array) {
> > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > +               virtnet_rq_recycle_data(rq, data);
> > > > +       }
> > > >
> > > > +map_err:
> > > > +       put_page(virt_to_head_page(buf));
> > > >         return err;
> > > >  }
> > > >
> > > > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> > > >                 void *ctx;
> > > >
> > > >                 while (stats.packets < budget &&
> > > > -                      (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > > > +                      (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> > > >                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> > > >                         stats.packets++;
> > > >                 }
> > > >         } else {
> > > >                 while (stats.packets < budget &&
> > > > -                      (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > > > +                      (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> > > >                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> > > >                         stats.packets++;
> > > >                 }
> > > > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > >                 __netif_napi_del(&vi->rq[i].napi);
> > > >                 __netif_napi_del(&vi->sq[i].napi);
> > > > +
> > > > +               kfree(vi->rq[i].data_array);
> > > > +               kfree(vi->rq[i].dma_array);
> > > >         }
> > > >
> > > >         /* We called __netif_napi_del(),
> > > > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> > > >         }
> > > >
> > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > -               struct virtqueue *vq = vi->rq[i].vq;
> > > > -               while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > > > -                       virtnet_rq_free_unused_buf(vq, buf);
> > > > +               struct receive_queue *rq = &vi->rq[i];
> > > > +
> > > > +               while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > > > +                       virtnet_rq_free_unused_buf(rq->vq, buf);
> > > >                 cond_resched();
> > > >         }
> > > >  }
> > > > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> > > >         if (ret)
> > > >                 goto err_free;
> > > >
> > > > +       ret = virtnet_rq_merge_map_init(vi);
> > > > +       if (ret)
> > > > +               goto err_free;
> > > > +
> > > >         cpus_read_lock();
> > > >         virtnet_set_affinity(vi);
> > > >         cpus_read_unlock();
> > > > --
> > > > 2.32.0.3.g01195cf9f
> > > >
> > >
> >
>
Jason Wang July 14, 2023, 5:45 a.m. UTC | #19
On Fri, Jul 14, 2023 at 12:37 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> On Fri, 14 Jul 2023 11:57:05 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > On Thu, Jul 13, 2023 at 3:02 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > >
> > > On Thu, 13 Jul 2023 12:20:01 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > On Mon, Jul 10, 2023 at 11:43 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > >
> > > >
> > > > I'd suggest to tweak the title like:
> > > >
> > > > "merge dma operations when refilling mergeable buffers"
> > > >
> > > > > Currently, the virtio core will perform a dma operation for each
> > > > > operation.
> > > >
> > > > "for each buffer"?
> > > >
> > > > > Although, the same page may be operated multiple times.
> > > > >
> > > > > The driver does the dma operation and manages the dma address based the
> > > > > feature premapped of virtio core.
> > > > >
> > > > > This way, we can perform only one dma operation for the same page. In
> > > > > the case of mtu 1500, this can reduce a lot of dma operations.
> > > > >
> > > > > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > > > > increased from 1893766 to 1901105. An increase of 0.4%.
> > > >
> > > > Btw, it looks to me the code to deal with XDP_TX/REDIRECT for
> > > > linearized pages was missed.
> > > >
> > > > >
> > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > ---
> > > > >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> > > > >  1 file changed, 267 insertions(+), 16 deletions(-)
> > > > >
> > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > index 486b5849033d..4de845d35bed 100644
> > > > > --- a/drivers/net/virtio_net.c
> > > > > +++ b/drivers/net/virtio_net.c
> > > > > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> > > > >  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
> > > > >  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
> > > > >
> > > > > +/* The bufs on the same page may share this struct. */
> > > > > +struct virtnet_rq_dma {
> > > > > +       struct virtnet_rq_dma *next;
> > > > > +
> > > > > +       dma_addr_t addr;
> > > > > +
> > > > > +       void *buf;
> > > > > +       u32 len;
> > > > > +
> > > > > +       u32 ref;
> > > > > +};
> > > > > +
> > > > > +/* Record the dma and buf. */
> > > > > +struct virtnet_rq_data {
> > > > > +       struct virtnet_rq_data *next;
> > > > > +
> > > > > +       void *buf;
> > > > > +
> > > > > +       struct virtnet_rq_dma *dma;
> > > > > +};
> > > > > +
> > > > >  /* Internal representation of a send virtqueue */
> > > > >  struct send_queue {
> > > > >         /* Virtqueue associated with this send _queue */
> > > > > @@ -175,6 +196,13 @@ struct receive_queue {
> > > > >         char name[16];
> > > > >
> > > > >         struct xdp_rxq_info xdp_rxq;
> > > > > +
> > > > > +       struct virtnet_rq_data *data_array;
> > > > > +       struct virtnet_rq_data *data_free;
> > > > > +
> > > > > +       struct virtnet_rq_dma *dma_array;
> > > > > +       struct virtnet_rq_dma *dma_free;
> > > > > +       struct virtnet_rq_dma *last_dma;
> > > > >  };
> > > > >
> > > > >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > > > > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > >         return skb;
> > > > >  }
> > > > >
> > > > > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > > > > +{
> > > > > +       struct device *dev;
> > > > > +
> > > > > +       --dma->ref;
> > > > > +
> > > > > +       if (dma->ref)
> > > > > +               return;
> > > > > +
> > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > +
> > > > > +       dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> > > > > +
> > > > > +       dma->next = rq->dma_free;
> > > > > +       rq->dma_free = dma;
> > > > > +}
> > > > > +
> > > > > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > > > > +                                    struct virtnet_rq_data *data)
> > > > > +{
> > > > > +       void *buf;
> > > > > +
> > > > > +       buf = data->buf;
> > > > > +
> > > > > +       data->next = rq->data_free;
> > > > > +       rq->data_free = data;
> > > > > +
> > > > > +       return buf;
> > > > > +}
> > > > > +
> > > > > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > > > > +                                                  void *buf,
> > > > > +                                                  struct virtnet_rq_dma *dma)
> > > > > +{
> > > > > +       struct virtnet_rq_data *data;
> > > > > +
> > > > > +       data = rq->data_free;
> > > > > +       rq->data_free = data->next;
> > > > > +
> > > > > +       data->buf = buf;
> > > > > +       data->dma = dma;
> > > > > +
> > > > > +       return data;
> > > > > +}
> > > > > +
> > > > > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > > > > +{
> > > > > +       struct virtnet_rq_data *data;
> > > > > +       void *buf;
> > > > > +
> > > > > +       buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > > > > +       if (!buf || !rq->data_array)
> > > > > +               return buf;
> > > > > +
> > > > > +       data = buf;
> > > > > +
> > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > +
> > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > +}
> > > > > +
> > > > > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > > > > +{
> > > > > +       struct virtnet_rq_data *data;
> > > > > +       void *buf;
> > > > > +
> > > > > +       buf = virtqueue_detach_unused_buf(rq->vq);
> > > > > +       if (!buf || !rq->data_array)
> > > > > +               return buf;
> > > > > +
> > > > > +       data = buf;
> > > > > +
> > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > +
> > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > +}
> > > > > +
> > > > > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > > > > +{
> > > > > +       struct virtnet_rq_dma *dma = rq->last_dma;
> > > > > +       struct device *dev;
> > > > > +       u32 off, map_len;
> > > > > +       dma_addr_t addr;
> > > > > +       void *end;
> > > > > +
> > > > > +       if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > > > > +               ++dma->ref;
> > > > > +               addr = dma->addr + (buf - dma->buf);
> > > > > +               goto ok;
> > > > > +       }
> > > > > +
> > > > > +       end = buf + len - 1;
> > > > > +       off = offset_in_page(end);
> > > > > +       map_len = len + PAGE_SIZE - off;
> > > >
> > > > This assumes a PAGE_SIZE which seems sub-optimal as page frag could be
> > > > larger than this.
> > > >
> > > > > +
> > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > +
> > > > > +       addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > > > > +                                 map_len, DMA_FROM_DEVICE, 0);
> > > > > +       if (addr == DMA_MAPPING_ERROR)
> > > > > +               return -ENOMEM;
> > > > > +
> > > > > +       dma = rq->dma_free;
> > > > > +       rq->dma_free = dma->next;
> > > > > +
> > > > > +       dma->ref = 1;
> > > > > +       dma->buf = buf;
> > > > > +       dma->addr = addr;
> > > > > +       dma->len = map_len;
> > > > > +
> > > > > +       rq->last_dma = dma;
> > > > > +
> > > > > +ok:
> > > > > +       sg_init_table(rq->sg, 1);
> > > > > +       rq->sg[0].dma_address = addr;
> > > > > +       rq->sg[0].length = len;
> > > > > +
> > > > > +       return 0;
> > > > > +}
> > > > > +
> > > > > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > > > > +{
> > > > > +       struct receive_queue *rq;
> > > > > +       int i, err, j, num;
> > > > > +
> > > > > +       /* disable for big mode */
> > > > > +       if (!vi->mergeable_rx_bufs && vi->big_packets)
> > > > > +               return 0;
> > > > > +
> > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > +               err = virtqueue_set_premapped(vi->rq[i].vq);
> > > > > +               if (err)
> > > > > +                       continue;
> > > > > +
> > > > > +               rq = &vi->rq[i];
> > > > > +
> > > > > +               num = virtqueue_get_vring_size(rq->vq);
> > > > > +
> > > > > +               rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > > > > +               if (!rq->data_array)
> > > >
> > > > Can we avoid those allocations when we don't use the DMA API?
> > > >
> > > > > +                       goto err;
> > > > > +
> > > > > +               rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > > > > +               if (!rq->dma_array)
> > > > > +                       goto err;
> > > > > +
> > > > > +               for (j = 0; j < num; ++j) {
> > > > > +                       rq->data_array[j].next = rq->data_free;
> > > > > +                       rq->data_free = &rq->data_array[j];
> > > > > +
> > > > > +                       rq->dma_array[j].next = rq->dma_free;
> > > > > +                       rq->dma_free = &rq->dma_array[j];
> > > > > +               }
> > > > > +       }
> > > > > +
> > > > > +       return 0;
> > > > > +
> > > > > +err:
> > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > +               struct receive_queue *rq;
> > > > > +
> > > > > +               rq = &vi->rq[i];
> > > > > +
> > > > > +               kfree(rq->dma_array);
> > > > > +               kfree(rq->data_array);
> > > > > +       }
> > > > > +
> > > > > +       return -ENOMEM;
> > > > > +}
> > > > > +
> > > > >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> > > > >  {
> > > > >         unsigned int len;
> > > > > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > >                 void *buf;
> > > > >                 int off;
> > > > >
> > > > > -               buf = virtqueue_get_buf(rq->vq, &buflen);
> > > > > +               buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> > > > >                 if (unlikely(!buf))
> > > > >                         goto err_buf;
> > > > >
> > > > > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > >                 return -EINVAL;
> > > > >
> > > > >         while (--*num_buf > 0) {
> > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > >                 if (unlikely(!buf)) {
> > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > >                                  dev->name, *num_buf,
> > > > > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > >         while (--num_buf) {
> > > > >                 int num_skb_frags;
> > > > >
> > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > >                 if (unlikely(!buf)) {
> > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > >                                  dev->name, num_buf,
> > > > > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > >  err_skb:
> > > > >         put_page(page);
> > > > >         while (num_buf-- > 1) {
> > > > > -               buf = virtqueue_get_buf(rq->vq, &len);
> > > > > +               buf = virtnet_rq_get_buf(rq, &len, NULL);
> > > > >                 if (unlikely(!buf)) {
> > > > >                         pr_debug("%s: rx error: %d buffers missing\n",
> > > > >                                  dev->name, num_buf);
> > > > > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > >         unsigned int xdp_headroom = virtnet_get_headroom(vi);
> > > > >         void *ctx = (void *)(unsigned long)xdp_headroom;
> > > > >         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > > > > +       struct virtnet_rq_data *data;
> > > > >         int err;
> > > > >
> > > > >         len = SKB_DATA_ALIGN(len) +
> > > > > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > >         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > >         get_page(alloc_frag->page);
> > > > >         alloc_frag->offset += len;
> > > > > -       sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > -                   vi->hdr_len + GOOD_PACKET_LEN);
> > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > +
> > > > > +       if (rq->data_array) {
> > > > > +               err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > +                                       vi->hdr_len + GOOD_PACKET_LEN);
> > > >
> > > > Thanks to the compound page. I wonder if everything could be
> > > > simplified if we just reuse page->private for storing metadata like
> > > > dma address and refcnt. Then we don't need extra stuff for tracking
> > > > any other thing?
> > >
> > > Maybe we can try alloc one small buffer from the page_frag to store the dma info
> > > when page_frag.offset == 0.
> >
> > And store it in the ctx? I think it should work.
>
>
> Since the dma information is located on the first page of the composite page, we
> can get the dma information through buf.
>
> No need to modify ctx.

Ok, I'm not sure I get this fully, maybe you can post another version to see.

Thanks

>
> Thanks.
>
>
> >
> > Thanks
> >
> > >
> > > Thanks.
> > >
> > >
> > > >
> > > > Thanks
> > > >
> > > >
> > > >
> > > > > +               if (err)
> > > > > +                       goto map_err;
> > > > > +
> > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > +       } else {
> > > > > +               sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > +                           vi->hdr_len + GOOD_PACKET_LEN);
> > > > > +               data = (void *)buf;
> > > > > +       }
> > > > > +
> > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > >         if (err < 0)
> > > > > -               put_page(virt_to_head_page(buf));
> > > > > +               goto add_err;
> > > > > +
> > > > > +       return err;
> > > > > +
> > > > > +add_err:
> > > > > +       if (rq->data_array) {
> > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > +       }
> > > > > +
> > > > > +map_err:
> > > > > +       put_page(virt_to_head_page(buf));
> > > > >         return err;
> > > > >  }
> > > > >
> > > > > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > >         unsigned int headroom = virtnet_get_headroom(vi);
> > > > >         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> > > > >         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > > > > +       struct virtnet_rq_data *data;
> > > > >         char *buf;
> > > > >         void *ctx;
> > > > >         int err;
> > > > > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > >                 alloc_frag->offset += hole;
> > > > >         }
> > > > >
> > > > > -       sg_init_one(rq->sg, buf, len);
> > > > > +       if (rq->data_array) {
> > > > > +               err = virtnet_rq_map_sg(rq, buf, len);
> > > > > +               if (err)
> > > > > +                       goto map_err;
> > > > > +
> > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > +       } else {
> > > > > +               sg_init_one(rq->sg, buf, len);
> > > > > +               data = (void *)buf;
> > > > > +       }
> > > > > +
> > > > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > >         if (err < 0)
> > > > > -               put_page(virt_to_head_page(buf));
> > > > > +               goto add_err;
> > > > > +
> > > > > +       return 0;
> > > > > +
> > > > > +add_err:
> > > > > +       if (rq->data_array) {
> > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > +       }
> > > > >
> > > > > +map_err:
> > > > > +       put_page(virt_to_head_page(buf));
> > > > >         return err;
> > > > >  }
> > > > >
> > > > > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> > > > >                 void *ctx;
> > > > >
> > > > >                 while (stats.packets < budget &&
> > > > > -                      (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> > > > >                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> > > > >                         stats.packets++;
> > > > >                 }
> > > > >         } else {
> > > > >                 while (stats.packets < budget &&
> > > > > -                      (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> > > > >                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> > > > >                         stats.packets++;
> > > > >                 }
> > > > > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > >                 __netif_napi_del(&vi->rq[i].napi);
> > > > >                 __netif_napi_del(&vi->sq[i].napi);
> > > > > +
> > > > > +               kfree(vi->rq[i].data_array);
> > > > > +               kfree(vi->rq[i].dma_array);
> > > > >         }
> > > > >
> > > > >         /* We called __netif_napi_del(),
> > > > > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> > > > >         }
> > > > >
> > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > -               struct virtqueue *vq = vi->rq[i].vq;
> > > > > -               while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > > > > -                       virtnet_rq_free_unused_buf(vq, buf);
> > > > > +               struct receive_queue *rq = &vi->rq[i];
> > > > > +
> > > > > +               while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > > > > +                       virtnet_rq_free_unused_buf(rq->vq, buf);
> > > > >                 cond_resched();
> > > > >         }
> > > > >  }
> > > > > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> > > > >         if (ret)
> > > > >                 goto err_free;
> > > > >
> > > > > +       ret = virtnet_rq_merge_map_init(vi);
> > > > > +       if (ret)
> > > > > +               goto err_free;
> > > > > +
> > > > >         cpus_read_lock();
> > > > >         virtnet_set_affinity(vi);
> > > > >         cpus_read_unlock();
> > > > > --
> > > > > 2.32.0.3.g01195cf9f
> > > > >
> > > >
> > >
> >
>
Michael S. Tsirkin July 14, 2023, 10:37 a.m. UTC | #20
On Wed, Jul 12, 2023 at 04:38:24PM +0800, Xuan Zhuo wrote:
> On Wed, 12 Jul 2023 16:37:43 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > On Wed, Jul 12, 2023 at 4:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > >
> > > On Wed, 12 Jul 2023 15:54:58 +0800, Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > On Tue, 11 Jul 2023 10:58:51 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > On Tue, Jul 11, 2023 at 10:42 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > >
> > > > > > On Tue, 11 Jul 2023 10:36:17 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > On Mon, Jul 10, 2023 at 8:41 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > >
> > > > > > > > On Mon, 10 Jul 2023 07:59:03 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > > > > On Mon, Jul 10, 2023 at 06:18:30PM +0800, Xuan Zhuo wrote:
> > > > > > > > > > On Mon, 10 Jul 2023 05:40:21 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > > > > > > On Mon, Jul 10, 2023 at 11:42:37AM +0800, Xuan Zhuo wrote:
> > > > > > > > > > > > Currently, the virtio core will perform a dma operation for each
> > > > > > > > > > > > operation. Although, the same page may be operated multiple times.
> > > > > > > > > > > >
> > > > > > > > > > > > The driver does the dma operation and manages the dma address based the
> > > > > > > > > > > > feature premapped of virtio core.
> > > > > > > > > > > >
> > > > > > > > > > > > This way, we can perform only one dma operation for the same page. In
> > > > > > > > > > > > the case of mtu 1500, this can reduce a lot of dma operations.
> > > > > > > > > > > >
> > > > > > > > > > > > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > > > > > > > > > > > increased from 1893766 to 1901105. An increase of 0.4%.
> > > > > > > > > > >
> > > > > > > > > > > what kind of dma was there? an IOMMU? which vendors? in which mode
> > > > > > > > > > > of operation?
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Do you mean this:
> > > > > > > > > >
> > > > > > > > > > [    0.470816] iommu: Default domain type: Passthrough
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > With passthrough, dma API is just some indirect function calls, they do
> > > > > > > > > not affect the performance a lot.
> > > > > > > >
> > > > > > > >
> > > > > > > > Yes, this benefit is worthless. I seem to have done a meaningless thing. The
> > > > > > > > overhead of DMA I observed is indeed not too high.
> > > > > > >
> > > > > > > Have you measured with iommu=strict?
> > > > > >
> > > > > > I have not tested this way, our environment is pt, I wonder if strict is a
> > > > > > common scenario. I can test it.
> > > > >
> > > > > It's not a common setup, but it's a way to stress DMA layer to see the overhead.
> > > >
> > > > kernel command line: intel_iommu=on iommu.strict=1 iommu.passthrough=0
> > > >
> > > > virtio-net without merge dma 428614.00 pps
> > > >
> > > > virtio-net with merge dma    742853.00 pps
> > >
> > >
> > > kernel command line: intel_iommu=on iommu.strict=0 iommu.passthrough=0
> > >
> > > virtio-net without merge dma 775496.00 pps
> > >
> > > virtio-net with merge dma    1010514.00 pps
> > >
> > >
> >
> > Great, let's add those numbers to the changelog.
> 
> 
> Yes, I will do it in next version.
> 
> 
> Thanks.
> 

You should also test without iommu but with swiotlb=force

But first fix the use of DMA API to actually be correct,
otherwise you are cheating by avoiding synchronization.



> >
> > Thanks
> >
> > > Thanks.
> > >
> > > >
> > > >
> > > > Thanks.
> > > >
> > > >
> > > >
> > > >
> > > > >
> > > > > Thanks
> > > > >
> > > > > >
> > > > > > Thanks.
> > > > > >
> > > > > >
> > > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > > > >
> > > > > > > > Thanks.
> > > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > Try e.g. bounce buffer. Which is where you will see a problem: your
> > > > > > > > > patches won't work.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > > > > >
> > > > > > > > > > > This kind of difference is likely in the noise.
> > > > > > > > > >
> > > > > > > > > > It's really not high, but this is because the proportion of DMA under perf top
> > > > > > > > > > is not high. Probably that much.
> > > > > > > > >
> > > > > > > > > So maybe not worth the complexity.
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > ---
> > > > > > > > > > > >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> > > > > > > > > > > >  1 file changed, 267 insertions(+), 16 deletions(-)
> > > > > > > > > > > >
> > > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > > > > > > index 486b5849033d..4de845d35bed 100644
> > > > > > > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > > > > > > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> > > > > > > > > > > >  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
> > > > > > > > > > > >  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
> > > > > > > > > > > >
> > > > > > > > > > > > +/* The bufs on the same page may share this struct. */
> > > > > > > > > > > > +struct virtnet_rq_dma {
> > > > > > > > > > > > +       struct virtnet_rq_dma *next;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       dma_addr_t addr;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > +       u32 len;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       u32 ref;
> > > > > > > > > > > > +};
> > > > > > > > > > > > +
> > > > > > > > > > > > +/* Record the dma and buf. */
> > > > > > > > > > >
> > > > > > > > > > > I guess I see that. But why?
> > > > > > > > > > > And these two comments are the extent of the available
> > > > > > > > > > > documentation, that's not enough I feel.
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > +struct virtnet_rq_data {
> > > > > > > > > > > > +       struct virtnet_rq_data *next;
> > > > > > > > > > >
> > > > > > > > > > > Is manually reimplementing a linked list the best
> > > > > > > > > > > we can do?
> > > > > > > > > >
> > > > > > > > > > Yes, we can use llist.
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > +
> > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       struct virtnet_rq_dma *dma;
> > > > > > > > > > > > +};
> > > > > > > > > > > > +
> > > > > > > > > > > >  /* Internal representation of a send virtqueue */
> > > > > > > > > > > >  struct send_queue {
> > > > > > > > > > > >         /* Virtqueue associated with this send _queue */
> > > > > > > > > > > > @@ -175,6 +196,13 @@ struct receive_queue {
> > > > > > > > > > > >         char name[16];
> > > > > > > > > > > >
> > > > > > > > > > > >         struct xdp_rxq_info xdp_rxq;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       struct virtnet_rq_data *data_array;
> > > > > > > > > > > > +       struct virtnet_rq_data *data_free;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       struct virtnet_rq_dma *dma_array;
> > > > > > > > > > > > +       struct virtnet_rq_dma *dma_free;
> > > > > > > > > > > > +       struct virtnet_rq_dma *last_dma;
> > > > > > > > > > > >  };
> > > > > > > > > > > >
> > > > > > > > > > > >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > > > > > > > > > > > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > > > > > > >         return skb;
> > > > > > > > > > > >  }
> > > > > > > > > > > >
> > > > > > > > > > > > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +       struct device *dev;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       --dma->ref;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       if (dma->ref)
> > > > > > > > > > > > +               return;
> > > > > > > > > > > > +
> > > > > > > > > > >
> > > > > > > > > > > If you don't unmap there is no guarantee valid data will be
> > > > > > > > > > > there in the buffer.
> > > > > > > > > > >
> > > > > > > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > > > > > > +
> > > > > > > > > > > > +       dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > +
> > > > > > > > > > > > +       dma->next = rq->dma_free;
> > > > > > > > > > > > +       rq->dma_free = dma;
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > > > > > > > > > > > +                                    struct virtnet_rq_data *data)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       buf = data->buf;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       data->next = rq->data_free;
> > > > > > > > > > > > +       rq->data_free = data;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       return buf;
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > > > > > > > > > > > +                                                  void *buf,
> > > > > > > > > > > > +                                                  struct virtnet_rq_dma *dma)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       data = rq->data_free;
> > > > > > > > > > > > +       rq->data_free = data->next;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       data->buf = buf;
> > > > > > > > > > > > +       data->dma = dma;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       return data;
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > > > > > > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > > > > > > +               return buf;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       data = buf;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > +
> > > > > > > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       buf = virtqueue_detach_unused_buf(rq->vq);
> > > > > > > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > > > > > > +               return buf;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       data = buf;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > +
> > > > > > > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +       struct virtnet_rq_dma *dma = rq->last_dma;
> > > > > > > > > > > > +       struct device *dev;
> > > > > > > > > > > > +       u32 off, map_len;
> > > > > > > > > > > > +       dma_addr_t addr;
> > > > > > > > > > > > +       void *end;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > > > > > > > > > > > +               ++dma->ref;
> > > > > > > > > > > > +               addr = dma->addr + (buf - dma->buf);
> > > > > > > > > > > > +               goto ok;
> > > > > > > > > > > > +       }
> > > > > > > > > > >
> > > > > > > > > > > So this is the meat of the proposed optimization. I guess that
> > > > > > > > > > > if the last buffer we allocated happens to be in the same page
> > > > > > > > > > > as this one then they can both be mapped for DMA together.
> > > > > > > > > >
> > > > > > > > > > Since we use page_frag, the buffers we allocated are all continuous.
> > > > > > > > > >
> > > > > > > > > > > Why last one specifically? Whether next one happens to
> > > > > > > > > > > be close depends on luck. If you want to try optimizing this
> > > > > > > > > > > the right thing to do is likely by using a page pool.
> > > > > > > > > > > There's actually work upstream on page pool, look it up.
> > > > > > > > > >
> > > > > > > > > > As we discussed in another thread, the page pool is first used for xdp. Let's
> > > > > > > > > > transform it step by step.
> > > > > > > > > >
> > > > > > > > > > Thanks.
> > > > > > > > >
> > > > > > > > > ok so this should wait then?
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > +
> > > > > > > > > > > > +       end = buf + len - 1;
> > > > > > > > > > > > +       off = offset_in_page(end);
> > > > > > > > > > > > +       map_len = len + PAGE_SIZE - off;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > > > > > > +
> > > > > > > > > > > > +       addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > > > > > > > > > > > +                                 map_len, DMA_FROM_DEVICE, 0);
> > > > > > > > > > > > +       if (addr == DMA_MAPPING_ERROR)
> > > > > > > > > > > > +               return -ENOMEM;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       dma = rq->dma_free;
> > > > > > > > > > > > +       rq->dma_free = dma->next;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       dma->ref = 1;
> > > > > > > > > > > > +       dma->buf = buf;
> > > > > > > > > > > > +       dma->addr = addr;
> > > > > > > > > > > > +       dma->len = map_len;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       rq->last_dma = dma;
> > > > > > > > > > > > +
> > > > > > > > > > > > +ok:
> > > > > > > > > > > > +       sg_init_table(rq->sg, 1);
> > > > > > > > > > > > +       rq->sg[0].dma_address = addr;
> > > > > > > > > > > > +       rq->sg[0].length = len;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       return 0;
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +       struct receive_queue *rq;
> > > > > > > > > > > > +       int i, err, j, num;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       /* disable for big mode */
> > > > > > > > > > > > +       if (!vi->mergeable_rx_bufs && vi->big_packets)
> > > > > > > > > > > > +               return 0;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > +               err = virtqueue_set_premapped(vi->rq[i].vq);
> > > > > > > > > > > > +               if (err)
> > > > > > > > > > > > +                       continue;
> > > > > > > > > > > > +
> > > > > > > > > > > > +               rq = &vi->rq[i];
> > > > > > > > > > > > +
> > > > > > > > > > > > +               num = virtqueue_get_vring_size(rq->vq);
> > > > > > > > > > > > +
> > > > > > > > > > > > +               rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > > > > > > > > > > > +               if (!rq->data_array)
> > > > > > > > > > > > +                       goto err;
> > > > > > > > > > > > +
> > > > > > > > > > > > +               rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > > > > > > > > > > > +               if (!rq->dma_array)
> > > > > > > > > > > > +                       goto err;
> > > > > > > > > > > > +
> > > > > > > > > > > > +               for (j = 0; j < num; ++j) {
> > > > > > > > > > > > +                       rq->data_array[j].next = rq->data_free;
> > > > > > > > > > > > +                       rq->data_free = &rq->data_array[j];
> > > > > > > > > > > > +
> > > > > > > > > > > > +                       rq->dma_array[j].next = rq->dma_free;
> > > > > > > > > > > > +                       rq->dma_free = &rq->dma_array[j];
> > > > > > > > > > > > +               }
> > > > > > > > > > > > +       }
> > > > > > > > > > > > +
> > > > > > > > > > > > +       return 0;
> > > > > > > > > > > > +
> > > > > > > > > > > > +err:
> > > > > > > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > +               struct receive_queue *rq;
> > > > > > > > > > > > +
> > > > > > > > > > > > +               rq = &vi->rq[i];
> > > > > > > > > > > > +
> > > > > > > > > > > > +               kfree(rq->dma_array);
> > > > > > > > > > > > +               kfree(rq->data_array);
> > > > > > > > > > > > +       }
> > > > > > > > > > > > +
> > > > > > > > > > > > +       return -ENOMEM;
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> > > > > > > > > > > >  {
> > > > > > > > > > > >         unsigned int len;
> > > > > > > > > > > > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > > > > > > >                 void *buf;
> > > > > > > > > > > >                 int off;
> > > > > > > > > > > >
> > > > > > > > > > > > -               buf = virtqueue_get_buf(rq->vq, &buflen);
> > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> > > > > > > > > > > >                 if (unlikely(!buf))
> > > > > > > > > > > >                         goto err_buf;
> > > > > > > > > > > >
> > > > > > > > > > > > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > > > > > > >                 return -EINVAL;
> > > > > > > > > > > >
> > > > > > > > > > > >         while (--*num_buf > 0) {
> > > > > > > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > > > > > > >                                  dev->name, *num_buf,
> > > > > > > > > > > > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > > > >         while (--num_buf) {
> > > > > > > > > > > >                 int num_skb_frags;
> > > > > > > > > > > >
> > > > > > > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > > > > > > >                                  dev->name, num_buf,
> > > > > > > > > > > > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > > > >  err_skb:
> > > > > > > > > > > >         put_page(page);
> > > > > > > > > > > >         while (num_buf-- > 1) {
> > > > > > > > > > > > -               buf = virtqueue_get_buf(rq->vq, &len);
> > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, NULL);
> > > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers missing\n",
> > > > > > > > > > > >                                  dev->name, num_buf);
> > > > > > > > > > > > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > > > > > > >         unsigned int xdp_headroom = virtnet_get_headroom(vi);
> > > > > > > > > > > >         void *ctx = (void *)(unsigned long)xdp_headroom;
> > > > > > > > > > > >         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > >         int err;
> > > > > > > > > > > >
> > > > > > > > > > > >         len = SKB_DATA_ALIGN(len) +
> > > > > > > > > > > > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > > > > > > >         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > > > > > >         get_page(alloc_frag->page);
> > > > > > > > > > > >         alloc_frag->offset += len;
> > > > > > > > > > > > -       sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > > -                   vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > > > > > +
> > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > +               err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > > +                                       vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > > +               if (err)
> > > > > > > > > > > > +                       goto map_err;
> > > > > > > > > > > > +
> > > > > > > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > > > > > > +       } else {
> > > > > > > > > > > > +               sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > > +                           vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > > +               data = (void *)buf;
> > > > > > > > > > > > +       }
> > > > > > > > > > > > +
> > > > > > > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > > > > > > >         if (err < 0)
> > > > > > > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > > > > > > +               goto add_err;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       return err;
> > > > > > > > > > > > +
> > > > > > > > > > > > +add_err:
> > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > +       }
> > > > > > > > > > > > +
> > > > > > > > > > > > +map_err:
> > > > > > > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > > > > > > >         return err;
> > > > > > > > > > > >  }
> > > > > > > > > > > >
> > > > > > > > > > > > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > > > > >         unsigned int headroom = virtnet_get_headroom(vi);
> > > > > > > > > > > >         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> > > > > > > > > > > >         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > >         char *buf;
> > > > > > > > > > > >         void *ctx;
> > > > > > > > > > > >         int err;
> > > > > > > > > > > > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > > > > >                 alloc_frag->offset += hole;
> > > > > > > > > > > >         }
> > > > > > > > > > > >
> > > > > > > > > > > > -       sg_init_one(rq->sg, buf, len);
> > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > +               err = virtnet_rq_map_sg(rq, buf, len);
> > > > > > > > > > > > +               if (err)
> > > > > > > > > > > > +                       goto map_err;
> > > > > > > > > > > > +
> > > > > > > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > > > > > > +       } else {
> > > > > > > > > > > > +               sg_init_one(rq->sg, buf, len);
> > > > > > > > > > > > +               data = (void *)buf;
> > > > > > > > > > > > +       }
> > > > > > > > > > > > +
> > > > > > > > > > > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > > > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > > > > > > >         if (err < 0)
> > > > > > > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > > > > > > +               goto add_err;
> > > > > > > > > > > > +
> > > > > > > > > > > > +       return 0;
> > > > > > > > > > > > +
> > > > > > > > > > > > +add_err:
> > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > +       }
> > > > > > > > > > > >
> > > > > > > > > > > > +map_err:
> > > > > > > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > > > > > > >         return err;
> > > > > > > > > > > >  }
> > > > > > > > > > > >
> > > > > > > > > > > > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> > > > > > > > > > > >                 void *ctx;
> > > > > > > > > > > >
> > > > > > > > > > > >                 while (stats.packets < budget &&
> > > > > > > > > > > > -                      (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > > > > > > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> > > > > > > > > > > >                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> > > > > > > > > > > >                         stats.packets++;
> > > > > > > > > > > >                 }
> > > > > > > > > > > >         } else {
> > > > > > > > > > > >                 while (stats.packets < budget &&
> > > > > > > > > > > > -                      (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > > > > > > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> > > > > > > > > > > >                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> > > > > > > > > > > >                         stats.packets++;
> > > > > > > > > > > >                 }
> > > > > > > > > > > > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > > > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > >                 __netif_napi_del(&vi->rq[i].napi);
> > > > > > > > > > > >                 __netif_napi_del(&vi->sq[i].napi);
> > > > > > > > > > > > +
> > > > > > > > > > > > +               kfree(vi->rq[i].data_array);
> > > > > > > > > > > > +               kfree(vi->rq[i].dma_array);
> > > > > > > > > > > >         }
> > > > > > > > > > > >
> > > > > > > > > > > >         /* We called __netif_napi_del(),
> > > > > > > > > > > > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > > > > > > >         }
> > > > > > > > > > > >
> > > > > > > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > -               struct virtqueue *vq = vi->rq[i].vq;
> > > > > > > > > > > > -               while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > > > > > > > > > > > -                       virtnet_rq_free_unused_buf(vq, buf);
> > > > > > > > > > > > +               struct receive_queue *rq = &vi->rq[i];
> > > > > > > > > > > > +
> > > > > > > > > > > > +               while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > > > > > > > > > > > +                       virtnet_rq_free_unused_buf(rq->vq, buf);
> > > > > > > > > > > >                 cond_resched();
> > > > > > > > > > > >         }
> > > > > > > > > > > >  }
> > > > > > > > > > > > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> > > > > > > > > > > >         if (ret)
> > > > > > > > > > > >                 goto err_free;
> > > > > > > > > > > >
> > > > > > > > > > > > +       ret = virtnet_rq_merge_map_init(vi);
> > > > > > > > > > > > +       if (ret)
> > > > > > > > > > > > +               goto err_free;
> > > > > > > > > > > > +
> > > > > > > > > > > >         cpus_read_lock();
> > > > > > > > > > > >         virtnet_set_affinity(vi);
> > > > > > > > > > > >         cpus_read_unlock();
> > > > > > > > > > > > --
> > > > > > > > > > > > 2.32.0.3.g01195cf9f
> > > > > > > > > > >
> > > > > > > > >
> > > > > > > >
> > > > > > >
> > > > > >
> > > > >
> > > >
> > >
> >
Xuan Zhuo July 19, 2023, 3:21 a.m. UTC | #21
On Fri, 14 Jul 2023 06:37:10 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Wed, Jul 12, 2023 at 04:38:24PM +0800, Xuan Zhuo wrote:
> > On Wed, 12 Jul 2023 16:37:43 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > On Wed, Jul 12, 2023 at 4:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > >
> > > > On Wed, 12 Jul 2023 15:54:58 +0800, Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > On Tue, 11 Jul 2023 10:58:51 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > On Tue, Jul 11, 2023 at 10:42 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > >
> > > > > > > On Tue, 11 Jul 2023 10:36:17 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > On Mon, Jul 10, 2023 at 8:41 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > >
> > > > > > > > > On Mon, 10 Jul 2023 07:59:03 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > > > > > On Mon, Jul 10, 2023 at 06:18:30PM +0800, Xuan Zhuo wrote:
> > > > > > > > > > > On Mon, 10 Jul 2023 05:40:21 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > > > > > > > On Mon, Jul 10, 2023 at 11:42:37AM +0800, Xuan Zhuo wrote:
> > > > > > > > > > > > > Currently, the virtio core will perform a dma operation for each
> > > > > > > > > > > > > operation. Although, the same page may be operated multiple times.
> > > > > > > > > > > > >
> > > > > > > > > > > > > The driver does the dma operation and manages the dma address based the
> > > > > > > > > > > > > feature premapped of virtio core.
> > > > > > > > > > > > >
> > > > > > > > > > > > > This way, we can perform only one dma operation for the same page. In
> > > > > > > > > > > > > the case of mtu 1500, this can reduce a lot of dma operations.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > > > > > > > > > > > > increased from 1893766 to 1901105. An increase of 0.4%.
> > > > > > > > > > > >
> > > > > > > > > > > > what kind of dma was there? an IOMMU? which vendors? in which mode
> > > > > > > > > > > > of operation?
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Do you mean this:
> > > > > > > > > > >
> > > > > > > > > > > [    0.470816] iommu: Default domain type: Passthrough
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > With passthrough, dma API is just some indirect function calls, they do
> > > > > > > > > > not affect the performance a lot.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Yes, this benefit is worthless. I seem to have done a meaningless thing. The
> > > > > > > > > overhead of DMA I observed is indeed not too high.
> > > > > > > >
> > > > > > > > Have you measured with iommu=strict?
> > > > > > >
> > > > > > > I have not tested this way, our environment is pt, I wonder if strict is a
> > > > > > > common scenario. I can test it.
> > > > > >
> > > > > > It's not a common setup, but it's a way to stress DMA layer to see the overhead.
> > > > >
> > > > > kernel command line: intel_iommu=on iommu.strict=1 iommu.passthrough=0
> > > > >
> > > > > virtio-net without merge dma 428614.00 pps
> > > > >
> > > > > virtio-net with merge dma    742853.00 pps
> > > >
> > > >
> > > > kernel command line: intel_iommu=on iommu.strict=0 iommu.passthrough=0
> > > >
> > > > virtio-net without merge dma 775496.00 pps
> > > >
> > > > virtio-net with merge dma    1010514.00 pps
> > > >
> > > >
> > >
> > > Great, let's add those numbers to the changelog.
> >
> >
> > Yes, I will do it in next version.
> >
> >
> > Thanks.
> >
>
> You should also test without iommu but with swiotlb=force


For swiotlb, merge DMA has no benefit, because we still need to copy data from
swiotlb buffer to the origin buffer.

The benefit of the merge DMA is to reduce the operate to the iommu device.

I did some test for this. The result is same.

Thanks.


>
> But first fix the use of DMA API to actually be correct,
> otherwise you are cheating by avoiding synchronization.
>
>
>
> > >
> > > Thanks
> > >
> > > > Thanks.
> > > >
> > > > >
> > > > >
> > > > > Thanks.
> > > > >
> > > > >
> > > > >
> > > > >
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > > > >
> > > > > > > Thanks.
> > > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > Thanks
> > > > > > > >
> > > > > > > > >
> > > > > > > > > Thanks.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Try e.g. bounce buffer. Which is where you will see a problem: your
> > > > > > > > > > patches won't work.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > > > > > >
> > > > > > > > > > > > This kind of difference is likely in the noise.
> > > > > > > > > > >
> > > > > > > > > > > It's really not high, but this is because the proportion of DMA under perf top
> > > > > > > > > > > is not high. Probably that much.
> > > > > > > > > >
> > > > > > > > > > So maybe not worth the complexity.
> > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > ---
> > > > > > > > > > > > >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> > > > > > > > > > > > >  1 file changed, 267 insertions(+), 16 deletions(-)
> > > > > > > > > > > > >
> > > > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > > > > > > > index 486b5849033d..4de845d35bed 100644
> > > > > > > > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > > > > > > > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> > > > > > > > > > > > >  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
> > > > > > > > > > > > >  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
> > > > > > > > > > > > >
> > > > > > > > > > > > > +/* The bufs on the same page may share this struct. */
> > > > > > > > > > > > > +struct virtnet_rq_dma {
> > > > > > > > > > > > > +       struct virtnet_rq_dma *next;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       dma_addr_t addr;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > +       u32 len;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       u32 ref;
> > > > > > > > > > > > > +};
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +/* Record the dma and buf. */
> > > > > > > > > > > >
> > > > > > > > > > > > I guess I see that. But why?
> > > > > > > > > > > > And these two comments are the extent of the available
> > > > > > > > > > > > documentation, that's not enough I feel.
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > +struct virtnet_rq_data {
> > > > > > > > > > > > > +       struct virtnet_rq_data *next;
> > > > > > > > > > > >
> > > > > > > > > > > > Is manually reimplementing a linked list the best
> > > > > > > > > > > > we can do?
> > > > > > > > > > >
> > > > > > > > > > > Yes, we can use llist.
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       struct virtnet_rq_dma *dma;
> > > > > > > > > > > > > +};
> > > > > > > > > > > > > +
> > > > > > > > > > > > >  /* Internal representation of a send virtqueue */
> > > > > > > > > > > > >  struct send_queue {
> > > > > > > > > > > > >         /* Virtqueue associated with this send _queue */
> > > > > > > > > > > > > @@ -175,6 +196,13 @@ struct receive_queue {
> > > > > > > > > > > > >         char name[16];
> > > > > > > > > > > > >
> > > > > > > > > > > > >         struct xdp_rxq_info xdp_rxq;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       struct virtnet_rq_data *data_array;
> > > > > > > > > > > > > +       struct virtnet_rq_data *data_free;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       struct virtnet_rq_dma *dma_array;
> > > > > > > > > > > > > +       struct virtnet_rq_dma *dma_free;
> > > > > > > > > > > > > +       struct virtnet_rq_dma *last_dma;
> > > > > > > > > > > > >  };
> > > > > > > > > > > > >
> > > > > > > > > > > > >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > > > > > > > > > > > > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > > > > > > > >         return skb;
> > > > > > > > > > > > >  }
> > > > > > > > > > > > >
> > > > > > > > > > > > > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > > > > > > > > > > > > +{
> > > > > > > > > > > > > +       struct device *dev;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       --dma->ref;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       if (dma->ref)
> > > > > > > > > > > > > +               return;
> > > > > > > > > > > > > +
> > > > > > > > > > > >
> > > > > > > > > > > > If you don't unmap there is no guarantee valid data will be
> > > > > > > > > > > > there in the buffer.
> > > > > > > > > > > >
> > > > > > > > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       dma->next = rq->dma_free;
> > > > > > > > > > > > > +       rq->dma_free = dma;
> > > > > > > > > > > > > +}
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > > > > > > > > > > > > +                                    struct virtnet_rq_data *data)
> > > > > > > > > > > > > +{
> > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       buf = data->buf;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       data->next = rq->data_free;
> > > > > > > > > > > > > +       rq->data_free = data;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       return buf;
> > > > > > > > > > > > > +}
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > > > > > > > > > > > > +                                                  void *buf,
> > > > > > > > > > > > > +                                                  struct virtnet_rq_dma *dma)
> > > > > > > > > > > > > +{
> > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       data = rq->data_free;
> > > > > > > > > > > > > +       rq->data_free = data->next;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       data->buf = buf;
> > > > > > > > > > > > > +       data->dma = dma;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       return data;
> > > > > > > > > > > > > +}
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > > > > > > > > > > > > +{
> > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > > > > > > > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > > > > > > > +               return buf;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       data = buf;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > +}
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > > > > > > > > > > > > +{
> > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       buf = virtqueue_detach_unused_buf(rq->vq);
> > > > > > > > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > > > > > > > +               return buf;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       data = buf;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > +}
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > > > > > > > > > > > > +{
> > > > > > > > > > > > > +       struct virtnet_rq_dma *dma = rq->last_dma;
> > > > > > > > > > > > > +       struct device *dev;
> > > > > > > > > > > > > +       u32 off, map_len;
> > > > > > > > > > > > > +       dma_addr_t addr;
> > > > > > > > > > > > > +       void *end;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > > > > > > > > > > > > +               ++dma->ref;
> > > > > > > > > > > > > +               addr = dma->addr + (buf - dma->buf);
> > > > > > > > > > > > > +               goto ok;
> > > > > > > > > > > > > +       }
> > > > > > > > > > > >
> > > > > > > > > > > > So this is the meat of the proposed optimization. I guess that
> > > > > > > > > > > > if the last buffer we allocated happens to be in the same page
> > > > > > > > > > > > as this one then they can both be mapped for DMA together.
> > > > > > > > > > >
> > > > > > > > > > > Since we use page_frag, the buffers we allocated are all continuous.
> > > > > > > > > > >
> > > > > > > > > > > > Why last one specifically? Whether next one happens to
> > > > > > > > > > > > be close depends on luck. If you want to try optimizing this
> > > > > > > > > > > > the right thing to do is likely by using a page pool.
> > > > > > > > > > > > There's actually work upstream on page pool, look it up.
> > > > > > > > > > >
> > > > > > > > > > > As we discussed in another thread, the page pool is first used for xdp. Let's
> > > > > > > > > > > transform it step by step.
> > > > > > > > > > >
> > > > > > > > > > > Thanks.
> > > > > > > > > >
> > > > > > > > > > ok so this should wait then?
> > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       end = buf + len - 1;
> > > > > > > > > > > > > +       off = offset_in_page(end);
> > > > > > > > > > > > > +       map_len = len + PAGE_SIZE - off;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > > > > > > > > > > > > +                                 map_len, DMA_FROM_DEVICE, 0);
> > > > > > > > > > > > > +       if (addr == DMA_MAPPING_ERROR)
> > > > > > > > > > > > > +               return -ENOMEM;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       dma = rq->dma_free;
> > > > > > > > > > > > > +       rq->dma_free = dma->next;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       dma->ref = 1;
> > > > > > > > > > > > > +       dma->buf = buf;
> > > > > > > > > > > > > +       dma->addr = addr;
> > > > > > > > > > > > > +       dma->len = map_len;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       rq->last_dma = dma;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +ok:
> > > > > > > > > > > > > +       sg_init_table(rq->sg, 1);
> > > > > > > > > > > > > +       rq->sg[0].dma_address = addr;
> > > > > > > > > > > > > +       rq->sg[0].length = len;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       return 0;
> > > > > > > > > > > > > +}
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > > > > > > > > > > > > +{
> > > > > > > > > > > > > +       struct receive_queue *rq;
> > > > > > > > > > > > > +       int i, err, j, num;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       /* disable for big mode */
> > > > > > > > > > > > > +       if (!vi->mergeable_rx_bufs && vi->big_packets)
> > > > > > > > > > > > > +               return 0;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > +               err = virtqueue_set_premapped(vi->rq[i].vq);
> > > > > > > > > > > > > +               if (err)
> > > > > > > > > > > > > +                       continue;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +               rq = &vi->rq[i];
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +               num = virtqueue_get_vring_size(rq->vq);
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +               rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > > > > > > > > > > > > +               if (!rq->data_array)
> > > > > > > > > > > > > +                       goto err;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +               rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > > > > > > > > > > > > +               if (!rq->dma_array)
> > > > > > > > > > > > > +                       goto err;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +               for (j = 0; j < num; ++j) {
> > > > > > > > > > > > > +                       rq->data_array[j].next = rq->data_free;
> > > > > > > > > > > > > +                       rq->data_free = &rq->data_array[j];
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +                       rq->dma_array[j].next = rq->dma_free;
> > > > > > > > > > > > > +                       rq->dma_free = &rq->dma_array[j];
> > > > > > > > > > > > > +               }
> > > > > > > > > > > > > +       }
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       return 0;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +err:
> > > > > > > > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > +               struct receive_queue *rq;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +               rq = &vi->rq[i];
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +               kfree(rq->dma_array);
> > > > > > > > > > > > > +               kfree(rq->data_array);
> > > > > > > > > > > > > +       }
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       return -ENOMEM;
> > > > > > > > > > > > > +}
> > > > > > > > > > > > > +
> > > > > > > > > > > > >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> > > > > > > > > > > > >  {
> > > > > > > > > > > > >         unsigned int len;
> > > > > > > > > > > > > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > > > > > > > >                 void *buf;
> > > > > > > > > > > > >                 int off;
> > > > > > > > > > > > >
> > > > > > > > > > > > > -               buf = virtqueue_get_buf(rq->vq, &buflen);
> > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> > > > > > > > > > > > >                 if (unlikely(!buf))
> > > > > > > > > > > > >                         goto err_buf;
> > > > > > > > > > > > >
> > > > > > > > > > > > > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > > > > > > > >                 return -EINVAL;
> > > > > > > > > > > > >
> > > > > > > > > > > > >         while (--*num_buf > 0) {
> > > > > > > > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > > > > > > > >                                  dev->name, *num_buf,
> > > > > > > > > > > > > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > > > > >         while (--num_buf) {
> > > > > > > > > > > > >                 int num_skb_frags;
> > > > > > > > > > > > >
> > > > > > > > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > > > > > > > >                                  dev->name, num_buf,
> > > > > > > > > > > > > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > > > > >  err_skb:
> > > > > > > > > > > > >         put_page(page);
> > > > > > > > > > > > >         while (num_buf-- > 1) {
> > > > > > > > > > > > > -               buf = virtqueue_get_buf(rq->vq, &len);
> > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, NULL);
> > > > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers missing\n",
> > > > > > > > > > > > >                                  dev->name, num_buf);
> > > > > > > > > > > > > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > > > > > > > >         unsigned int xdp_headroom = virtnet_get_headroom(vi);
> > > > > > > > > > > > >         void *ctx = (void *)(unsigned long)xdp_headroom;
> > > > > > > > > > > > >         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > >         int err;
> > > > > > > > > > > > >
> > > > > > > > > > > > >         len = SKB_DATA_ALIGN(len) +
> > > > > > > > > > > > > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > > > > > > > >         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > > > > > > >         get_page(alloc_frag->page);
> > > > > > > > > > > > >         alloc_frag->offset += len;
> > > > > > > > > > > > > -       sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > > > -                   vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > +               err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > > > +                                       vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > > > +               if (err)
> > > > > > > > > > > > > +                       goto map_err;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > > > > > > > +       } else {
> > > > > > > > > > > > > +               sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > > > +                           vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > > > +               data = (void *)buf;
> > > > > > > > > > > > > +       }
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > > > > > > > >         if (err < 0)
> > > > > > > > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > > > > > > > +               goto add_err;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       return err;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +add_err:
> > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > +       }
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +map_err:
> > > > > > > > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > > > > > > > >         return err;
> > > > > > > > > > > > >  }
> > > > > > > > > > > > >
> > > > > > > > > > > > > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > > > > > >         unsigned int headroom = virtnet_get_headroom(vi);
> > > > > > > > > > > > >         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> > > > > > > > > > > > >         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > >         char *buf;
> > > > > > > > > > > > >         void *ctx;
> > > > > > > > > > > > >         int err;
> > > > > > > > > > > > > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > > > > > >                 alloc_frag->offset += hole;
> > > > > > > > > > > > >         }
> > > > > > > > > > > > >
> > > > > > > > > > > > > -       sg_init_one(rq->sg, buf, len);
> > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > +               err = virtnet_rq_map_sg(rq, buf, len);
> > > > > > > > > > > > > +               if (err)
> > > > > > > > > > > > > +                       goto map_err;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > > > > > > > +       } else {
> > > > > > > > > > > > > +               sg_init_one(rq->sg, buf, len);
> > > > > > > > > > > > > +               data = (void *)buf;
> > > > > > > > > > > > > +       }
> > > > > > > > > > > > > +
> > > > > > > > > > > > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > > > > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > > > > > > > >         if (err < 0)
> > > > > > > > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > > > > > > > +               goto add_err;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +       return 0;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +add_err:
> > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > +       }
> > > > > > > > > > > > >
> > > > > > > > > > > > > +map_err:
> > > > > > > > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > > > > > > > >         return err;
> > > > > > > > > > > > >  }
> > > > > > > > > > > > >
> > > > > > > > > > > > > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> > > > > > > > > > > > >                 void *ctx;
> > > > > > > > > > > > >
> > > > > > > > > > > > >                 while (stats.packets < budget &&
> > > > > > > > > > > > > -                      (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > > > > > > > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> > > > > > > > > > > > >                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> > > > > > > > > > > > >                         stats.packets++;
> > > > > > > > > > > > >                 }
> > > > > > > > > > > > >         } else {
> > > > > > > > > > > > >                 while (stats.packets < budget &&
> > > > > > > > > > > > > -                      (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > > > > > > > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> > > > > > > > > > > > >                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> > > > > > > > > > > > >                         stats.packets++;
> > > > > > > > > > > > >                 }
> > > > > > > > > > > > > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > > > > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > >                 __netif_napi_del(&vi->rq[i].napi);
> > > > > > > > > > > > >                 __netif_napi_del(&vi->sq[i].napi);
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +               kfree(vi->rq[i].data_array);
> > > > > > > > > > > > > +               kfree(vi->rq[i].dma_array);
> > > > > > > > > > > > >         }
> > > > > > > > > > > > >
> > > > > > > > > > > > >         /* We called __netif_napi_del(),
> > > > > > > > > > > > > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > > > > > > > >         }
> > > > > > > > > > > > >
> > > > > > > > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > -               struct virtqueue *vq = vi->rq[i].vq;
> > > > > > > > > > > > > -               while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > > > > > > > > > > > > -                       virtnet_rq_free_unused_buf(vq, buf);
> > > > > > > > > > > > > +               struct receive_queue *rq = &vi->rq[i];
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +               while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > > > > > > > > > > > > +                       virtnet_rq_free_unused_buf(rq->vq, buf);
> > > > > > > > > > > > >                 cond_resched();
> > > > > > > > > > > > >         }
> > > > > > > > > > > > >  }
> > > > > > > > > > > > > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> > > > > > > > > > > > >         if (ret)
> > > > > > > > > > > > >                 goto err_free;
> > > > > > > > > > > > >
> > > > > > > > > > > > > +       ret = virtnet_rq_merge_map_init(vi);
> > > > > > > > > > > > > +       if (ret)
> > > > > > > > > > > > > +               goto err_free;
> > > > > > > > > > > > > +
> > > > > > > > > > > > >         cpus_read_lock();
> > > > > > > > > > > > >         virtnet_set_affinity(vi);
> > > > > > > > > > > > >         cpus_read_unlock();
> > > > > > > > > > > > > --
> > > > > > > > > > > > > 2.32.0.3.g01195cf9f
> > > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > >
> > > > > > > >
> > > > > > >
> > > > > >
> > > > >
> > > >
> > >
>
Michael S. Tsirkin July 19, 2023, 8:55 a.m. UTC | #22
On Wed, Jul 19, 2023 at 11:21:07AM +0800, Xuan Zhuo wrote:
> On Fri, 14 Jul 2023 06:37:10 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > On Wed, Jul 12, 2023 at 04:38:24PM +0800, Xuan Zhuo wrote:
> > > On Wed, 12 Jul 2023 16:37:43 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > On Wed, Jul 12, 2023 at 4:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > >
> > > > > On Wed, 12 Jul 2023 15:54:58 +0800, Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > On Tue, 11 Jul 2023 10:58:51 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > On Tue, Jul 11, 2023 at 10:42 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > >
> > > > > > > > On Tue, 11 Jul 2023 10:36:17 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > > On Mon, Jul 10, 2023 at 8:41 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Mon, 10 Jul 2023 07:59:03 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > > > > > > On Mon, Jul 10, 2023 at 06:18:30PM +0800, Xuan Zhuo wrote:
> > > > > > > > > > > > On Mon, 10 Jul 2023 05:40:21 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > > > > > > > > On Mon, Jul 10, 2023 at 11:42:37AM +0800, Xuan Zhuo wrote:
> > > > > > > > > > > > > > Currently, the virtio core will perform a dma operation for each
> > > > > > > > > > > > > > operation. Although, the same page may be operated multiple times.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > The driver does the dma operation and manages the dma address based the
> > > > > > > > > > > > > > feature premapped of virtio core.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > This way, we can perform only one dma operation for the same page. In
> > > > > > > > > > > > > > the case of mtu 1500, this can reduce a lot of dma operations.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > > > > > > > > > > > > > increased from 1893766 to 1901105. An increase of 0.4%.
> > > > > > > > > > > > >
> > > > > > > > > > > > > what kind of dma was there? an IOMMU? which vendors? in which mode
> > > > > > > > > > > > > of operation?
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > Do you mean this:
> > > > > > > > > > > >
> > > > > > > > > > > > [    0.470816] iommu: Default domain type: Passthrough
> > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > With passthrough, dma API is just some indirect function calls, they do
> > > > > > > > > > > not affect the performance a lot.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Yes, this benefit is worthless. I seem to have done a meaningless thing. The
> > > > > > > > > > overhead of DMA I observed is indeed not too high.
> > > > > > > > >
> > > > > > > > > Have you measured with iommu=strict?
> > > > > > > >
> > > > > > > > I have not tested this way, our environment is pt, I wonder if strict is a
> > > > > > > > common scenario. I can test it.
> > > > > > >
> > > > > > > It's not a common setup, but it's a way to stress DMA layer to see the overhead.
> > > > > >
> > > > > > kernel command line: intel_iommu=on iommu.strict=1 iommu.passthrough=0
> > > > > >
> > > > > > virtio-net without merge dma 428614.00 pps
> > > > > >
> > > > > > virtio-net with merge dma    742853.00 pps
> > > > >
> > > > >
> > > > > kernel command line: intel_iommu=on iommu.strict=0 iommu.passthrough=0
> > > > >
> > > > > virtio-net without merge dma 775496.00 pps
> > > > >
> > > > > virtio-net with merge dma    1010514.00 pps
> > > > >
> > > > >
> > > >
> > > > Great, let's add those numbers to the changelog.
> > >
> > >
> > > Yes, I will do it in next version.
> > >
> > >
> > > Thanks.
> > >
> >
> > You should also test without iommu but with swiotlb=force
> 
> 
> For swiotlb, merge DMA has no benefit, because we still need to copy data from
> swiotlb buffer to the origin buffer.
> The benefit of the merge DMA is to reduce the operate to the iommu device.
> 
> I did some test for this. The result is same.
> 
> Thanks.
> 

Did you actually check that it works though?
Looks like with swiotlb you need to synch to trigger a copy
before unmap, and I don't see where it's done in the current
patch.


> 
> >
> > But first fix the use of DMA API to actually be correct,
> > otherwise you are cheating by avoiding synchronization.
> >
> >
> >
> > > >
> > > > Thanks
> > > >
> > > > > Thanks.
> > > > >
> > > > > >
> > > > > >
> > > > > > Thanks.
> > > > > >
> > > > > >
> > > > > >
> > > > > >
> > > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > > > >
> > > > > > > > Thanks.
> > > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > Thanks
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Thanks.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Try e.g. bounce buffer. Which is where you will see a problem: your
> > > > > > > > > > > patches won't work.
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > > > > > > >
> > > > > > > > > > > > > This kind of difference is likely in the noise.
> > > > > > > > > > > >
> > > > > > > > > > > > It's really not high, but this is because the proportion of DMA under perf top
> > > > > > > > > > > > is not high. Probably that much.
> > > > > > > > > > >
> > > > > > > > > > > So maybe not worth the complexity.
> > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > ---
> > > > > > > > > > > > > >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> > > > > > > > > > > > > >  1 file changed, 267 insertions(+), 16 deletions(-)
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > > > > > > > > index 486b5849033d..4de845d35bed 100644
> > > > > > > > > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > > > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > > > > > > > > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> > > > > > > > > > > > > >  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
> > > > > > > > > > > > > >  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > +/* The bufs on the same page may share this struct. */
> > > > > > > > > > > > > > +struct virtnet_rq_dma {
> > > > > > > > > > > > > > +       struct virtnet_rq_dma *next;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       dma_addr_t addr;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > +       u32 len;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       u32 ref;
> > > > > > > > > > > > > > +};
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +/* Record the dma and buf. */
> > > > > > > > > > > > >
> > > > > > > > > > > > > I guess I see that. But why?
> > > > > > > > > > > > > And these two comments are the extent of the available
> > > > > > > > > > > > > documentation, that's not enough I feel.
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > +struct virtnet_rq_data {
> > > > > > > > > > > > > > +       struct virtnet_rq_data *next;
> > > > > > > > > > > > >
> > > > > > > > > > > > > Is manually reimplementing a linked list the best
> > > > > > > > > > > > > we can do?
> > > > > > > > > > > >
> > > > > > > > > > > > Yes, we can use llist.
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       struct virtnet_rq_dma *dma;
> > > > > > > > > > > > > > +};
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > >  /* Internal representation of a send virtqueue */
> > > > > > > > > > > > > >  struct send_queue {
> > > > > > > > > > > > > >         /* Virtqueue associated with this send _queue */
> > > > > > > > > > > > > > @@ -175,6 +196,13 @@ struct receive_queue {
> > > > > > > > > > > > > >         char name[16];
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >         struct xdp_rxq_info xdp_rxq;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       struct virtnet_rq_data *data_array;
> > > > > > > > > > > > > > +       struct virtnet_rq_data *data_free;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       struct virtnet_rq_dma *dma_array;
> > > > > > > > > > > > > > +       struct virtnet_rq_dma *dma_free;
> > > > > > > > > > > > > > +       struct virtnet_rq_dma *last_dma;
> > > > > > > > > > > > > >  };
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > > > > > > > > > > > > > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > > > > > > > > >         return skb;
> > > > > > > > > > > > > >  }
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > +       struct device *dev;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       --dma->ref;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       if (dma->ref)
> > > > > > > > > > > > > > +               return;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > >
> > > > > > > > > > > > > If you don't unmap there is no guarantee valid data will be
> > > > > > > > > > > > > there in the buffer.
> > > > > > > > > > > > >
> > > > > > > > > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       dma->next = rq->dma_free;
> > > > > > > > > > > > > > +       rq->dma_free = dma;
> > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > > > > > > > > > > > > > +                                    struct virtnet_rq_data *data)
> > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       buf = data->buf;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       data->next = rq->data_free;
> > > > > > > > > > > > > > +       rq->data_free = data;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       return buf;
> > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > > > > > > > > > > > > > +                                                  void *buf,
> > > > > > > > > > > > > > +                                                  struct virtnet_rq_dma *dma)
> > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       data = rq->data_free;
> > > > > > > > > > > > > > +       rq->data_free = data->next;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       data->buf = buf;
> > > > > > > > > > > > > > +       data->dma = dma;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       return data;
> > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > > > > > > > > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > > > > > > > > +               return buf;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       data = buf;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       buf = virtqueue_detach_unused_buf(rq->vq);
> > > > > > > > > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > > > > > > > > +               return buf;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       data = buf;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > +       struct virtnet_rq_dma *dma = rq->last_dma;
> > > > > > > > > > > > > > +       struct device *dev;
> > > > > > > > > > > > > > +       u32 off, map_len;
> > > > > > > > > > > > > > +       dma_addr_t addr;
> > > > > > > > > > > > > > +       void *end;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > > > > > > > > > > > > > +               ++dma->ref;
> > > > > > > > > > > > > > +               addr = dma->addr + (buf - dma->buf);
> > > > > > > > > > > > > > +               goto ok;
> > > > > > > > > > > > > > +       }
> > > > > > > > > > > > >
> > > > > > > > > > > > > So this is the meat of the proposed optimization. I guess that
> > > > > > > > > > > > > if the last buffer we allocated happens to be in the same page
> > > > > > > > > > > > > as this one then they can both be mapped for DMA together.
> > > > > > > > > > > >
> > > > > > > > > > > > Since we use page_frag, the buffers we allocated are all continuous.
> > > > > > > > > > > >
> > > > > > > > > > > > > Why last one specifically? Whether next one happens to
> > > > > > > > > > > > > be close depends on luck. If you want to try optimizing this
> > > > > > > > > > > > > the right thing to do is likely by using a page pool.
> > > > > > > > > > > > > There's actually work upstream on page pool, look it up.
> > > > > > > > > > > >
> > > > > > > > > > > > As we discussed in another thread, the page pool is first used for xdp. Let's
> > > > > > > > > > > > transform it step by step.
> > > > > > > > > > > >
> > > > > > > > > > > > Thanks.
> > > > > > > > > > >
> > > > > > > > > > > ok so this should wait then?
> > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       end = buf + len - 1;
> > > > > > > > > > > > > > +       off = offset_in_page(end);
> > > > > > > > > > > > > > +       map_len = len + PAGE_SIZE - off;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > > > > > > > > > > > > > +                                 map_len, DMA_FROM_DEVICE, 0);
> > > > > > > > > > > > > > +       if (addr == DMA_MAPPING_ERROR)
> > > > > > > > > > > > > > +               return -ENOMEM;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       dma = rq->dma_free;
> > > > > > > > > > > > > > +       rq->dma_free = dma->next;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       dma->ref = 1;
> > > > > > > > > > > > > > +       dma->buf = buf;
> > > > > > > > > > > > > > +       dma->addr = addr;
> > > > > > > > > > > > > > +       dma->len = map_len;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       rq->last_dma = dma;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +ok:
> > > > > > > > > > > > > > +       sg_init_table(rq->sg, 1);
> > > > > > > > > > > > > > +       rq->sg[0].dma_address = addr;
> > > > > > > > > > > > > > +       rq->sg[0].length = len;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       return 0;
> > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > +       struct receive_queue *rq;
> > > > > > > > > > > > > > +       int i, err, j, num;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       /* disable for big mode */
> > > > > > > > > > > > > > +       if (!vi->mergeable_rx_bufs && vi->big_packets)
> > > > > > > > > > > > > > +               return 0;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > > +               err = virtqueue_set_premapped(vi->rq[i].vq);
> > > > > > > > > > > > > > +               if (err)
> > > > > > > > > > > > > > +                       continue;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +               rq = &vi->rq[i];
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +               num = virtqueue_get_vring_size(rq->vq);
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +               rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > > > > > > > > > > > > > +               if (!rq->data_array)
> > > > > > > > > > > > > > +                       goto err;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +               rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > > > > > > > > > > > > > +               if (!rq->dma_array)
> > > > > > > > > > > > > > +                       goto err;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +               for (j = 0; j < num; ++j) {
> > > > > > > > > > > > > > +                       rq->data_array[j].next = rq->data_free;
> > > > > > > > > > > > > > +                       rq->data_free = &rq->data_array[j];
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +                       rq->dma_array[j].next = rq->dma_free;
> > > > > > > > > > > > > > +                       rq->dma_free = &rq->dma_array[j];
> > > > > > > > > > > > > > +               }
> > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       return 0;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +err:
> > > > > > > > > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > > +               struct receive_queue *rq;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +               rq = &vi->rq[i];
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +               kfree(rq->dma_array);
> > > > > > > > > > > > > > +               kfree(rq->data_array);
> > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       return -ENOMEM;
> > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> > > > > > > > > > > > > >  {
> > > > > > > > > > > > > >         unsigned int len;
> > > > > > > > > > > > > > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > > > > > > > > >                 void *buf;
> > > > > > > > > > > > > >                 int off;
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > -               buf = virtqueue_get_buf(rq->vq, &buflen);
> > > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> > > > > > > > > > > > > >                 if (unlikely(!buf))
> > > > > > > > > > > > > >                         goto err_buf;
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > > > > > > > > >                 return -EINVAL;
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >         while (--*num_buf > 0) {
> > > > > > > > > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > > > > > > > > >                                  dev->name, *num_buf,
> > > > > > > > > > > > > > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > > > > > >         while (--num_buf) {
> > > > > > > > > > > > > >                 int num_skb_frags;
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > > > > > > > > >                                  dev->name, num_buf,
> > > > > > > > > > > > > > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > > > > > >  err_skb:
> > > > > > > > > > > > > >         put_page(page);
> > > > > > > > > > > > > >         while (num_buf-- > 1) {
> > > > > > > > > > > > > > -               buf = virtqueue_get_buf(rq->vq, &len);
> > > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, NULL);
> > > > > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers missing\n",
> > > > > > > > > > > > > >                                  dev->name, num_buf);
> > > > > > > > > > > > > > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > > > > > > > > >         unsigned int xdp_headroom = virtnet_get_headroom(vi);
> > > > > > > > > > > > > >         void *ctx = (void *)(unsigned long)xdp_headroom;
> > > > > > > > > > > > > >         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > >         int err;
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >         len = SKB_DATA_ALIGN(len) +
> > > > > > > > > > > > > > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > > > > > > > > >         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > > > > > > > >         get_page(alloc_frag->page);
> > > > > > > > > > > > > >         alloc_frag->offset += len;
> > > > > > > > > > > > > > -       sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > > > > -                   vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > > +               err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > > > > +                                       vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > > > > +               if (err)
> > > > > > > > > > > > > > +                       goto map_err;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > > > > > > > > +       } else {
> > > > > > > > > > > > > > +               sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > > > > +                           vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > > > > +               data = (void *)buf;
> > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > > > > > > > > >         if (err < 0)
> > > > > > > > > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > > > > > > > > +               goto add_err;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       return err;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +add_err:
> > > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +map_err:
> > > > > > > > > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > > > > > > > > >         return err;
> > > > > > > > > > > > > >  }
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > > > > > > >         unsigned int headroom = virtnet_get_headroom(vi);
> > > > > > > > > > > > > >         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> > > > > > > > > > > > > >         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > >         char *buf;
> > > > > > > > > > > > > >         void *ctx;
> > > > > > > > > > > > > >         int err;
> > > > > > > > > > > > > > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > > > > > > >                 alloc_frag->offset += hole;
> > > > > > > > > > > > > >         }
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > -       sg_init_one(rq->sg, buf, len);
> > > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > > +               err = virtnet_rq_map_sg(rq, buf, len);
> > > > > > > > > > > > > > +               if (err)
> > > > > > > > > > > > > > +                       goto map_err;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > > > > > > > > +       } else {
> > > > > > > > > > > > > > +               sg_init_one(rq->sg, buf, len);
> > > > > > > > > > > > > > +               data = (void *)buf;
> > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > > > > > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > > > > > > > > >         if (err < 0)
> > > > > > > > > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > > > > > > > > +               goto add_err;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +       return 0;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +add_err:
> > > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > +map_err:
> > > > > > > > > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > > > > > > > > >         return err;
> > > > > > > > > > > > > >  }
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> > > > > > > > > > > > > >                 void *ctx;
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >                 while (stats.packets < budget &&
> > > > > > > > > > > > > > -                      (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > > > > > > > > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> > > > > > > > > > > > > >                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> > > > > > > > > > > > > >                         stats.packets++;
> > > > > > > > > > > > > >                 }
> > > > > > > > > > > > > >         } else {
> > > > > > > > > > > > > >                 while (stats.packets < budget &&
> > > > > > > > > > > > > > -                      (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > > > > > > > > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> > > > > > > > > > > > > >                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> > > > > > > > > > > > > >                         stats.packets++;
> > > > > > > > > > > > > >                 }
> > > > > > > > > > > > > > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > > > > > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > >                 __netif_napi_del(&vi->rq[i].napi);
> > > > > > > > > > > > > >                 __netif_napi_del(&vi->sq[i].napi);
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +               kfree(vi->rq[i].data_array);
> > > > > > > > > > > > > > +               kfree(vi->rq[i].dma_array);
> > > > > > > > > > > > > >         }
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >         /* We called __netif_napi_del(),
> > > > > > > > > > > > > > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > > > > > > > > >         }
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > > -               struct virtqueue *vq = vi->rq[i].vq;
> > > > > > > > > > > > > > -               while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > > > > > > > > > > > > > -                       virtnet_rq_free_unused_buf(vq, buf);
> > > > > > > > > > > > > > +               struct receive_queue *rq = &vi->rq[i];
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +               while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > > > > > > > > > > > > > +                       virtnet_rq_free_unused_buf(rq->vq, buf);
> > > > > > > > > > > > > >                 cond_resched();
> > > > > > > > > > > > > >         }
> > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> > > > > > > > > > > > > >         if (ret)
> > > > > > > > > > > > > >                 goto err_free;
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > +       ret = virtnet_rq_merge_map_init(vi);
> > > > > > > > > > > > > > +       if (ret)
> > > > > > > > > > > > > > +               goto err_free;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > >         cpus_read_lock();
> > > > > > > > > > > > > >         virtnet_set_affinity(vi);
> > > > > > > > > > > > > >         cpus_read_unlock();
> > > > > > > > > > > > > > --
> > > > > > > > > > > > > > 2.32.0.3.g01195cf9f
> > > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > >
> > > > > > > >
> > > > > > >
> > > > > >
> > > > >
> > > >
> >
Jason Wang July 19, 2023, 9:38 a.m. UTC | #23
On Wed, Jul 19, 2023 at 4:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, Jul 19, 2023 at 11:21:07AM +0800, Xuan Zhuo wrote:
> > On Fri, 14 Jul 2023 06:37:10 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > On Wed, Jul 12, 2023 at 04:38:24PM +0800, Xuan Zhuo wrote:
> > > > On Wed, 12 Jul 2023 16:37:43 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > On Wed, Jul 12, 2023 at 4:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > >
> > > > > > On Wed, 12 Jul 2023 15:54:58 +0800, Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > On Tue, 11 Jul 2023 10:58:51 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > On Tue, Jul 11, 2023 at 10:42 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > >
> > > > > > > > > On Tue, 11 Jul 2023 10:36:17 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > > > On Mon, Jul 10, 2023 at 8:41 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > On Mon, 10 Jul 2023 07:59:03 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > > > > > > > On Mon, Jul 10, 2023 at 06:18:30PM +0800, Xuan Zhuo wrote:
> > > > > > > > > > > > > On Mon, 10 Jul 2023 05:40:21 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > > > > > > > > > On Mon, Jul 10, 2023 at 11:42:37AM +0800, Xuan Zhuo wrote:
> > > > > > > > > > > > > > > Currently, the virtio core will perform a dma operation for each
> > > > > > > > > > > > > > > operation. Although, the same page may be operated multiple times.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > The driver does the dma operation and manages the dma address based the
> > > > > > > > > > > > > > > feature premapped of virtio core.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > This way, we can perform only one dma operation for the same page. In
> > > > > > > > > > > > > > > the case of mtu 1500, this can reduce a lot of dma operations.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > > > > > > > > > > > > > > increased from 1893766 to 1901105. An increase of 0.4%.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > what kind of dma was there? an IOMMU? which vendors? in which mode
> > > > > > > > > > > > > > of operation?
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > Do you mean this:
> > > > > > > > > > > > >
> > > > > > > > > > > > > [    0.470816] iommu: Default domain type: Passthrough
> > > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > With passthrough, dma API is just some indirect function calls, they do
> > > > > > > > > > > > not affect the performance a lot.
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Yes, this benefit is worthless. I seem to have done a meaningless thing. The
> > > > > > > > > > > overhead of DMA I observed is indeed not too high.
> > > > > > > > > >
> > > > > > > > > > Have you measured with iommu=strict?
> > > > > > > > >
> > > > > > > > > I have not tested this way, our environment is pt, I wonder if strict is a
> > > > > > > > > common scenario. I can test it.
> > > > > > > >
> > > > > > > > It's not a common setup, but it's a way to stress DMA layer to see the overhead.
> > > > > > >
> > > > > > > kernel command line: intel_iommu=on iommu.strict=1 iommu.passthrough=0
> > > > > > >
> > > > > > > virtio-net without merge dma 428614.00 pps
> > > > > > >
> > > > > > > virtio-net with merge dma    742853.00 pps
> > > > > >
> > > > > >
> > > > > > kernel command line: intel_iommu=on iommu.strict=0 iommu.passthrough=0
> > > > > >
> > > > > > virtio-net without merge dma 775496.00 pps
> > > > > >
> > > > > > virtio-net with merge dma    1010514.00 pps
> > > > > >
> > > > > >
> > > > >
> > > > > Great, let's add those numbers to the changelog.
> > > >
> > > >
> > > > Yes, I will do it in next version.
> > > >
> > > >
> > > > Thanks.
> > > >
> > >
> > > You should also test without iommu but with swiotlb=force
> >
> >
> > For swiotlb, merge DMA has no benefit, because we still need to copy data from
> > swiotlb buffer to the origin buffer.
> > The benefit of the merge DMA is to reduce the operate to the iommu device.
> >
> > I did some test for this. The result is same.
> >
> > Thanks.
> >
>
> Did you actually check that it works though?
> Looks like with swiotlb you need to synch to trigger a copy
> before unmap, and I don't see where it's done in the current
> patch.

And this is needed for XDP_REDIRECT as well.

Thanks

>
>
> >
> > >
> > > But first fix the use of DMA API to actually be correct,
> > > otherwise you are cheating by avoiding synchronization.
> > >
> > >
> > >
> > > > >
> > > > > Thanks
> > > > >
> > > > > > Thanks.
> > > > > >
> > > > > > >
> > > > > > >
> > > > > > > Thanks.
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > Thanks
> > > > > > > >
> > > > > > > > >
> > > > > > > > > Thanks.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Thanks
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Thanks.
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > Try e.g. bounce buffer. Which is where you will see a problem: your
> > > > > > > > > > > > patches won't work.
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > This kind of difference is likely in the noise.
> > > > > > > > > > > > >
> > > > > > > > > > > > > It's really not high, but this is because the proportion of DMA under perf top
> > > > > > > > > > > > > is not high. Probably that much.
> > > > > > > > > > > >
> > > > > > > > > > > > So maybe not worth the complexity.
> > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > ---
> > > > > > > > > > > > > > >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> > > > > > > > > > > > > > >  1 file changed, 267 insertions(+), 16 deletions(-)
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > > > > > > > > > index 486b5849033d..4de845d35bed 100644
> > > > > > > > > > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > > > > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > > > > > > > > > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> > > > > > > > > > > > > > >  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
> > > > > > > > > > > > > > >  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +/* The bufs on the same page may share this struct. */
> > > > > > > > > > > > > > > +struct virtnet_rq_dma {
> > > > > > > > > > > > > > > +       struct virtnet_rq_dma *next;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       dma_addr_t addr;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > > +       u32 len;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       u32 ref;
> > > > > > > > > > > > > > > +};
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +/* Record the dma and buf. */
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > I guess I see that. But why?
> > > > > > > > > > > > > > And these two comments are the extent of the available
> > > > > > > > > > > > > > documentation, that's not enough I feel.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +struct virtnet_rq_data {
> > > > > > > > > > > > > > > +       struct virtnet_rq_data *next;
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Is manually reimplementing a linked list the best
> > > > > > > > > > > > > > we can do?
> > > > > > > > > > > > >
> > > > > > > > > > > > > Yes, we can use llist.
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       struct virtnet_rq_dma *dma;
> > > > > > > > > > > > > > > +};
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > >  /* Internal representation of a send virtqueue */
> > > > > > > > > > > > > > >  struct send_queue {
> > > > > > > > > > > > > > >         /* Virtqueue associated with this send _queue */
> > > > > > > > > > > > > > > @@ -175,6 +196,13 @@ struct receive_queue {
> > > > > > > > > > > > > > >         char name[16];
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >         struct xdp_rxq_info xdp_rxq;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       struct virtnet_rq_data *data_array;
> > > > > > > > > > > > > > > +       struct virtnet_rq_data *data_free;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       struct virtnet_rq_dma *dma_array;
> > > > > > > > > > > > > > > +       struct virtnet_rq_dma *dma_free;
> > > > > > > > > > > > > > > +       struct virtnet_rq_dma *last_dma;
> > > > > > > > > > > > > > >  };
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > > > > > > > > > > > > > > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > > > > > > > > > >         return skb;
> > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > +       struct device *dev;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       --dma->ref;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       if (dma->ref)
> > > > > > > > > > > > > > > +               return;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > If you don't unmap there is no guarantee valid data will be
> > > > > > > > > > > > > > there in the buffer.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       dma->next = rq->dma_free;
> > > > > > > > > > > > > > > +       rq->dma_free = dma;
> > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > > > > > > > > > > > > > > +                                    struct virtnet_rq_data *data)
> > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       buf = data->buf;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       data->next = rq->data_free;
> > > > > > > > > > > > > > > +       rq->data_free = data;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       return buf;
> > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > > > > > > > > > > > > > > +                                                  void *buf,
> > > > > > > > > > > > > > > +                                                  struct virtnet_rq_dma *dma)
> > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       data = rq->data_free;
> > > > > > > > > > > > > > > +       rq->data_free = data->next;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       data->buf = buf;
> > > > > > > > > > > > > > > +       data->dma = dma;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       return data;
> > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > > > > > > > > > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > > > > > > > > > +               return buf;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       data = buf;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       buf = virtqueue_detach_unused_buf(rq->vq);
> > > > > > > > > > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > > > > > > > > > +               return buf;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       data = buf;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > +       struct virtnet_rq_dma *dma = rq->last_dma;
> > > > > > > > > > > > > > > +       struct device *dev;
> > > > > > > > > > > > > > > +       u32 off, map_len;
> > > > > > > > > > > > > > > +       dma_addr_t addr;
> > > > > > > > > > > > > > > +       void *end;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > > > > > > > > > > > > > > +               ++dma->ref;
> > > > > > > > > > > > > > > +               addr = dma->addr + (buf - dma->buf);
> > > > > > > > > > > > > > > +               goto ok;
> > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > So this is the meat of the proposed optimization. I guess that
> > > > > > > > > > > > > > if the last buffer we allocated happens to be in the same page
> > > > > > > > > > > > > > as this one then they can both be mapped for DMA together.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Since we use page_frag, the buffers we allocated are all continuous.
> > > > > > > > > > > > >
> > > > > > > > > > > > > > Why last one specifically? Whether next one happens to
> > > > > > > > > > > > > > be close depends on luck. If you want to try optimizing this
> > > > > > > > > > > > > > the right thing to do is likely by using a page pool.
> > > > > > > > > > > > > > There's actually work upstream on page pool, look it up.
> > > > > > > > > > > > >
> > > > > > > > > > > > > As we discussed in another thread, the page pool is first used for xdp. Let's
> > > > > > > > > > > > > transform it step by step.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Thanks.
> > > > > > > > > > > >
> > > > > > > > > > > > ok so this should wait then?
> > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       end = buf + len - 1;
> > > > > > > > > > > > > > > +       off = offset_in_page(end);
> > > > > > > > > > > > > > > +       map_len = len + PAGE_SIZE - off;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > > > > > > > > > > > > > > +                                 map_len, DMA_FROM_DEVICE, 0);
> > > > > > > > > > > > > > > +       if (addr == DMA_MAPPING_ERROR)
> > > > > > > > > > > > > > > +               return -ENOMEM;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       dma = rq->dma_free;
> > > > > > > > > > > > > > > +       rq->dma_free = dma->next;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       dma->ref = 1;
> > > > > > > > > > > > > > > +       dma->buf = buf;
> > > > > > > > > > > > > > > +       dma->addr = addr;
> > > > > > > > > > > > > > > +       dma->len = map_len;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       rq->last_dma = dma;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +ok:
> > > > > > > > > > > > > > > +       sg_init_table(rq->sg, 1);
> > > > > > > > > > > > > > > +       rq->sg[0].dma_address = addr;
> > > > > > > > > > > > > > > +       rq->sg[0].length = len;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       return 0;
> > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > +       struct receive_queue *rq;
> > > > > > > > > > > > > > > +       int i, err, j, num;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       /* disable for big mode */
> > > > > > > > > > > > > > > +       if (!vi->mergeable_rx_bufs && vi->big_packets)
> > > > > > > > > > > > > > > +               return 0;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > > > +               err = virtqueue_set_premapped(vi->rq[i].vq);
> > > > > > > > > > > > > > > +               if (err)
> > > > > > > > > > > > > > > +                       continue;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +               rq = &vi->rq[i];
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +               num = virtqueue_get_vring_size(rq->vq);
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +               rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > > > > > > > > > > > > > > +               if (!rq->data_array)
> > > > > > > > > > > > > > > +                       goto err;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +               rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > > > > > > > > > > > > > > +               if (!rq->dma_array)
> > > > > > > > > > > > > > > +                       goto err;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +               for (j = 0; j < num; ++j) {
> > > > > > > > > > > > > > > +                       rq->data_array[j].next = rq->data_free;
> > > > > > > > > > > > > > > +                       rq->data_free = &rq->data_array[j];
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +                       rq->dma_array[j].next = rq->dma_free;
> > > > > > > > > > > > > > > +                       rq->dma_free = &rq->dma_array[j];
> > > > > > > > > > > > > > > +               }
> > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       return 0;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +err:
> > > > > > > > > > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > > > +               struct receive_queue *rq;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +               rq = &vi->rq[i];
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +               kfree(rq->dma_array);
> > > > > > > > > > > > > > > +               kfree(rq->data_array);
> > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       return -ENOMEM;
> > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> > > > > > > > > > > > > > >  {
> > > > > > > > > > > > > > >         unsigned int len;
> > > > > > > > > > > > > > > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > > > > > > > > > >                 void *buf;
> > > > > > > > > > > > > > >                 int off;
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > -               buf = virtqueue_get_buf(rq->vq, &buflen);
> > > > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> > > > > > > > > > > > > > >                 if (unlikely(!buf))
> > > > > > > > > > > > > > >                         goto err_buf;
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > > > > > > > > > >                 return -EINVAL;
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >         while (--*num_buf > 0) {
> > > > > > > > > > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > > > > > > > > > >                                  dev->name, *num_buf,
> > > > > > > > > > > > > > > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > > > > > > >         while (--num_buf) {
> > > > > > > > > > > > > > >                 int num_skb_frags;
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > > > > > > > > > >                                  dev->name, num_buf,
> > > > > > > > > > > > > > > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > > > > > > >  err_skb:
> > > > > > > > > > > > > > >         put_page(page);
> > > > > > > > > > > > > > >         while (num_buf-- > 1) {
> > > > > > > > > > > > > > > -               buf = virtqueue_get_buf(rq->vq, &len);
> > > > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, NULL);
> > > > > > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers missing\n",
> > > > > > > > > > > > > > >                                  dev->name, num_buf);
> > > > > > > > > > > > > > > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > > > > > > > > > >         unsigned int xdp_headroom = virtnet_get_headroom(vi);
> > > > > > > > > > > > > > >         void *ctx = (void *)(unsigned long)xdp_headroom;
> > > > > > > > > > > > > > >         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > >         int err;
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >         len = SKB_DATA_ALIGN(len) +
> > > > > > > > > > > > > > > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > > > > > > > > > >         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > > > > > > > > >         get_page(alloc_frag->page);
> > > > > > > > > > > > > > >         alloc_frag->offset += len;
> > > > > > > > > > > > > > > -       sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > > > > > -                   vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > > > +               err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > > > > > +                                       vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > > > > > +               if (err)
> > > > > > > > > > > > > > > +                       goto map_err;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > > > > > > > > > +       } else {
> > > > > > > > > > > > > > > +               sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > > > > > +                           vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > > > > > +               data = (void *)buf;
> > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > > > > > > > > > >         if (err < 0)
> > > > > > > > > > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > > > > > > > > > +               goto add_err;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       return err;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +add_err:
> > > > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +map_err:
> > > > > > > > > > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > > > > > > > > > >         return err;
> > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > > > > > > > >         unsigned int headroom = virtnet_get_headroom(vi);
> > > > > > > > > > > > > > >         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> > > > > > > > > > > > > > >         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > >         char *buf;
> > > > > > > > > > > > > > >         void *ctx;
> > > > > > > > > > > > > > >         int err;
> > > > > > > > > > > > > > > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > > > > > > > >                 alloc_frag->offset += hole;
> > > > > > > > > > > > > > >         }
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > -       sg_init_one(rq->sg, buf, len);
> > > > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > > > +               err = virtnet_rq_map_sg(rq, buf, len);
> > > > > > > > > > > > > > > +               if (err)
> > > > > > > > > > > > > > > +                       goto map_err;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > > > > > > > > > +       } else {
> > > > > > > > > > > > > > > +               sg_init_one(rq->sg, buf, len);
> > > > > > > > > > > > > > > +               data = (void *)buf;
> > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > > > > > > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > > > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > > > > > > > > > >         if (err < 0)
> > > > > > > > > > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > > > > > > > > > +               goto add_err;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       return 0;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +add_err:
> > > > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +map_err:
> > > > > > > > > > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > > > > > > > > > >         return err;
> > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> > > > > > > > > > > > > > >                 void *ctx;
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >                 while (stats.packets < budget &&
> > > > > > > > > > > > > > > -                      (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > > > > > > > > > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> > > > > > > > > > > > > > >                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> > > > > > > > > > > > > > >                         stats.packets++;
> > > > > > > > > > > > > > >                 }
> > > > > > > > > > > > > > >         } else {
> > > > > > > > > > > > > > >                 while (stats.packets < budget &&
> > > > > > > > > > > > > > > -                      (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > > > > > > > > > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> > > > > > > > > > > > > > >                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> > > > > > > > > > > > > > >                         stats.packets++;
> > > > > > > > > > > > > > >                 }
> > > > > > > > > > > > > > > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > > > > > > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > > >                 __netif_napi_del(&vi->rq[i].napi);
> > > > > > > > > > > > > > >                 __netif_napi_del(&vi->sq[i].napi);
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +               kfree(vi->rq[i].data_array);
> > > > > > > > > > > > > > > +               kfree(vi->rq[i].dma_array);
> > > > > > > > > > > > > > >         }
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >         /* We called __netif_napi_del(),
> > > > > > > > > > > > > > > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > > > > > > > > > >         }
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > > > -               struct virtqueue *vq = vi->rq[i].vq;
> > > > > > > > > > > > > > > -               while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > > > > > > > > > > > > > > -                       virtnet_rq_free_unused_buf(vq, buf);
> > > > > > > > > > > > > > > +               struct receive_queue *rq = &vi->rq[i];
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +               while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > > > > > > > > > > > > > > +                       virtnet_rq_free_unused_buf(rq->vq, buf);
> > > > > > > > > > > > > > >                 cond_resched();
> > > > > > > > > > > > > > >         }
> > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> > > > > > > > > > > > > > >         if (ret)
> > > > > > > > > > > > > > >                 goto err_free;
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +       ret = virtnet_rq_merge_map_init(vi);
> > > > > > > > > > > > > > > +       if (ret)
> > > > > > > > > > > > > > > +               goto err_free;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > >         cpus_read_lock();
> > > > > > > > > > > > > > >         virtnet_set_affinity(vi);
> > > > > > > > > > > > > > >         cpus_read_unlock();
> > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > 2.32.0.3.g01195cf9f
> > > > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > >
> > > > > > > >
> > > > > > >
> > > > > >
> > > > >
> > >
>
Michael S. Tsirkin July 19, 2023, 9:51 a.m. UTC | #24
On Wed, Jul 19, 2023 at 05:38:56PM +0800, Jason Wang wrote:
> On Wed, Jul 19, 2023 at 4:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Wed, Jul 19, 2023 at 11:21:07AM +0800, Xuan Zhuo wrote:
> > > On Fri, 14 Jul 2023 06:37:10 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > On Wed, Jul 12, 2023 at 04:38:24PM +0800, Xuan Zhuo wrote:
> > > > > On Wed, 12 Jul 2023 16:37:43 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > On Wed, Jul 12, 2023 at 4:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > >
> > > > > > > On Wed, 12 Jul 2023 15:54:58 +0800, Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > On Tue, 11 Jul 2023 10:58:51 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > > On Tue, Jul 11, 2023 at 10:42 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Tue, 11 Jul 2023 10:36:17 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > > > > On Mon, Jul 10, 2023 at 8:41 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > On Mon, 10 Jul 2023 07:59:03 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > > > > > > > > On Mon, Jul 10, 2023 at 06:18:30PM +0800, Xuan Zhuo wrote:
> > > > > > > > > > > > > > On Mon, 10 Jul 2023 05:40:21 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > > > > > > > > > > On Mon, Jul 10, 2023 at 11:42:37AM +0800, Xuan Zhuo wrote:
> > > > > > > > > > > > > > > > Currently, the virtio core will perform a dma operation for each
> > > > > > > > > > > > > > > > operation. Although, the same page may be operated multiple times.
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > The driver does the dma operation and manages the dma address based the
> > > > > > > > > > > > > > > > feature premapped of virtio core.
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > This way, we can perform only one dma operation for the same page. In
> > > > > > > > > > > > > > > > the case of mtu 1500, this can reduce a lot of dma operations.
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > > > > > > > > > > > > > > > increased from 1893766 to 1901105. An increase of 0.4%.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > what kind of dma was there? an IOMMU? which vendors? in which mode
> > > > > > > > > > > > > > > of operation?
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Do you mean this:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > [    0.470816] iommu: Default domain type: Passthrough
> > > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > With passthrough, dma API is just some indirect function calls, they do
> > > > > > > > > > > > > not affect the performance a lot.
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > Yes, this benefit is worthless. I seem to have done a meaningless thing. The
> > > > > > > > > > > > overhead of DMA I observed is indeed not too high.
> > > > > > > > > > >
> > > > > > > > > > > Have you measured with iommu=strict?
> > > > > > > > > >
> > > > > > > > > > I have not tested this way, our environment is pt, I wonder if strict is a
> > > > > > > > > > common scenario. I can test it.
> > > > > > > > >
> > > > > > > > > It's not a common setup, but it's a way to stress DMA layer to see the overhead.
> > > > > > > >
> > > > > > > > kernel command line: intel_iommu=on iommu.strict=1 iommu.passthrough=0
> > > > > > > >
> > > > > > > > virtio-net without merge dma 428614.00 pps
> > > > > > > >
> > > > > > > > virtio-net with merge dma    742853.00 pps
> > > > > > >
> > > > > > >
> > > > > > > kernel command line: intel_iommu=on iommu.strict=0 iommu.passthrough=0
> > > > > > >
> > > > > > > virtio-net without merge dma 775496.00 pps
> > > > > > >
> > > > > > > virtio-net with merge dma    1010514.00 pps
> > > > > > >
> > > > > > >
> > > > > >
> > > > > > Great, let's add those numbers to the changelog.
> > > > >
> > > > >
> > > > > Yes, I will do it in next version.
> > > > >
> > > > >
> > > > > Thanks.
> > > > >
> > > >
> > > > You should also test without iommu but with swiotlb=force
> > >
> > >
> > > For swiotlb, merge DMA has no benefit, because we still need to copy data from
> > > swiotlb buffer to the origin buffer.
> > > The benefit of the merge DMA is to reduce the operate to the iommu device.
> > >
> > > I did some test for this. The result is same.
> > >
> > > Thanks.
> > >
> >
> > Did you actually check that it works though?
> > Looks like with swiotlb you need to synch to trigger a copy
> > before unmap, and I don't see where it's done in the current
> > patch.
> 
> And this is needed for XDP_REDIRECT as well.
> 
> Thanks

And once you do, you'll do the copy twice so it will
actually be slower.

I suspect you need to sync manually then unmap with DMA_ATTR_SKIP_CPU_SYNC.

> >
> >
> > >
> > > >
> > > > But first fix the use of DMA API to actually be correct,
> > > > otherwise you are cheating by avoiding synchronization.
> > > >
> > > >
> > > >
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > > > > Thanks.
> > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > Thanks.
> > > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > Thanks
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Thanks.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Thanks
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > Thanks.
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > Try e.g. bounce buffer. Which is where you will see a problem: your
> > > > > > > > > > > > > patches won't work.
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > This kind of difference is likely in the noise.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > It's really not high, but this is because the proportion of DMA under perf top
> > > > > > > > > > > > > > is not high. Probably that much.
> > > > > > > > > > > > >
> > > > > > > > > > > > > So maybe not worth the complexity.
> > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > ---
> > > > > > > > > > > > > > > >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> > > > > > > > > > > > > > > >  1 file changed, 267 insertions(+), 16 deletions(-)
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > > > > > > > > > > index 486b5849033d..4de845d35bed 100644
> > > > > > > > > > > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > > > > > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > > > > > > > > > > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> > > > > > > > > > > > > > > >  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
> > > > > > > > > > > > > > > >  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > +/* The bufs on the same page may share this struct. */
> > > > > > > > > > > > > > > > +struct virtnet_rq_dma {
> > > > > > > > > > > > > > > > +       struct virtnet_rq_dma *next;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       dma_addr_t addr;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > > > +       u32 len;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       u32 ref;
> > > > > > > > > > > > > > > > +};
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +/* Record the dma and buf. */
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > I guess I see that. But why?
> > > > > > > > > > > > > > > And these two comments are the extent of the available
> > > > > > > > > > > > > > > documentation, that's not enough I feel.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > +struct virtnet_rq_data {
> > > > > > > > > > > > > > > > +       struct virtnet_rq_data *next;
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Is manually reimplementing a linked list the best
> > > > > > > > > > > > > > > we can do?
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Yes, we can use llist.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       struct virtnet_rq_dma *dma;
> > > > > > > > > > > > > > > > +};
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > >  /* Internal representation of a send virtqueue */
> > > > > > > > > > > > > > > >  struct send_queue {
> > > > > > > > > > > > > > > >         /* Virtqueue associated with this send _queue */
> > > > > > > > > > > > > > > > @@ -175,6 +196,13 @@ struct receive_queue {
> > > > > > > > > > > > > > > >         char name[16];
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >         struct xdp_rxq_info xdp_rxq;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       struct virtnet_rq_data *data_array;
> > > > > > > > > > > > > > > > +       struct virtnet_rq_data *data_free;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       struct virtnet_rq_dma *dma_array;
> > > > > > > > > > > > > > > > +       struct virtnet_rq_dma *dma_free;
> > > > > > > > > > > > > > > > +       struct virtnet_rq_dma *last_dma;
> > > > > > > > > > > > > > > >  };
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > > > > > > > > > > > > > > > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > > > > > > > > > > >         return skb;
> > > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > > +       struct device *dev;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       --dma->ref;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       if (dma->ref)
> > > > > > > > > > > > > > > > +               return;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > If you don't unmap there is no guarantee valid data will be
> > > > > > > > > > > > > > > there in the buffer.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       dma->next = rq->dma_free;
> > > > > > > > > > > > > > > > +       rq->dma_free = dma;
> > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > > > > > > > > > > > > > > > +                                    struct virtnet_rq_data *data)
> > > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       buf = data->buf;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       data->next = rq->data_free;
> > > > > > > > > > > > > > > > +       rq->data_free = data;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       return buf;
> > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > > > > > > > > > > > > > > > +                                                  void *buf,
> > > > > > > > > > > > > > > > +                                                  struct virtnet_rq_dma *dma)
> > > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       data = rq->data_free;
> > > > > > > > > > > > > > > > +       rq->data_free = data->next;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       data->buf = buf;
> > > > > > > > > > > > > > > > +       data->dma = dma;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       return data;
> > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > > > > > > > > > > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > > > > > > > > > > +               return buf;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       data = buf;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       buf = virtqueue_detach_unused_buf(rq->vq);
> > > > > > > > > > > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > > > > > > > > > > +               return buf;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       data = buf;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > > +       struct virtnet_rq_dma *dma = rq->last_dma;
> > > > > > > > > > > > > > > > +       struct device *dev;
> > > > > > > > > > > > > > > > +       u32 off, map_len;
> > > > > > > > > > > > > > > > +       dma_addr_t addr;
> > > > > > > > > > > > > > > > +       void *end;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > > > > > > > > > > > > > > > +               ++dma->ref;
> > > > > > > > > > > > > > > > +               addr = dma->addr + (buf - dma->buf);
> > > > > > > > > > > > > > > > +               goto ok;
> > > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > So this is the meat of the proposed optimization. I guess that
> > > > > > > > > > > > > > > if the last buffer we allocated happens to be in the same page
> > > > > > > > > > > > > > > as this one then they can both be mapped for DMA together.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Since we use page_frag, the buffers we allocated are all continuous.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Why last one specifically? Whether next one happens to
> > > > > > > > > > > > > > > be close depends on luck. If you want to try optimizing this
> > > > > > > > > > > > > > > the right thing to do is likely by using a page pool.
> > > > > > > > > > > > > > > There's actually work upstream on page pool, look it up.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > As we discussed in another thread, the page pool is first used for xdp. Let's
> > > > > > > > > > > > > > transform it step by step.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Thanks.
> > > > > > > > > > > > >
> > > > > > > > > > > > > ok so this should wait then?
> > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       end = buf + len - 1;
> > > > > > > > > > > > > > > > +       off = offset_in_page(end);
> > > > > > > > > > > > > > > > +       map_len = len + PAGE_SIZE - off;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > > > > > > > > > > > > > > > +                                 map_len, DMA_FROM_DEVICE, 0);
> > > > > > > > > > > > > > > > +       if (addr == DMA_MAPPING_ERROR)
> > > > > > > > > > > > > > > > +               return -ENOMEM;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       dma = rq->dma_free;
> > > > > > > > > > > > > > > > +       rq->dma_free = dma->next;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       dma->ref = 1;
> > > > > > > > > > > > > > > > +       dma->buf = buf;
> > > > > > > > > > > > > > > > +       dma->addr = addr;
> > > > > > > > > > > > > > > > +       dma->len = map_len;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       rq->last_dma = dma;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +ok:
> > > > > > > > > > > > > > > > +       sg_init_table(rq->sg, 1);
> > > > > > > > > > > > > > > > +       rq->sg[0].dma_address = addr;
> > > > > > > > > > > > > > > > +       rq->sg[0].length = len;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       return 0;
> > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > > +       struct receive_queue *rq;
> > > > > > > > > > > > > > > > +       int i, err, j, num;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       /* disable for big mode */
> > > > > > > > > > > > > > > > +       if (!vi->mergeable_rx_bufs && vi->big_packets)
> > > > > > > > > > > > > > > > +               return 0;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > > > > +               err = virtqueue_set_premapped(vi->rq[i].vq);
> > > > > > > > > > > > > > > > +               if (err)
> > > > > > > > > > > > > > > > +                       continue;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +               rq = &vi->rq[i];
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +               num = virtqueue_get_vring_size(rq->vq);
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +               rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > > > > > > > > > > > > > > > +               if (!rq->data_array)
> > > > > > > > > > > > > > > > +                       goto err;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +               rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > > > > > > > > > > > > > > > +               if (!rq->dma_array)
> > > > > > > > > > > > > > > > +                       goto err;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +               for (j = 0; j < num; ++j) {
> > > > > > > > > > > > > > > > +                       rq->data_array[j].next = rq->data_free;
> > > > > > > > > > > > > > > > +                       rq->data_free = &rq->data_array[j];
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +                       rq->dma_array[j].next = rq->dma_free;
> > > > > > > > > > > > > > > > +                       rq->dma_free = &rq->dma_array[j];
> > > > > > > > > > > > > > > > +               }
> > > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       return 0;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +err:
> > > > > > > > > > > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > > > > +               struct receive_queue *rq;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +               rq = &vi->rq[i];
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +               kfree(rq->dma_array);
> > > > > > > > > > > > > > > > +               kfree(rq->data_array);
> > > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       return -ENOMEM;
> > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> > > > > > > > > > > > > > > >  {
> > > > > > > > > > > > > > > >         unsigned int len;
> > > > > > > > > > > > > > > > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > > > > > > > > > > >                 void *buf;
> > > > > > > > > > > > > > > >                 int off;
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > -               buf = virtqueue_get_buf(rq->vq, &buflen);
> > > > > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> > > > > > > > > > > > > > > >                 if (unlikely(!buf))
> > > > > > > > > > > > > > > >                         goto err_buf;
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > > > > > > > > > > >                 return -EINVAL;
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >         while (--*num_buf > 0) {
> > > > > > > > > > > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > > > > > > > > > > >                                  dev->name, *num_buf,
> > > > > > > > > > > > > > > > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > > > > > > > >         while (--num_buf) {
> > > > > > > > > > > > > > > >                 int num_skb_frags;
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > > > > > > > > > > >                                  dev->name, num_buf,
> > > > > > > > > > > > > > > > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > > > > > > > >  err_skb:
> > > > > > > > > > > > > > > >         put_page(page);
> > > > > > > > > > > > > > > >         while (num_buf-- > 1) {
> > > > > > > > > > > > > > > > -               buf = virtqueue_get_buf(rq->vq, &len);
> > > > > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, NULL);
> > > > > > > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers missing\n",
> > > > > > > > > > > > > > > >                                  dev->name, num_buf);
> > > > > > > > > > > > > > > > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > > > > > > > > > > >         unsigned int xdp_headroom = virtnet_get_headroom(vi);
> > > > > > > > > > > > > > > >         void *ctx = (void *)(unsigned long)xdp_headroom;
> > > > > > > > > > > > > > > >         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > > > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > > >         int err;
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >         len = SKB_DATA_ALIGN(len) +
> > > > > > > > > > > > > > > > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > > > > > > > > > > >         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > > > > > > > > > >         get_page(alloc_frag->page);
> > > > > > > > > > > > > > > >         alloc_frag->offset += len;
> > > > > > > > > > > > > > > > -       sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > > > > > > -                   vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > > > > +               err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > > > > > > +                                       vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > > > > > > +               if (err)
> > > > > > > > > > > > > > > > +                       goto map_err;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > > > > > > > > > > +       } else {
> > > > > > > > > > > > > > > > +               sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > > > > > > +                           vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > > > > > > +               data = (void *)buf;
> > > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > > > > > > > > > > >         if (err < 0)
> > > > > > > > > > > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > > > > > > > > > > +               goto add_err;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       return err;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +add_err:
> > > > > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +map_err:
> > > > > > > > > > > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > > > > > > > > > > >         return err;
> > > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > > > > > > > > >         unsigned int headroom = virtnet_get_headroom(vi);
> > > > > > > > > > > > > > > >         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> > > > > > > > > > > > > > > >         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > > > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > > >         char *buf;
> > > > > > > > > > > > > > > >         void *ctx;
> > > > > > > > > > > > > > > >         int err;
> > > > > > > > > > > > > > > > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > > > > > > > > >                 alloc_frag->offset += hole;
> > > > > > > > > > > > > > > >         }
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > -       sg_init_one(rq->sg, buf, len);
> > > > > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > > > > +               err = virtnet_rq_map_sg(rq, buf, len);
> > > > > > > > > > > > > > > > +               if (err)
> > > > > > > > > > > > > > > > +                       goto map_err;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > > > > > > > > > > +       } else {
> > > > > > > > > > > > > > > > +               sg_init_one(rq->sg, buf, len);
> > > > > > > > > > > > > > > > +               data = (void *)buf;
> > > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > > > > > > > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > > > > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > > > > > > > > > > >         if (err < 0)
> > > > > > > > > > > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > > > > > > > > > > +               goto add_err;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +       return 0;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +add_err:
> > > > > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > +map_err:
> > > > > > > > > > > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > > > > > > > > > > >         return err;
> > > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> > > > > > > > > > > > > > > >                 void *ctx;
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >                 while (stats.packets < budget &&
> > > > > > > > > > > > > > > > -                      (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > > > > > > > > > > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> > > > > > > > > > > > > > > >                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> > > > > > > > > > > > > > > >                         stats.packets++;
> > > > > > > > > > > > > > > >                 }
> > > > > > > > > > > > > > > >         } else {
> > > > > > > > > > > > > > > >                 while (stats.packets < budget &&
> > > > > > > > > > > > > > > > -                      (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > > > > > > > > > > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> > > > > > > > > > > > > > > >                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> > > > > > > > > > > > > > > >                         stats.packets++;
> > > > > > > > > > > > > > > >                 }
> > > > > > > > > > > > > > > > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > > > > > > > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > > > >                 __netif_napi_del(&vi->rq[i].napi);
> > > > > > > > > > > > > > > >                 __netif_napi_del(&vi->sq[i].napi);
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +               kfree(vi->rq[i].data_array);
> > > > > > > > > > > > > > > > +               kfree(vi->rq[i].dma_array);
> > > > > > > > > > > > > > > >         }
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >         /* We called __netif_napi_del(),
> > > > > > > > > > > > > > > > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > > > > > > > > > > >         }
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > > > > -               struct virtqueue *vq = vi->rq[i].vq;
> > > > > > > > > > > > > > > > -               while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > > > > > > > > > > > > > > > -                       virtnet_rq_free_unused_buf(vq, buf);
> > > > > > > > > > > > > > > > +               struct receive_queue *rq = &vi->rq[i];
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +               while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > > > > > > > > > > > > > > > +                       virtnet_rq_free_unused_buf(rq->vq, buf);
> > > > > > > > > > > > > > > >                 cond_resched();
> > > > > > > > > > > > > > > >         }
> > > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > > > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> > > > > > > > > > > > > > > >         if (ret)
> > > > > > > > > > > > > > > >                 goto err_free;
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > +       ret = virtnet_rq_merge_map_init(vi);
> > > > > > > > > > > > > > > > +       if (ret)
> > > > > > > > > > > > > > > > +               goto err_free;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > >         cpus_read_lock();
> > > > > > > > > > > > > > > >         virtnet_set_affinity(vi);
> > > > > > > > > > > > > > > >         cpus_read_unlock();
> > > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > > 2.32.0.3.g01195cf9f
> > > > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > >
> > > > > > > >
> > > > > > >
> > > > > >
> > > >
> >
Xuan Zhuo July 20, 2023, 2:24 a.m. UTC | #25
On Wed, 19 Jul 2023 04:55:04 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Wed, Jul 19, 2023 at 11:21:07AM +0800, Xuan Zhuo wrote:
> > On Fri, 14 Jul 2023 06:37:10 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > On Wed, Jul 12, 2023 at 04:38:24PM +0800, Xuan Zhuo wrote:
> > > > On Wed, 12 Jul 2023 16:37:43 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > On Wed, Jul 12, 2023 at 4:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > >
> > > > > > On Wed, 12 Jul 2023 15:54:58 +0800, Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > On Tue, 11 Jul 2023 10:58:51 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > On Tue, Jul 11, 2023 at 10:42 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > >
> > > > > > > > > On Tue, 11 Jul 2023 10:36:17 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > > > On Mon, Jul 10, 2023 at 8:41 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > On Mon, 10 Jul 2023 07:59:03 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > > > > > > > On Mon, Jul 10, 2023 at 06:18:30PM +0800, Xuan Zhuo wrote:
> > > > > > > > > > > > > On Mon, 10 Jul 2023 05:40:21 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > > > > > > > > > On Mon, Jul 10, 2023 at 11:42:37AM +0800, Xuan Zhuo wrote:
> > > > > > > > > > > > > > > Currently, the virtio core will perform a dma operation for each
> > > > > > > > > > > > > > > operation. Although, the same page may be operated multiple times.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > The driver does the dma operation and manages the dma address based the
> > > > > > > > > > > > > > > feature premapped of virtio core.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > This way, we can perform only one dma operation for the same page. In
> > > > > > > > > > > > > > > the case of mtu 1500, this can reduce a lot of dma operations.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > > > > > > > > > > > > > > increased from 1893766 to 1901105. An increase of 0.4%.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > what kind of dma was there? an IOMMU? which vendors? in which mode
> > > > > > > > > > > > > > of operation?
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > Do you mean this:
> > > > > > > > > > > > >
> > > > > > > > > > > > > [    0.470816] iommu: Default domain type: Passthrough
> > > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > With passthrough, dma API is just some indirect function calls, they do
> > > > > > > > > > > > not affect the performance a lot.
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Yes, this benefit is worthless. I seem to have done a meaningless thing. The
> > > > > > > > > > > overhead of DMA I observed is indeed not too high.
> > > > > > > > > >
> > > > > > > > > > Have you measured with iommu=strict?
> > > > > > > > >
> > > > > > > > > I have not tested this way, our environment is pt, I wonder if strict is a
> > > > > > > > > common scenario. I can test it.
> > > > > > > >
> > > > > > > > It's not a common setup, but it's a way to stress DMA layer to see the overhead.
> > > > > > >
> > > > > > > kernel command line: intel_iommu=on iommu.strict=1 iommu.passthrough=0
> > > > > > >
> > > > > > > virtio-net without merge dma 428614.00 pps
> > > > > > >
> > > > > > > virtio-net with merge dma    742853.00 pps
> > > > > >
> > > > > >
> > > > > > kernel command line: intel_iommu=on iommu.strict=0 iommu.passthrough=0
> > > > > >
> > > > > > virtio-net without merge dma 775496.00 pps
> > > > > >
> > > > > > virtio-net with merge dma    1010514.00 pps
> > > > > >
> > > > > >
> > > > >
> > > > > Great, let's add those numbers to the changelog.
> > > >
> > > >
> > > > Yes, I will do it in next version.
> > > >
> > > >
> > > > Thanks.
> > > >
> > >
> > > You should also test without iommu but with swiotlb=force
> >
> >
> > For swiotlb, merge DMA has no benefit, because we still need to copy data from
> > swiotlb buffer to the origin buffer.
> > The benefit of the merge DMA is to reduce the operate to the iommu device.
> >
> > I did some test for this. The result is same.
> >
> > Thanks.
> >
>
> Did you actually check that it works though?
> Looks like with swiotlb you need to synch to trigger a copy
> before unmap, and I don't see where it's done in the current
> patch.

Yes, you are right, I miss the sync in this patch.
But when I tested for swiotlb, I fixed this.
You can see this in the next version.

Thanks.


>
>
> >
> > >
> > > But first fix the use of DMA API to actually be correct,
> > > otherwise you are cheating by avoiding synchronization.
> > >
> > >
> > >
> > > > >
> > > > > Thanks
> > > > >
> > > > > > Thanks.
> > > > > >
> > > > > > >
> > > > > > >
> > > > > > > Thanks.
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > Thanks
> > > > > > > >
> > > > > > > > >
> > > > > > > > > Thanks.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Thanks
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Thanks.
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > Try e.g. bounce buffer. Which is where you will see a problem: your
> > > > > > > > > > > > patches won't work.
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > This kind of difference is likely in the noise.
> > > > > > > > > > > > >
> > > > > > > > > > > > > It's really not high, but this is because the proportion of DMA under perf top
> > > > > > > > > > > > > is not high. Probably that much.
> > > > > > > > > > > >
> > > > > > > > > > > > So maybe not worth the complexity.
> > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > ---
> > > > > > > > > > > > > > >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> > > > > > > > > > > > > > >  1 file changed, 267 insertions(+), 16 deletions(-)
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > > > > > > > > > index 486b5849033d..4de845d35bed 100644
> > > > > > > > > > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > > > > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > > > > > > > > > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> > > > > > > > > > > > > > >  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
> > > > > > > > > > > > > > >  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +/* The bufs on the same page may share this struct. */
> > > > > > > > > > > > > > > +struct virtnet_rq_dma {
> > > > > > > > > > > > > > > +       struct virtnet_rq_dma *next;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       dma_addr_t addr;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > > +       u32 len;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       u32 ref;
> > > > > > > > > > > > > > > +};
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +/* Record the dma and buf. */
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > I guess I see that. But why?
> > > > > > > > > > > > > > And these two comments are the extent of the available
> > > > > > > > > > > > > > documentation, that's not enough I feel.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +struct virtnet_rq_data {
> > > > > > > > > > > > > > > +       struct virtnet_rq_data *next;
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Is manually reimplementing a linked list the best
> > > > > > > > > > > > > > we can do?
> > > > > > > > > > > > >
> > > > > > > > > > > > > Yes, we can use llist.
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       struct virtnet_rq_dma *dma;
> > > > > > > > > > > > > > > +};
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > >  /* Internal representation of a send virtqueue */
> > > > > > > > > > > > > > >  struct send_queue {
> > > > > > > > > > > > > > >         /* Virtqueue associated with this send _queue */
> > > > > > > > > > > > > > > @@ -175,6 +196,13 @@ struct receive_queue {
> > > > > > > > > > > > > > >         char name[16];
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >         struct xdp_rxq_info xdp_rxq;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       struct virtnet_rq_data *data_array;
> > > > > > > > > > > > > > > +       struct virtnet_rq_data *data_free;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       struct virtnet_rq_dma *dma_array;
> > > > > > > > > > > > > > > +       struct virtnet_rq_dma *dma_free;
> > > > > > > > > > > > > > > +       struct virtnet_rq_dma *last_dma;
> > > > > > > > > > > > > > >  };
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > > > > > > > > > > > > > > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > > > > > > > > > >         return skb;
> > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > +       struct device *dev;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       --dma->ref;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       if (dma->ref)
> > > > > > > > > > > > > > > +               return;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > If you don't unmap there is no guarantee valid data will be
> > > > > > > > > > > > > > there in the buffer.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       dma->next = rq->dma_free;
> > > > > > > > > > > > > > > +       rq->dma_free = dma;
> > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > > > > > > > > > > > > > > +                                    struct virtnet_rq_data *data)
> > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       buf = data->buf;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       data->next = rq->data_free;
> > > > > > > > > > > > > > > +       rq->data_free = data;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       return buf;
> > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > > > > > > > > > > > > > > +                                                  void *buf,
> > > > > > > > > > > > > > > +                                                  struct virtnet_rq_dma *dma)
> > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       data = rq->data_free;
> > > > > > > > > > > > > > > +       rq->data_free = data->next;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       data->buf = buf;
> > > > > > > > > > > > > > > +       data->dma = dma;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       return data;
> > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > > > > > > > > > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > > > > > > > > > +               return buf;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       data = buf;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       buf = virtqueue_detach_unused_buf(rq->vq);
> > > > > > > > > > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > > > > > > > > > +               return buf;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       data = buf;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > +       struct virtnet_rq_dma *dma = rq->last_dma;
> > > > > > > > > > > > > > > +       struct device *dev;
> > > > > > > > > > > > > > > +       u32 off, map_len;
> > > > > > > > > > > > > > > +       dma_addr_t addr;
> > > > > > > > > > > > > > > +       void *end;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > > > > > > > > > > > > > > +               ++dma->ref;
> > > > > > > > > > > > > > > +               addr = dma->addr + (buf - dma->buf);
> > > > > > > > > > > > > > > +               goto ok;
> > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > So this is the meat of the proposed optimization. I guess that
> > > > > > > > > > > > > > if the last buffer we allocated happens to be in the same page
> > > > > > > > > > > > > > as this one then they can both be mapped for DMA together.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Since we use page_frag, the buffers we allocated are all continuous.
> > > > > > > > > > > > >
> > > > > > > > > > > > > > Why last one specifically? Whether next one happens to
> > > > > > > > > > > > > > be close depends on luck. If you want to try optimizing this
> > > > > > > > > > > > > > the right thing to do is likely by using a page pool.
> > > > > > > > > > > > > > There's actually work upstream on page pool, look it up.
> > > > > > > > > > > > >
> > > > > > > > > > > > > As we discussed in another thread, the page pool is first used for xdp. Let's
> > > > > > > > > > > > > transform it step by step.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Thanks.
> > > > > > > > > > > >
> > > > > > > > > > > > ok so this should wait then?
> > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       end = buf + len - 1;
> > > > > > > > > > > > > > > +       off = offset_in_page(end);
> > > > > > > > > > > > > > > +       map_len = len + PAGE_SIZE - off;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > > > > > > > > > > > > > > +                                 map_len, DMA_FROM_DEVICE, 0);
> > > > > > > > > > > > > > > +       if (addr == DMA_MAPPING_ERROR)
> > > > > > > > > > > > > > > +               return -ENOMEM;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       dma = rq->dma_free;
> > > > > > > > > > > > > > > +       rq->dma_free = dma->next;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       dma->ref = 1;
> > > > > > > > > > > > > > > +       dma->buf = buf;
> > > > > > > > > > > > > > > +       dma->addr = addr;
> > > > > > > > > > > > > > > +       dma->len = map_len;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       rq->last_dma = dma;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +ok:
> > > > > > > > > > > > > > > +       sg_init_table(rq->sg, 1);
> > > > > > > > > > > > > > > +       rq->sg[0].dma_address = addr;
> > > > > > > > > > > > > > > +       rq->sg[0].length = len;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       return 0;
> > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > +       struct receive_queue *rq;
> > > > > > > > > > > > > > > +       int i, err, j, num;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       /* disable for big mode */
> > > > > > > > > > > > > > > +       if (!vi->mergeable_rx_bufs && vi->big_packets)
> > > > > > > > > > > > > > > +               return 0;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > > > +               err = virtqueue_set_premapped(vi->rq[i].vq);
> > > > > > > > > > > > > > > +               if (err)
> > > > > > > > > > > > > > > +                       continue;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +               rq = &vi->rq[i];
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +               num = virtqueue_get_vring_size(rq->vq);
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +               rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > > > > > > > > > > > > > > +               if (!rq->data_array)
> > > > > > > > > > > > > > > +                       goto err;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +               rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > > > > > > > > > > > > > > +               if (!rq->dma_array)
> > > > > > > > > > > > > > > +                       goto err;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +               for (j = 0; j < num; ++j) {
> > > > > > > > > > > > > > > +                       rq->data_array[j].next = rq->data_free;
> > > > > > > > > > > > > > > +                       rq->data_free = &rq->data_array[j];
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +                       rq->dma_array[j].next = rq->dma_free;
> > > > > > > > > > > > > > > +                       rq->dma_free = &rq->dma_array[j];
> > > > > > > > > > > > > > > +               }
> > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       return 0;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +err:
> > > > > > > > > > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > > > +               struct receive_queue *rq;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +               rq = &vi->rq[i];
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +               kfree(rq->dma_array);
> > > > > > > > > > > > > > > +               kfree(rq->data_array);
> > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       return -ENOMEM;
> > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> > > > > > > > > > > > > > >  {
> > > > > > > > > > > > > > >         unsigned int len;
> > > > > > > > > > > > > > > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > > > > > > > > > >                 void *buf;
> > > > > > > > > > > > > > >                 int off;
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > -               buf = virtqueue_get_buf(rq->vq, &buflen);
> > > > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> > > > > > > > > > > > > > >                 if (unlikely(!buf))
> > > > > > > > > > > > > > >                         goto err_buf;
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > > > > > > > > > >                 return -EINVAL;
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >         while (--*num_buf > 0) {
> > > > > > > > > > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > > > > > > > > > >                                  dev->name, *num_buf,
> > > > > > > > > > > > > > > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > > > > > > >         while (--num_buf) {
> > > > > > > > > > > > > > >                 int num_skb_frags;
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > > > > > > > > > >                                  dev->name, num_buf,
> > > > > > > > > > > > > > > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > > > > > > >  err_skb:
> > > > > > > > > > > > > > >         put_page(page);
> > > > > > > > > > > > > > >         while (num_buf-- > 1) {
> > > > > > > > > > > > > > > -               buf = virtqueue_get_buf(rq->vq, &len);
> > > > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, NULL);
> > > > > > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers missing\n",
> > > > > > > > > > > > > > >                                  dev->name, num_buf);
> > > > > > > > > > > > > > > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > > > > > > > > > >         unsigned int xdp_headroom = virtnet_get_headroom(vi);
> > > > > > > > > > > > > > >         void *ctx = (void *)(unsigned long)xdp_headroom;
> > > > > > > > > > > > > > >         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > >         int err;
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >         len = SKB_DATA_ALIGN(len) +
> > > > > > > > > > > > > > > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > > > > > > > > > >         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > > > > > > > > >         get_page(alloc_frag->page);
> > > > > > > > > > > > > > >         alloc_frag->offset += len;
> > > > > > > > > > > > > > > -       sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > > > > > -                   vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > > > +               err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > > > > > +                                       vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > > > > > +               if (err)
> > > > > > > > > > > > > > > +                       goto map_err;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > > > > > > > > > +       } else {
> > > > > > > > > > > > > > > +               sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > > > > > +                           vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > > > > > +               data = (void *)buf;
> > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > > > > > > > > > >         if (err < 0)
> > > > > > > > > > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > > > > > > > > > +               goto add_err;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       return err;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +add_err:
> > > > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +map_err:
> > > > > > > > > > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > > > > > > > > > >         return err;
> > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > > > > > > > >         unsigned int headroom = virtnet_get_headroom(vi);
> > > > > > > > > > > > > > >         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> > > > > > > > > > > > > > >         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > >         char *buf;
> > > > > > > > > > > > > > >         void *ctx;
> > > > > > > > > > > > > > >         int err;
> > > > > > > > > > > > > > > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > > > > > > > >                 alloc_frag->offset += hole;
> > > > > > > > > > > > > > >         }
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > -       sg_init_one(rq->sg, buf, len);
> > > > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > > > +               err = virtnet_rq_map_sg(rq, buf, len);
> > > > > > > > > > > > > > > +               if (err)
> > > > > > > > > > > > > > > +                       goto map_err;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > > > > > > > > > +       } else {
> > > > > > > > > > > > > > > +               sg_init_one(rq->sg, buf, len);
> > > > > > > > > > > > > > > +               data = (void *)buf;
> > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > > > > > > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > > > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > > > > > > > > > >         if (err < 0)
> > > > > > > > > > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > > > > > > > > > +               goto add_err;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +       return 0;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +add_err:
> > > > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +map_err:
> > > > > > > > > > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > > > > > > > > > >         return err;
> > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> > > > > > > > > > > > > > >                 void *ctx;
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >                 while (stats.packets < budget &&
> > > > > > > > > > > > > > > -                      (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > > > > > > > > > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> > > > > > > > > > > > > > >                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> > > > > > > > > > > > > > >                         stats.packets++;
> > > > > > > > > > > > > > >                 }
> > > > > > > > > > > > > > >         } else {
> > > > > > > > > > > > > > >                 while (stats.packets < budget &&
> > > > > > > > > > > > > > > -                      (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > > > > > > > > > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> > > > > > > > > > > > > > >                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> > > > > > > > > > > > > > >                         stats.packets++;
> > > > > > > > > > > > > > >                 }
> > > > > > > > > > > > > > > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > > > > > > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > > >                 __netif_napi_del(&vi->rq[i].napi);
> > > > > > > > > > > > > > >                 __netif_napi_del(&vi->sq[i].napi);
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +               kfree(vi->rq[i].data_array);
> > > > > > > > > > > > > > > +               kfree(vi->rq[i].dma_array);
> > > > > > > > > > > > > > >         }
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >         /* We called __netif_napi_del(),
> > > > > > > > > > > > > > > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > > > > > > > > > >         }
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > > > -               struct virtqueue *vq = vi->rq[i].vq;
> > > > > > > > > > > > > > > -               while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > > > > > > > > > > > > > > -                       virtnet_rq_free_unused_buf(vq, buf);
> > > > > > > > > > > > > > > +               struct receive_queue *rq = &vi->rq[i];
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +               while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > > > > > > > > > > > > > > +                       virtnet_rq_free_unused_buf(rq->vq, buf);
> > > > > > > > > > > > > > >                 cond_resched();
> > > > > > > > > > > > > > >         }
> > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> > > > > > > > > > > > > > >         if (ret)
> > > > > > > > > > > > > > >                 goto err_free;
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +       ret = virtnet_rq_merge_map_init(vi);
> > > > > > > > > > > > > > > +       if (ret)
> > > > > > > > > > > > > > > +               goto err_free;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > >         cpus_read_lock();
> > > > > > > > > > > > > > >         virtnet_set_affinity(vi);
> > > > > > > > > > > > > > >         cpus_read_unlock();
> > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > 2.32.0.3.g01195cf9f
> > > > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > >
> > > > > > > >
> > > > > > >
> > > > > >
> > > > >
> > >
>
Xuan Zhuo July 20, 2023, 2:26 a.m. UTC | #26
On Wed, 19 Jul 2023 05:51:50 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Wed, Jul 19, 2023 at 05:38:56PM +0800, Jason Wang wrote:
> > On Wed, Jul 19, 2023 at 4:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Wed, Jul 19, 2023 at 11:21:07AM +0800, Xuan Zhuo wrote:
> > > > On Fri, 14 Jul 2023 06:37:10 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > On Wed, Jul 12, 2023 at 04:38:24PM +0800, Xuan Zhuo wrote:
> > > > > > On Wed, 12 Jul 2023 16:37:43 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > On Wed, Jul 12, 2023 at 4:33 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > >
> > > > > > > > On Wed, 12 Jul 2023 15:54:58 +0800, Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > > On Tue, 11 Jul 2023 10:58:51 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > > > On Tue, Jul 11, 2023 at 10:42 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > On Tue, 11 Jul 2023 10:36:17 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > > > > > On Mon, Jul 10, 2023 at 8:41 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> > > > > > > > > > > > >
> > > > > > > > > > > > > On Mon, 10 Jul 2023 07:59:03 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > > > > > > > > > On Mon, Jul 10, 2023 at 06:18:30PM +0800, Xuan Zhuo wrote:
> > > > > > > > > > > > > > > On Mon, 10 Jul 2023 05:40:21 -0400, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > > > > > > > > > > > > > > On Mon, Jul 10, 2023 at 11:42:37AM +0800, Xuan Zhuo wrote:
> > > > > > > > > > > > > > > > > Currently, the virtio core will perform a dma operation for each
> > > > > > > > > > > > > > > > > operation. Although, the same page may be operated multiple times.
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > The driver does the dma operation and manages the dma address based the
> > > > > > > > > > > > > > > > > feature premapped of virtio core.
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > This way, we can perform only one dma operation for the same page. In
> > > > > > > > > > > > > > > > > the case of mtu 1500, this can reduce a lot of dma operations.
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > Tested on Aliyun g7.4large machine, in the case of a cpu 100%, pps
> > > > > > > > > > > > > > > > > increased from 1893766 to 1901105. An increase of 0.4%.
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > what kind of dma was there? an IOMMU? which vendors? in which mode
> > > > > > > > > > > > > > > > of operation?
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Do you mean this:
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > [    0.470816] iommu: Default domain type: Passthrough
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > With passthrough, dma API is just some indirect function calls, they do
> > > > > > > > > > > > > > not affect the performance a lot.
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > Yes, this benefit is worthless. I seem to have done a meaningless thing. The
> > > > > > > > > > > > > overhead of DMA I observed is indeed not too high.
> > > > > > > > > > > >
> > > > > > > > > > > > Have you measured with iommu=strict?
> > > > > > > > > > >
> > > > > > > > > > > I have not tested this way, our environment is pt, I wonder if strict is a
> > > > > > > > > > > common scenario. I can test it.
> > > > > > > > > >
> > > > > > > > > > It's not a common setup, but it's a way to stress DMA layer to see the overhead.
> > > > > > > > >
> > > > > > > > > kernel command line: intel_iommu=on iommu.strict=1 iommu.passthrough=0
> > > > > > > > >
> > > > > > > > > virtio-net without merge dma 428614.00 pps
> > > > > > > > >
> > > > > > > > > virtio-net with merge dma    742853.00 pps
> > > > > > > >
> > > > > > > >
> > > > > > > > kernel command line: intel_iommu=on iommu.strict=0 iommu.passthrough=0
> > > > > > > >
> > > > > > > > virtio-net without merge dma 775496.00 pps
> > > > > > > >
> > > > > > > > virtio-net with merge dma    1010514.00 pps
> > > > > > > >
> > > > > > > >
> > > > > > >
> > > > > > > Great, let's add those numbers to the changelog.
> > > > > >
> > > > > >
> > > > > > Yes, I will do it in next version.
> > > > > >
> > > > > >
> > > > > > Thanks.
> > > > > >
> > > > >
> > > > > You should also test without iommu but with swiotlb=force
> > > >
> > > >
> > > > For swiotlb, merge DMA has no benefit, because we still need to copy data from
> > > > swiotlb buffer to the origin buffer.
> > > > The benefit of the merge DMA is to reduce the operate to the iommu device.
> > > >
> > > > I did some test for this. The result is same.
> > > >
> > > > Thanks.
> > > >
> > >
> > > Did you actually check that it works though?
> > > Looks like with swiotlb you need to synch to trigger a copy
> > > before unmap, and I don't see where it's done in the current
> > > patch.
> >
> > And this is needed for XDP_REDIRECT as well.
> >
> > Thanks
>
> And once you do, you'll do the copy twice so it will
> actually be slower.

Yes, I also think so. But, I did not see too much decline.
There may be a fluctuating effect.

>
> I suspect you need to sync manually then unmap with DMA_ATTR_SKIP_CPU_SYNC.

DMA_ATTR_SKIP_CPU_SYNC is great!!

I will include this in v13.

Thanks.


>
> > >
> > >
> > > >
> > > > >
> > > > > But first fix the use of DMA API to actually be correct,
> > > > > otherwise you are cheating by avoiding synchronization.
> > > > >
> > > > >
> > > > >
> > > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > > > > Thanks.
> > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Thanks.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Thanks
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Thanks.
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > Thanks
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > Thanks.
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Try e.g. bounce buffer. Which is where you will see a problem: your
> > > > > > > > > > > > > > patches won't work.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > This kind of difference is likely in the noise.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > It's really not high, but this is because the proportion of DMA under perf top
> > > > > > > > > > > > > > > is not high. Probably that much.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > So maybe not worth the complexity.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > ---
> > > > > > > > > > > > > > > > >  drivers/net/virtio_net.c | 283 ++++++++++++++++++++++++++++++++++++---
> > > > > > > > > > > > > > > > >  1 file changed, 267 insertions(+), 16 deletions(-)
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > > > > > > > > > > > index 486b5849033d..4de845d35bed 100644
> > > > > > > > > > > > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > > > > > > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > > > > > > > > > > > @@ -126,6 +126,27 @@ static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
> > > > > > > > > > > > > > > > >  #define VIRTNET_SQ_STATS_LEN   ARRAY_SIZE(virtnet_sq_stats_desc)
> > > > > > > > > > > > > > > > >  #define VIRTNET_RQ_STATS_LEN   ARRAY_SIZE(virtnet_rq_stats_desc)
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > +/* The bufs on the same page may share this struct. */
> > > > > > > > > > > > > > > > > +struct virtnet_rq_dma {
> > > > > > > > > > > > > > > > > +       struct virtnet_rq_dma *next;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       dma_addr_t addr;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > > > > +       u32 len;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       u32 ref;
> > > > > > > > > > > > > > > > > +};
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +/* Record the dma and buf. */
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > I guess I see that. But why?
> > > > > > > > > > > > > > > > And these two comments are the extent of the available
> > > > > > > > > > > > > > > > documentation, that's not enough I feel.
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > +struct virtnet_rq_data {
> > > > > > > > > > > > > > > > > +       struct virtnet_rq_data *next;
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > Is manually reimplementing a linked list the best
> > > > > > > > > > > > > > > > we can do?
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Yes, we can use llist.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       struct virtnet_rq_dma *dma;
> > > > > > > > > > > > > > > > > +};
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > >  /* Internal representation of a send virtqueue */
> > > > > > > > > > > > > > > > >  struct send_queue {
> > > > > > > > > > > > > > > > >         /* Virtqueue associated with this send _queue */
> > > > > > > > > > > > > > > > > @@ -175,6 +196,13 @@ struct receive_queue {
> > > > > > > > > > > > > > > > >         char name[16];
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > >         struct xdp_rxq_info xdp_rxq;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       struct virtnet_rq_data *data_array;
> > > > > > > > > > > > > > > > > +       struct virtnet_rq_data *data_free;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       struct virtnet_rq_dma *dma_array;
> > > > > > > > > > > > > > > > > +       struct virtnet_rq_dma *dma_free;
> > > > > > > > > > > > > > > > > +       struct virtnet_rq_dma *last_dma;
> > > > > > > > > > > > > > > > >  };
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > >  /* This structure can contain rss message with maximum settings for indirection table and keysize
> > > > > > > > > > > > > > > > > @@ -549,6 +577,176 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > > > > > > > > > > > >         return skb;
> > > > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > +static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
> > > > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > > > +       struct device *dev;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       --dma->ref;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       if (dma->ref)
> > > > > > > > > > > > > > > > > +               return;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > If you don't unmap there is no guarantee valid data will be
> > > > > > > > > > > > > > > > there in the buffer.
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       dma->next = rq->dma_free;
> > > > > > > > > > > > > > > > > +       rq->dma_free = dma;
> > > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +static void *virtnet_rq_recycle_data(struct receive_queue *rq,
> > > > > > > > > > > > > > > > > +                                    struct virtnet_rq_data *data)
> > > > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       buf = data->buf;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       data->next = rq->data_free;
> > > > > > > > > > > > > > > > > +       rq->data_free = data;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       return buf;
> > > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
> > > > > > > > > > > > > > > > > +                                                  void *buf,
> > > > > > > > > > > > > > > > > +                                                  struct virtnet_rq_dma *dma)
> > > > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       data = rq->data_free;
> > > > > > > > > > > > > > > > > +       rq->data_free = data->next;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       data->buf = buf;
> > > > > > > > > > > > > > > > > +       data->dma = dma;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       return data;
> > > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
> > > > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> > > > > > > > > > > > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > > > > > > > > > > > +               return buf;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       data = buf;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
> > > > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > > > > +       void *buf;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       buf = virtqueue_detach_unused_buf(rq->vq);
> > > > > > > > > > > > > > > > > +       if (!buf || !rq->data_array)
> > > > > > > > > > > > > > > > > +               return buf;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       data = buf;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       return virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
> > > > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > > > +       struct virtnet_rq_dma *dma = rq->last_dma;
> > > > > > > > > > > > > > > > > +       struct device *dev;
> > > > > > > > > > > > > > > > > +       u32 off, map_len;
> > > > > > > > > > > > > > > > > +       dma_addr_t addr;
> > > > > > > > > > > > > > > > > +       void *end;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
> > > > > > > > > > > > > > > > > +               ++dma->ref;
> > > > > > > > > > > > > > > > > +               addr = dma->addr + (buf - dma->buf);
> > > > > > > > > > > > > > > > > +               goto ok;
> > > > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > So this is the meat of the proposed optimization. I guess that
> > > > > > > > > > > > > > > > if the last buffer we allocated happens to be in the same page
> > > > > > > > > > > > > > > > as this one then they can both be mapped for DMA together.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Since we use page_frag, the buffers we allocated are all continuous.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > Why last one specifically? Whether next one happens to
> > > > > > > > > > > > > > > > be close depends on luck. If you want to try optimizing this
> > > > > > > > > > > > > > > > the right thing to do is likely by using a page pool.
> > > > > > > > > > > > > > > > There's actually work upstream on page pool, look it up.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > As we discussed in another thread, the page pool is first used for xdp. Let's
> > > > > > > > > > > > > > > transform it step by step.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Thanks.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > ok so this should wait then?
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       end = buf + len - 1;
> > > > > > > > > > > > > > > > > +       off = offset_in_page(end);
> > > > > > > > > > > > > > > > > +       map_len = len + PAGE_SIZE - off;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       dev = virtqueue_dma_dev(rq->vq);
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
> > > > > > > > > > > > > > > > > +                                 map_len, DMA_FROM_DEVICE, 0);
> > > > > > > > > > > > > > > > > +       if (addr == DMA_MAPPING_ERROR)
> > > > > > > > > > > > > > > > > +               return -ENOMEM;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       dma = rq->dma_free;
> > > > > > > > > > > > > > > > > +       rq->dma_free = dma->next;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       dma->ref = 1;
> > > > > > > > > > > > > > > > > +       dma->buf = buf;
> > > > > > > > > > > > > > > > > +       dma->addr = addr;
> > > > > > > > > > > > > > > > > +       dma->len = map_len;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       rq->last_dma = dma;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +ok:
> > > > > > > > > > > > > > > > > +       sg_init_table(rq->sg, 1);
> > > > > > > > > > > > > > > > > +       rq->sg[0].dma_address = addr;
> > > > > > > > > > > > > > > > > +       rq->sg[0].length = len;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       return 0;
> > > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
> > > > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > > > +       struct receive_queue *rq;
> > > > > > > > > > > > > > > > > +       int i, err, j, num;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       /* disable for big mode */
> > > > > > > > > > > > > > > > > +       if (!vi->mergeable_rx_bufs && vi->big_packets)
> > > > > > > > > > > > > > > > > +               return 0;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > > > > > +               err = virtqueue_set_premapped(vi->rq[i].vq);
> > > > > > > > > > > > > > > > > +               if (err)
> > > > > > > > > > > > > > > > > +                       continue;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +               rq = &vi->rq[i];
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +               num = virtqueue_get_vring_size(rq->vq);
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +               rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
> > > > > > > > > > > > > > > > > +               if (!rq->data_array)
> > > > > > > > > > > > > > > > > +                       goto err;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +               rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
> > > > > > > > > > > > > > > > > +               if (!rq->dma_array)
> > > > > > > > > > > > > > > > > +                       goto err;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +               for (j = 0; j < num; ++j) {
> > > > > > > > > > > > > > > > > +                       rq->data_array[j].next = rq->data_free;
> > > > > > > > > > > > > > > > > +                       rq->data_free = &rq->data_array[j];
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +                       rq->dma_array[j].next = rq->dma_free;
> > > > > > > > > > > > > > > > > +                       rq->dma_free = &rq->dma_array[j];
> > > > > > > > > > > > > > > > > +               }
> > > > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       return 0;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +err:
> > > > > > > > > > > > > > > > > +       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > > > > > +               struct receive_queue *rq;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +               rq = &vi->rq[i];
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +               kfree(rq->dma_array);
> > > > > > > > > > > > > > > > > +               kfree(rq->data_array);
> > > > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       return -ENOMEM;
> > > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > >  static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
> > > > > > > > > > > > > > > > >  {
> > > > > > > > > > > > > > > > >         unsigned int len;
> > > > > > > > > > > > > > > > > @@ -835,7 +1033,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > > > > > > > > > > > >                 void *buf;
> > > > > > > > > > > > > > > > >                 int off;
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > -               buf = virtqueue_get_buf(rq->vq, &buflen);
> > > > > > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &buflen, NULL);
> > > > > > > > > > > > > > > > >                 if (unlikely(!buf))
> > > > > > > > > > > > > > > > >                         goto err_buf;
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > @@ -1126,7 +1324,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > > > > > > > > > > > >                 return -EINVAL;
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > >         while (--*num_buf > 0) {
> > > > > > > > > > > > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > > > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > > > > > > > > > > > >                                  dev->name, *num_buf,
> > > > > > > > > > > > > > > > > @@ -1351,7 +1549,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > > > > > > > > >         while (--num_buf) {
> > > > > > > > > > > > > > > > >                 int num_skb_frags;
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > -               buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
> > > > > > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, &ctx);
> > > > > > > > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
> > > > > > > > > > > > > > > > >                                  dev->name, num_buf,
> > > > > > > > > > > > > > > > > @@ -1414,7 +1612,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > > > > > > > > >  err_skb:
> > > > > > > > > > > > > > > > >         put_page(page);
> > > > > > > > > > > > > > > > >         while (num_buf-- > 1) {
> > > > > > > > > > > > > > > > > -               buf = virtqueue_get_buf(rq->vq, &len);
> > > > > > > > > > > > > > > > > +               buf = virtnet_rq_get_buf(rq, &len, NULL);
> > > > > > > > > > > > > > > > >                 if (unlikely(!buf)) {
> > > > > > > > > > > > > > > > >                         pr_debug("%s: rx error: %d buffers missing\n",
> > > > > > > > > > > > > > > > >                                  dev->name, num_buf);
> > > > > > > > > > > > > > > > > @@ -1529,6 +1727,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > > > > > > > > > > > >         unsigned int xdp_headroom = virtnet_get_headroom(vi);
> > > > > > > > > > > > > > > > >         void *ctx = (void *)(unsigned long)xdp_headroom;
> > > > > > > > > > > > > > > > >         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
> > > > > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > > > >         int err;
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > >         len = SKB_DATA_ALIGN(len) +
> > > > > > > > > > > > > > > > > @@ -1539,11 +1738,34 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> > > > > > > > > > > > > > > > >         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > > > > > > > > > > >         get_page(alloc_frag->page);
> > > > > > > > > > > > > > > > >         alloc_frag->offset += len;
> > > > > > > > > > > > > > > > > -       sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > > > > > > > -                   vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > > > > > +               err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > > > > > > > +                                       vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > > > > > > > +               if (err)
> > > > > > > > > > > > > > > > > +                       goto map_err;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > > > > > > > > > > > +       } else {
> > > > > > > > > > > > > > > > > +               sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> > > > > > > > > > > > > > > > > +                           vi->hdr_len + GOOD_PACKET_LEN);
> > > > > > > > > > > > > > > > > +               data = (void *)buf;
> > > > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > > > > > > > > > > > >         if (err < 0)
> > > > > > > > > > > > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > > > > > > > > > > > +               goto add_err;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       return err;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +add_err:
> > > > > > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +map_err:
> > > > > > > > > > > > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > > > > > > > > > > > >         return err;
> > > > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > @@ -1620,6 +1842,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > > > > > > > > > >         unsigned int headroom = virtnet_get_headroom(vi);
> > > > > > > > > > > > > > > > >         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
> > > > > > > > > > > > > > > > >         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
> > > > > > > > > > > > > > > > > +       struct virtnet_rq_data *data;
> > > > > > > > > > > > > > > > >         char *buf;
> > > > > > > > > > > > > > > > >         void *ctx;
> > > > > > > > > > > > > > > > >         int err;
> > > > > > > > > > > > > > > > > @@ -1650,12 +1873,32 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > > > > > > > > > >                 alloc_frag->offset += hole;
> > > > > > > > > > > > > > > > >         }
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > -       sg_init_one(rq->sg, buf, len);
> > > > > > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > > > > > +               err = virtnet_rq_map_sg(rq, buf, len);
> > > > > > > > > > > > > > > > > +               if (err)
> > > > > > > > > > > > > > > > > +                       goto map_err;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +               data = virtnet_rq_get_data(rq, buf, rq->last_dma);
> > > > > > > > > > > > > > > > > +       } else {
> > > > > > > > > > > > > > > > > +               sg_init_one(rq->sg, buf, len);
> > > > > > > > > > > > > > > > > +               data = (void *)buf;
> > > > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > > > > > > > > > > > > > -       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > > > > > > > > > > +       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
> > > > > > > > > > > > > > > > >         if (err < 0)
> > > > > > > > > > > > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > > > > > > > > > > > +               goto add_err;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +       return 0;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +add_err:
> > > > > > > > > > > > > > > > > +       if (rq->data_array) {
> > > > > > > > > > > > > > > > > +               virtnet_rq_unmap(rq, data->dma);
> > > > > > > > > > > > > > > > > +               virtnet_rq_recycle_data(rq, data);
> > > > > > > > > > > > > > > > > +       }
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > +map_err:
> > > > > > > > > > > > > > > > > +       put_page(virt_to_head_page(buf));
> > > > > > > > > > > > > > > > >         return err;
> > > > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > @@ -1775,13 +2018,13 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> > > > > > > > > > > > > > > > >                 void *ctx;
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > >                 while (stats.packets < budget &&
> > > > > > > > > > > > > > > > > -                      (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
> > > > > > > > > > > > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> > > > > > > > > > > > > > > > >                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
> > > > > > > > > > > > > > > > >                         stats.packets++;
> > > > > > > > > > > > > > > > >                 }
> > > > > > > > > > > > > > > > >         } else {
> > > > > > > > > > > > > > > > >                 while (stats.packets < budget &&
> > > > > > > > > > > > > > > > > -                      (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> > > > > > > > > > > > > > > > > +                      (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
> > > > > > > > > > > > > > > > >                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
> > > > > > > > > > > > > > > > >                         stats.packets++;
> > > > > > > > > > > > > > > > >                 }
> > > > > > > > > > > > > > > > > @@ -3514,6 +3757,9 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > > > > > > > > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > > > > >                 __netif_napi_del(&vi->rq[i].napi);
> > > > > > > > > > > > > > > > >                 __netif_napi_del(&vi->sq[i].napi);
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +               kfree(vi->rq[i].data_array);
> > > > > > > > > > > > > > > > > +               kfree(vi->rq[i].dma_array);
> > > > > > > > > > > > > > > > >         }
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > >         /* We called __netif_napi_del(),
> > > > > > > > > > > > > > > > > @@ -3591,9 +3837,10 @@ static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > > > > > > > > > > > >         }
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > > > > > > > > -               struct virtqueue *vq = vi->rq[i].vq;
> > > > > > > > > > > > > > > > > -               while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
> > > > > > > > > > > > > > > > > -                       virtnet_rq_free_unused_buf(vq, buf);
> > > > > > > > > > > > > > > > > +               struct receive_queue *rq = &vi->rq[i];
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +               while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
> > > > > > > > > > > > > > > > > +                       virtnet_rq_free_unused_buf(rq->vq, buf);
> > > > > > > > > > > > > > > > >                 cond_resched();
> > > > > > > > > > > > > > > > >         }
> > > > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > > > > @@ -3767,6 +4014,10 @@ static int init_vqs(struct virtnet_info *vi)
> > > > > > > > > > > > > > > > >         if (ret)
> > > > > > > > > > > > > > > > >                 goto err_free;
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > +       ret = virtnet_rq_merge_map_init(vi);
> > > > > > > > > > > > > > > > > +       if (ret)
> > > > > > > > > > > > > > > > > +               goto err_free;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > >         cpus_read_lock();
> > > > > > > > > > > > > > > > >         virtnet_set_affinity(vi);
> > > > > > > > > > > > > > > > >         cpus_read_unlock();
> > > > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > > > 2.32.0.3.g01195cf9f
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > >
> > > > > > > >
> > > > > > >
> > > > >
> > >
>
>
> _______________________________________________
> Virtualization mailing list
> Virtualization@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Christoph Hellwig July 20, 2023, 6:23 a.m. UTC | #27
Hi Jason,

can you please resend your reply with proper quoting?  I had to give
up after multiple pages of scrolling without finding anything that
you added to the full quote.
Jason Wang July 20, 2023, 7:41 a.m. UTC | #28
On Thu, Jul 20, 2023 at 2:23 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> Hi Jason,
>
> can you please resend your reply with proper quoting?  I had to give
> up after multiple pages of scrolling without finding anything that
> you added to the full quote.

I guess it's this part?

> > > You should also test without iommu but with swiotlb=force
> >
> >
> > For swiotlb, merge DMA has no benefit, because we still need to copy data from
> > swiotlb buffer to the origin buffer.
> > The benefit of the merge DMA is to reduce the operate to the iommu device.
> >
> > I did some test for this. The result is same.
> >
> > Thanks.
> >
>
> Did you actually check that it works though?
> Looks like with swiotlb you need to synch to trigger a copy
> before unmap, and I don't see where it's done in the current
> patch.

And this is needed for XDP_REDIRECT as well.

Thanks
Christoph Hellwig July 20, 2023, 8:21 a.m. UTC | #29
On Thu, Jul 20, 2023 at 03:41:56PM +0800, Jason Wang wrote:
> > Did you actually check that it works though?
> > Looks like with swiotlb you need to synch to trigger a copy
> > before unmap, and I don't see where it's done in the current
> > patch.
> 
> And this is needed for XDP_REDIRECT as well.

DMA always needs proper syncs, be that for swiotlb or for cache
maintainance, yes.
diff mbox series

Patch

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 486b5849033d..4de845d35bed 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -126,6 +126,27 @@  static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
 #define VIRTNET_SQ_STATS_LEN	ARRAY_SIZE(virtnet_sq_stats_desc)
 #define VIRTNET_RQ_STATS_LEN	ARRAY_SIZE(virtnet_rq_stats_desc)
 
+/* The bufs on the same page may share this struct. */
+struct virtnet_rq_dma {
+	struct virtnet_rq_dma *next;
+
+	dma_addr_t addr;
+
+	void *buf;
+	u32 len;
+
+	u32 ref;
+};
+
+/* Record the dma and buf. */
+struct virtnet_rq_data {
+	struct virtnet_rq_data *next;
+
+	void *buf;
+
+	struct virtnet_rq_dma *dma;
+};
+
 /* Internal representation of a send virtqueue */
 struct send_queue {
 	/* Virtqueue associated with this send _queue */
@@ -175,6 +196,13 @@  struct receive_queue {
 	char name[16];
 
 	struct xdp_rxq_info xdp_rxq;
+
+	struct virtnet_rq_data *data_array;
+	struct virtnet_rq_data *data_free;
+
+	struct virtnet_rq_dma *dma_array;
+	struct virtnet_rq_dma *dma_free;
+	struct virtnet_rq_dma *last_dma;
 };
 
 /* This structure can contain rss message with maximum settings for indirection table and keysize
@@ -549,6 +577,176 @@  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	return skb;
 }
 
+static void virtnet_rq_unmap(struct receive_queue *rq, struct virtnet_rq_dma *dma)
+{
+	struct device *dev;
+
+	--dma->ref;
+
+	if (dma->ref)
+		return;
+
+	dev = virtqueue_dma_dev(rq->vq);
+
+	dma_unmap_page(dev, dma->addr, dma->len, DMA_FROM_DEVICE);
+
+	dma->next = rq->dma_free;
+	rq->dma_free = dma;
+}
+
+static void *virtnet_rq_recycle_data(struct receive_queue *rq,
+				     struct virtnet_rq_data *data)
+{
+	void *buf;
+
+	buf = data->buf;
+
+	data->next = rq->data_free;
+	rq->data_free = data;
+
+	return buf;
+}
+
+static struct virtnet_rq_data *virtnet_rq_get_data(struct receive_queue *rq,
+						   void *buf,
+						   struct virtnet_rq_dma *dma)
+{
+	struct virtnet_rq_data *data;
+
+	data = rq->data_free;
+	rq->data_free = data->next;
+
+	data->buf = buf;
+	data->dma = dma;
+
+	return data;
+}
+
+static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
+{
+	struct virtnet_rq_data *data;
+	void *buf;
+
+	buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
+	if (!buf || !rq->data_array)
+		return buf;
+
+	data = buf;
+
+	virtnet_rq_unmap(rq, data->dma);
+
+	return virtnet_rq_recycle_data(rq, data);
+}
+
+static void *virtnet_rq_detach_unused_buf(struct receive_queue *rq)
+{
+	struct virtnet_rq_data *data;
+	void *buf;
+
+	buf = virtqueue_detach_unused_buf(rq->vq);
+	if (!buf || !rq->data_array)
+		return buf;
+
+	data = buf;
+
+	virtnet_rq_unmap(rq, data->dma);
+
+	return virtnet_rq_recycle_data(rq, data);
+}
+
+static int virtnet_rq_map_sg(struct receive_queue *rq, void *buf, u32 len)
+{
+	struct virtnet_rq_dma *dma = rq->last_dma;
+	struct device *dev;
+	u32 off, map_len;
+	dma_addr_t addr;
+	void *end;
+
+	if (likely(dma) && buf >= dma->buf && (buf + len <= dma->buf + dma->len)) {
+		++dma->ref;
+		addr = dma->addr + (buf - dma->buf);
+		goto ok;
+	}
+
+	end = buf + len - 1;
+	off = offset_in_page(end);
+	map_len = len + PAGE_SIZE - off;
+
+	dev = virtqueue_dma_dev(rq->vq);
+
+	addr = dma_map_page_attrs(dev, virt_to_page(buf), offset_in_page(buf),
+				  map_len, DMA_FROM_DEVICE, 0);
+	if (addr == DMA_MAPPING_ERROR)
+		return -ENOMEM;
+
+	dma = rq->dma_free;
+	rq->dma_free = dma->next;
+
+	dma->ref = 1;
+	dma->buf = buf;
+	dma->addr = addr;
+	dma->len = map_len;
+
+	rq->last_dma = dma;
+
+ok:
+	sg_init_table(rq->sg, 1);
+	rq->sg[0].dma_address = addr;
+	rq->sg[0].length = len;
+
+	return 0;
+}
+
+static int virtnet_rq_merge_map_init(struct virtnet_info *vi)
+{
+	struct receive_queue *rq;
+	int i, err, j, num;
+
+	/* disable for big mode */
+	if (!vi->mergeable_rx_bufs && vi->big_packets)
+		return 0;
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		err = virtqueue_set_premapped(vi->rq[i].vq);
+		if (err)
+			continue;
+
+		rq = &vi->rq[i];
+
+		num = virtqueue_get_vring_size(rq->vq);
+
+		rq->data_array = kmalloc_array(num, sizeof(*rq->data_array), GFP_KERNEL);
+		if (!rq->data_array)
+			goto err;
+
+		rq->dma_array = kmalloc_array(num, sizeof(*rq->dma_array), GFP_KERNEL);
+		if (!rq->dma_array)
+			goto err;
+
+		for (j = 0; j < num; ++j) {
+			rq->data_array[j].next = rq->data_free;
+			rq->data_free = &rq->data_array[j];
+
+			rq->dma_array[j].next = rq->dma_free;
+			rq->dma_free = &rq->dma_array[j];
+		}
+	}
+
+	return 0;
+
+err:
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		struct receive_queue *rq;
+
+		rq = &vi->rq[i];
+
+		kfree(rq->dma_array);
+		kfree(rq->data_array);
+	}
+
+	return -ENOMEM;
+}
+
 static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
 {
 	unsigned int len;
@@ -835,7 +1033,7 @@  static struct page *xdp_linearize_page(struct receive_queue *rq,
 		void *buf;
 		int off;
 
-		buf = virtqueue_get_buf(rq->vq, &buflen);
+		buf = virtnet_rq_get_buf(rq, &buflen, NULL);
 		if (unlikely(!buf))
 			goto err_buf;
 
@@ -1126,7 +1324,7 @@  static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
 		return -EINVAL;
 
 	while (--*num_buf > 0) {
-		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
+		buf = virtnet_rq_get_buf(rq, &len, &ctx);
 		if (unlikely(!buf)) {
 			pr_debug("%s: rx error: %d buffers out of %d missing\n",
 				 dev->name, *num_buf,
@@ -1351,7 +1549,7 @@  static struct sk_buff *receive_mergeable(struct net_device *dev,
 	while (--num_buf) {
 		int num_skb_frags;
 
-		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
+		buf = virtnet_rq_get_buf(rq, &len, &ctx);
 		if (unlikely(!buf)) {
 			pr_debug("%s: rx error: %d buffers out of %d missing\n",
 				 dev->name, num_buf,
@@ -1414,7 +1612,7 @@  static struct sk_buff *receive_mergeable(struct net_device *dev,
 err_skb:
 	put_page(page);
 	while (num_buf-- > 1) {
-		buf = virtqueue_get_buf(rq->vq, &len);
+		buf = virtnet_rq_get_buf(rq, &len, NULL);
 		if (unlikely(!buf)) {
 			pr_debug("%s: rx error: %d buffers missing\n",
 				 dev->name, num_buf);
@@ -1529,6 +1727,7 @@  static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
 	unsigned int xdp_headroom = virtnet_get_headroom(vi);
 	void *ctx = (void *)(unsigned long)xdp_headroom;
 	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
+	struct virtnet_rq_data *data;
 	int err;
 
 	len = SKB_DATA_ALIGN(len) +
@@ -1539,11 +1738,34 @@  static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
 	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
 	get_page(alloc_frag->page);
 	alloc_frag->offset += len;
-	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
-		    vi->hdr_len + GOOD_PACKET_LEN);
-	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
+
+	if (rq->data_array) {
+		err = virtnet_rq_map_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom,
+					vi->hdr_len + GOOD_PACKET_LEN);
+		if (err)
+			goto map_err;
+
+		data = virtnet_rq_get_data(rq, buf, rq->last_dma);
+	} else {
+		sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
+			    vi->hdr_len + GOOD_PACKET_LEN);
+		data = (void *)buf;
+	}
+
+	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
 	if (err < 0)
-		put_page(virt_to_head_page(buf));
+		goto add_err;
+
+	return err;
+
+add_err:
+	if (rq->data_array) {
+		virtnet_rq_unmap(rq, data->dma);
+		virtnet_rq_recycle_data(rq, data);
+	}
+
+map_err:
+	put_page(virt_to_head_page(buf));
 	return err;
 }
 
@@ -1620,6 +1842,7 @@  static int add_recvbuf_mergeable(struct virtnet_info *vi,
 	unsigned int headroom = virtnet_get_headroom(vi);
 	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
 	unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
+	struct virtnet_rq_data *data;
 	char *buf;
 	void *ctx;
 	int err;
@@ -1650,12 +1873,32 @@  static int add_recvbuf_mergeable(struct virtnet_info *vi,
 		alloc_frag->offset += hole;
 	}
 
-	sg_init_one(rq->sg, buf, len);
+	if (rq->data_array) {
+		err = virtnet_rq_map_sg(rq, buf, len);
+		if (err)
+			goto map_err;
+
+		data = virtnet_rq_get_data(rq, buf, rq->last_dma);
+	} else {
+		sg_init_one(rq->sg, buf, len);
+		data = (void *)buf;
+	}
+
 	ctx = mergeable_len_to_ctx(len + room, headroom);
-	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
+	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, data, ctx, gfp);
 	if (err < 0)
-		put_page(virt_to_head_page(buf));
+		goto add_err;
+
+	return 0;
+
+add_err:
+	if (rq->data_array) {
+		virtnet_rq_unmap(rq, data->dma);
+		virtnet_rq_recycle_data(rq, data);
+	}
 
+map_err:
+	put_page(virt_to_head_page(buf));
 	return err;
 }
 
@@ -1775,13 +2018,13 @@  static int virtnet_receive(struct receive_queue *rq, int budget,
 		void *ctx;
 
 		while (stats.packets < budget &&
-		       (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
+		       (buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
 			receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
 			stats.packets++;
 		}
 	} else {
 		while (stats.packets < budget &&
-		       (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
+		       (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
 			receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
 			stats.packets++;
 		}
@@ -3514,6 +3757,9 @@  static void virtnet_free_queues(struct virtnet_info *vi)
 	for (i = 0; i < vi->max_queue_pairs; i++) {
 		__netif_napi_del(&vi->rq[i].napi);
 		__netif_napi_del(&vi->sq[i].napi);
+
+		kfree(vi->rq[i].data_array);
+		kfree(vi->rq[i].dma_array);
 	}
 
 	/* We called __netif_napi_del(),
@@ -3591,9 +3837,10 @@  static void free_unused_bufs(struct virtnet_info *vi)
 	}
 
 	for (i = 0; i < vi->max_queue_pairs; i++) {
-		struct virtqueue *vq = vi->rq[i].vq;
-		while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
-			virtnet_rq_free_unused_buf(vq, buf);
+		struct receive_queue *rq = &vi->rq[i];
+
+		while ((buf = virtnet_rq_detach_unused_buf(rq)) != NULL)
+			virtnet_rq_free_unused_buf(rq->vq, buf);
 		cond_resched();
 	}
 }
@@ -3767,6 +4014,10 @@  static int init_vqs(struct virtnet_info *vi)
 	if (ret)
 		goto err_free;
 
+	ret = virtnet_rq_merge_map_init(vi);
+	if (ret)
+		goto err_free;
+
 	cpus_read_lock();
 	virtnet_set_affinity(vi);
 	cpus_read_unlock();