diff mbox series

[1/3] virtio: cache indirect desc for split

Message ID 20211027061913.76276-2-xuanzhuo@linux.alibaba.com (mailing list archive)
State Superseded
Headers show
Series virtio support cache indirect desc | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Xuan Zhuo Oct. 27, 2021, 6:19 a.m. UTC
In the case of using indirect, indirect desc must be allocated and
released each time, which increases a lot of cpu overhead.

Here, a cache is added for indirect. If the number of indirect desc to be
applied for is less than VIRT_QUEUE_CACHE_DESC_NUM, the desc array with
the size of VIRT_QUEUE_CACHE_DESC_NUM is fixed and cached for reuse.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 drivers/virtio/virtio.c      |  6 ++++
 drivers/virtio/virtio_ring.c | 63 ++++++++++++++++++++++++++++++------
 include/linux/virtio.h       | 10 ++++++
 3 files changed, 70 insertions(+), 9 deletions(-)

Comments

Michael S. Tsirkin Oct. 27, 2021, 8:55 a.m. UTC | #1
On Wed, Oct 27, 2021 at 02:19:11PM +0800, Xuan Zhuo wrote:
> In the case of using indirect, indirect desc must be allocated and
> released each time, which increases a lot of cpu overhead.
> 
> Here, a cache is added for indirect. If the number of indirect desc to be
> applied for is less than VIRT_QUEUE_CACHE_DESC_NUM, the desc array with
> the size of VIRT_QUEUE_CACHE_DESC_NUM is fixed and cached for reuse.
> 
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> ---
>  drivers/virtio/virtio.c      |  6 ++++
>  drivers/virtio/virtio_ring.c | 63 ++++++++++++++++++++++++++++++------
>  include/linux/virtio.h       | 10 ++++++
>  3 files changed, 70 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> index 0a5b54034d4b..04bcb74e5b9a 100644
> --- a/drivers/virtio/virtio.c
> +++ b/drivers/virtio/virtio.c
> @@ -431,6 +431,12 @@ bool is_virtio_device(struct device *dev)
>  }
>  EXPORT_SYMBOL_GPL(is_virtio_device);
>  
> +void virtio_use_desc_cache(struct virtio_device *dev, bool val)
> +{
> +	dev->desc_cache = val;
> +}
> +EXPORT_SYMBOL_GPL(virtio_use_desc_cache);
> +
>  void unregister_virtio_device(struct virtio_device *dev)
>  {
>  	int index = dev->index; /* save for after device release */
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index dd95dfd85e98..0b9a8544b0e8 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -117,6 +117,10 @@ struct vring_virtqueue {
>  	/* Hint for event idx: already triggered no need to disable. */
>  	bool event_triggered;
>  
> +	/* Is indirect cache used? */
> +	bool use_desc_cache;
> +	void *desc_cache_chain;
> +
>  	union {
>  		/* Available for split ring */
>  		struct {

Let's use llist_head and friends please (I am guessing we want
single-linked to avoid writing into indirect buffer after use,
invalidating the cache, but please document why in a comment).  Do not
open-code it.

Also hide all casts in inline wrappers.


> @@ -423,12 +427,47 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq,
>  	return extra[i].next;
>  }
>  
> -static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
> +#define VIRT_QUEUE_CACHE_DESC_NUM 4

Add an inline wrapper for checking sg versus VIRT_QUEUE_CACHE_DESC_NUM.

> +
> +static void desc_cache_chain_free_split(void *chain)
> +{
> +	struct vring_desc *desc;
> +
> +	while (chain) {
> +		desc = chain;
> +		chain = (void *)desc->addr;
> +		kfree(desc);
> +	}
> +}
> +
> +static void desc_cache_put_split(struct vring_virtqueue *vq,
> +				 struct vring_desc *desc, int n)
> +{
> +	if (vq->use_desc_cache && n <= VIRT_QUEUE_CACHE_DESC_NUM) {
> +		desc->addr = (u64)vq->desc_cache_chain;
> +		vq->desc_cache_chain = desc;
> +	} else {
> +		kfree(desc);
> +	}
> +}


> +
> +static struct vring_desc *alloc_indirect_split(struct vring_virtqueue *vq,
>  					       unsigned int total_sg,
>  					       gfp_t gfp)
>  {
>  	struct vring_desc *desc;
> -	unsigned int i;
> +	unsigned int i, n;
> +
> +	if (vq->use_desc_cache && total_sg <= VIRT_QUEUE_CACHE_DESC_NUM) {
> +		if (vq->desc_cache_chain) {
> +			desc = vq->desc_cache_chain;
> +			vq->desc_cache_chain = (void *)desc->addr;
> +			goto got;
> +		}
> +		n = VIRT_QUEUE_CACHE_DESC_NUM;

Hmm. This will allocate more entries than actually used. Why do it?

> +	} else {
> +		n = total_sg;
> +	}
>  
>  	/*
>  	 * We require lowmem mappings for the descriptors because
> @@ -437,12 +476,13 @@ static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
>  	 */
>  	gfp &= ~__GFP_HIGHMEM;
>  
> -	desc = kmalloc_array(total_sg, sizeof(struct vring_desc), gfp);
> +	desc = kmalloc_array(n, sizeof(struct vring_desc), gfp);
>  	if (!desc)
>  		return NULL;
>  
> +got:
>  	for (i = 0; i < total_sg; i++)
> -		desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
> +		desc[i].next = cpu_to_virtio16(vq->vq.vdev, i + 1);
>  	return desc;
>  }
>  
> @@ -508,7 +548,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
>  	head = vq->free_head;
>  
>  	if (virtqueue_use_indirect(_vq, total_sg))
> -		desc = alloc_indirect_split(_vq, total_sg, gfp);
> +		desc = alloc_indirect_split(vq, total_sg, gfp);
>  	else {
>  		desc = NULL;
>  		WARN_ON_ONCE(total_sg > vq->split.vring.num && !vq->indirect);
> @@ -652,7 +692,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
>  	}
>  
>  	if (indirect)
> -		kfree(desc);
> +		desc_cache_put_split(vq, desc, total_sg);
>  
>  	END_USE(vq);
>  	return -ENOMEM;
> @@ -717,7 +757,7 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
>  	if (vq->indirect) {
>  		struct vring_desc *indir_desc =
>  				vq->split.desc_state[head].indir_desc;
> -		u32 len;
> +		u32 len, n;
>  
>  		/* Free the indirect table, if any, now that it's unmapped. */
>  		if (!indir_desc)
> @@ -729,10 +769,12 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
>  				VRING_DESC_F_INDIRECT));
>  		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
>  
> -		for (j = 0; j < len / sizeof(struct vring_desc); j++)
> +		n = len / sizeof(struct vring_desc);
> +
> +		for (j = 0; j < n; j++)
>  			vring_unmap_one_split_indirect(vq, &indir_desc[j]);
>  
> -		kfree(indir_desc);
> +		desc_cache_put_split(vq, indir_desc, n);
>  		vq->split.desc_state[head].indir_desc = NULL;
>  	} else if (ctx) {
>  		*ctx = vq->split.desc_state[head].indir_desc;
> @@ -2199,6 +2241,8 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
>  	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
>  		!context;
>  	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
> +	vq->desc_cache_chain = NULL;
> +	vq->use_desc_cache = vdev->desc_cache;
>  
>  	if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM))
>  		vq->weak_barriers = false;
> @@ -2329,6 +2373,7 @@ void vring_del_virtqueue(struct virtqueue *_vq)
>  	if (!vq->packed_ring) {
>  		kfree(vq->split.desc_state);
>  		kfree(vq->split.desc_extra);
> +		desc_cache_chain_free_split(vq->desc_cache_chain);
>  	}
>  	kfree(vq);
>  }
> diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> index 41edbc01ffa4..d84b7b8f4070 100644
> --- a/include/linux/virtio.h
> +++ b/include/linux/virtio.h
> @@ -109,6 +109,7 @@ struct virtio_device {
>  	bool failed;
>  	bool config_enabled;
>  	bool config_change_pending;
> +	bool desc_cache;
>  	spinlock_t config_lock;
>  	spinlock_t vqs_list_lock; /* Protects VQs list access */
>  	struct device dev;
> @@ -130,6 +131,15 @@ int register_virtio_device(struct virtio_device *dev);
>  void unregister_virtio_device(struct virtio_device *dev);
>  bool is_virtio_device(struct device *dev);
>  
> +/**
> + * virtio_use_desc_cache - virtio ring use desc cache
> + *
> + * virtio will cache the allocated indirect desc.
> + *
> + * This function must be called before find_vqs.
> + */
> +void virtio_use_desc_cache(struct virtio_device *dev, bool val);
> +
>  void virtio_break_device(struct virtio_device *dev);
>  
>  void virtio_config_changed(struct virtio_device *dev);
> -- 
> 2.31.0
Dongli Zhang Oct. 27, 2021, 4:33 p.m. UTC | #2
On 10/26/21 11:19 PM, Xuan Zhuo wrote:
> In the case of using indirect, indirect desc must be allocated and
> released each time, which increases a lot of cpu overhead.
> 
> Here, a cache is added for indirect. If the number of indirect desc to be
> applied for is less than VIRT_QUEUE_CACHE_DESC_NUM, the desc array with
> the size of VIRT_QUEUE_CACHE_DESC_NUM is fixed and cached for reuse.
> 
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> ---
>  drivers/virtio/virtio.c      |  6 ++++
>  drivers/virtio/virtio_ring.c | 63 ++++++++++++++++++++++++++++++------
>  include/linux/virtio.h       | 10 ++++++
>  3 files changed, 70 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> index 0a5b54034d4b..04bcb74e5b9a 100644
> --- a/drivers/virtio/virtio.c
> +++ b/drivers/virtio/virtio.c
> @@ -431,6 +431,12 @@ bool is_virtio_device(struct device *dev)
>  }
>  EXPORT_SYMBOL_GPL(is_virtio_device);
>  
> +void virtio_use_desc_cache(struct virtio_device *dev, bool val)
> +{
> +	dev->desc_cache = val;
> +}
> +EXPORT_SYMBOL_GPL(virtio_use_desc_cache);
> +
>  void unregister_virtio_device(struct virtio_device *dev)
>  {
>  	int index = dev->index; /* save for after device release */
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index dd95dfd85e98..0b9a8544b0e8 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -117,6 +117,10 @@ struct vring_virtqueue {
>  	/* Hint for event idx: already triggered no need to disable. */
>  	bool event_triggered;
>  
> +	/* Is indirect cache used? */
> +	bool use_desc_cache;
> +	void *desc_cache_chain;
> +
>  	union {
>  		/* Available for split ring */
>  		struct {
> @@ -423,12 +427,47 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq,
>  	return extra[i].next;
>  }
>  
> -static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
> +#define VIRT_QUEUE_CACHE_DESC_NUM 4
> +
> +static void desc_cache_chain_free_split(void *chain)
> +{
> +	struct vring_desc *desc;
> +
> +	while (chain) {
> +		desc = chain;
> +		chain = (void *)desc->addr;
> +		kfree(desc);
> +	}
> +}
> +
> +static void desc_cache_put_split(struct vring_virtqueue *vq,
> +				 struct vring_desc *desc, int n)
> +{
> +	if (vq->use_desc_cache && n <= VIRT_QUEUE_CACHE_DESC_NUM) {
> +		desc->addr = (u64)vq->desc_cache_chain;
> +		vq->desc_cache_chain = desc;
> +	} else {
> +		kfree(desc);
> +	}
> +}
> +
> +static struct vring_desc *alloc_indirect_split(struct vring_virtqueue *vq,
>  					       unsigned int total_sg,
>  					       gfp_t gfp)
>  {
>  	struct vring_desc *desc;
> -	unsigned int i;
> +	unsigned int i, n;
> +
> +	if (vq->use_desc_cache && total_sg <= VIRT_QUEUE_CACHE_DESC_NUM) {
> +		if (vq->desc_cache_chain) {
> +			desc = vq->desc_cache_chain;
> +			vq->desc_cache_chain = (void *)desc->addr;
> +			goto got;
> +		}
> +		n = VIRT_QUEUE_CACHE_DESC_NUM;

How about to make the VIRT_QUEUE_CACHE_DESC_NUM configurable (at least during
driver probing) unless there is a reason that the default value is 4.

Thank you very much!

Dongli Zhang



> +	} else {
> +		n = total_sg;
> +	}
>  
>  	/*
>  	 * We require lowmem mappings for the descriptors because
> @@ -437,12 +476,13 @@ static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
>  	 */
>  	gfp &= ~__GFP_HIGHMEM;
>  
> -	desc = kmalloc_array(total_sg, sizeof(struct vring_desc), gfp);
> +	desc = kmalloc_array(n, sizeof(struct vring_desc), gfp);
>  	if (!desc)
>  		return NULL;
>  
> +got:
>  	for (i = 0; i < total_sg; i++)
> -		desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
> +		desc[i].next = cpu_to_virtio16(vq->vq.vdev, i + 1);
>  	return desc;
>  }
>  
> @@ -508,7 +548,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
>  	head = vq->free_head;
>  
>  	if (virtqueue_use_indirect(_vq, total_sg))
> -		desc = alloc_indirect_split(_vq, total_sg, gfp);
> +		desc = alloc_indirect_split(vq, total_sg, gfp);
>  	else {
>  		desc = NULL;
>  		WARN_ON_ONCE(total_sg > vq->split.vring.num && !vq->indirect);
> @@ -652,7 +692,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
>  	}
>  
>  	if (indirect)
> -		kfree(desc);
> +		desc_cache_put_split(vq, desc, total_sg);
>  
>  	END_USE(vq);
>  	return -ENOMEM;
> @@ -717,7 +757,7 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
>  	if (vq->indirect) {
>  		struct vring_desc *indir_desc =
>  				vq->split.desc_state[head].indir_desc;
> -		u32 len;
> +		u32 len, n;
>  
>  		/* Free the indirect table, if any, now that it's unmapped. */
>  		if (!indir_desc)
> @@ -729,10 +769,12 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
>  				VRING_DESC_F_INDIRECT));
>  		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
>  
> -		for (j = 0; j < len / sizeof(struct vring_desc); j++)
> +		n = len / sizeof(struct vring_desc);
> +
> +		for (j = 0; j < n; j++)
>  			vring_unmap_one_split_indirect(vq, &indir_desc[j]);
>  
> -		kfree(indir_desc);
> +		desc_cache_put_split(vq, indir_desc, n);
>  		vq->split.desc_state[head].indir_desc = NULL;
>  	} else if (ctx) {
>  		*ctx = vq->split.desc_state[head].indir_desc;
> @@ -2199,6 +2241,8 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
>  	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
>  		!context;
>  	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
> +	vq->desc_cache_chain = NULL;
> +	vq->use_desc_cache = vdev->desc_cache;
>  
>  	if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM))
>  		vq->weak_barriers = false;
> @@ -2329,6 +2373,7 @@ void vring_del_virtqueue(struct virtqueue *_vq)
>  	if (!vq->packed_ring) {
>  		kfree(vq->split.desc_state);
>  		kfree(vq->split.desc_extra);
> +		desc_cache_chain_free_split(vq->desc_cache_chain);
>  	}
>  	kfree(vq);
>  }
> diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> index 41edbc01ffa4..d84b7b8f4070 100644
> --- a/include/linux/virtio.h
> +++ b/include/linux/virtio.h
> @@ -109,6 +109,7 @@ struct virtio_device {
>  	bool failed;
>  	bool config_enabled;
>  	bool config_change_pending;
> +	bool desc_cache;
>  	spinlock_t config_lock;
>  	spinlock_t vqs_list_lock; /* Protects VQs list access */
>  	struct device dev;
> @@ -130,6 +131,15 @@ int register_virtio_device(struct virtio_device *dev);
>  void unregister_virtio_device(struct virtio_device *dev);
>  bool is_virtio_device(struct device *dev);
>  
> +/**
> + * virtio_use_desc_cache - virtio ring use desc cache
> + *
> + * virtio will cache the allocated indirect desc.
> + *
> + * This function must be called before find_vqs.
> + */
> +void virtio_use_desc_cache(struct virtio_device *dev, bool val);
> +
>  void virtio_break_device(struct virtio_device *dev);
>  
>  void virtio_config_changed(struct virtio_device *dev);
>
Michael S. Tsirkin Oct. 27, 2021, 5:07 p.m. UTC | #3
On Wed, Oct 27, 2021 at 02:19:11PM +0800, Xuan Zhuo wrote:
> In the case of using indirect, indirect desc must be allocated and
> released each time, which increases a lot of cpu overhead.
> 
> Here, a cache is added for indirect. If the number of indirect desc to be
> applied for is less than VIRT_QUEUE_CACHE_DESC_NUM, the desc array with
> the size of VIRT_QUEUE_CACHE_DESC_NUM is fixed and cached for reuse.
> 
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> ---
>  drivers/virtio/virtio.c      |  6 ++++
>  drivers/virtio/virtio_ring.c | 63 ++++++++++++++++++++++++++++++------
>  include/linux/virtio.h       | 10 ++++++
>  3 files changed, 70 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> index 0a5b54034d4b..04bcb74e5b9a 100644
> --- a/drivers/virtio/virtio.c
> +++ b/drivers/virtio/virtio.c
> @@ -431,6 +431,12 @@ bool is_virtio_device(struct device *dev)
>  }
>  EXPORT_SYMBOL_GPL(is_virtio_device);
>  
> +void virtio_use_desc_cache(struct virtio_device *dev, bool val)
> +{
> +	dev->desc_cache = val;
> +}
> +EXPORT_SYMBOL_GPL(virtio_use_desc_cache);
> +
>  void unregister_virtio_device(struct virtio_device *dev)
>  {
>  	int index = dev->index; /* save for after device release */
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index dd95dfd85e98..0b9a8544b0e8 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -117,6 +117,10 @@ struct vring_virtqueue {
>  	/* Hint for event idx: already triggered no need to disable. */
>  	bool event_triggered;
>  
> +	/* Is indirect cache used? */
> +	bool use_desc_cache;
> +	void *desc_cache_chain;
> +
>  	union {
>  		/* Available for split ring */
>  		struct {
> @@ -423,12 +427,47 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq,
>  	return extra[i].next;
>  }
>  
> -static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
> +#define VIRT_QUEUE_CACHE_DESC_NUM 4
> +
> +static void desc_cache_chain_free_split(void *chain)
> +{
> +	struct vring_desc *desc;
> +
> +	while (chain) {
> +		desc = chain;
> +		chain = (void *)desc->addr;
> +		kfree(desc);
> +	}
> +}
> +
> +static void desc_cache_put_split(struct vring_virtqueue *vq,
> +				 struct vring_desc *desc, int n)
> +{
> +	if (vq->use_desc_cache && n <= VIRT_QUEUE_CACHE_DESC_NUM) {
> +		desc->addr = (u64)vq->desc_cache_chain;
> +		vq->desc_cache_chain = desc;
> +	} else {
> +		kfree(desc);
> +	}
> +}
> +


So I have a question here. What happens if we just do:

if (n <= VIRT_QUEUE_CACHE_DESC_NUM) {
	return kmem_cache_alloc(VIRT_QUEUE_CACHE_DESC_NUM * sizeof desc, gfp)
} else {
	return kmalloc_arrat(n, sizeof desc, gfp)
}

A small change and won't we reap most performance benefits?
Michael S. Tsirkin Oct. 27, 2021, 7:36 p.m. UTC | #4
On Wed, Oct 27, 2021 at 09:33:46AM -0700, Dongli Zhang wrote:
> 
> 
> On 10/26/21 11:19 PM, Xuan Zhuo wrote:
> > In the case of using indirect, indirect desc must be allocated and
> > released each time, which increases a lot of cpu overhead.
> > 
> > Here, a cache is added for indirect. If the number of indirect desc to be
> > applied for is less than VIRT_QUEUE_CACHE_DESC_NUM, the desc array with
> > the size of VIRT_QUEUE_CACHE_DESC_NUM is fixed and cached for reuse.
> > 
> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > ---
> >  drivers/virtio/virtio.c      |  6 ++++
> >  drivers/virtio/virtio_ring.c | 63 ++++++++++++++++++++++++++++++------
> >  include/linux/virtio.h       | 10 ++++++
> >  3 files changed, 70 insertions(+), 9 deletions(-)
> > 
> > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > index 0a5b54034d4b..04bcb74e5b9a 100644
> > --- a/drivers/virtio/virtio.c
> > +++ b/drivers/virtio/virtio.c
> > @@ -431,6 +431,12 @@ bool is_virtio_device(struct device *dev)
> >  }
> >  EXPORT_SYMBOL_GPL(is_virtio_device);
> >  
> > +void virtio_use_desc_cache(struct virtio_device *dev, bool val)
> > +{
> > +	dev->desc_cache = val;
> > +}
> > +EXPORT_SYMBOL_GPL(virtio_use_desc_cache);
> > +
> >  void unregister_virtio_device(struct virtio_device *dev)
> >  {
> >  	int index = dev->index; /* save for after device release */
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index dd95dfd85e98..0b9a8544b0e8 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -117,6 +117,10 @@ struct vring_virtqueue {
> >  	/* Hint for event idx: already triggered no need to disable. */
> >  	bool event_triggered;
> >  
> > +	/* Is indirect cache used? */
> > +	bool use_desc_cache;
> > +	void *desc_cache_chain;
> > +
> >  	union {
> >  		/* Available for split ring */
> >  		struct {
> > @@ -423,12 +427,47 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq,
> >  	return extra[i].next;
> >  }
> >  
> > -static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
> > +#define VIRT_QUEUE_CACHE_DESC_NUM 4
> > +
> > +static void desc_cache_chain_free_split(void *chain)
> > +{
> > +	struct vring_desc *desc;
> > +
> > +	while (chain) {
> > +		desc = chain;
> > +		chain = (void *)desc->addr;
> > +		kfree(desc);
> > +	}
> > +}
> > +
> > +static void desc_cache_put_split(struct vring_virtqueue *vq,
> > +				 struct vring_desc *desc, int n)
> > +{
> > +	if (vq->use_desc_cache && n <= VIRT_QUEUE_CACHE_DESC_NUM) {
> > +		desc->addr = (u64)vq->desc_cache_chain;
> > +		vq->desc_cache_chain = desc;
> > +	} else {
> > +		kfree(desc);
> > +	}
> > +}
> > +
> > +static struct vring_desc *alloc_indirect_split(struct vring_virtqueue *vq,
> >  					       unsigned int total_sg,
> >  					       gfp_t gfp)
> >  {
> >  	struct vring_desc *desc;
> > -	unsigned int i;
> > +	unsigned int i, n;
> > +
> > +	if (vq->use_desc_cache && total_sg <= VIRT_QUEUE_CACHE_DESC_NUM) {
> > +		if (vq->desc_cache_chain) {
> > +			desc = vq->desc_cache_chain;
> > +			vq->desc_cache_chain = (void *)desc->addr;
> > +			goto got;
> > +		}
> > +		n = VIRT_QUEUE_CACHE_DESC_NUM;
> 
> How about to make the VIRT_QUEUE_CACHE_DESC_NUM configurable (at least during
> driver probing) unless there is a reason that the default value is 4.
> 
> Thank you very much!
> 
> Dongli Zhang


I would start with some experimentation showing that it actually makes a
difference in performance.

> 
> 
> > +	} else {
> > +		n = total_sg;
> > +	}
> >  
> >  	/*
> >  	 * We require lowmem mappings for the descriptors because
> > @@ -437,12 +476,13 @@ static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
> >  	 */
> >  	gfp &= ~__GFP_HIGHMEM;
> >  
> > -	desc = kmalloc_array(total_sg, sizeof(struct vring_desc), gfp);
> > +	desc = kmalloc_array(n, sizeof(struct vring_desc), gfp);
> >  	if (!desc)
> >  		return NULL;
> >  
> > +got:
> >  	for (i = 0; i < total_sg; i++)
> > -		desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
> > +		desc[i].next = cpu_to_virtio16(vq->vq.vdev, i + 1);
> >  	return desc;
> >  }
> >  
> > @@ -508,7 +548,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> >  	head = vq->free_head;
> >  
> >  	if (virtqueue_use_indirect(_vq, total_sg))
> > -		desc = alloc_indirect_split(_vq, total_sg, gfp);
> > +		desc = alloc_indirect_split(vq, total_sg, gfp);
> >  	else {
> >  		desc = NULL;
> >  		WARN_ON_ONCE(total_sg > vq->split.vring.num && !vq->indirect);
> > @@ -652,7 +692,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
> >  	}
> >  
> >  	if (indirect)
> > -		kfree(desc);
> > +		desc_cache_put_split(vq, desc, total_sg);
> >  
> >  	END_USE(vq);
> >  	return -ENOMEM;
> > @@ -717,7 +757,7 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
> >  	if (vq->indirect) {
> >  		struct vring_desc *indir_desc =
> >  				vq->split.desc_state[head].indir_desc;
> > -		u32 len;
> > +		u32 len, n;
> >  
> >  		/* Free the indirect table, if any, now that it's unmapped. */
> >  		if (!indir_desc)
> > @@ -729,10 +769,12 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
> >  				VRING_DESC_F_INDIRECT));
> >  		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
> >  
> > -		for (j = 0; j < len / sizeof(struct vring_desc); j++)
> > +		n = len / sizeof(struct vring_desc);
> > +
> > +		for (j = 0; j < n; j++)
> >  			vring_unmap_one_split_indirect(vq, &indir_desc[j]);
> >  
> > -		kfree(indir_desc);
> > +		desc_cache_put_split(vq, indir_desc, n);
> >  		vq->split.desc_state[head].indir_desc = NULL;
> >  	} else if (ctx) {
> >  		*ctx = vq->split.desc_state[head].indir_desc;
> > @@ -2199,6 +2241,8 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
> >  	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
> >  		!context;
> >  	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
> > +	vq->desc_cache_chain = NULL;
> > +	vq->use_desc_cache = vdev->desc_cache;
> >  
> >  	if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM))
> >  		vq->weak_barriers = false;
> > @@ -2329,6 +2373,7 @@ void vring_del_virtqueue(struct virtqueue *_vq)
> >  	if (!vq->packed_ring) {
> >  		kfree(vq->split.desc_state);
> >  		kfree(vq->split.desc_extra);
> > +		desc_cache_chain_free_split(vq->desc_cache_chain);
> >  	}
> >  	kfree(vq);
> >  }
> > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > index 41edbc01ffa4..d84b7b8f4070 100644
> > --- a/include/linux/virtio.h
> > +++ b/include/linux/virtio.h
> > @@ -109,6 +109,7 @@ struct virtio_device {
> >  	bool failed;
> >  	bool config_enabled;
> >  	bool config_change_pending;
> > +	bool desc_cache;
> >  	spinlock_t config_lock;
> >  	spinlock_t vqs_list_lock; /* Protects VQs list access */
> >  	struct device dev;
> > @@ -130,6 +131,15 @@ int register_virtio_device(struct virtio_device *dev);
> >  void unregister_virtio_device(struct virtio_device *dev);
> >  bool is_virtio_device(struct device *dev);
> >  
> > +/**
> > + * virtio_use_desc_cache - virtio ring use desc cache
> > + *
> > + * virtio will cache the allocated indirect desc.
> > + *
> > + * This function must be called before find_vqs.
> > + */
> > +void virtio_use_desc_cache(struct virtio_device *dev, bool val);
> > +
> >  void virtio_break_device(struct virtio_device *dev);
> >  
> >  void virtio_config_changed(struct virtio_device *dev);
> >
kernel test robot Oct. 27, 2021, 11:02 p.m. UTC | #5
Hi Xuan,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on horms-ipvs/master]
[also build test WARNING on linus/master v5.15-rc7 next-20211027]
[cannot apply to mst-vhost/linux-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Xuan-Zhuo/virtio-support-cache-indirect-desc/20211027-142025
base:   https://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs.git master
config: arm-defconfig (attached as .config)
compiler: arm-linux-gnueabi-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/8935e116c155fb7d484bad35a42b2ca98f75e384
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Xuan-Zhuo/virtio-support-cache-indirect-desc/20211027-142025
        git checkout 8935e116c155fb7d484bad35a42b2ca98f75e384
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross ARCH=arm 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   drivers/virtio/virtio_ring.c: In function 'desc_cache_chain_free_split':
>> drivers/virtio/virtio_ring.c:438:25: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
     438 |                 chain = (void *)desc->addr;
         |                         ^
   drivers/virtio/virtio_ring.c: In function 'desc_cache_put_split':
>> drivers/virtio/virtio_ring.c:447:30: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
     447 |                 desc->addr = (u64)vq->desc_cache_chain;
         |                              ^
   drivers/virtio/virtio_ring.c: In function 'alloc_indirect_split':
   drivers/virtio/virtio_ring.c:464:48: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
     464 |                         vq->desc_cache_chain = (void *)desc->addr;
         |                                                ^


vim +438 drivers/virtio/virtio_ring.c

   431	
   432	static void desc_cache_chain_free_split(void *chain)
   433	{
   434		struct vring_desc *desc;
   435	
   436		while (chain) {
   437			desc = chain;
 > 438			chain = (void *)desc->addr;
   439			kfree(desc);
   440		}
   441	}
   442	
   443	static void desc_cache_put_split(struct vring_virtqueue *vq,
   444					 struct vring_desc *desc, int n)
   445	{
   446		if (vq->use_desc_cache && n <= VIRT_QUEUE_CACHE_DESC_NUM) {
 > 447			desc->addr = (u64)vq->desc_cache_chain;
   448			vq->desc_cache_chain = desc;
   449		} else {
   450			kfree(desc);
   451		}
   452	}
   453	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
kernel test robot Oct. 28, 2021, 12:57 a.m. UTC | #6
Hi Xuan,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on horms-ipvs/master]
[also build test WARNING on linus/master v5.15-rc7 next-20211027]
[cannot apply to mst-vhost/linux-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Xuan-Zhuo/virtio-support-cache-indirect-desc/20211027-142025
base:   https://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs.git master
config: i386-randconfig-s002-20211027 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce:
        # apt-get install sparse
        # sparse version: v0.6.4-dirty
        # https://github.com/0day-ci/linux/commit/8935e116c155fb7d484bad35a42b2ca98f75e384
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Xuan-Zhuo/virtio-support-cache-indirect-desc/20211027-142025
        git checkout 8935e116c155fb7d484bad35a42b2ca98f75e384
        # save the attached .config to linux build tree
        make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' O=build_dir ARCH=i386 SHELL=/bin/bash drivers/virtio/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>


sparse warnings: (new ones prefixed by >>)
>> drivers/virtio/virtio_ring.c:438:26: sparse: sparse: cast from restricted __virtio64
>> drivers/virtio/virtio_ring.c:447:28: sparse: sparse: incorrect type in assignment (different base types) @@     expected restricted __virtio64 [usertype] addr @@     got unsigned long long [usertype] @@
   drivers/virtio/virtio_ring.c:447:28: sparse:     expected restricted __virtio64 [usertype] addr
   drivers/virtio/virtio_ring.c:447:28: sparse:     got unsigned long long [usertype]
   drivers/virtio/virtio_ring.c:464:49: sparse: sparse: cast from restricted __virtio64

vim +438 drivers/virtio/virtio_ring.c

   431	
   432	static void desc_cache_chain_free_split(void *chain)
   433	{
   434		struct vring_desc *desc;
   435	
   436		while (chain) {
   437			desc = chain;
 > 438			chain = (void *)desc->addr;
   439			kfree(desc);
   440		}
   441	}
   442	
   443	static void desc_cache_put_split(struct vring_virtqueue *vq,
   444					 struct vring_desc *desc, int n)
   445	{
   446		if (vq->use_desc_cache && n <= VIRT_QUEUE_CACHE_DESC_NUM) {
 > 447			desc->addr = (u64)vq->desc_cache_chain;
   448			vq->desc_cache_chain = desc;
   449		} else {
   450			kfree(desc);
   451		}
   452	}
   453	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Jason Wang Oct. 28, 2021, 2:16 a.m. UTC | #7
On Thu, Oct 28, 2021 at 1:07 AM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, Oct 27, 2021 at 02:19:11PM +0800, Xuan Zhuo wrote:
> > In the case of using indirect, indirect desc must be allocated and
> > released each time, which increases a lot of cpu overhead.
> >
> > Here, a cache is added for indirect. If the number of indirect desc to be
> > applied for is less than VIRT_QUEUE_CACHE_DESC_NUM, the desc array with
> > the size of VIRT_QUEUE_CACHE_DESC_NUM is fixed and cached for reuse.
> >
> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > ---
> >  drivers/virtio/virtio.c      |  6 ++++
> >  drivers/virtio/virtio_ring.c | 63 ++++++++++++++++++++++++++++++------
> >  include/linux/virtio.h       | 10 ++++++
> >  3 files changed, 70 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > index 0a5b54034d4b..04bcb74e5b9a 100644
> > --- a/drivers/virtio/virtio.c
> > +++ b/drivers/virtio/virtio.c
> > @@ -431,6 +431,12 @@ bool is_virtio_device(struct device *dev)
> >  }
> >  EXPORT_SYMBOL_GPL(is_virtio_device);
> >
> > +void virtio_use_desc_cache(struct virtio_device *dev, bool val)
> > +{
> > +     dev->desc_cache = val;
> > +}
> > +EXPORT_SYMBOL_GPL(virtio_use_desc_cache);
> > +
> >  void unregister_virtio_device(struct virtio_device *dev)
> >  {
> >       int index = dev->index; /* save for after device release */
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index dd95dfd85e98..0b9a8544b0e8 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -117,6 +117,10 @@ struct vring_virtqueue {
> >       /* Hint for event idx: already triggered no need to disable. */
> >       bool event_triggered;
> >
> > +     /* Is indirect cache used? */
> > +     bool use_desc_cache;
> > +     void *desc_cache_chain;
> > +
> >       union {
> >               /* Available for split ring */
> >               struct {
> > @@ -423,12 +427,47 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq,
> >       return extra[i].next;
> >  }
> >
> > -static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
> > +#define VIRT_QUEUE_CACHE_DESC_NUM 4
> > +
> > +static void desc_cache_chain_free_split(void *chain)
> > +{
> > +     struct vring_desc *desc;
> > +
> > +     while (chain) {
> > +             desc = chain;
> > +             chain = (void *)desc->addr;
> > +             kfree(desc);
> > +     }
> > +}
> > +
> > +static void desc_cache_put_split(struct vring_virtqueue *vq,
> > +                              struct vring_desc *desc, int n)
> > +{
> > +     if (vq->use_desc_cache && n <= VIRT_QUEUE_CACHE_DESC_NUM) {
> > +             desc->addr = (u64)vq->desc_cache_chain;
> > +             vq->desc_cache_chain = desc;
> > +     } else {
> > +             kfree(desc);
> > +     }
> > +}
> > +
>
>
> So I have a question here. What happens if we just do:
>
> if (n <= VIRT_QUEUE_CACHE_DESC_NUM) {
>         return kmem_cache_alloc(VIRT_QUEUE_CACHE_DESC_NUM * sizeof desc, gfp)
> } else {
>         return kmalloc_arrat(n, sizeof desc, gfp)
> }
>
> A small change and won't we reap most performance benefits?

Yes, I think we need a benchmark to use private cache to see how much
it can help.

Thanks

>
> --
> MST
>
Michael S. Tsirkin Oct. 31, 2021, 2:47 p.m. UTC | #8
On Thu, Oct 28, 2021 at 02:16:03PM +0800, Xuan Zhuo wrote:
> On Thu, 28 Oct 2021 10:16:10 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > On Thu, Oct 28, 2021 at 1:07 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Wed, Oct 27, 2021 at 02:19:11PM +0800, Xuan Zhuo wrote:
> > > > In the case of using indirect, indirect desc must be allocated and
> > > > released each time, which increases a lot of cpu overhead.
> > > >
> > > > Here, a cache is added for indirect. If the number of indirect desc to be
> > > > applied for is less than VIRT_QUEUE_CACHE_DESC_NUM, the desc array with
> > > > the size of VIRT_QUEUE_CACHE_DESC_NUM is fixed and cached for reuse.
> > > >
> > > > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > > ---
> > > >  drivers/virtio/virtio.c      |  6 ++++
> > > >  drivers/virtio/virtio_ring.c | 63 ++++++++++++++++++++++++++++++------
> > > >  include/linux/virtio.h       | 10 ++++++
> > > >  3 files changed, 70 insertions(+), 9 deletions(-)
> > > >
> > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > index 0a5b54034d4b..04bcb74e5b9a 100644
> > > > --- a/drivers/virtio/virtio.c
> > > > +++ b/drivers/virtio/virtio.c
> > > > @@ -431,6 +431,12 @@ bool is_virtio_device(struct device *dev)
> > > >  }
> > > >  EXPORT_SYMBOL_GPL(is_virtio_device);
> > > >
> > > > +void virtio_use_desc_cache(struct virtio_device *dev, bool val)
> > > > +{
> > > > +     dev->desc_cache = val;
> > > > +}
> > > > +EXPORT_SYMBOL_GPL(virtio_use_desc_cache);
> > > > +
> > > >  void unregister_virtio_device(struct virtio_device *dev)
> > > >  {
> > > >       int index = dev->index; /* save for after device release */
> > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > index dd95dfd85e98..0b9a8544b0e8 100644
> > > > --- a/drivers/virtio/virtio_ring.c
> > > > +++ b/drivers/virtio/virtio_ring.c
> > > > @@ -117,6 +117,10 @@ struct vring_virtqueue {
> > > >       /* Hint for event idx: already triggered no need to disable. */
> > > >       bool event_triggered;
> > > >
> > > > +     /* Is indirect cache used? */
> > > > +     bool use_desc_cache;
> > > > +     void *desc_cache_chain;
> > > > +
> > > >       union {
> > > >               /* Available for split ring */
> > > >               struct {
> > > > @@ -423,12 +427,47 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq,
> > > >       return extra[i].next;
> > > >  }
> > > >
> > > > -static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
> > > > +#define VIRT_QUEUE_CACHE_DESC_NUM 4
> > > > +
> > > > +static void desc_cache_chain_free_split(void *chain)
> > > > +{
> > > > +     struct vring_desc *desc;
> > > > +
> > > > +     while (chain) {
> > > > +             desc = chain;
> > > > +             chain = (void *)desc->addr;
> > > > +             kfree(desc);
> > > > +     }
> > > > +}
> > > > +
> > > > +static void desc_cache_put_split(struct vring_virtqueue *vq,
> > > > +                              struct vring_desc *desc, int n)
> > > > +{
> > > > +     if (vq->use_desc_cache && n <= VIRT_QUEUE_CACHE_DESC_NUM) {
> > > > +             desc->addr = (u64)vq->desc_cache_chain;
> > > > +             vq->desc_cache_chain = desc;
> > > > +     } else {
> > > > +             kfree(desc);
> > > > +     }
> > > > +}
> > > > +
> > >
> > >
> > > So I have a question here. What happens if we just do:
> > >
> > > if (n <= VIRT_QUEUE_CACHE_DESC_NUM) {
> > >         return kmem_cache_alloc(VIRT_QUEUE_CACHE_DESC_NUM * sizeof desc, gfp)
> > > } else {
> > >         return kmalloc_arrat(n, sizeof desc, gfp)
> > > }
> > >
> > > A small change and won't we reap most performance benefits?
> >
> > Yes, I think we need a benchmark to use private cache to see how much
> > it can help.
> 
> I did a test, the code is as follows:
> 
> +static void desc_cache_put_packed(struct vring_virtqueue *vq,
> +                                  struct vring_packed_desc *desc, int n)
> + {
> +       if (n <= VIRT_QUEUE_CACHE_DESC_NUM) {
> +               kmem_cache_free(vq->desc_cache, desc);
> +       } else {
> +               kfree(desc);
> +       }
> 
> 
> @@ -476,11 +452,14 @@ static struct vring_desc *alloc_indirect_split(struct vring_virtqueue *vq,
>          */
>         gfp &= ~__GFP_HIGHMEM;
> 
> -       desc = kmalloc_array(n, sizeof(struct vring_desc), gfp);
> +       if (total_sg <= VIRT_QUEUE_CACHE_DESC_NUM)
> +               desc = kmem_cache_alloc(vq->desc_cache, gfp);
> +       else
> +               desc = kmalloc_array(total_sg, sizeof(struct vring_desc), gfp);
> +
> 
> 	.......
> 
> +       vq->desc_cache = kmem_cache_create("virio_desc",
> +                                          4 * sizeof(struct vring_desc),
> +                                          0, 0, NULL);
> 
> The effect is not good, basically there is no improvement, using perf top can
> see that the overhead of kmem_cache_alloc/kmem_cache_free is also relatively
> large:
> 
>          26.91%  [kernel]  [k] virtqueue_add
>          15.35%  [kernel]  [k] detach_buf_split
>          14.15%  [kernel]  [k] virtnet_xsk_xmit
>          13.24%  [kernel]  [k] virtqueue_add_outbuf
>        >  9.30%  [kernel]  [k] __slab_free
>        >  3.91%  [kernel]  [k] kmem_cache_alloc
>           2.85%  [kernel]  [k] virtqueue_get_buf_ctx
>        >  2.82%  [kernel]  [k] kmem_cache_free
>           2.54%  [kernel]  [k] memset_erms
>           2.37%  [kernel]  [k] xsk_tx_peek_desc
>           1.20%  [kernel]  [k] virtnet_xsk_run
>           0.81%  [kernel]  [k] vring_map_one_sg
>           0.69%  [kernel]  [k] __free_old_xmit_ptr
>           0.69%  [kernel]  [k] virtqueue_kick_prepare
>           0.43%  [kernel]  [k] sg_init_table
>           0.41%  [kernel]  [k] sg_next
>           0.28%  [kernel]  [k] vring_unmap_one_split
>           0.25%  [kernel]  [k] vring_map_single.constprop.34
>           0.24%  [kernel]  [k] net_rx_action
> 
> Thanks.


How about batching these?  E.g. kmem_cache_alloc_bulk/kmem_cache_free_bulk?


> >
> > Thanks
> >
> > >
> > > --
> > > MST
> > >
> >
diff mbox series

Patch

diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index 0a5b54034d4b..04bcb74e5b9a 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -431,6 +431,12 @@  bool is_virtio_device(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(is_virtio_device);
 
+void virtio_use_desc_cache(struct virtio_device *dev, bool val)
+{
+	dev->desc_cache = val;
+}
+EXPORT_SYMBOL_GPL(virtio_use_desc_cache);
+
 void unregister_virtio_device(struct virtio_device *dev)
 {
 	int index = dev->index; /* save for after device release */
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index dd95dfd85e98..0b9a8544b0e8 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -117,6 +117,10 @@  struct vring_virtqueue {
 	/* Hint for event idx: already triggered no need to disable. */
 	bool event_triggered;
 
+	/* Is indirect cache used? */
+	bool use_desc_cache;
+	void *desc_cache_chain;
+
 	union {
 		/* Available for split ring */
 		struct {
@@ -423,12 +427,47 @@  static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq,
 	return extra[i].next;
 }
 
-static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
+#define VIRT_QUEUE_CACHE_DESC_NUM 4
+
+static void desc_cache_chain_free_split(void *chain)
+{
+	struct vring_desc *desc;
+
+	while (chain) {
+		desc = chain;
+		chain = (void *)desc->addr;
+		kfree(desc);
+	}
+}
+
+static void desc_cache_put_split(struct vring_virtqueue *vq,
+				 struct vring_desc *desc, int n)
+{
+	if (vq->use_desc_cache && n <= VIRT_QUEUE_CACHE_DESC_NUM) {
+		desc->addr = (u64)vq->desc_cache_chain;
+		vq->desc_cache_chain = desc;
+	} else {
+		kfree(desc);
+	}
+}
+
+static struct vring_desc *alloc_indirect_split(struct vring_virtqueue *vq,
 					       unsigned int total_sg,
 					       gfp_t gfp)
 {
 	struct vring_desc *desc;
-	unsigned int i;
+	unsigned int i, n;
+
+	if (vq->use_desc_cache && total_sg <= VIRT_QUEUE_CACHE_DESC_NUM) {
+		if (vq->desc_cache_chain) {
+			desc = vq->desc_cache_chain;
+			vq->desc_cache_chain = (void *)desc->addr;
+			goto got;
+		}
+		n = VIRT_QUEUE_CACHE_DESC_NUM;
+	} else {
+		n = total_sg;
+	}
 
 	/*
 	 * We require lowmem mappings for the descriptors because
@@ -437,12 +476,13 @@  static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
 	 */
 	gfp &= ~__GFP_HIGHMEM;
 
-	desc = kmalloc_array(total_sg, sizeof(struct vring_desc), gfp);
+	desc = kmalloc_array(n, sizeof(struct vring_desc), gfp);
 	if (!desc)
 		return NULL;
 
+got:
 	for (i = 0; i < total_sg; i++)
-		desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
+		desc[i].next = cpu_to_virtio16(vq->vq.vdev, i + 1);
 	return desc;
 }
 
@@ -508,7 +548,7 @@  static inline int virtqueue_add_split(struct virtqueue *_vq,
 	head = vq->free_head;
 
 	if (virtqueue_use_indirect(_vq, total_sg))
-		desc = alloc_indirect_split(_vq, total_sg, gfp);
+		desc = alloc_indirect_split(vq, total_sg, gfp);
 	else {
 		desc = NULL;
 		WARN_ON_ONCE(total_sg > vq->split.vring.num && !vq->indirect);
@@ -652,7 +692,7 @@  static inline int virtqueue_add_split(struct virtqueue *_vq,
 	}
 
 	if (indirect)
-		kfree(desc);
+		desc_cache_put_split(vq, desc, total_sg);
 
 	END_USE(vq);
 	return -ENOMEM;
@@ -717,7 +757,7 @@  static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
 	if (vq->indirect) {
 		struct vring_desc *indir_desc =
 				vq->split.desc_state[head].indir_desc;
-		u32 len;
+		u32 len, n;
 
 		/* Free the indirect table, if any, now that it's unmapped. */
 		if (!indir_desc)
@@ -729,10 +769,12 @@  static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
 				VRING_DESC_F_INDIRECT));
 		BUG_ON(len == 0 || len % sizeof(struct vring_desc));
 
-		for (j = 0; j < len / sizeof(struct vring_desc); j++)
+		n = len / sizeof(struct vring_desc);
+
+		for (j = 0; j < n; j++)
 			vring_unmap_one_split_indirect(vq, &indir_desc[j]);
 
-		kfree(indir_desc);
+		desc_cache_put_split(vq, indir_desc, n);
 		vq->split.desc_state[head].indir_desc = NULL;
 	} else if (ctx) {
 		*ctx = vq->split.desc_state[head].indir_desc;
@@ -2199,6 +2241,8 @@  struct virtqueue *__vring_new_virtqueue(unsigned int index,
 	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
 		!context;
 	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
+	vq->desc_cache_chain = NULL;
+	vq->use_desc_cache = vdev->desc_cache;
 
 	if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM))
 		vq->weak_barriers = false;
@@ -2329,6 +2373,7 @@  void vring_del_virtqueue(struct virtqueue *_vq)
 	if (!vq->packed_ring) {
 		kfree(vq->split.desc_state);
 		kfree(vq->split.desc_extra);
+		desc_cache_chain_free_split(vq->desc_cache_chain);
 	}
 	kfree(vq);
 }
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 41edbc01ffa4..d84b7b8f4070 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -109,6 +109,7 @@  struct virtio_device {
 	bool failed;
 	bool config_enabled;
 	bool config_change_pending;
+	bool desc_cache;
 	spinlock_t config_lock;
 	spinlock_t vqs_list_lock; /* Protects VQs list access */
 	struct device dev;
@@ -130,6 +131,15 @@  int register_virtio_device(struct virtio_device *dev);
 void unregister_virtio_device(struct virtio_device *dev);
 bool is_virtio_device(struct device *dev);
 
+/**
+ * virtio_use_desc_cache - virtio ring use desc cache
+ *
+ * virtio will cache the allocated indirect desc.
+ *
+ * This function must be called before find_vqs.
+ */
+void virtio_use_desc_cache(struct virtio_device *dev, bool val);
+
 void virtio_break_device(struct virtio_device *dev);
 
 void virtio_config_changed(struct virtio_device *dev);