diff mbox series

[v3,net-next,1/5] net: xdp: introduce bulking for xdp tx return path

Message ID 5ef0c2886518d8ae1577c8b60ea6ef55d031673e.1604484917.git.lorenzo@kernel.org (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series xdp: introduce bulking for page_pool tx return path | expand

Commit Message

Lorenzo Bianconi Nov. 4, 2020, 10:22 a.m. UTC
XDP bulk APIs introduce a defer/flush mechanism to return
pages belonging to the same xdp_mem_allocator object
(identified via the mem.id field) in bulk to optimize
I-cache and D-cache since xdp_return_frame is usually run
inside the driver NAPI tx completion loop.
The bulk queue size is set to 16 to be aligned to how
XDP_REDIRECT bulking works. The bulk is flushed when
it is full or when mem.id changes.
xdp_frame_bulk is usually stored/allocated on the function
call-stack to avoid locking penalties.
Current implementation considers only page_pool memory model.

Suggested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 include/net/xdp.h |  9 +++++++
 net/core/xdp.c    | 61 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)

Comments

Jesper Dangaard Brouer Nov. 4, 2020, 11:11 a.m. UTC | #1
On Wed,  4 Nov 2020 11:22:54 +0100
Lorenzo Bianconi <lorenzo@kernel.org> wrote:

> XDP bulk APIs introduce a defer/flush mechanism to return
> pages belonging to the same xdp_mem_allocator object
> (identified via the mem.id field) in bulk to optimize
> I-cache and D-cache since xdp_return_frame is usually run
> inside the driver NAPI tx completion loop.
> The bulk queue size is set to 16 to be aligned to how
> XDP_REDIRECT bulking works. The bulk is flushed when
> it is full or when mem.id changes.
> xdp_frame_bulk is usually stored/allocated on the function
> call-stack to avoid locking penalties.
> Current implementation considers only page_pool memory model.
> 
> Suggested-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
>  include/net/xdp.h |  9 +++++++
>  net/core/xdp.c    | 61 +++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 70 insertions(+)
> 
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index 3814fb631d52..a1f48a73e6df 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -104,6 +104,12 @@ struct xdp_frame {
>  	struct net_device *dev_rx; /* used by cpumap */
>  };
>  
> +#define XDP_BULK_QUEUE_SIZE	16
> +struct xdp_frame_bulk {
> +	int count;
> +	void *xa;
> +	void *q[XDP_BULK_QUEUE_SIZE];
> +};
>  
>  static inline struct skb_shared_info *
>  xdp_get_shared_info_from_frame(struct xdp_frame *frame)
> @@ -194,6 +200,9 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp)
>  void xdp_return_frame(struct xdp_frame *xdpf);
>  void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
>  void xdp_return_buff(struct xdp_buff *xdp);
> +void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq);
> +void xdp_return_frame_bulk(struct xdp_frame *xdpf,
> +			   struct xdp_frame_bulk *bq);
>  
>  /* When sending xdp_frame into the network stack, then there is no
>   * return point callback, which is needed to release e.g. DMA-mapping
> diff --git a/net/core/xdp.c b/net/core/xdp.c
> index 48aba933a5a8..66ac275a0360 100644
> --- a/net/core/xdp.c
> +++ b/net/core/xdp.c
> @@ -380,6 +380,67 @@ void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
>  }
>  EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
>  
> +/* XDP bulk APIs introduce a defer/flush mechanism to return
> + * pages belonging to the same xdp_mem_allocator object
> + * (identified via the mem.id field) in bulk to optimize
> + * I-cache and D-cache.
> + * The bulk queue size is set to 16 to be aligned to how
> + * XDP_REDIRECT bulking works. The bulk is flushed when

If this is connected, then why have you not redefined DEV_MAP_BULK_SIZE?

Cc. DPAA2 maintainers as they use this define in their drivers.
You want to make sure this driver is flexible enough for future changes.

Like:

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 3814fb631d52..44440a36f96f 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -245,6 +245,6 @@ bool xdp_attachment_flags_ok(struct xdp_attachment_info *info,
 void xdp_attachment_setup(struct xdp_attachment_info *info,
                          struct netdev_bpf *bpf);
 
-#define DEV_MAP_BULK_SIZE 16
+#define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE
 
 #endif /* __LINUX_NET_XDP_H__ */


> + * it is full or when mem.id changes.
> + * xdp_frame_bulk is usually stored/allocated on the function
> + * call-stack to avoid locking penalties.
> + */
> +void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq)
> +{
> +	struct xdp_mem_allocator *xa = bq->xa;
> +	int i;
> +
> +	if (unlikely(!xa))
> +		return;
> +
> +	for (i = 0; i < bq->count; i++) {
> +		struct page *page = virt_to_head_page(bq->q[i]);
> +
> +		page_pool_put_full_page(xa->page_pool, page, false);
> +	}
> +	bq->count = 0;
> +}
> +EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk);
> +
> +void xdp_return_frame_bulk(struct xdp_frame *xdpf,
> +			   struct xdp_frame_bulk *bq)
> +{
> +	struct xdp_mem_info *mem = &xdpf->mem;
> +	struct xdp_mem_allocator *xa;
> +
> +	if (mem->type != MEM_TYPE_PAGE_POOL) {
> +		__xdp_return(xdpf->data, &xdpf->mem, false);
> +		return;
> +	}
>

I cannot make up my mind: It would be a micro-optimization to move
this if-statement to include/net/xdp.h, but it will make code harder to
read/follow, and the call you replace xdp_return_frame() is also in
xdp.c with same call to _xdp_return().  Let keep it as-is. (we can
followup with micro-optimizations)


> +	rcu_read_lock();
> +
> +	xa = bq->xa;
> +	if (unlikely(!xa)) {
> +		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
> +		bq->count = 0;
> +		bq->xa = xa;
> +	}
> +
> +	if (bq->count == XDP_BULK_QUEUE_SIZE)
> +		xdp_flush_frame_bulk(bq);
> +
> +	if (mem->id != xa->mem.id) {
> +		xdp_flush_frame_bulk(bq);
> +		bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
> +	}
> +
> +	bq->q[bq->count++] = xdpf->data;
> +
> +	rcu_read_unlock();
> +}
> +EXPORT_SYMBOL_GPL(xdp_return_frame_bulk);
> +
>  void xdp_return_buff(struct xdp_buff *xdp)
>  {
>  	__xdp_return(xdp->data, &xdp->rxq->mem, true);
Lorenzo Bianconi Nov. 4, 2020, 11:19 a.m. UTC | #2
> On Wed,  4 Nov 2020 11:22:54 +0100
> Lorenzo Bianconi <lorenzo@kernel.org> wrote:
> 

[...]

> > +/* XDP bulk APIs introduce a defer/flush mechanism to return
> > + * pages belonging to the same xdp_mem_allocator object
> > + * (identified via the mem.id field) in bulk to optimize
> > + * I-cache and D-cache.
> > + * The bulk queue size is set to 16 to be aligned to how
> > + * XDP_REDIRECT bulking works. The bulk is flushed when
> 
> If this is connected, then why have you not redefined DEV_MAP_BULK_SIZE?
> 
> Cc. DPAA2 maintainers as they use this define in their drivers.
> You want to make sure this driver is flexible enough for future changes.
> 
> Like:
> 
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index 3814fb631d52..44440a36f96f 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -245,6 +245,6 @@ bool xdp_attachment_flags_ok(struct xdp_attachment_info *info,
>  void xdp_attachment_setup(struct xdp_attachment_info *info,
>                           struct netdev_bpf *bpf);
>  
> -#define DEV_MAP_BULK_SIZE 16
> +#define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE

my idea was to address it in a separated patch, but if you prefer I can merge
this change in v4

>  
>  #endif /* __LINUX_NET_XDP_H__ */
> 
> 
> > + * it is full or when mem.id changes.
> > + * xdp_frame_bulk is usually stored/allocated on the function
> > + * call-stack to avoid locking penalties.
> > + */
> > +void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq)
> > +{
> > +	struct xdp_mem_allocator *xa = bq->xa;
> > +	int i;
> > +
> > +	if (unlikely(!xa))
> > +		return;
> > +
> > +	for (i = 0; i < bq->count; i++) {
> > +		struct page *page = virt_to_head_page(bq->q[i]);
> > +
> > +		page_pool_put_full_page(xa->page_pool, page, false);
> > +	}
> > +	bq->count = 0;
> > +}
> > +EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk);
> > +
> > +void xdp_return_frame_bulk(struct xdp_frame *xdpf,
> > +			   struct xdp_frame_bulk *bq)
> > +{
> > +	struct xdp_mem_info *mem = &xdpf->mem;
> > +	struct xdp_mem_allocator *xa;
> > +
> > +	if (mem->type != MEM_TYPE_PAGE_POOL) {
> > +		__xdp_return(xdpf->data, &xdpf->mem, false);
> > +		return;
> > +	}
> >
> 
> I cannot make up my mind: It would be a micro-optimization to move
> this if-statement to include/net/xdp.h, but it will make code harder to
> read/follow, and the call you replace xdp_return_frame() is also in
> xdp.c with same call to _xdp_return().  Let keep it as-is. (we can
> followup with micro-optimizations)

ack

Regards,
Lorenzo

> 
> 
> > +	rcu_read_lock();
> > +
> > +	xa = bq->xa;
> > +	if (unlikely(!xa)) {
> > +		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
> > +		bq->count = 0;
> > +		bq->xa = xa;
> > +	}
> > +
> > +	if (bq->count == XDP_BULK_QUEUE_SIZE)
> > +		xdp_flush_frame_bulk(bq);
> > +
> > +	if (mem->id != xa->mem.id) {
> > +		xdp_flush_frame_bulk(bq);
> > +		bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
> > +	}
> > +
> > +	bq->q[bq->count++] = xdpf->data;
> > +
> > +	rcu_read_unlock();
> > +}
> > +EXPORT_SYMBOL_GPL(xdp_return_frame_bulk);
> > +
> >  void xdp_return_buff(struct xdp_buff *xdp)
> >  {
> >  	__xdp_return(xdp->data, &xdp->rxq->mem, true);
> 
> 
> 
> -- 
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   LinkedIn: http://www.linkedin.com/in/brouer
>
Jesper Dangaard Brouer Nov. 4, 2020, 12:28 p.m. UTC | #3
On Wed, 4 Nov 2020 12:19:02 +0100
Lorenzo Bianconi <lorenzo.bianconi@redhat.com> wrote:

> > On Wed,  4 Nov 2020 11:22:54 +0100
> > Lorenzo Bianconi <lorenzo@kernel.org> wrote:
> >   
> 
> [...]
> 
> > > +/* XDP bulk APIs introduce a defer/flush mechanism to return
> > > + * pages belonging to the same xdp_mem_allocator object
> > > + * (identified via the mem.id field) in bulk to optimize
> > > + * I-cache and D-cache.
> > > + * The bulk queue size is set to 16 to be aligned to how
> > > + * XDP_REDIRECT bulking works. The bulk is flushed when  
> > 
> > If this is connected, then why have you not redefined DEV_MAP_BULK_SIZE?
> > 
> > Cc. DPAA2 maintainers as they use this define in their drivers.
> > You want to make sure this driver is flexible enough for future changes.
> > 
> > Like:
> > 
> > diff --git a/include/net/xdp.h b/include/net/xdp.h
> > index 3814fb631d52..44440a36f96f 100644
> > --- a/include/net/xdp.h
> > +++ b/include/net/xdp.h
> > @@ -245,6 +245,6 @@ bool xdp_attachment_flags_ok(struct xdp_attachment_info *info,
> >  void xdp_attachment_setup(struct xdp_attachment_info *info,
> >                           struct netdev_bpf *bpf);
> >  
> > -#define DEV_MAP_BULK_SIZE 16
> > +#define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE  
> 
> my idea was to address it in a separated patch, but if you prefer I can merge
> this change in v4

Please merge in V4.  As this patch contains the explanation, and we
want to avoid too much churn (remember our colleagues need to backport
and review this).
Lorenzo Bianconi Nov. 4, 2020, 12:53 p.m. UTC | #4
> On Wed, 4 Nov 2020 12:19:02 +0100
> Lorenzo Bianconi <lorenzo.bianconi@redhat.com> wrote:
> 
> > > On Wed,  4 Nov 2020 11:22:54 +0100
> > > Lorenzo Bianconi <lorenzo@kernel.org> wrote:
> > >   
> > 
> > [...]
> > 
> > > > +/* XDP bulk APIs introduce a defer/flush mechanism to return
> > > > + * pages belonging to the same xdp_mem_allocator object
> > > > + * (identified via the mem.id field) in bulk to optimize
> > > > + * I-cache and D-cache.
> > > > + * The bulk queue size is set to 16 to be aligned to how
> > > > + * XDP_REDIRECT bulking works. The bulk is flushed when  
> > > 
> > > If this is connected, then why have you not redefined DEV_MAP_BULK_SIZE?
> > > 
> > > Cc. DPAA2 maintainers as they use this define in their drivers.
> > > You want to make sure this driver is flexible enough for future changes.
> > > 
> > > Like:
> > > 
> > > diff --git a/include/net/xdp.h b/include/net/xdp.h
> > > index 3814fb631d52..44440a36f96f 100644
> > > --- a/include/net/xdp.h
> > > +++ b/include/net/xdp.h
> > > @@ -245,6 +245,6 @@ bool xdp_attachment_flags_ok(struct xdp_attachment_info *info,
> > >  void xdp_attachment_setup(struct xdp_attachment_info *info,
> > >                           struct netdev_bpf *bpf);
> > >  
> > > -#define DEV_MAP_BULK_SIZE 16
> > > +#define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE  
> > 
> > my idea was to address it in a separated patch, but if you prefer I can merge
> > this change in v4

sure, will do in v4.

Regards,
Lorenzo

> 
> Please merge in V4.  As this patch contains the explanation, and we
> want to avoid too much churn (remember our colleagues need to backport
> and review this).
> 
> -- 
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   LinkedIn: http://www.linkedin.com/in/brouer
>
diff mbox series

Patch

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 3814fb631d52..a1f48a73e6df 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -104,6 +104,12 @@  struct xdp_frame {
 	struct net_device *dev_rx; /* used by cpumap */
 };
 
+#define XDP_BULK_QUEUE_SIZE	16
+struct xdp_frame_bulk {
+	int count;
+	void *xa;
+	void *q[XDP_BULK_QUEUE_SIZE];
+};
 
 static inline struct skb_shared_info *
 xdp_get_shared_info_from_frame(struct xdp_frame *frame)
@@ -194,6 +200,9 @@  struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp)
 void xdp_return_frame(struct xdp_frame *xdpf);
 void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
 void xdp_return_buff(struct xdp_buff *xdp);
+void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq);
+void xdp_return_frame_bulk(struct xdp_frame *xdpf,
+			   struct xdp_frame_bulk *bq);
 
 /* When sending xdp_frame into the network stack, then there is no
  * return point callback, which is needed to release e.g. DMA-mapping
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 48aba933a5a8..66ac275a0360 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -380,6 +380,67 @@  void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
 
+/* XDP bulk APIs introduce a defer/flush mechanism to return
+ * pages belonging to the same xdp_mem_allocator object
+ * (identified via the mem.id field) in bulk to optimize
+ * I-cache and D-cache.
+ * The bulk queue size is set to 16 to be aligned to how
+ * XDP_REDIRECT bulking works. The bulk is flushed when
+ * it is full or when mem.id changes.
+ * xdp_frame_bulk is usually stored/allocated on the function
+ * call-stack to avoid locking penalties.
+ */
+void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq)
+{
+	struct xdp_mem_allocator *xa = bq->xa;
+	int i;
+
+	if (unlikely(!xa))
+		return;
+
+	for (i = 0; i < bq->count; i++) {
+		struct page *page = virt_to_head_page(bq->q[i]);
+
+		page_pool_put_full_page(xa->page_pool, page, false);
+	}
+	bq->count = 0;
+}
+EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk);
+
+void xdp_return_frame_bulk(struct xdp_frame *xdpf,
+			   struct xdp_frame_bulk *bq)
+{
+	struct xdp_mem_info *mem = &xdpf->mem;
+	struct xdp_mem_allocator *xa;
+
+	if (mem->type != MEM_TYPE_PAGE_POOL) {
+		__xdp_return(xdpf->data, &xdpf->mem, false);
+		return;
+	}
+
+	rcu_read_lock();
+
+	xa = bq->xa;
+	if (unlikely(!xa)) {
+		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+		bq->count = 0;
+		bq->xa = xa;
+	}
+
+	if (bq->count == XDP_BULK_QUEUE_SIZE)
+		xdp_flush_frame_bulk(bq);
+
+	if (mem->id != xa->mem.id) {
+		xdp_flush_frame_bulk(bq);
+		bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+	}
+
+	bq->q[bq->count++] = xdpf->data;
+
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame_bulk);
+
 void xdp_return_buff(struct xdp_buff *xdp)
 {
 	__xdp_return(xdp->data, &xdp->rxq->mem, true);