diff mbox series

[net-next,v5,1/2] page_pool: Add page_pool stat counters

Message ID 1644868949-24506-2-git-send-email-jdamato@fastly.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series page_pool: Add page_pool stat counters | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 6019 this patch: 6019
netdev/cc_maintainers success CCed 5 of 5 maintainers
netdev/build_clang success Errors and warnings before: 878 this patch: 878
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 6172 this patch: 6172
netdev/checkpatch warning CHECK: Macro argument 'pool' may be better as '(pool)' to avoid precedence issues
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Joe Damato Feb. 14, 2022, 8:02 p.m. UTC
Add per-cpu per-pool statistics counters for the allocation path of a page
pool.

This code is disabled by default and a kernel config option is provided for
users who wish to enable them.

The statistics added are:
	- fast: successful fast path allocations
	- slow: slow path order-0 allocations
	- slow_high_order: slow path high order allocations
	- empty: ptr ring is empty, so a slow path allocation was forced.
	- refill: an allocation which triggered a refill of the cache
	- waive: pages obtained from the ptr ring that cannot be added to
	  the cache due to a NUMA mismatch.

Signed-off-by: Joe Damato <jdamato@fastly.com>
---
 include/net/page_pool.h | 18 ++++++++++++++++++
 net/Kconfig             | 13 +++++++++++++
 net/core/page_pool.c    | 37 +++++++++++++++++++++++++++++++++----
 3 files changed, 64 insertions(+), 4 deletions(-)

Comments

Jesper Dangaard Brouer Feb. 15, 2022, 3:41 p.m. UTC | #1
On 14/02/2022 21.02, Joe Damato wrote:
> Add per-cpu per-pool statistics counters for the allocation path of a page
> pool.
> 
> This code is disabled by default and a kernel config option is provided for
> users who wish to enable them.
> 
> The statistics added are:
> 	- fast: successful fast path allocations
> 	- slow: slow path order-0 allocations
> 	- slow_high_order: slow path high order allocations
> 	- empty: ptr ring is empty, so a slow path allocation was forced.
> 	- refill: an allocation which triggered a refill of the cache
> 	- waive: pages obtained from the ptr ring that cannot be added to
> 	  the cache due to a NUMA mismatch.
> 
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> ---
>   include/net/page_pool.h | 18 ++++++++++++++++++
>   net/Kconfig             | 13 +++++++++++++
>   net/core/page_pool.c    | 37 +++++++++++++++++++++++++++++++++----
>   3 files changed, 64 insertions(+), 4 deletions(-)
> 
> diff --git a/include/net/page_pool.h b/include/net/page_pool.h
> index 97c3c19..d827ab1 100644
> --- a/include/net/page_pool.h
> +++ b/include/net/page_pool.h
> @@ -135,7 +135,25 @@ struct page_pool {
>   	refcount_t user_cnt;
>   
>   	u64 destroy_cnt;
> +#ifdef CONFIG_PAGE_POOL_STATS
> +	struct page_pool_stats __percpu *stats;
> +#endif
> +};

You still have to consider cache-line locality, as I have pointed out 
before.
This placement is wrong!

Output from pahole:

  /* --- cacheline 23 boundary (1472 bytes) --- */
  atomic_t                   pages_state_release_cnt; /*  1472     4 */
  refcount_t                 user_cnt;             /*  1476     4 */
  u64                        destroy_cnt;          /*  1480     8 */

Your *stats pointer end-up on a cache-line that "remote" CPUs will write 
into (pages_state_release_cnt).
This is why we see a slowdown to the 'bench_page_pool_cross_cpu' test.

--Jesper
Joe Damato Feb. 16, 2022, 5:04 p.m. UTC | #2
On Tue, Feb 15, 2022 at 7:41 AM Jesper Dangaard Brouer
<jbrouer@redhat.com> wrote:
>
>
> On 14/02/2022 21.02, Joe Damato wrote:
> > Add per-cpu per-pool statistics counters for the allocation path of a page
> > pool.
> >
> > This code is disabled by default and a kernel config option is provided for
> > users who wish to enable them.
> >
> > The statistics added are:
> >       - fast: successful fast path allocations
> >       - slow: slow path order-0 allocations
> >       - slow_high_order: slow path high order allocations
> >       - empty: ptr ring is empty, so a slow path allocation was forced.
> >       - refill: an allocation which triggered a refill of the cache
> >       - waive: pages obtained from the ptr ring that cannot be added to
> >         the cache due to a NUMA mismatch.
> >
> > Signed-off-by: Joe Damato <jdamato@fastly.com>
> > ---
> >   include/net/page_pool.h | 18 ++++++++++++++++++
> >   net/Kconfig             | 13 +++++++++++++
> >   net/core/page_pool.c    | 37 +++++++++++++++++++++++++++++++++----
> >   3 files changed, 64 insertions(+), 4 deletions(-)
> >
> > diff --git a/include/net/page_pool.h b/include/net/page_pool.h
> > index 97c3c19..d827ab1 100644
> > --- a/include/net/page_pool.h
> > +++ b/include/net/page_pool.h
> > @@ -135,7 +135,25 @@ struct page_pool {
> >       refcount_t user_cnt;
> >
> >       u64 destroy_cnt;
> > +#ifdef CONFIG_PAGE_POOL_STATS
> > +     struct page_pool_stats __percpu *stats;
> > +#endif
> > +};
>
> You still have to consider cache-line locality, as I have pointed out
> before.

You are right, I forgot to include that in this revision. Sorry about
that and thanks for the review and response.

> This placement is wrong!
>
> Output from pahole:
>
>   /* --- cacheline 23 boundary (1472 bytes) --- */
>   atomic_t                   pages_state_release_cnt; /*  1472     4 */
>   refcount_t                 user_cnt;             /*  1476     4 */
>   u64                        destroy_cnt;          /*  1480     8 */
>
> Your *stats pointer end-up on a cache-line that "remote" CPUs will write
> into (pages_state_release_cnt).
> This is why we see a slowdown to the 'bench_page_pool_cross_cpu' test.

If I give *stats its own cache-line by adding
____cacheline_aligned_in_smp (but leaving the placement at the end of
the page_pool struct), pahole reports:

/* --- cacheline 24 boundary (1536 bytes) --- */
atomic_t                   pages_state_release_cnt; /*  1536     4 */
refcount_t                 user_cnt;             /*  1540     4 */
u64                        destroy_cnt;          /*  1544     8 */

/* XXX 48 bytes hole, try to pack */

/* --- cacheline 25 boundary (1600 bytes) --- */
struct page_pool_stats *   stats;                /*  1600     8 */

Re-running bench_page_pool_cross_cpu loops=20000000 returning_cpus=4
still shows a fairly large variation in cycles (measurement period
time of 34.128419339 sec) from run to run; roughly a delta of 287
cycles in the runs I just performed back to back.

The best measurements after making the cache-line change described
above are faster than compiling the kernel with stats disabled. The
worst measurements, however, are very close to the data I submit in
the cover letter for this revision.

As far as I can tell xdp_mem_id is not written to particularly often -
only when RX rings are configured in the driver - so I also tried
moving *stats above xdp_mem_id so that they share a cache-line and
reduce the size of the hole between xdp_mem_id and pp_alloc_cache.

pahole reports:

/* --- cacheline 3 boundary (192 bytes) --- */
struct page_pool_stats *   stats;                /*   192     8 */
u32                        xdp_mem_id;           /*   200     4 */

Results of bench_page_pool_cross_cpu loops=20000000 returning_cpus=4
are the same as above -- the best measurements are faster than stats
disabled, the worst are very close to what I mentioned in the cover
letter for this v5.

I am happy to submit a v6 with *stats placed in either location; on
its own cache-line at the expense of a larger page_pool struct, or
placed near xdp_mem_id to consume some of the hole between xdp_mem_id
and pp_alloc_cache.

Do you have a preference on giving the stats pointer its own
cache-line vs sharing a line with xdp_mem_id?

In either case: the benchmarks don't seem to show a consistent
significant improvement on my test hardware. Is there another
benchmark or a different set of arguments you think I should use when
running this test?

My apologies if I am missing something obvious here.

Thanks,
Joe
diff mbox series

Patch

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 97c3c19..d827ab1 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -135,7 +135,25 @@  struct page_pool {
 	refcount_t user_cnt;
 
 	u64 destroy_cnt;
+#ifdef CONFIG_PAGE_POOL_STATS
+	struct page_pool_stats __percpu *stats;
+#endif
+};
+
+#ifdef CONFIG_PAGE_POOL_STATS
+struct page_pool_stats {
+	struct {
+		u64 fast; /* fast path allocations */
+		u64 slow; /* slow-path order 0 allocations */
+		u64 slow_high_order; /* slow-path high order allocations */
+		u64 empty; /* failed refills due to empty ptr ring, forcing
+			    * slow path allocation
+			    */
+		u64 refill; /* allocations via successful refill */
+		u64 waive;  /* failed refills due to numa zone mismatch */
+	} alloc;
 };
+#endif
 
 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
 
diff --git a/net/Kconfig b/net/Kconfig
index 8a1f9d0..8c07308 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -434,6 +434,19 @@  config NET_DEVLINK
 config PAGE_POOL
 	bool
 
+config PAGE_POOL_STATS
+	default n
+	bool "Page pool stats"
+	depends on PAGE_POOL
+	help
+	  Enable page pool statistics to track allocations. This option
+	  incurs additional CPU cost in allocation paths and additional
+	  memory cost to store the statistics. These statistics are only
+	  available if this option is enabled and if the driver using
+	  the page pool supports exporting this data.
+
+	  If unsure, say N.
+
 config FAILOVER
 	tristate "Generic failover module"
 	help
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index e25d359..f29dff9 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -26,6 +26,17 @@ 
 
 #define BIAS_MAX	LONG_MAX
 
+#ifdef CONFIG_PAGE_POOL_STATS
+/* this_cpu_inc_alloc_stat is intended to be used in softirq context */
+#define this_cpu_inc_alloc_stat(pool, __stat)				\
+	do {								\
+		struct page_pool_stats __percpu *s = pool->stats;	\
+		__this_cpu_inc(s->alloc.__stat);			\
+	} while (0)
+#else
+#define this_cpu_inc_alloc_stat(pool, __stat)
+#endif
+
 static int page_pool_init(struct page_pool *pool,
 			  const struct page_pool_params *params)
 {
@@ -73,6 +84,12 @@  static int page_pool_init(struct page_pool *pool,
 	    pool->p.flags & PP_FLAG_PAGE_FRAG)
 		return -EINVAL;
 
+#ifdef CONFIG_PAGE_POOL_STATS
+	pool->stats = alloc_percpu(struct page_pool_stats);
+	if (!pool->stats)
+		return -ENOMEM;
+#endif
+
 	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
 		return -ENOMEM;
 
@@ -117,8 +134,10 @@  static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
 	int pref_nid; /* preferred NUMA node */
 
 	/* Quicker fallback, avoid locks when ring is empty */
-	if (__ptr_ring_empty(r))
+	if (__ptr_ring_empty(r)) {
+		this_cpu_inc_alloc_stat(pool, empty);
 		return NULL;
+	}
 
 	/* Softirq guarantee CPU and thus NUMA node is stable. This,
 	 * assumes CPU refilling driver RX-ring will also run RX-NAPI.
@@ -145,14 +164,17 @@  static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
 			 * This limit stress on page buddy alloactor.
 			 */
 			page_pool_return_page(pool, page);
+			this_cpu_inc_alloc_stat(pool, waive);
 			page = NULL;
 			break;
 		}
 	} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
 
 	/* Return last page */
-	if (likely(pool->alloc.count > 0))
+	if (likely(pool->alloc.count > 0)) {
 		page = pool->alloc.cache[--pool->alloc.count];
+		this_cpu_inc_alloc_stat(pool, refill);
+	}
 
 	return page;
 }
@@ -166,6 +188,7 @@  static struct page *__page_pool_get_cached(struct page_pool *pool)
 	if (likely(pool->alloc.count)) {
 		/* Fast-path */
 		page = pool->alloc.cache[--pool->alloc.count];
+		this_cpu_inc_alloc_stat(pool, fast);
 	} else {
 		page = page_pool_refill_alloc_cache(pool);
 	}
@@ -239,6 +262,7 @@  static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
 		return NULL;
 	}
 
+	this_cpu_inc_alloc_stat(pool, slow_high_order);
 	page_pool_set_pp_info(pool, page);
 
 	/* Track how many pages are held 'in-flight' */
@@ -293,10 +317,12 @@  static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
 	}
 
 	/* Return last page */
-	if (likely(pool->alloc.count > 0))
+	if (likely(pool->alloc.count > 0)) {
 		page = pool->alloc.cache[--pool->alloc.count];
-	else
+		this_cpu_inc_alloc_stat(pool, slow);
+	} else {
 		page = NULL;
+	}
 
 	/* When page just alloc'ed is should/must have refcnt 1. */
 	return page;
@@ -620,6 +646,9 @@  static void page_pool_free(struct page_pool *pool)
 	if (pool->p.flags & PP_FLAG_DMA_MAP)
 		put_device(pool->p.dev);
 
+#ifdef CONFIG_PAGE_POOL_STATS
+	free_percpu(pool->stats);
+#endif
 	kfree(pool);
 }