[RFC,v2] Introduce Hierarchical Per-CPU Counters

Message ID	20250408160508.991738-1-mathieu.desnoyers@efficios.com (mailing list archive)
State	New
Headers	show Return-Path: <owner-linux-mm@kvack.org> From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> To: Sweet Tea Dorminy <sweettea@google.com>, Mateusz Guzik <mjguzik@gmail.com> Cc: linux-kernel@vger.kernel.org, Mathieu Desnoyers <mathieu.desnoyers@efficios.com>, Andrew Morton <akpm@linux-foundation.org>, "Paul E. McKenney" <paulmck@kernel.org>, Steven Rostedt <rostedt@goodmis.org>, Masami Hiramatsu <mhiramat@kernel.org>, Dennis Zhou <dennis@kernel.org>, Tejun Heo <tj@kernel.org>, Christoph Lameter <cl@linux.com>, Martin Liu <liumartin@google.com>, David Rientjes <rientjes@google.com>, christian.koenig@amd.com, Shakeel Butt <shakeel.butt@linux.dev>, Johannes Weiner <hannes@cmpxchg.org>, Lorenzo Stoakes <lorenzo.stoakes@oracle.com>, "Liam R . Howlett" <Liam.Howlett@Oracle.com>, Suren Baghdasaryan <surenb@google.com>, Vlastimil Babka <vbabka@suse.cz>, Christian Brauner <brauner@kernel.org>, Wei Yang <richard.weiyang@gmail.com>, David Hildenbrand <david@redhat.com>, Miaohe Lin <linmiaohe@huawei.com>, Al Viro <viro@zeniv.linux.org.uk>, linux-mm@kvack.org, linux-trace-kernel@vger.kernel.org, Yu Zhao <yuzhao@google.com>, Roman Gushchin <roman.gushchin@linux.dev>, Matthew Wilcox <willy@infradead.org> Subject: [RFC PATCH v2] Introduce Hierarchical Per-CPU Counters Date: Tue, 8 Apr 2025 12:05:08 -0400 Message-Id: <20250408160508.991738-1-mathieu.desnoyers@efficios.com> MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	[RFC,v2] Introduce Hierarchical Per-CPU Counters \| expand [RFC,v2] Introduce Hierarchical Per-CPU Counters

diff --git a/include/linux/percpu_counter_tree.h b/include/linux/percpu_counter_tree.h new file mode 100644 index 000000000000..aac5a711de3e --- /dev/null +++ b/include/linux/percpu_counter_tree.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0+ OR MIT */ +/* SPDX-FileCopyrightText: 2025 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> */ + +#ifndef _PERCPU_COUNTER_TREE_H +#define _PERCPU_COUNTER_TREE_H + +#include <linux/cleanup.h> +#include <linux/preempt.h> +#include <linux/atomic.h> +#include <linux/percpu.h> + +struct percpu_counter_tree_level_item { + atomic_t count; +} ____cacheline_aligned_in_smp; + +struct percpu_counter_tree { + /* Fast-path fields. */ + unsigned int __percpu *level0; + unsigned int level0_bit_mask; + atomic_t *approx_sum; + int bias; /* bias for counter_set */ + + /* Slow-path fields. */ + struct percpu_counter_tree_level_item *items; + unsigned int batch_size; + unsigned int inaccuracy; /* approximation imprecise within ± inaccuracy */ + unsigned int nr_levels; + unsigned int nr_cpus; +}; + +int percpu_counter_tree_init(struct percpu_counter_tree *counter, unsigned int batch_size); +void percpu_counter_tree_destroy(struct percpu_counter_tree *counter); +void percpu_counter_tree_add_slowpath(struct percpu_counter_tree *counter, int inc); +int percpu_counter_tree_precise_sum(struct percpu_counter_tree *counter); +int percpu_counter_tree_approximate_compare(struct percpu_counter_tree *a, struct percpu_counter_tree *b); +int percpu_counter_tree_approximate_compare_value(struct percpu_counter_tree *counter, int v); +int percpu_counter_tree_precise_compare(struct percpu_counter_tree *a, struct percpu_counter_tree *b); +int percpu_counter_tree_precise_compare_value(struct percpu_counter_tree *counter, int v); +void percpu_counter_tree_set_bias(struct percpu_counter_tree *counter, int bias); +void percpu_counter_tree_set(struct percpu_counter_tree *counter, int v); +unsigned int percpu_counter_tree_inaccuracy(struct percpu_counter_tree *counter); + +/* Fast paths */ + +static inline +int percpu_counter_tree_carry(int orig, int res, int inc, unsigned int bit_mask) +{ + if (inc < 0) { + inc = -(-inc & ~(bit_mask - 1)); + /* + * xor bit_mask: underflow. + * + * If inc has bit set, decrement an additional bit if + * there is _no_ bit transition between orig and res. + * Else, inc has bit cleared, decrement an additional + * bit if there is a bit transition between orig and + * res. + */ + if ((inc ^ orig ^ res) & bit_mask) + inc -= bit_mask; + } else { + inc &= ~(bit_mask - 1); + /* + * xor bit_mask: overflow. + * + * If inc has bit set, increment an additional bit if + * there is _no_ bit transition between orig and res. + * Else, inc has bit cleared, increment an additional + * bit if there is a bit transition between orig and + * res. + */ + if ((inc ^ orig ^ res) & bit_mask) + inc += bit_mask; + } + return inc; +} + +static inline +void percpu_counter_tree_add(struct percpu_counter_tree *counter, int inc) +{ + unsigned int bit_mask = counter->level0_bit_mask, orig, res; + + if (!inc) + return; + /* Make sure the fast and slow paths use the same cpu number. */ + guard(migrate)(); + res = this_cpu_add_return(*counter->level0, inc); + orig = res - inc; + inc = percpu_counter_tree_carry(orig, res, inc, bit_mask); + if (!inc) + return; + percpu_counter_tree_add_slowpath(counter, inc); +} + +static inline +int percpu_counter_tree_approximate_sum(struct percpu_counter_tree *counter) +{ + return (int) ((unsigned int)atomic_read(counter->approx_sum) + + (unsigned int)READ_ONCE(counter->bias)); +} + +#endif /* _PERCPU_COUNTER_TREE_H */ diff --git a/lib/Makefile b/lib/Makefile index d5cfc7afbbb8..d803a3a63288 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -201,6 +201,7 @@ obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o obj-$(CONFIG_SMP) += percpu_counter.o +obj-$(CONFIG_SMP) += percpu_counter_tree.o obj-$(CONFIG_AUDIT_GENERIC) += audit.o obj-$(CONFIG_AUDIT_COMPAT_GENERIC) += compat_audit.o diff --git a/lib/percpu_counter_tree.c b/lib/percpu_counter_tree.c new file mode 100644 index 000000000000..b530ba9dd61b --- /dev/null +++ b/lib/percpu_counter_tree.c @@ -0,0 +1,313 @@ +// SPDX-License-Identifier: GPL-2.0+ OR MIT +// SPDX-FileCopyrightText: 2025 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> + +/* + * Split Counters With Binary Tree Approximation Propagation + * + * * Propagation diagram when reaching batch size thresholds (± batch size): + * + * Example diagram for 8 CPUs: + * + * log2(8) = 3 levels + * + * At each level, each pair propagates its values to the next level when + * reaching the batch size thresholds. + * + * Counters at levels 0, 1, 2 can be kept on a single byte (±128 range), + * although it may be relevant to keep them on 32-bit counters for + * simplicity. (complexity vs memory footprint tradeoff) + * + * Counter at level 3 can be kept on a 32-bit counter. + * + * Level 0: 0 1 2 3 4 5 6 7 + * | / | / | / | / + * | / | / | / | / + * | / | / | / | / + * Level 1: 0 1 2 3 + * | / | / + * | / | / + * | / | / + * Level 2: 0 1 + * | / + * | / + * | / + * Level 3: 0 + * + * * Approximation inaccuracy: + * + * BATCH(level N): Level N batch size. + * + * Example for BATCH(level 0) = 32. + * + * BATCH(level 0) = 32 + * BATCH(level 1) = 64 + * BATCH(level 2) = 128 + * BATCH(level N) = BATCH(level 0) * 2^N + * + * per-counter global + * inaccuracy inaccuracy + * Level 0: [ -32 .. +31] ±256 (8 * 32) + * Level 1: [ -64 .. +63] ±256 (4 * 64) + * Level 2: [-128 .. +127] ±256 (2 * 128) + * Total: ------ ±768 (log2(nr_cpu_ids) * BATCH(level 0) * nr_cpu_ids) + * + * ----- + * + * Approximate Sum Carry Propagation + * + * Let's define a number of counter bits for each level, e.g.: + * + * log2(BATCH(level 0)) = log2(32) = 5 + * + * nr_bit value_mask range + * Level 0: 5 bits v 0 .. +31 + * Level 1: 1 bit (v & ~((1UL << 5) - 1)) 0 .. +63 + * Level 2: 1 bit (v & ~((1UL << 6) - 1)) 0 .. +127 + * Level 3: 25 bits (v & ~((1UL << 7) - 1)) 0 .. 2^32-1 + * + * Note: Use a full 32-bit per-cpu counter at level 0 to allow precise sum. + * + * Note: Use cacheline aligned counters at levels above 0 to prevent false sharing. + * If memory footprint is an issue, a specialized allocator could be used + * to eliminate padding. + * + * Example with expanded values: + * + * counter_add(counter, inc): + * + * if (!inc) + * return; + * + * res = percpu_add_return(counter @ Level 0, inc); + * orig = res - inc; + * if (inc < 0) { + * inc = -(-inc & ~0b00011111); // Clear used bits + * // xor bit 5: underflow + * if ((inc ^ orig ^ res) & 0b00100000) + * inc -= 0b00100000; + * } else { + * inc &= ~0b00011111; // Clear used bits + * // xor bit 5: overflow + * if ((inc ^ orig ^ res) & 0b00100000) + * inc += 0b00100000; + * } + * if (!inc) + * return; + * + * res = atomic_add_return(counter @ Level 1, inc); + * orig = res - inc; + * if (inc < 0) { + * inc = -(-inc & ~0b00111111); // Clear used bits + * // xor bit 6: underflow + * if ((inc ^ orig ^ res) & 0b01000000) + * inc -= 0b01000000; + * } else { + * inc &= ~0b00111111; // Clear used bits + * // xor bit 6: overflow + * if ((inc ^ orig ^ res) & 0b01000000) + * inc += 0b01000000; + * } + * if (!inc) + * return; + * + * res = atomic_add_return(counter @ Level 2, inc); + * orig = res - inc; + * if (inc < 0) { + * inc = -(-inc & ~0b01111111); // Clear used bits + * // xor bit 7: underflow + * if ((inc ^ orig ^ res) & 0b10000000) + * inc -= 0b10000000; + * } else { + * inc &= ~0b01111111; // Clear used bits + * // xor bit 7: overflow + * if ((inc ^ orig ^ res) & 0b10000000) + * inc += 0b10000000; + * } + * if (!inc) + * return; + * + * atomic_add(counter @ Level 3, inc); + */ + +#include <linux/percpu_counter_tree.h> +#include <linux/cpumask.h> +#include <linux/percpu.h> +#include <linux/atomic.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/math.h> + +int percpu_counter_tree_init(struct percpu_counter_tree *counter, unsigned int batch_size) +{ + /* Batch size must be power of 2 */ + if (!batch_size || (batch_size & (batch_size - 1))) + return -EINVAL; + counter->nr_levels = get_count_order(nr_cpu_ids); + counter->nr_cpus = 1UL << counter->nr_levels; + counter->batch_size = batch_size; + counter->level0_bit_mask = 1UL << get_count_order(batch_size); + counter->inaccuracy = counter->nr_levels * batch_size * counter->nr_cpus; + counter->bias = 0; + counter->level0 = alloc_percpu(unsigned int); + if (!counter->level0) + return -ENOMEM; + counter->items = kzalloc(counter->nr_cpus - 1 * + sizeof(struct percpu_counter_tree_level_item), + GFP_KERNEL); + if (!counter->items) { + free_percpu(counter->level0); + return -ENOMEM; + } + counter->approx_sum = &counter->items[counter->nr_cpus - 2].count; + return 0; +} + +void percpu_counter_tree_destroy(struct percpu_counter_tree *counter) +{ + free_percpu(counter->level0); + kfree(counter->items); +} + +/* Called with migration disabled. */ +void percpu_counter_tree_add_slowpath(struct percpu_counter_tree *counter, int inc) +{ + struct percpu_counter_tree_level_item *item = counter->items; + unsigned int level_items = counter->nr_cpus >> 1; + unsigned int level, nr_levels = counter->nr_levels; + unsigned int bit_mask = counter->level0_bit_mask; + unsigned int cpu = smp_processor_id(); + + for (level = 1; level < nr_levels; level++) { + atomic_t *count = &item[cpu & (level_items - 1)].count; + unsigned int orig, res; + + bit_mask <<= 1; + res = atomic_add_return_relaxed(inc, count); + orig = res - inc; + inc = percpu_counter_tree_carry(orig, res, inc, bit_mask); + item += level_items; + level_items >>= 1; + if (!inc) + return; + } + atomic_add(inc, counter->approx_sum); +} + +/* + * Precise sum. Perform the sum of all per-cpu counters. + */ +static +int percpu_counter_tree_precise_sum_unbiased(struct percpu_counter_tree *counter) +{ + unsigned int sum = 0; + int cpu; + + for_each_possible_cpu(cpu) + sum += *per_cpu_ptr(counter->level0, cpu); + return (int) sum; +} + +int percpu_counter_tree_precise_sum(struct percpu_counter_tree *counter) +{ + return percpu_counter_tree_precise_sum_unbiased(counter) + READ_ONCE(counter->bias); +} + +/* + * Do an approximate comparison of two counters. + * Return 0 if counters do not differ by more than the sum of their + * respective inaccuracy ranges, + * Return -1 if counter @a less than counter @b, + * Return 1 if counter @a is greater than counter @b. + */ +int percpu_counter_tree_approximate_compare(struct percpu_counter_tree *a, struct percpu_counter_tree *b) +{ + int count_a = percpu_counter_tree_approximate_sum(a), + count_b = percpu_counter_tree_approximate_sum(b); + + if (abs(count_a - count_b) <= (a->inaccuracy + b->inaccuracy)) + return 0; + if (count_a < count_b) + return -1; + return 1; +} + +/* + * Do an approximate comparison of a counter against a given value. + * Return 0 if the value is within the inaccuracy range of the counter, + * Return -1 if the value less than counter, + * Return 1 if the value is greater than counter. + */ +int percpu_counter_tree_approximate_compare_value(struct percpu_counter_tree *counter, int v) +{ + int count = percpu_counter_tree_approximate_sum(counter); + + if (abs(v - count) <= counter->inaccuracy) + return 0; + if (count < v) + return -1; + return 1; +} + +/* + * Do a precise comparison of two counters. + * Return 0 if the counters are equal, + * Return -1 if counter @a less than counter @b, + * Return 1 if counter @a is greater than counter @b. + */ +int percpu_counter_tree_precise_compare(struct percpu_counter_tree *a, struct percpu_counter_tree *b) +{ + int count_a = percpu_counter_tree_approximate_sum(a), + count_b = percpu_counter_tree_approximate_sum(b); + + if (abs(count_a - count_b) <= (a->inaccuracy + b->inaccuracy)) { + if (b->inaccuracy < a->inaccuracy) { + count_a = percpu_counter_tree_precise_sum(a); + if (abs(count_a - count_b) <= b->inaccuracy) + count_b = percpu_counter_tree_precise_sum(b); + } else { + count_b = percpu_counter_tree_precise_sum(b); + if (abs(count_a - count_b) <= a->inaccuracy) + count_a = percpu_counter_tree_precise_sum(a); + } + } + if (count_a > count_b) + return -1; + if (count_a > count_b) + return 1; + return 0; +} + +/* + * Do a precise comparision of a counter against a given value. + * Return 0 if the value is equal to the counter, + * Return -1 if the value less than counter, + * Return 1 if the value is greater than counter. + */ +int percpu_counter_tree_precise_compare_value(struct percpu_counter_tree *counter, int v) +{ + int count = percpu_counter_tree_approximate_sum(counter); + + if (abs(v - count) <= counter->inaccuracy) + count = percpu_counter_tree_precise_sum(counter); + if (count < v) + return -1; + if (count > v) + return 1; + return 0; +} + +void percpu_counter_tree_set_bias(struct percpu_counter_tree *counter, int bias) +{ + WRITE_ONCE(counter->bias, bias); +} + +void percpu_counter_tree_set(struct percpu_counter_tree *counter, int v) +{ + percpu_counter_tree_set_bias(counter, + v - percpu_counter_tree_precise_sum_unbiased(counter)); +} + +unsigned int percpu_counter_tree_inaccuracy(struct percpu_counter_tree *counter) +{ + return counter->inaccuracy; +}

[RFC,v2] Introduce Hierarchical Per-CPU Counters

Commit Message

Comments

Patch