@@ -60,6 +60,11 @@ struct sbitmap {
*/
unsigned int map_nr;
+ /**
+ * @map_nr_per_node: Number of words being used per NUMA node.
+ */
+ unsigned int map_nr_per_node;
+
/**
* @round_robin: Allocate bits in strict round-robin order.
*/
@@ -8,6 +8,17 @@
#include <linux/random.h>
#include <linux/sbitmap.h>
#include <linux/seq_file.h>
+#include <linux/mm.h>
+
+static unsigned int sbitmap_get_new_hint(struct sbitmap *sb, int cpu)
+{
+ unsigned int shift = sb->shift;
+ unsigned int map_nr_per_node = sb->map_nr_per_node;
+ unsigned int bit_per_node = map_nr_per_node << shift;
+ unsigned int hint_base = bit_per_node * cpu_to_node(cpu);
+
+ return hint_base + (prandom_u32() % bit_per_node);
+}
static int init_alloc_hint(struct sbitmap *sb, gfp_t flags)
{
@@ -20,8 +31,10 @@ static int init_alloc_hint(struct sbitmap *sb, gfp_t flags)
if (depth && !sb->round_robin) {
int i;
- for_each_possible_cpu(i)
- *per_cpu_ptr(sb->alloc_hint, i) = prandom_u32() % depth;
+ for_each_possible_cpu(i) {
+ *per_cpu_ptr(sb->alloc_hint, i) =
+ sbitmap_get_new_hint(sb, i);
+ }
}
return 0;
}
@@ -86,7 +99,8 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
{
unsigned int bits_per_word;
struct sbitmap_word *map;
- int index;
+ int index, num_nodes = num_online_nodes();
+ int nid, map_nr_cnt;
if (shift < 0)
shift = sbitmap_calculate_shift(depth);
@@ -105,6 +119,11 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
return 0;
}
+ if (sb->map_nr < num_nodes) {
+ sb->map_nr_per_node = 1;
+ } else {
+ sb->map_nr_per_node = sb->map_nr / num_nodes;
+ }
if (alloc_hint) {
if (init_alloc_hint(sb, flags))
return -ENOMEM;
@@ -113,23 +132,43 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
}
sb->map = kvzalloc_node(sb->map_nr * sizeof(*sb->map), flags, node);
- if (!sb->map) {
- free_percpu(sb->alloc_hint);
- return -ENOMEM;
- }
-
- map = kvzalloc_node(sb->map_nr * sizeof(**sb->map), flags, node);
- if (!map)
- return -ENOMEM;
+ if (!sb->map)
+ goto err_map;
- for (index = 0; index < sb->map_nr; index++, map++) {
+ for (index = 0, nid = 0; index < sb->map_nr; index++, map++, map_nr_cnt++) {
struct sbitmap_word **_map;
+ if ((index % sb->map_nr_per_node) == 0) {
+ int cnt;
+
+ if (index == 0) {
+ cnt = sb->map_nr_per_node +
+ (sb->map_nr % sb->map_nr_per_node);
+ } else {
+ cnt = sb->map_nr_per_node;
+ }
+
+ map = kvzalloc_node(cnt * sizeof(**sb->map), flags, nid);
+ if (!map)
+ goto err_map_numa;
+ nid++;
+ }
+
_map = &sb->map[index];
*_map = map;
}
return 0;
+err_map_numa:
+ for (index = 0; index < sb->map_nr; index++, map++) {
+ if ((index % sb->map_nr_per_node) == 0) {
+ kfree(map);
+ }
+ }
+err_map:
+ free_percpu(sb->alloc_hint);
+
+ return -ENOMEM;
}
EXPORT_SYMBOL_GPL(sbitmap_init_node);
Currently sbitmap words are allocated in a single array. That array is flagged for allocating at the NUMA node passed in sbitmap_init_node(). However often the sbitmap will be accessed by all the CPUs in the system - for example, when BLK_MQ_F_TAG_HCTX_SHARED is set for the blk-mq tagset. This can lead to performance issues where all CPUs in the system are doing cross-NUMA node accesses to read/set/unset sbitmap tags. Improve this by spreading the word allocations across all NUMA nodes as evenly as possible. We set the per-CPU hint to fall within range of words allocated for the NUMA node to which that CPU belongs. Known issues: - sbitmap resize does not work well for this scheme - Improve updating hint for sbitmap_get() failure and sbitmap_put() when hint is outside range of CPU's NUMA node words. - Add intelligence for sub-arrays to be allocated at a single node, e.g. when numa != NUMA_NO_NODE in sbitmap_init_node() Signed-off-by: John Garry <john.garry@huawei.com> --- include/linux/sbitmap.h | 5 ++++ lib/sbitmap.c | 63 +++++++++++++++++++++++++++++++++-------- 2 files changed, 56 insertions(+), 12 deletions(-)