diff mbox

[v6,4/6] lib/dlock-list: Make sibling CPUs share the same linked list

Message ID 1507152007-28753-5-git-send-email-longman@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Waiman Long Oct. 4, 2017, 9:20 p.m. UTC
The dlock list needs one list for each of the CPUs available. However,
for sibling CPUs, they are sharing the L2 and probably L1 caches
too. As a result, there is not much to gain in term of avoiding
cacheline contention while increasing the cacheline footprint of the
L1/L2 caches as separate lists may need to be in the cache.

This patch makes all the sibling CPUs share the same list, thus
reducing the number of lists that need to be maintained in each
dlock list without having any noticeable impact on performance. It
also improves dlock list iteration performance as fewer lists need
to be iterated.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
---
 lib/dlock-list.c | 61 ++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 46 insertions(+), 15 deletions(-)

Comments

Jan Kara Oct. 5, 2017, 8:59 a.m. UTC | #1
On Wed 04-10-17 17:20:05, Waiman Long wrote:
>  int alloc_dlock_list_heads(struct dlock_list_heads *dlist)
>  {
> -	int idx;
> +	int idx, cnt = nr_dlock_lists ? nr_dlock_lists : nr_cpu_ids;

Hum, is this there for the case where alloc_dlock_list_heads() is called
before nr_dlock_lists is initialized? But how can the dlist be used later
when it has larger number of lists and you don't know how many?

								Honza
Waiman Long Oct. 5, 2017, 2:57 p.m. UTC | #2
On 10/05/2017 04:59 AM, Jan Kara wrote:
> On Wed 04-10-17 17:20:05, Waiman Long wrote:
>>  int alloc_dlock_list_heads(struct dlock_list_heads *dlist)
>>  {
>> -	int idx;
>> +	int idx, cnt = nr_dlock_lists ? nr_dlock_lists : nr_cpu_ids;
> Hum, is this there for the case where alloc_dlock_list_heads() is called
> before nr_dlock_lists is initialized? But how can the dlist be used later
> when it has larger number of lists and you don't know how many?

The point is nr_dlock_lists <= nr_cpu_ids. Before nr_dlock_lists is
initialized, the mapping table will map all cpus to the first entry. So
the extra entries will just stay unused. At free time, they will all get
de-allocated. I will clarify that with a comment.

Cheers,
Longman
Davidlohr Bueso Oct. 5, 2017, 5:40 p.m. UTC | #3
This is still spitting:

lib/dlock-list.c: In function ‘cpu2idx_init’:
lib/dlock-list.c:75:16: error: incompatible types when assigning to type ‘cpumask_var_t’ from type ‘struct cpumask *’
   sibling_mask = topology_sibling_cpumask(cpu);
                ^
lib/dlock-list.c:76:7: warning: the address of ‘sibling_mask’ will always evaluate as ‘true’ [-Waddress]
   if (sibling_mask) {
       ^
kernel test robot Oct. 7, 2017, 1:44 p.m. UTC | #4
Hi Waiman,

[auto build test ERROR on linus/master]
[also build test ERROR on v4.14-rc3]
[cannot apply to next-20170929]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Waiman-Long/vfs-Use-dlock-list-for-SB-s-s_inodes-list/20171007-210724
config: i386-tinyconfig (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All error/warnings (new ones prefixed by >>):

   lib/dlock-list.c: In function 'cpu2idx_init':
>> lib/dlock-list.c:75:16: error: assignment to expression with array type
      sibling_mask = topology_sibling_cpumask(cpu);
                   ^
>> lib/dlock-list.c:76:7: warning: the address of 'sibling_mask' will always evaluate as 'true' [-Waddress]
      if (sibling_mask) {
          ^~~~~~~~~~~~

vim +75 lib/dlock-list.c

    44	
    45	/*
    46	 * Initialize cpu2idx mapping table & nr_dlock_lists.
    47	 *
    48	 * It is possible that a dlock-list can be allocated before the cpu2idx is
    49	 * initialized. In this case, all the cpus are mapped to the first entry
    50	 * before initialization.
    51	 *
    52	 * All the sibling CPUs of a sibling group will map to the same dlock list so
    53	 * as to reduce the number of dlock lists to be maintained while minimizing
    54	 * cacheline contention.
    55	 *
    56	 * As the sibling masks are set up in the core initcall phase, this function
    57	 * has to be done in the postcore phase to get the right data.
    58	 */
    59	static int __init cpu2idx_init(void)
    60	{
    61		int idx, cpu;
    62		cpumask_var_t sibling_mask;
    63		static struct cpumask mask __initdata;
    64	
    65		cpumask_clear(&mask);
    66		idx = 0;
    67		for_each_possible_cpu(cpu) {
    68			int scpu;
    69	
    70			if (cpumask_test_cpu(cpu, &mask))
    71				continue;
    72			per_cpu(cpu2idx, cpu) = idx;
    73			cpumask_set_cpu(cpu, &mask);
    74	
  > 75			sibling_mask = topology_sibling_cpumask(cpu);
  > 76			if (sibling_mask) {
    77				for_each_cpu(scpu, sibling_mask) {
    78					per_cpu(cpu2idx, scpu) = idx;
    79					cpumask_set_cpu(scpu, &mask);
    80				}
    81			}
    82			idx++;
    83		}
    84	
    85		/*
    86		 * nr_dlock_lists can only be set after cpu2idx is properly
    87		 * initialized.
    88		 */
    89		smp_mb();
    90		nr_dlock_lists = idx;
    91		pr_info("dlock-list: %d head entries per dlock list.\n",
    92			nr_dlock_lists);
    93		return 0;
    94	}
    95	postcore_initcall(cpu2idx_init);
    96	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
kernel test robot Oct. 7, 2017, 2:38 p.m. UTC | #5
Hi Waiman,

[auto build test ERROR on linus/master]
[also build test ERROR on v4.14-rc3]
[cannot apply to next-20170929]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Waiman-Long/vfs-Use-dlock-list-for-SB-s-s_inodes-list/20171007-210724
config: i386-randconfig-i0-201740 (attached as .config)
compiler: gcc-4.8 (Debian 4.8.4-1) 4.8.4
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   lib/dlock-list.c: In function 'cpu2idx_init':
>> lib/dlock-list.c:75:16: error: incompatible types when assigning to type 'cpumask_var_t' from type 'const struct cpumask *'
      sibling_mask = topology_sibling_cpumask(cpu);
                   ^
   lib/dlock-list.c:76:7: warning: the address of 'sibling_mask' will always evaluate as 'true' [-Waddress]
      if (sibling_mask) {
          ^

vim +75 lib/dlock-list.c

    44	
    45	/*
    46	 * Initialize cpu2idx mapping table & nr_dlock_lists.
    47	 *
    48	 * It is possible that a dlock-list can be allocated before the cpu2idx is
    49	 * initialized. In this case, all the cpus are mapped to the first entry
    50	 * before initialization.
    51	 *
    52	 * All the sibling CPUs of a sibling group will map to the same dlock list so
    53	 * as to reduce the number of dlock lists to be maintained while minimizing
    54	 * cacheline contention.
    55	 *
    56	 * As the sibling masks are set up in the core initcall phase, this function
    57	 * has to be done in the postcore phase to get the right data.
    58	 */
    59	static int __init cpu2idx_init(void)
    60	{
    61		int idx, cpu;
    62		cpumask_var_t sibling_mask;
    63		static struct cpumask mask __initdata;
    64	
    65		cpumask_clear(&mask);
    66		idx = 0;
    67		for_each_possible_cpu(cpu) {
    68			int scpu;
    69	
    70			if (cpumask_test_cpu(cpu, &mask))
    71				continue;
    72			per_cpu(cpu2idx, cpu) = idx;
    73			cpumask_set_cpu(cpu, &mask);
    74	
  > 75			sibling_mask = topology_sibling_cpumask(cpu);
    76			if (sibling_mask) {
    77				for_each_cpu(scpu, sibling_mask) {
    78					per_cpu(cpu2idx, scpu) = idx;
    79					cpumask_set_cpu(scpu, &mask);
    80				}
    81			}
    82			idx++;
    83		}
    84	
    85		/*
    86		 * nr_dlock_lists can only be set after cpu2idx is properly
    87		 * initialized.
    88		 */
    89		smp_mb();
    90		nr_dlock_lists = idx;
    91		pr_info("dlock-list: %d head entries per dlock list.\n",
    92			nr_dlock_lists);
    93		return 0;
    94	}
    95	postcore_initcall(cpu2idx_init);
    96	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Jan Kara Oct. 9, 2017, 7:26 a.m. UTC | #6
On Thu 05-10-17 10:57:07, Waiman Long wrote:
> On 10/05/2017 04:59 AM, Jan Kara wrote:
> > On Wed 04-10-17 17:20:05, Waiman Long wrote:
> >>  int alloc_dlock_list_heads(struct dlock_list_heads *dlist)
> >>  {
> >> -	int idx;
> >> +	int idx, cnt = nr_dlock_lists ? nr_dlock_lists : nr_cpu_ids;
> > Hum, is this there for the case where alloc_dlock_list_heads() is called
> > before nr_dlock_lists is initialized? But how can the dlist be used later
> > when it has larger number of lists and you don't know how many?
> 
> The point is nr_dlock_lists <= nr_cpu_ids. Before nr_dlock_lists is
> initialized, the mapping table will map all cpus to the first entry. So
> the extra entries will just stay unused. At free time, they will all get
> de-allocated. I will clarify that with a comment.

OK, nice trick.

								Honza
diff mbox

Patch

diff --git a/lib/dlock-list.c b/lib/dlock-list.c
index 2779e3e..a8c741d 100644
--- a/lib/dlock-list.c
+++ b/lib/dlock-list.c
@@ -25,15 +25,14 @@ 
  * The distributed and locked list is a distributed set of lists each of
  * which is protected by its own spinlock, but acts like a single
  * consolidated list to the callers. For scaling purpose, the number of
- * lists used is equal to the number of possible CPUs in the system to
- * minimize contention.
+ * lists used is equal to the number of possible cores in the system to
+ * minimize contention. All threads of the same CPU core will share the
+ * same list.
  *
- * However, it is possible that individual CPU numbers may be equal to
- * or greater than the number of possible CPUs when there are holes in
- * the CPU number list. As a result, we need to map the CPU number to a
- * list index.
+ * We need to map each CPU number to a list index.
  */
 static DEFINE_PER_CPU_READ_MOSTLY(int, cpu2idx);
+static int nr_dlock_lists __read_mostly;
 
 /*
  * As all the locks in the dlock list are dynamically allocated, they need
@@ -44,20 +43,53 @@ 
 static struct lock_class_key dlock_list_key;
 
 /*
- * Initialize cpu2idx mapping table
+ * Initialize cpu2idx mapping table & nr_dlock_lists.
  *
  * It is possible that a dlock-list can be allocated before the cpu2idx is
  * initialized. In this case, all the cpus are mapped to the first entry
  * before initialization.
  *
+ * All the sibling CPUs of a sibling group will map to the same dlock list so
+ * as to reduce the number of dlock lists to be maintained while minimizing
+ * cacheline contention.
+ *
+ * As the sibling masks are set up in the core initcall phase, this function
+ * has to be done in the postcore phase to get the right data.
  */
 static int __init cpu2idx_init(void)
 {
 	int idx, cpu;
+	cpumask_var_t sibling_mask;
+	static struct cpumask mask __initdata;
 
+	cpumask_clear(&mask);
 	idx = 0;
-	for_each_possible_cpu(cpu)
-		per_cpu(cpu2idx, cpu) = idx++;
+	for_each_possible_cpu(cpu) {
+		int scpu;
+
+		if (cpumask_test_cpu(cpu, &mask))
+			continue;
+		per_cpu(cpu2idx, cpu) = idx;
+		cpumask_set_cpu(cpu, &mask);
+
+		sibling_mask = topology_sibling_cpumask(cpu);
+		if (sibling_mask) {
+			for_each_cpu(scpu, sibling_mask) {
+				per_cpu(cpu2idx, scpu) = idx;
+				cpumask_set_cpu(scpu, &mask);
+			}
+		}
+		idx++;
+	}
+
+	/*
+	 * nr_dlock_lists can only be set after cpu2idx is properly
+	 * initialized.
+	 */
+	smp_mb();
+	nr_dlock_lists = idx;
+	pr_info("dlock-list: %d head entries per dlock list.\n",
+		nr_dlock_lists);
 	return 0;
 }
 postcore_initcall(cpu2idx_init);
@@ -74,15 +106,14 @@  static int __init cpu2idx_init(void)
  */
 int alloc_dlock_list_heads(struct dlock_list_heads *dlist)
 {
-	int idx;
+	int idx, cnt = nr_dlock_lists ? nr_dlock_lists : nr_cpu_ids;
 
-	dlist->heads = kcalloc(nr_cpu_ids, sizeof(struct dlock_list_head),
-			       GFP_KERNEL);
+	dlist->heads = kcalloc(cnt, sizeof(struct dlock_list_head), GFP_KERNEL);
 
 	if (!dlist->heads)
 		return -ENOMEM;
 
-	for (idx = 0; idx < nr_cpu_ids; idx++) {
+	for (idx = 0; idx < cnt; idx++) {
 		struct dlock_list_head *head = &dlist->heads[idx];
 
 		INIT_LIST_HEAD(&head->list);
@@ -118,7 +149,7 @@  bool dlock_lists_empty(struct dlock_list_heads *dlist)
 {
 	int idx;
 
-	for (idx = 0; idx < nr_cpu_ids; idx++)
+	for (idx = 0; idx < nr_dlock_lists; idx++)
 		if (!list_empty(&dlist->heads[idx].list))
 			return false;
 	return true;
@@ -207,7 +238,7 @@  struct dlock_list_node *__dlock_list_next_list(struct dlock_list_iter *iter)
 	/*
 	 * Try next list
 	 */
-	if (++iter->index >= nr_cpu_ids)
+	if (++iter->index >= nr_dlock_lists)
 		return NULL;	/* All the entries iterated */
 
 	if (list_empty(&iter->head[iter->index].list))