diff mbox

[v3,1/3] lib/percpu-list: Per-cpu list with associated per-cpu locks

Message ID 1456254272-42313-2-git-send-email-Waiman.Long@hpe.com (mailing list archive)
State New, archived
Headers show

Commit Message

Waiman Long Feb. 23, 2016, 7:04 p.m. UTC
Linked list is used everywhere in the Linux kernel. However, if many
threads are trying to add or delete entries into the same linked list,
it can create a performance bottleneck.

This patch introduces a new per-cpu list subystem with associated
per-cpu locks for protecting each of the lists individually. This
allows list entries insertion and deletion operations to happen in
parallel instead of being serialized with a global list and lock.

List entry insertion is strictly per cpu. List deletion, however, can
happen in a cpu other than the one that did the insertion. So we still
need lock to protect the list. Because of that, there may still be
a small amount of contention when deletion is being done.

A new header file include/linux/percpu-list.h will be added with the
associated pcpu_list_head and pcpu_list_node structures. The following
functions are provided to manage the per-cpu list:

 1. int init_pcpu_list_head(struct pcpu_list_head **ppcpu_head)
 2. void pcpu_list_add(struct pcpu_list_node *node,
		       struct pcpu_list_head *head)
 3. void pcpu_list_del(struct pcpu_list *node)

Iteration of all the list entries within a group of per-cpu
lists is done by calling either the pcpu_list_iterate() or
pcpu_list_iterate_safe() functions in a while loop. They correspond
to the list_for_each_entry() and list_for_each_entry_safe() macros
respectively. The iteration states are keep in a pcpu_list_state
structure that is passed to the iteration functions.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
---
 include/linux/percpu-list.h |  235 +++++++++++++++++++++++++++++++++++++++++++
 lib/Makefile                |    2 +-
 lib/percpu-list.c           |   85 ++++++++++++++++
 3 files changed, 321 insertions(+), 1 deletions(-)
 create mode 100644 include/linux/percpu-list.h
 create mode 100644 lib/percpu-list.c

Comments

Boqun Feng Feb. 24, 2016, 2 a.m. UTC | #1
Hi Waiman,

On Tue, Feb 23, 2016 at 02:04:30PM -0500, Waiman Long wrote:
> Linked list is used everywhere in the Linux kernel. However, if many
> threads are trying to add or delete entries into the same linked list,
> it can create a performance bottleneck.
> 
> This patch introduces a new per-cpu list subystem with associated
> per-cpu locks for protecting each of the lists individually. This
> allows list entries insertion and deletion operations to happen in
> parallel instead of being serialized with a global list and lock.
> 
> List entry insertion is strictly per cpu. List deletion, however, can
> happen in a cpu other than the one that did the insertion. So we still
> need lock to protect the list. Because of that, there may still be
> a small amount of contention when deletion is being done.
> 
> A new header file include/linux/percpu-list.h will be added with the
> associated pcpu_list_head and pcpu_list_node structures. The following
> functions are provided to manage the per-cpu list:
> 
>  1. int init_pcpu_list_head(struct pcpu_list_head **ppcpu_head)
>  2. void pcpu_list_add(struct pcpu_list_node *node,
> 		       struct pcpu_list_head *head)
>  3. void pcpu_list_del(struct pcpu_list *node)
> 
> Iteration of all the list entries within a group of per-cpu
> lists is done by calling either the pcpu_list_iterate() or
> pcpu_list_iterate_safe() functions in a while loop. They correspond
> to the list_for_each_entry() and list_for_each_entry_safe() macros
> respectively. The iteration states are keep in a pcpu_list_state
> structure that is passed to the iteration functions.
> 
> Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
> ---
>  include/linux/percpu-list.h |  235 +++++++++++++++++++++++++++++++++++++++++++
>  lib/Makefile                |    2 +-
>  lib/percpu-list.c           |   85 ++++++++++++++++
>  3 files changed, 321 insertions(+), 1 deletions(-)
>  create mode 100644 include/linux/percpu-list.h
>  create mode 100644 lib/percpu-list.c
> 
> diff --git a/include/linux/percpu-list.h b/include/linux/percpu-list.h
> new file mode 100644
> index 0000000..8759fec
> --- /dev/null
> +++ b/include/linux/percpu-list.h
> @@ -0,0 +1,235 @@
> +/*
> + * Per-cpu list
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * (C) Copyright 2016 Hewlett-Packard Enterprise Development LP
> + *
> + * Authors: Waiman Long <waiman.long@hpe.com>
> + */
> +#ifndef __LINUX_PERCPU_LIST_H
> +#define __LINUX_PERCPU_LIST_H
> +
> +#include <linux/spinlock.h>
> +#include <linux/list.h>
> +#include <linux/percpu.h>
> +
> +/*
> + * include/linux/percpu-list.h
> + *
> + * A per-cpu list protected by a per-cpu spinlock.
> + *
> + * The pcpu_list_head structure contains the spinlock, the other
> + * pcpu_list_node structures only contains a pointer to the spinlock in
> + * pcpu_list_head.
> + */
> +struct pcpu_list_head {
> +	struct list_head list;
> +	spinlock_t lock;
> +};
> +
> +struct pcpu_list_node {
> +	struct list_head list;
> +	spinlock_t *lockptr;
> +};
> +
> +/*
> + * Per-cpu list iteration state
> + */
> +struct pcpu_list_state {
> +	int			 cpu;
> +	spinlock_t		*lock;
> +	struct list_head	*head;	/* List head of current per-cpu list */
> +	struct pcpu_list_node	*curr;
> +	struct pcpu_list_node	*next;
> +};
> +
> +#define PCPU_LIST_HEAD_INIT(name)				\
> +	{							\
> +		.list.prev = &name.list,			\
> +		.list.next = &name.list,			\
> +		.list.lock = __SPIN_LOCK_UNLOCKED(name),	\
> +	}
> +
> +#define PCPU_LIST_NODE_INIT(name)		\
> +	{					\
> +		.list.prev = &name.list,	\
> +		.list.next = &name.list,	\
> +		.list.lockptr = NULL		\
> +	}
> +
> +#define PCPU_LIST_STATE_INIT()			\
> +	{					\
> +		.cpu  = -1,			\
> +		.lock = NULL,			\
> +		.head = NULL,			\
> +		.curr = NULL,			\
> +		.next = NULL,			\
> +	}
> +
> +#define DEFINE_PCPU_LIST_STATE(s)		\
> +	struct pcpu_list_state s = PCPU_LIST_STATE_INIT()
> +
> +#define pcpu_list_next_entry(pos, member) list_next_entry(pos, member.list)
> +
> +static inline void init_pcpu_list_node(struct pcpu_list_node *node)
> +{
> +	INIT_LIST_HEAD(&node->list);
> +	node->lockptr = NULL;
> +}
> +
> +static inline void free_pcpu_list_head(struct pcpu_list_head **ppcpu_head)
> +{
> +	free_percpu(*ppcpu_head);
> +	*ppcpu_head = NULL;
> +}
> +
> +static inline void init_pcpu_list_state(struct pcpu_list_state *state)
> +{
> +	state->cpu  = -1;
> +	state->lock = NULL;
> +	state->head = NULL;
> +	state->curr = NULL;
> +	state->next = NULL;
> +}
> +
> +#if NR_CPUS == 1
> +/*
> + * For uniprocessor, the list head and lock in struct pcpu_list_head are
> + * used directly.
> + */
> +static inline bool pcpu_list_empty(struct pcpu_list_head *pcpu_head)
> +{
> +	return list_empty(&pcpu_head->list);
> +}
> +
> +static __always_inline bool
> +__pcpu_list_next_cpu(struct pcpu_list_head *head, struct pcpu_list_state *state)
> +{
> +	if (state->lock)
> +		spin_unlock(state->lock);
> +
> +	if (state->cpu++ >= 0)
> +		return false;
> +
> +	state->curr = list_entry(head->list.next, struct pcpu_list_node, list);
> +	if (list_empty(&state->curr->list))
> +		return false;
> +	state->lock = &head->lock;
> +	spin_lock(state->lock);
> +	return true;
> +
> +}
> +#else /* NR_CPUS == 1 */
> +/*
> + * Multiprocessor
> + */
> +static inline bool pcpu_list_empty(struct pcpu_list_head *pcpu_head)
> +{
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu)
> +		if (!list_empty(&per_cpu_ptr(pcpu_head, cpu)->list))
> +			return false;
> +	return true;
> +}
> +
> +/*
> + * Helper function to find the first entry of the next per-cpu list
> + * It works somewhat like for_each_possible_cpu(cpu).
> + *
> + * Return: true if the entry is found, false if all the lists exhausted
> + */
> +static __always_inline bool
> +__pcpu_list_next_cpu(struct pcpu_list_head *head, struct pcpu_list_state *state)
> +{
> +	if (state->lock)
> +		spin_unlock(state->lock);
> +next_cpu:
> +	/*
> +	 * for_each_possible_cpu(cpu)
> +	 */
> +	state->cpu = cpumask_next(state->cpu, cpu_possible_mask);
> +	if (state->cpu >= nr_cpu_ids)
> +		return false;	/* All the per-cpu lists iterated */
> +
> +	state->head = &per_cpu_ptr(head, state->cpu)->list;
> +	state->lock = &per_cpu_ptr(head, state->cpu)->lock;
> +	state->curr = list_entry(state->head->next,
> +				 struct pcpu_list_node, list);
> +	if (&state->curr->list == state->head)
> +		goto next_cpu;
> +
> +	spin_lock(state->lock);
> +	return true;
> +}
> +#endif /* NR_CPUS == 1 */
> +
> +/*
> + * Iterate to the next entry of the group of per-cpu lists
> + *
> + * Return: true if the next entry is found, false if all the entries iterated
> + */
> +static inline bool pcpu_list_iterate(struct pcpu_list_head *head,
> +				     struct pcpu_list_state *state)
> +{
> +	/*
> +	 * Find next entry
> +	 */
> +	if (state->curr)
> +		state->curr = list_next_entry(state->curr, list);
> +
> +	if (!state->curr || (&state->curr->list == state->head)) {
> +		/*
> +		 * The current per-cpu list has been exhausted, try the next
> +		 * per-cpu list.
> +		 */
> +		if (!__pcpu_list_next_cpu(head, state))
> +			return false;
> +	}
> +	return true;	/* Continue the iteration */
> +}
> +
> +/*
> + * Iterate to the next entry of the group of per-cpu lists and safe
> + * against removal of list_entry
> + *
> + * Return: true if the next entry is found, false if all the entries iterated
> + */
> +static inline bool pcpu_list_iterate_safe(struct pcpu_list_head *head,
> +					  struct pcpu_list_state *state)
> +{
> +	/*
> +	 * Find next entry
> +	 */
> +	if (state->curr) {
> +		state->curr = state->next;
> +		state->next = list_next_entry(state->next, list);
> +	}
> +
> +	if (!state->curr || (&state->curr->list == state->head)) {
> +		/*
> +		 * The current per-cpu list has been exhausted, try the next
> +		 * per-cpu list.
> +		 */
> +		if (!__pcpu_list_next_cpu(head, state))
> +			return false;
> +		state->next = list_next_entry(state->curr, list);
> +	}
> +	return true;	/* Continue the iteration */
> +}
> +
> +extern int  init_pcpu_list_head(struct pcpu_list_head **ppcpu_head);
> +extern void pcpu_list_add(struct pcpu_list_node *node,
> +			  struct pcpu_list_head *head);
> +extern void pcpu_list_del(struct pcpu_list_node *node);
> +
> +#endif /* __LINUX_PERCPU_LIST_H */
> diff --git a/lib/Makefile b/lib/Makefile
> index a7c26a4..71a25d4 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -27,7 +27,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
>  	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
>  	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
>  	 percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \
> -	 once.o
> +	 once.o percpu-list.o
>  obj-y += string_helpers.o
>  obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
>  obj-y += hexdump.o
> diff --git a/lib/percpu-list.c b/lib/percpu-list.c
> new file mode 100644
> index 0000000..45bbb2a
> --- /dev/null
> +++ b/lib/percpu-list.c
> @@ -0,0 +1,85 @@
> +/*
> + * Per-cpu list
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * (C) Copyright 2016 Hewlett-Packard Enterprise Development LP
> + *
> + * Authors: Waiman Long <waiman.long@hpe.com>
> + */
> +#include <linux/percpu-list.h>
> +
> +/*
> + * Initialize the per-cpu list
> + */
> +int init_pcpu_list_head(struct pcpu_list_head **ppcpu_head)
> +{
> +	struct pcpu_list_head *pcpu_head = alloc_percpu(struct pcpu_list_head);
> +	int cpu;
> +
> +	if (!pcpu_head)
> +		return -ENOMEM;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct pcpu_list_head *head = per_cpu_ptr(pcpu_head, cpu);
> +
> +		INIT_LIST_HEAD(&head->list);
> +		head->lock = __SPIN_LOCK_UNLOCKED(&head->lock);
> +	}
> +
> +	*ppcpu_head = pcpu_head;
> +	return 0;
> +}
> +
> +/*
> + * List selection is based on the CPU being used when the pcpu_list_add()
> + * function is called. However, deletion may be done by a different CPU.
> + * So we still need to use a lock to protect the content of the list.
> + */
> +void pcpu_list_add(struct pcpu_list_node *node, struct pcpu_list_head *head)
> +{
> +	spinlock_t *lock;
> +
> +	/*
> +	 * There is a very slight chance the cpu will be changed
> +	 * (by preemption) before calling spin_lock(). We only need to put
> +	 * the node in one of the per-cpu lists. It may not need to be
> +	 * that of the current cpu.
> +	 */

Just curious about the comment here, what if the following happens:

	CPU 0				CPU 1
	=====================		=====================
	task_1:

	lock = this_cpu_ptr(&head->lock); // head->lock is on CPU0
	<preempted>
					continue to task_1:
					spin_lock(lock);
					node->lockptr = lock;
					// head->list is on CPU1
					list_add(&node->list, this_cpu_ptr(&head->list));
					spin_unlock(lock);

, which ends up the node is in the list on CPU1 while ->lockptr pointing
to the lock on CPU0.

If there is another node whose ->lockptr points to the lock on CPU1 and
the node is in list on CPU1, what will happen if these two nodes get
deleted simultaneously?

Regards,
Boqun

> +	lock = this_cpu_ptr(&head->lock);
> +	spin_lock(lock);
> +	node->lockptr = lock;
> +	list_add(&node->list, this_cpu_ptr(&head->list));
> +	spin_unlock(lock);
> +}
> +
> +/*
> + * Delete a node from a percpu list
> + *
> + * We need to check the lock pointer again after taking the lock to guard
> + * against concurrent delete of the same node. If the lock pointer changes
> + * (becomes NULL or to a different one), we assume that the deletion was done
> + * elsewhere.
> + */
> +void pcpu_list_del(struct pcpu_list_node *node)
> +{
> +	spinlock_t *lock = READ_ONCE(node->lockptr);
> +
> +	if (unlikely(!lock))
> +		return;
> +
> +	spin_lock(lock);
> +	if (likely(lock == node->lockptr)) {
> +		list_del_init(&node->list);
> +		node->lockptr = NULL;
> +	}
> +	spin_unlock(lock);
> +}
> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Waiman Long Feb. 24, 2016, 4:01 a.m. UTC | #2
On 02/23/2016 09:00 PM, Boqun Feng wrote:
> Hi Waiman,
>
> On Tue, Feb 23, 2016 at 02:04:30PM -0500, Waiman Long wrote:
>> }
>> +
>> +/*
>> + * List selection is based on the CPU being used when the pcpu_list_add()
>> + * function is called. However, deletion may be done by a different CPU.
>> + * So we still need to use a lock to protect the content of the list.
>> + */
>> +void pcpu_list_add(struct pcpu_list_node *node, struct pcpu_list_head *head)
>> +{
>> +	spinlock_t *lock;
>> +
>> +	/*
>> +	 * There is a very slight chance the cpu will be changed
>> +	 * (by preemption) before calling spin_lock(). We only need to put
>> +	 * the node in one of the per-cpu lists. It may not need to be
>> +	 * that of the current cpu.
>> +	 */
> Just curious about the comment here, what if the following happens:
>
> 	CPU 0				CPU 1
> 	=====================		=====================
> 	task_1:
>
> 	lock = this_cpu_ptr(&head->lock); // head->lock is on CPU0
> 	<preempted>
> 					continue to task_1:
> 					spin_lock(lock);
> 					node->lockptr = lock;
> 					// head->list is on CPU1
> 					list_add(&node->list, this_cpu_ptr(&head->list));
> 					spin_unlock(lock);
>
> , which ends up the node is in the list on CPU1 while ->lockptr pointing
> to the lock on CPU0.
>
> If there is another node whose ->lockptr points to the lock on CPU1 and
> the node is in list on CPU1, what will happen if these two nodes get
> deleted simultaneously?
>
> Regards,
> Boqun
>

Yes, you are right. I should have acquired the per-cpu head pointer 
first and used it onward instead of accessing the lock and list in 2 
separate operations. I will fix that in the next update.

Thanks for finding that.

Cheers,
Longman

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara Feb. 24, 2016, 7:56 a.m. UTC | #3
On Tue 23-02-16 14:04:30, Waiman Long wrote:
> Linked list is used everywhere in the Linux kernel. However, if many
> threads are trying to add or delete entries into the same linked list,
> it can create a performance bottleneck.
> 
> This patch introduces a new per-cpu list subystem with associated
> per-cpu locks for protecting each of the lists individually. This
> allows list entries insertion and deletion operations to happen in
> parallel instead of being serialized with a global list and lock.
> 
> List entry insertion is strictly per cpu. List deletion, however, can
> happen in a cpu other than the one that did the insertion. So we still
> need lock to protect the list. Because of that, there may still be
> a small amount of contention when deletion is being done.
> 
> A new header file include/linux/percpu-list.h will be added with the
> associated pcpu_list_head and pcpu_list_node structures. The following
> functions are provided to manage the per-cpu list:
> 
>  1. int init_pcpu_list_head(struct pcpu_list_head **ppcpu_head)
>  2. void pcpu_list_add(struct pcpu_list_node *node,
> 		       struct pcpu_list_head *head)
>  3. void pcpu_list_del(struct pcpu_list *node)
> 
> Iteration of all the list entries within a group of per-cpu
> lists is done by calling either the pcpu_list_iterate() or
> pcpu_list_iterate_safe() functions in a while loop. They correspond
> to the list_for_each_entry() and list_for_each_entry_safe() macros
> respectively. The iteration states are keep in a pcpu_list_state
> structure that is passed to the iteration functions.
> 
> Signed-off-by: Waiman Long <Waiman.Long@hpe.com>

Two comments below.

> +/*
> + * Helper function to find the first entry of the next per-cpu list
> + * It works somewhat like for_each_possible_cpu(cpu).
> + *
> + * Return: true if the entry is found, false if all the lists exhausted
> + */
> +static __always_inline bool
> +__pcpu_list_next_cpu(struct pcpu_list_head *head, struct pcpu_list_state *state)
> +{
> +	if (state->lock)
> +		spin_unlock(state->lock);
> +next_cpu:
> +	/*
> +	 * for_each_possible_cpu(cpu)
> +	 */
> +	state->cpu = cpumask_next(state->cpu, cpu_possible_mask);
> +	if (state->cpu >= nr_cpu_ids)
> +		return false;	/* All the per-cpu lists iterated */
> +
> +	state->head = &per_cpu_ptr(head, state->cpu)->list;
> +	state->lock = &per_cpu_ptr(head, state->cpu)->lock;
> +	state->curr = list_entry(state->head->next,
> +				 struct pcpu_list_node, list);
> +	if (&state->curr->list == state->head)
> +		goto next_cpu;

This might be more comprehensible as:

	if (list_empty(state->head))
		goto next_cpu;

and you can do it just after updating state->head (no need to init
state->lock & state->curr if the list is empty).

Another note: Initialization of state->curr is IMO racy - you need to hold
state->lock to grab state->curr reliably, don't you? Otherwise someone can
remove the entry while you are working with it. So you need to move that
down just before returning.

> +
> +	spin_lock(state->lock);
> +	return true;
> +}
> +#endif /* NR_CPUS == 1 */

...

> +/*
> + * Delete a node from a percpu list
> + *
> + * We need to check the lock pointer again after taking the lock to guard
> + * against concurrent delete of the same node. If the lock pointer changes
> + * (becomes NULL or to a different one), we assume that the deletion was done
> + * elsewhere.
> + */
> +void pcpu_list_del(struct pcpu_list_node *node)
> +{
> +	spinlock_t *lock = READ_ONCE(node->lockptr);
> +
> +	if (unlikely(!lock))
> +		return;
> +
> +	spin_lock(lock);
> +	if (likely(lock == node->lockptr)) {
> +		list_del_init(&node->list);
> +		node->lockptr = NULL;
> +	}

But someone changing lockptr under your hands would mean that there are
two processes racing to remove entries and that would generally point to a
problem (and likely use-after-free) in the caller, won't it? Or do you have
some particular usecase in mind?

								Honza
Waiman Long Feb. 24, 2016, 7:51 p.m. UTC | #4
On 02/24/2016 02:56 AM, Jan Kara wrote:
> On Tue 23-02-16 14:04:30, Waiman Long wrote:
>> Linked list is used everywhere in the Linux kernel. However, if many
>> threads are trying to add or delete entries into the same linked list,
>> it can create a performance bottleneck.
>>
>> This patch introduces a new per-cpu list subystem with associated
>> per-cpu locks for protecting each of the lists individually. This
>> allows list entries insertion and deletion operations to happen in
>> parallel instead of being serialized with a global list and lock.
>>
>> List entry insertion is strictly per cpu. List deletion, however, can
>> happen in a cpu other than the one that did the insertion. So we still
>> need lock to protect the list. Because of that, there may still be
>> a small amount of contention when deletion is being done.
>>
>> A new header file include/linux/percpu-list.h will be added with the
>> associated pcpu_list_head and pcpu_list_node structures. The following
>> functions are provided to manage the per-cpu list:
>>
>>   1. int init_pcpu_list_head(struct pcpu_list_head **ppcpu_head)
>>   2. void pcpu_list_add(struct pcpu_list_node *node,
>> 		       struct pcpu_list_head *head)
>>   3. void pcpu_list_del(struct pcpu_list *node)
>>
>> Iteration of all the list entries within a group of per-cpu
>> lists is done by calling either the pcpu_list_iterate() or
>> pcpu_list_iterate_safe() functions in a while loop. They correspond
>> to the list_for_each_entry() and list_for_each_entry_safe() macros
>> respectively. The iteration states are keep in a pcpu_list_state
>> structure that is passed to the iteration functions.
>>
>> Signed-off-by: Waiman Long<Waiman.Long@hpe.com>
> Two comments below.
>
>> +/*
>> + * Helper function to find the first entry of the next per-cpu list
>> + * It works somewhat like for_each_possible_cpu(cpu).
>> + *
>> + * Return: true if the entry is found, false if all the lists exhausted
>> + */
>> +static __always_inline bool
>> +__pcpu_list_next_cpu(struct pcpu_list_head *head, struct pcpu_list_state *state)
>> +{
>> +	if (state->lock)
>> +		spin_unlock(state->lock);
>> +next_cpu:
>> +	/*
>> +	 * for_each_possible_cpu(cpu)
>> +	 */
>> +	state->cpu = cpumask_next(state->cpu, cpu_possible_mask);
>> +	if (state->cpu>= nr_cpu_ids)
>> +		return false;	/* All the per-cpu lists iterated */
>> +
>> +	state->head =&per_cpu_ptr(head, state->cpu)->list;
>> +	state->lock =&per_cpu_ptr(head, state->cpu)->lock;
>> +	state->curr = list_entry(state->head->next,
>> +				 struct pcpu_list_node, list);
>> +	if (&state->curr->list == state->head)
>> +		goto next_cpu;
> This might be more comprehensible as:
>
> 	if (list_empty(state->head))
> 		goto next_cpu;
>
> and you can do it just after updating state->head (no need to init
> state->lock&  state->curr if the list is empty).

Thank for the suggestion. Will change the code accordingly.
> Another note: Initialization of state->curr is IMO racy - you need to hold
> state->lock to grab state->curr reliably, don't you? Otherwise someone can
> remove the entry while you are working with it. So you need to move that
> down just before returning.

Right. I will move the initialization of state->curr after the spin_lock().

>> +
>> +	spin_lock(state->lock);
>> +	return true;
>> +}
>> +#endif /* NR_CPUS == 1 */
> ...
>
>> +/*
>> + * Delete a node from a percpu list
>> + *
>> + * We need to check the lock pointer again after taking the lock to guard
>> + * against concurrent delete of the same node. If the lock pointer changes
>> + * (becomes NULL or to a different one), we assume that the deletion was done
>> + * elsewhere.
>> + */
>> +void pcpu_list_del(struct pcpu_list_node *node)
>> +{
>> +	spinlock_t *lock = READ_ONCE(node->lockptr);
>> +
>> +	if (unlikely(!lock))
>> +		return;
>> +
>> +	spin_lock(lock);
>> +	if (likely(lock == node->lockptr)) {
>> +		list_del_init(&node->list);
>> +		node->lockptr = NULL;
>> +	}
> But someone changing lockptr under your hands would mean that there are
> two processes racing to remove entries and that would generally point to a
> problem (and likely use-after-free) in the caller, won't it? Or do you have
> some particular usecase in mind?
>
> 								Honza
>

This is just defensive programming to guard against unforeseen case. I 
don't have any particular use case in mind that will make that happen. 
Maybe I should put a WARN_ON if this really happens.

Cheers,
Longman

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/percpu-list.h b/include/linux/percpu-list.h
new file mode 100644
index 0000000..8759fec
--- /dev/null
+++ b/include/linux/percpu-list.h
@@ -0,0 +1,235 @@ 
+/*
+ * Per-cpu list
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2016 Hewlett-Packard Enterprise Development LP
+ *
+ * Authors: Waiman Long <waiman.long@hpe.com>
+ */
+#ifndef __LINUX_PERCPU_LIST_H
+#define __LINUX_PERCPU_LIST_H
+
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+
+/*
+ * include/linux/percpu-list.h
+ *
+ * A per-cpu list protected by a per-cpu spinlock.
+ *
+ * The pcpu_list_head structure contains the spinlock, the other
+ * pcpu_list_node structures only contains a pointer to the spinlock in
+ * pcpu_list_head.
+ */
+struct pcpu_list_head {
+	struct list_head list;
+	spinlock_t lock;
+};
+
+struct pcpu_list_node {
+	struct list_head list;
+	spinlock_t *lockptr;
+};
+
+/*
+ * Per-cpu list iteration state
+ */
+struct pcpu_list_state {
+	int			 cpu;
+	spinlock_t		*lock;
+	struct list_head	*head;	/* List head of current per-cpu list */
+	struct pcpu_list_node	*curr;
+	struct pcpu_list_node	*next;
+};
+
+#define PCPU_LIST_HEAD_INIT(name)				\
+	{							\
+		.list.prev = &name.list,			\
+		.list.next = &name.list,			\
+		.list.lock = __SPIN_LOCK_UNLOCKED(name),	\
+	}
+
+#define PCPU_LIST_NODE_INIT(name)		\
+	{					\
+		.list.prev = &name.list,	\
+		.list.next = &name.list,	\
+		.list.lockptr = NULL		\
+	}
+
+#define PCPU_LIST_STATE_INIT()			\
+	{					\
+		.cpu  = -1,			\
+		.lock = NULL,			\
+		.head = NULL,			\
+		.curr = NULL,			\
+		.next = NULL,			\
+	}
+
+#define DEFINE_PCPU_LIST_STATE(s)		\
+	struct pcpu_list_state s = PCPU_LIST_STATE_INIT()
+
+#define pcpu_list_next_entry(pos, member) list_next_entry(pos, member.list)
+
+static inline void init_pcpu_list_node(struct pcpu_list_node *node)
+{
+	INIT_LIST_HEAD(&node->list);
+	node->lockptr = NULL;
+}
+
+static inline void free_pcpu_list_head(struct pcpu_list_head **ppcpu_head)
+{
+	free_percpu(*ppcpu_head);
+	*ppcpu_head = NULL;
+}
+
+static inline void init_pcpu_list_state(struct pcpu_list_state *state)
+{
+	state->cpu  = -1;
+	state->lock = NULL;
+	state->head = NULL;
+	state->curr = NULL;
+	state->next = NULL;
+}
+
+#if NR_CPUS == 1
+/*
+ * For uniprocessor, the list head and lock in struct pcpu_list_head are
+ * used directly.
+ */
+static inline bool pcpu_list_empty(struct pcpu_list_head *pcpu_head)
+{
+	return list_empty(&pcpu_head->list);
+}
+
+static __always_inline bool
+__pcpu_list_next_cpu(struct pcpu_list_head *head, struct pcpu_list_state *state)
+{
+	if (state->lock)
+		spin_unlock(state->lock);
+
+	if (state->cpu++ >= 0)
+		return false;
+
+	state->curr = list_entry(head->list.next, struct pcpu_list_node, list);
+	if (list_empty(&state->curr->list))
+		return false;
+	state->lock = &head->lock;
+	spin_lock(state->lock);
+	return true;
+
+}
+#else /* NR_CPUS == 1 */
+/*
+ * Multiprocessor
+ */
+static inline bool pcpu_list_empty(struct pcpu_list_head *pcpu_head)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		if (!list_empty(&per_cpu_ptr(pcpu_head, cpu)->list))
+			return false;
+	return true;
+}
+
+/*
+ * Helper function to find the first entry of the next per-cpu list
+ * It works somewhat like for_each_possible_cpu(cpu).
+ *
+ * Return: true if the entry is found, false if all the lists exhausted
+ */
+static __always_inline bool
+__pcpu_list_next_cpu(struct pcpu_list_head *head, struct pcpu_list_state *state)
+{
+	if (state->lock)
+		spin_unlock(state->lock);
+next_cpu:
+	/*
+	 * for_each_possible_cpu(cpu)
+	 */
+	state->cpu = cpumask_next(state->cpu, cpu_possible_mask);
+	if (state->cpu >= nr_cpu_ids)
+		return false;	/* All the per-cpu lists iterated */
+
+	state->head = &per_cpu_ptr(head, state->cpu)->list;
+	state->lock = &per_cpu_ptr(head, state->cpu)->lock;
+	state->curr = list_entry(state->head->next,
+				 struct pcpu_list_node, list);
+	if (&state->curr->list == state->head)
+		goto next_cpu;
+
+	spin_lock(state->lock);
+	return true;
+}
+#endif /* NR_CPUS == 1 */
+
+/*
+ * Iterate to the next entry of the group of per-cpu lists
+ *
+ * Return: true if the next entry is found, false if all the entries iterated
+ */
+static inline bool pcpu_list_iterate(struct pcpu_list_head *head,
+				     struct pcpu_list_state *state)
+{
+	/*
+	 * Find next entry
+	 */
+	if (state->curr)
+		state->curr = list_next_entry(state->curr, list);
+
+	if (!state->curr || (&state->curr->list == state->head)) {
+		/*
+		 * The current per-cpu list has been exhausted, try the next
+		 * per-cpu list.
+		 */
+		if (!__pcpu_list_next_cpu(head, state))
+			return false;
+	}
+	return true;	/* Continue the iteration */
+}
+
+/*
+ * Iterate to the next entry of the group of per-cpu lists and safe
+ * against removal of list_entry
+ *
+ * Return: true if the next entry is found, false if all the entries iterated
+ */
+static inline bool pcpu_list_iterate_safe(struct pcpu_list_head *head,
+					  struct pcpu_list_state *state)
+{
+	/*
+	 * Find next entry
+	 */
+	if (state->curr) {
+		state->curr = state->next;
+		state->next = list_next_entry(state->next, list);
+	}
+
+	if (!state->curr || (&state->curr->list == state->head)) {
+		/*
+		 * The current per-cpu list has been exhausted, try the next
+		 * per-cpu list.
+		 */
+		if (!__pcpu_list_next_cpu(head, state))
+			return false;
+		state->next = list_next_entry(state->curr, list);
+	}
+	return true;	/* Continue the iteration */
+}
+
+extern int  init_pcpu_list_head(struct pcpu_list_head **ppcpu_head);
+extern void pcpu_list_add(struct pcpu_list_node *node,
+			  struct pcpu_list_head *head);
+extern void pcpu_list_del(struct pcpu_list_node *node);
+
+#endif /* __LINUX_PERCPU_LIST_H */
diff --git a/lib/Makefile b/lib/Makefile
index a7c26a4..71a25d4 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -27,7 +27,7 @@  obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
 	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
 	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
 	 percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \
-	 once.o
+	 once.o percpu-list.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
 obj-y += hexdump.o
diff --git a/lib/percpu-list.c b/lib/percpu-list.c
new file mode 100644
index 0000000..45bbb2a
--- /dev/null
+++ b/lib/percpu-list.c
@@ -0,0 +1,85 @@ 
+/*
+ * Per-cpu list
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2016 Hewlett-Packard Enterprise Development LP
+ *
+ * Authors: Waiman Long <waiman.long@hpe.com>
+ */
+#include <linux/percpu-list.h>
+
+/*
+ * Initialize the per-cpu list
+ */
+int init_pcpu_list_head(struct pcpu_list_head **ppcpu_head)
+{
+	struct pcpu_list_head *pcpu_head = alloc_percpu(struct pcpu_list_head);
+	int cpu;
+
+	if (!pcpu_head)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		struct pcpu_list_head *head = per_cpu_ptr(pcpu_head, cpu);
+
+		INIT_LIST_HEAD(&head->list);
+		head->lock = __SPIN_LOCK_UNLOCKED(&head->lock);
+	}
+
+	*ppcpu_head = pcpu_head;
+	return 0;
+}
+
+/*
+ * List selection is based on the CPU being used when the pcpu_list_add()
+ * function is called. However, deletion may be done by a different CPU.
+ * So we still need to use a lock to protect the content of the list.
+ */
+void pcpu_list_add(struct pcpu_list_node *node, struct pcpu_list_head *head)
+{
+	spinlock_t *lock;
+
+	/*
+	 * There is a very slight chance the cpu will be changed
+	 * (by preemption) before calling spin_lock(). We only need to put
+	 * the node in one of the per-cpu lists. It may not need to be
+	 * that of the current cpu.
+	 */
+	lock = this_cpu_ptr(&head->lock);
+	spin_lock(lock);
+	node->lockptr = lock;
+	list_add(&node->list, this_cpu_ptr(&head->list));
+	spin_unlock(lock);
+}
+
+/*
+ * Delete a node from a percpu list
+ *
+ * We need to check the lock pointer again after taking the lock to guard
+ * against concurrent delete of the same node. If the lock pointer changes
+ * (becomes NULL or to a different one), we assume that the deletion was done
+ * elsewhere.
+ */
+void pcpu_list_del(struct pcpu_list_node *node)
+{
+	spinlock_t *lock = READ_ONCE(node->lockptr);
+
+	if (unlikely(!lock))
+		return;
+
+	spin_lock(lock);
+	if (likely(lock == node->lockptr)) {
+		list_del_init(&node->list);
+		node->lockptr = NULL;
+	}
+	spin_unlock(lock);
+}