diff mbox

[v7,1/6] lib/dlock-list: Distributed and lock-protected lists

Message ID 1507229008-20569-2-git-send-email-longman@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Waiman Long Oct. 5, 2017, 6:43 p.m. UTC
Linked list is used everywhere in the Linux kernel. However, if many
threads are trying to add or delete entries into the same linked list,
it can create a performance bottleneck.

This patch introduces a new list APIs that provide a set of distributed
lists (one per CPU), each of which is protected by its own spinlock.
To the callers, however, the set of lists acts like a single
consolidated list.  This allows list entries insertion and deletion
operations to happen in parallel instead of being serialized with a
global list and lock.

List entry insertion is strictly per cpu. List deletion, however, can
happen in a cpu other than the one that did the insertion. So we still
need lock to protect the list. Because of that, there may still be
a small amount of contention when deletion is being done.

A new header file include/linux/dlock-list.h will be added with the
associated dlock_list_head and dlock_list_node structures. The following
functions are provided to manage the per-cpu list:

 1. int alloc_dlock_list_heads(struct dlock_list_heads *dlist)
 2. void free_dlock_list_heads(struct dlock_list_heads *dlist)
 3. void dlock_list_add(struct dlock_list_node *node,
		        struct dlock_list_heads *dlist)
 4. void dlock_list_del(struct dlock_list *node)

Iteration of all the list entries within a dlock list array
is done by calling either the dlist_for_each_entry() or
dlist_for_each_entry_safe() macros. They correspond to the
list_for_each_entry() and list_for_each_entry_safe() macros
respectively. The iteration states are keep in a dlock_list_iter
structure that is passed to the iteration macros.

Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 include/linux/dlock-list.h | 224 +++++++++++++++++++++++++++++++++++++++++++
 lib/Makefile               |   2 +-
 lib/dlock-list.c           | 231 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 456 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/dlock-list.h
 create mode 100644 lib/dlock-list.c

Comments

Boqun Feng Oct. 10, 2017, 5:35 a.m. UTC | #1
On Thu, Oct 05, 2017 at 06:43:23PM +0000, Waiman Long wrote:
[...]
> +/*
> + * As all the locks in the dlock list are dynamically allocated, they need
> + * to belong to their own special lock class to avoid warning and stack
> + * trace in kernel log when lockdep is enabled. Statically allocated locks
> + * don't have this problem.
> + */
> +static struct lock_class_key dlock_list_key;
> +

So in this way, you make all dlock_lists share the same lock_class_key,
which means if there are two structures:

	struct some_a {
		...
		struct dlock_list_heads dlists;
	};

	struct some_b {
		...
		struct dlock_list_heads dlists;
	};

some_a::dlists and some_b::dlists are going to have the same lockdep
key, is this what you want? If not, you may want to do something like
init_srcu_struct() does.

> +/*
> + * Initialize cpu2idx mapping table
> + *
> + * It is possible that a dlock-list can be allocated before the cpu2idx is
> + * initialized. In this case, all the cpus are mapped to the first entry
> + * before initialization.
> + *
> + */
> +static int __init cpu2idx_init(void)
> +{
> +	int idx, cpu;
> +
> +	idx = 0;
> +	for_each_possible_cpu(cpu)
> +		per_cpu(cpu2idx, cpu) = idx++;
> +	return 0;
> +}
> +postcore_initcall(cpu2idx_init);
> +
> +/**
> + * alloc_dlock_list_heads - Initialize and allocate the list of head entries
> + * @dlist: Pointer to the dlock_list_heads structure to be initialized
> + * Return: 0 if successful, -ENOMEM if memory allocation error
> + *
> + * This function does not allocate the dlock_list_heads structure itself. The
> + * callers will have to do their own memory allocation, if necessary. However,
> + * this allows embedding the dlock_list_heads structure directly into other
> + * structures.
> + */
> +int alloc_dlock_list_heads(struct dlock_list_heads *dlist)
> +{
> +	int idx;
> +
> +	dlist->heads = kcalloc(nr_cpu_ids, sizeof(struct dlock_list_head),
> +			       GFP_KERNEL);
> +
> +	if (!dlist->heads)
> +		return -ENOMEM;
> +
> +	for (idx = 0; idx < nr_cpu_ids; idx++) {
> +		struct dlock_list_head *head = &dlist->heads[idx];
> +
> +		INIT_LIST_HEAD(&head->list);
> +		head->lock = __SPIN_LOCK_UNLOCKED(&head->lock);
> +		lockdep_set_class(&head->lock, &dlock_list_key);
> +	}
> +	return 0;
> +}
> +
> +/**
> + * free_dlock_list_heads - Free all the heads entries of the dlock list
> + * @dlist: Pointer of the dlock_list_heads structure to be freed
> + *
> + * This function doesn't free the dlock_list_heads structure itself. So
> + * the caller will have to do it, if necessary.
> + */
> +void free_dlock_list_heads(struct dlock_list_heads *dlist)
> +{
> +	kfree(dlist->heads);
> +	dlist->heads = NULL;
> +}
> +
> +/**
> + * dlock_lists_empty - Check if all the dlock lists are empty
> + * @dlist: Pointer to the dlock_list_heads structure
> + * Return: true if list is empty, false otherwise.
> + *
> + * This can be a pretty expensive function call. If this function is required
> + * in a performance critical path, we may have to maintain a global count
> + * of the list entries in the global dlock_list_heads structure instead.
> + */
> +bool dlock_lists_empty(struct dlock_list_heads *dlist)
> +{
> +	int idx;
> +
> +	for (idx = 0; idx < nr_cpu_ids; idx++)
> +		if (!list_empty(&dlist->heads[idx].list))
> +			return false;
> +	return true;
> +}
> +
> +/**
> + * dlock_lists_add - Adds a node to the given dlock list
> + * @node : The node to be added
> + * @dlist: The dlock list where the node is to be added
> + *
> + * List selection is based on the CPU being used when the dlock_list_add()
> + * function is called. However, deletion may be done by a different CPU.
> + */
> +void dlock_lists_add(struct dlock_list_node *node,
> +		     struct dlock_list_heads *dlist)
> +{
> +	struct dlock_list_head *head = &dlist->heads[this_cpu_read(cpu2idx)];
> +
> +	/*
> +	 * There is no need to disable preemption
> +	 */
> +	spin_lock(&head->lock);
> +	node->head = head;
> +	list_add(&node->list, &head->list);
> +	spin_unlock(&head->lock);
> +}
> +
> +/**
> + * dlock_lists_del - Delete a node from a dlock list
> + * @node : The node to be deleted
> + *
> + * We need to check the lock pointer again after taking the lock to guard
> + * against concurrent deletion of the same node. If the lock pointer changes
> + * (becomes NULL or to a different one), we assume that the deletion was done
> + * elsewhere. A warning will be printed if this happens as it is likely to be
> + * a bug.
> + */
> +void dlock_lists_del(struct dlock_list_node *node)
> +{
> +	struct dlock_list_head *head;
> +	bool retry;
> +
> +	do {
> +		head = READ_ONCE(node->head);

Since we read node->head locklessly here, I think we should use
WRITE_ONCE() for all the stores of node->head, to avoid store tearings?

Regards,
Boqun

> +		if (WARN_ONCE(!head, "%s: node 0x%lx has no associated head\n",
> +			      __func__, (unsigned long)node))
> +			return;
> +
> +		spin_lock(&head->lock);
> +		if (likely(head == node->head)) {
> +			list_del_init(&node->list);
> +			node->head = NULL;
> +			retry = false;
> +		} else {
> +			/*
> +			 * The lock has somehow changed. Retry again if it is
> +			 * not NULL. Otherwise, just ignore the delete
> +			 * operation.
> +			 */
> +			retry = (node->head != NULL);
> +		}
> +		spin_unlock(&head->lock);
> +	} while (retry);
> +}
> +
[...]
Waiman Long Oct. 13, 2017, 9:10 p.m. UTC | #2
On 10/10/2017 01:35 AM, Boqun Feng wrote:
> On Thu, Oct 05, 2017 at 06:43:23PM +0000, Waiman Long wrote:
> [...]
>> +/*
>> + * As all the locks in the dlock list are dynamically allocated, they need
>> + * to belong to their own special lock class to avoid warning and stack
>> + * trace in kernel log when lockdep is enabled. Statically allocated locks
>> + * don't have this problem.
>> + */
>> +static struct lock_class_key dlock_list_key;
>> +
> So in this way, you make all dlock_lists share the same lock_class_key,
> which means if there are two structures:
>
> 	struct some_a {
> 		...
> 		struct dlock_list_heads dlists;
> 	};
>
> 	struct some_b {
> 		...
> 		struct dlock_list_heads dlists;
> 	};
>
> some_a::dlists and some_b::dlists are going to have the same lockdep
> key, is this what you want? If not, you may want to do something like
> init_srcu_struct() does.

I think it will be a problem only if a task acquire a lock in a
dlock-list and then acquire another lock from another dlock-list. The
way the dlock-list is used, no more than one lock will be acquired at
any time, so there won't be nested locking within the same dlock-list.
It is not a problem with the current use cases that I and Davidlohr
have, but it may be a problem if dlock-list becomes more widely used. I
will take a look at how init_srcu_struct() does, and maybe update the
patch accordingly. Thanks for the suggestion.

>
>> + * dlock_lists_del - Delete a node from a dlock list
>> + * @node : The node to be deleted
>> + *
>> + * We need to check the lock pointer again after taking the lock to guard
>> + * against concurrent deletion of the same node. If the lock pointer changes
>> + * (becomes NULL or to a different one), we assume that the deletion was done
>> + * elsewhere. A warning will be printed if this happens as it is likely to be
>> + * a bug.
>> + */
>> +void dlock_lists_del(struct dlock_list_node *node)
>> +{
>> +	struct dlock_list_head *head;
>> +	bool retry;
>> +
>> +	do {
>> +		head = READ_ONCE(node->head);
> Since we read node->head locklessly here, I think we should use
> WRITE_ONCE() for all the stores of node->head, to avoid store tearings?

Yes, you are right. I will use WRITE_ONCE() in my next version.

Cheers,
Longman
Boqun Feng Oct. 18, 2017, 8:55 a.m. UTC | #3
On Thu, Oct 05, 2017 at 06:43:23PM +0000, Waiman Long wrote:
[...]
> +/*
> + * Find the first entry of the next available list.
> + */
> +extern struct dlock_list_node *
> +__dlock_list_next_list(struct dlock_list_iter *iter);
> +
> +/**
> + * __dlock_list_next_entry - Iterate to the next entry of the dlock list
> + * @curr : Pointer to the current dlock_list_node structure
> + * @iter : Pointer to the dlock list iterator structure
> + * Return: Pointer to the next entry or NULL if all the entries are iterated
> + *
> + * The iterator has to be properly initialized before calling this function.
> + */
> +static inline struct dlock_list_node *
> +__dlock_list_next_entry(struct dlock_list_node *curr,
> +			struct dlock_list_iter *iter)
> +{
> +	/*
> +	 * Find next entry
> +	 */
> +	if (curr)
> +		curr = list_next_entry(curr, list);
> +
> +	if (!curr || (&curr->list == &iter->entry->list)) {
> +		/*
> +		 * The current list has been exhausted, try the next available
> +		 * list.
> +		 */
> +		curr = __dlock_list_next_list(iter);
> +	}
> +
> +	return curr;	/* Continue the iteration */
> +}
> +
> +/**
> + * dlock_list_first_entry - get the first element from a list
> + * @iter  : The dlock list iterator.
> + * @type  : The type of the struct this is embedded in.
> + * @member: The name of the dlock_list_node within the struct.
> + * Return : Pointer to the next entry or NULL if all the entries are iterated.
> + */
> +#define dlock_list_first_entry(iter, type, member)			\
> +	({								\
> +		struct dlock_list_node *_n;				\
> +		_n = __dlock_list_next_entry(NULL, iter);		\
> +		_n ? list_entry(_n, type, member) : NULL;		\
> +	})
> +
> +/**
> + * dlock_list_next_entry - iterate to the next entry of the list
> + * @pos   : The type * to cursor
> + * @iter  : The dlock list iterator.
> + * @member: The name of the dlock_list_node within the struct.
> + * Return : Pointer to the next entry or NULL if all the entries are iterated.
> + *
> + * Note that pos can't be NULL.
> + */
> +#define dlock_list_next_entry(pos, iter, member)			\
> +	({								\
> +		struct dlock_list_node *_n;				\
> +		_n = __dlock_list_next_entry(&(pos)->member, iter);	\
> +		_n ? list_entry(_n, typeof(*(pos)), member) : NULL;	\
> +	})
> +

[...]

> +/**
> + * dlist_for_each_entry_safe - iterate over the dlock list & safe over removal
> + * @pos   : Type * to use as a loop cursor
> + * @n	  : Another type * to use as temporary storage
> + * @iter  : The dlock list iterator
> + * @member: The name of the dlock_list_node within the struct
> + *
> + * This iteration macro is safe with respect to list entry removal.
> + * However, it cannot correctly iterate newly added entries right after the
> + * current one.
> + */
> +#define dlist_for_each_entry_safe(pos, n, iter, member)			\

So I missed something interesting here ;-)

> +	for (pos = dlock_list_first_entry(iter, typeof(*(pos)), member);\
> +	    ({								\
> +		bool _b = (pos != NULL);				\
> +		if (_b)							\
> +			n = dlock_list_next_entry(pos, iter, member);	\

If @pos is the last item of the list of the index/cpu, and
dlock_list_next_entry() will eventually call __dlock_list_next_list(),
which will drop the lock for the current list and grab the lock for the
next list, leaving @pos unprotected. But in the meanwhile, there could
be another thread deleting @pos via dlock_lists_del() and freeing it.
This is a use-after-free.

I think we can have something like:

	(by adding a ->prev_entry in dlock_list_iter and severl helper
	functions.)

	bool dlist_is_last_perlist(struct dlock_list_node *n)
	{
		return list_is_last(&n->list, &n->head->list);
	
	}

	void dlock_list_release_prev(struct dlock_list_iter *iter)
	{
		spin_unlock(iter->prev_entry->lock);
		iter->prev_entry = NULL;
	}

	#define dlist_for_each_entry_safe(pos, n, iter, member)		\
		for (pos = dlock_list_first_entry(iter, typeof(*(pos)), member);	\
		    ({									\
			bool _b = (pos != NULL);					\
			if (_b) {							\
				if (dlist_is_last_perlist(&(pos)->member)) {		\
					iter->prev_entry = iter->entry;			\
					iter->entry = NULL;				\
					n = dlock_list_first_entry(NULL, iter, member);	\
				}							\
				else							\
					n = dlock_list_next_entry(pos, iter, member);	\
			}								\
			_b;								\
		    });									\
		    pos = n, iter->prev_entry && dlock_list_release_prev(iter))

Of course, the dlock_list_first_entry() here may need a better name ;-)

Thoughts?

Regards,
Boqun

> +		_b;							\
> +	    });								\
> +	    pos = n)
> +
> +#endif /* __LINUX_DLOCK_LIST_H */

[...]

> +/**
> + * __dlock_list_next_list: Find the first entry of the next available list
> + * @dlist: Pointer to the dlock_list_heads structure
> + * @iter : Pointer to the dlock list iterator structure
> + * Return: true if the entry is found, false if all the lists exhausted
> + *
> + * The information about the next available list will be put into the iterator.
> + */
> +struct dlock_list_node *__dlock_list_next_list(struct dlock_list_iter *iter)
> +{
> +	struct dlock_list_node *next;
> +	struct dlock_list_head *head;
> +
> +restart:
> +	if (iter->entry) {
> +		spin_unlock(&iter->entry->lock);
> +		iter->entry = NULL;
> +	}
> +
> +next_list:
> +	/*
> +	 * Try next list
> +	 */
> +	if (++iter->index >= nr_cpu_ids)
> +		return NULL;	/* All the entries iterated */
> +
> +	if (list_empty(&iter->head[iter->index].list))
> +		goto next_list;
> +
> +	head = iter->entry = &iter->head[iter->index];
> +	spin_lock(&head->lock);
> +	/*
> +	 * There is a slight chance that the list may become empty just
> +	 * before the lock is acquired. So an additional check is
> +	 * needed to make sure that a valid node will be returned.
> +	 */
> +	if (list_empty(&head->list))
> +		goto restart;
> +
> +	next = list_entry(head->list.next, struct dlock_list_node,
> +			  list);
> +	WARN_ON_ONCE(next->head != head);
> +
> +	return next;
> +}
> -- 
> 1.8.3.1
>
diff mbox

Patch

diff --git a/include/linux/dlock-list.h b/include/linux/dlock-list.h
new file mode 100644
index 0000000..7940e524
--- /dev/null
+++ b/include/linux/dlock-list.h
@@ -0,0 +1,224 @@ 
+/*
+ * Distributed and locked list
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2016 Hewlett-Packard Enterprise Development LP
+ * (C) Copyright 2017 Red Hat, Inc.
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ */
+#ifndef __LINUX_DLOCK_LIST_H
+#define __LINUX_DLOCK_LIST_H
+
+#include <linux/spinlock.h>
+#include <linux/list.h>
+
+/*
+ * include/linux/dlock-list.h
+ *
+ * The dlock_list_head structure contains the spinlock. It is cacheline
+ * aligned to reduce contention among different CPUs. The other
+ * dlock_list_node structures contains a pointer to the head entry instead.
+ */
+struct dlock_list_head {
+	struct list_head list;
+	spinlock_t lock;
+} ____cacheline_aligned_in_smp;
+
+struct dlock_list_heads {
+	struct dlock_list_head *heads;
+};
+
+/*
+ * dlock list node data structure
+ */
+struct dlock_list_node {
+	struct list_head list;
+	struct dlock_list_head *head;
+};
+
+/*
+ * dlock list iteration state
+ *
+ * This is an opaque data structure that may change. Users of this structure
+ * should not access the structure members directly other than using the
+ * helper functions and macros provided in this header file.
+ */
+struct dlock_list_iter {
+	int index;
+	struct dlock_list_head *head, *entry;
+};
+
+#define DLOCK_LIST_ITER_INIT(dlist)		\
+	{					\
+		.index = -1,			\
+		.head = (dlist)->heads,		\
+	}
+
+#define DEFINE_DLOCK_LIST_ITER(s, heads)	\
+	struct dlock_list_iter s = DLOCK_LIST_ITER_INIT(heads)
+
+static inline void init_dlock_list_iter(struct dlock_list_iter *iter,
+					struct dlock_list_heads *heads)
+{
+	*iter = (struct dlock_list_iter)DLOCK_LIST_ITER_INIT(heads);
+}
+
+#define DLOCK_LIST_NODE_INIT(name)		\
+	{					\
+		.list = LIST_HEAD_INIT(name)	\
+	}
+
+static inline void init_dlock_list_node(struct dlock_list_node *node)
+{
+	*node = (struct dlock_list_node)DLOCK_LIST_NODE_INIT(node->list);
+}
+
+/**
+ * dlock_list_unlock - unlock the spinlock that protects the current list
+ * @iter: Pointer to the dlock list iterator structure
+ */
+static inline void dlock_list_unlock(struct dlock_list_iter *iter)
+{
+	spin_unlock(&iter->entry->lock);
+}
+
+/**
+ * dlock_list_relock - lock the spinlock that protects the current list
+ * @iter: Pointer to the dlock list iterator structure
+ */
+static inline void dlock_list_relock(struct dlock_list_iter *iter)
+{
+	spin_lock(&iter->entry->lock);
+}
+
+/*
+ * Allocation and freeing of dlock list
+ */
+extern int  alloc_dlock_list_heads(struct dlock_list_heads *dlist);
+extern void free_dlock_list_heads(struct dlock_list_heads *dlist);
+
+/*
+ * Check if a dlock list is empty or not.
+ */
+extern bool dlock_lists_empty(struct dlock_list_heads *dlist);
+
+/*
+ * The dlock list addition and deletion functions here are not irq-safe.
+ * Special irq-safe variants will have to be added if we need them.
+ */
+extern void dlock_lists_add(struct dlock_list_node *node,
+			    struct dlock_list_heads *dlist);
+extern void dlock_lists_del(struct dlock_list_node *node);
+
+/*
+ * Find the first entry of the next available list.
+ */
+extern struct dlock_list_node *
+__dlock_list_next_list(struct dlock_list_iter *iter);
+
+/**
+ * __dlock_list_next_entry - Iterate to the next entry of the dlock list
+ * @curr : Pointer to the current dlock_list_node structure
+ * @iter : Pointer to the dlock list iterator structure
+ * Return: Pointer to the next entry or NULL if all the entries are iterated
+ *
+ * The iterator has to be properly initialized before calling this function.
+ */
+static inline struct dlock_list_node *
+__dlock_list_next_entry(struct dlock_list_node *curr,
+			struct dlock_list_iter *iter)
+{
+	/*
+	 * Find next entry
+	 */
+	if (curr)
+		curr = list_next_entry(curr, list);
+
+	if (!curr || (&curr->list == &iter->entry->list)) {
+		/*
+		 * The current list has been exhausted, try the next available
+		 * list.
+		 */
+		curr = __dlock_list_next_list(iter);
+	}
+
+	return curr;	/* Continue the iteration */
+}
+
+/**
+ * dlock_list_first_entry - get the first element from a list
+ * @iter  : The dlock list iterator.
+ * @type  : The type of the struct this is embedded in.
+ * @member: The name of the dlock_list_node within the struct.
+ * Return : Pointer to the next entry or NULL if all the entries are iterated.
+ */
+#define dlock_list_first_entry(iter, type, member)			\
+	({								\
+		struct dlock_list_node *_n;				\
+		_n = __dlock_list_next_entry(NULL, iter);		\
+		_n ? list_entry(_n, type, member) : NULL;		\
+	})
+
+/**
+ * dlock_list_next_entry - iterate to the next entry of the list
+ * @pos   : The type * to cursor
+ * @iter  : The dlock list iterator.
+ * @member: The name of the dlock_list_node within the struct.
+ * Return : Pointer to the next entry or NULL if all the entries are iterated.
+ *
+ * Note that pos can't be NULL.
+ */
+#define dlock_list_next_entry(pos, iter, member)			\
+	({								\
+		struct dlock_list_node *_n;				\
+		_n = __dlock_list_next_entry(&(pos)->member, iter);	\
+		_n ? list_entry(_n, typeof(*(pos)), member) : NULL;	\
+	})
+
+/**
+ * dlist_for_each_entry - iterate over the dlock list
+ * @pos   : Type * to use as a loop cursor
+ * @iter  : The dlock list iterator
+ * @member: The name of the dlock_list_node within the struct
+ *
+ * This iteration macro isn't safe with respect to list entry removal, but
+ * it can correctly iterate newly added entries right after the current one.
+ * This iteration function is designed to be used in a while loop.
+ */
+#define dlist_for_each_entry(pos, iter, member)				\
+	for (pos = dlock_list_first_entry(iter, typeof(*(pos)), member);\
+	     pos != NULL;						\
+	     pos = dlock_list_next_entry(pos, iter, member))
+
+/**
+ * dlist_for_each_entry_safe - iterate over the dlock list & safe over removal
+ * @pos   : Type * to use as a loop cursor
+ * @n	  : Another type * to use as temporary storage
+ * @iter  : The dlock list iterator
+ * @member: The name of the dlock_list_node within the struct
+ *
+ * This iteration macro is safe with respect to list entry removal.
+ * However, it cannot correctly iterate newly added entries right after the
+ * current one.
+ */
+#define dlist_for_each_entry_safe(pos, n, iter, member)			\
+	for (pos = dlock_list_first_entry(iter, typeof(*(pos)), member);\
+	    ({								\
+		bool _b = (pos != NULL);				\
+		if (_b)							\
+			n = dlock_list_next_entry(pos, iter, member);	\
+		_b;							\
+	    });								\
+	    pos = n)
+
+#endif /* __LINUX_DLOCK_LIST_H */
diff --git a/lib/Makefile b/lib/Makefile
index dafa796..0536cd3 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -38,7 +38,7 @@  obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \
 	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
 	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
 	 percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \
-	 once.o refcount.o usercopy.o errseq.o
+	 once.o refcount.o usercopy.o errseq.o dlock-list.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
 obj-y += hexdump.o
diff --git a/lib/dlock-list.c b/lib/dlock-list.c
new file mode 100644
index 0000000..2779e3e
--- /dev/null
+++ b/lib/dlock-list.c
@@ -0,0 +1,231 @@ 
+/*
+ * Distributed and locked list
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2016 Hewlett-Packard Enterprise Development LP
+ * (C) Copyright 2017 Red Hat, Inc.
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ */
+#include <linux/dlock-list.h>
+#include <linux/lockdep.h>
+#include <linux/slab.h>
+#include <linux/cpumask.h>
+
+/*
+ * The distributed and locked list is a distributed set of lists each of
+ * which is protected by its own spinlock, but acts like a single
+ * consolidated list to the callers. For scaling purpose, the number of
+ * lists used is equal to the number of possible CPUs in the system to
+ * minimize contention.
+ *
+ * However, it is possible that individual CPU numbers may be equal to
+ * or greater than the number of possible CPUs when there are holes in
+ * the CPU number list. As a result, we need to map the CPU number to a
+ * list index.
+ */
+static DEFINE_PER_CPU_READ_MOSTLY(int, cpu2idx);
+
+/*
+ * As all the locks in the dlock list are dynamically allocated, they need
+ * to belong to their own special lock class to avoid warning and stack
+ * trace in kernel log when lockdep is enabled. Statically allocated locks
+ * don't have this problem.
+ */
+static struct lock_class_key dlock_list_key;
+
+/*
+ * Initialize cpu2idx mapping table
+ *
+ * It is possible that a dlock-list can be allocated before the cpu2idx is
+ * initialized. In this case, all the cpus are mapped to the first entry
+ * before initialization.
+ *
+ */
+static int __init cpu2idx_init(void)
+{
+	int idx, cpu;
+
+	idx = 0;
+	for_each_possible_cpu(cpu)
+		per_cpu(cpu2idx, cpu) = idx++;
+	return 0;
+}
+postcore_initcall(cpu2idx_init);
+
+/**
+ * alloc_dlock_list_heads - Initialize and allocate the list of head entries
+ * @dlist: Pointer to the dlock_list_heads structure to be initialized
+ * Return: 0 if successful, -ENOMEM if memory allocation error
+ *
+ * This function does not allocate the dlock_list_heads structure itself. The
+ * callers will have to do their own memory allocation, if necessary. However,
+ * this allows embedding the dlock_list_heads structure directly into other
+ * structures.
+ */
+int alloc_dlock_list_heads(struct dlock_list_heads *dlist)
+{
+	int idx;
+
+	dlist->heads = kcalloc(nr_cpu_ids, sizeof(struct dlock_list_head),
+			       GFP_KERNEL);
+
+	if (!dlist->heads)
+		return -ENOMEM;
+
+	for (idx = 0; idx < nr_cpu_ids; idx++) {
+		struct dlock_list_head *head = &dlist->heads[idx];
+
+		INIT_LIST_HEAD(&head->list);
+		head->lock = __SPIN_LOCK_UNLOCKED(&head->lock);
+		lockdep_set_class(&head->lock, &dlock_list_key);
+	}
+	return 0;
+}
+
+/**
+ * free_dlock_list_heads - Free all the heads entries of the dlock list
+ * @dlist: Pointer of the dlock_list_heads structure to be freed
+ *
+ * This function doesn't free the dlock_list_heads structure itself. So
+ * the caller will have to do it, if necessary.
+ */
+void free_dlock_list_heads(struct dlock_list_heads *dlist)
+{
+	kfree(dlist->heads);
+	dlist->heads = NULL;
+}
+
+/**
+ * dlock_lists_empty - Check if all the dlock lists are empty
+ * @dlist: Pointer to the dlock_list_heads structure
+ * Return: true if list is empty, false otherwise.
+ *
+ * This can be a pretty expensive function call. If this function is required
+ * in a performance critical path, we may have to maintain a global count
+ * of the list entries in the global dlock_list_heads structure instead.
+ */
+bool dlock_lists_empty(struct dlock_list_heads *dlist)
+{
+	int idx;
+
+	for (idx = 0; idx < nr_cpu_ids; idx++)
+		if (!list_empty(&dlist->heads[idx].list))
+			return false;
+	return true;
+}
+
+/**
+ * dlock_lists_add - Adds a node to the given dlock list
+ * @node : The node to be added
+ * @dlist: The dlock list where the node is to be added
+ *
+ * List selection is based on the CPU being used when the dlock_list_add()
+ * function is called. However, deletion may be done by a different CPU.
+ */
+void dlock_lists_add(struct dlock_list_node *node,
+		     struct dlock_list_heads *dlist)
+{
+	struct dlock_list_head *head = &dlist->heads[this_cpu_read(cpu2idx)];
+
+	/*
+	 * There is no need to disable preemption
+	 */
+	spin_lock(&head->lock);
+	node->head = head;
+	list_add(&node->list, &head->list);
+	spin_unlock(&head->lock);
+}
+
+/**
+ * dlock_lists_del - Delete a node from a dlock list
+ * @node : The node to be deleted
+ *
+ * We need to check the lock pointer again after taking the lock to guard
+ * against concurrent deletion of the same node. If the lock pointer changes
+ * (becomes NULL or to a different one), we assume that the deletion was done
+ * elsewhere. A warning will be printed if this happens as it is likely to be
+ * a bug.
+ */
+void dlock_lists_del(struct dlock_list_node *node)
+{
+	struct dlock_list_head *head;
+	bool retry;
+
+	do {
+		head = READ_ONCE(node->head);
+		if (WARN_ONCE(!head, "%s: node 0x%lx has no associated head\n",
+			      __func__, (unsigned long)node))
+			return;
+
+		spin_lock(&head->lock);
+		if (likely(head == node->head)) {
+			list_del_init(&node->list);
+			node->head = NULL;
+			retry = false;
+		} else {
+			/*
+			 * The lock has somehow changed. Retry again if it is
+			 * not NULL. Otherwise, just ignore the delete
+			 * operation.
+			 */
+			retry = (node->head != NULL);
+		}
+		spin_unlock(&head->lock);
+	} while (retry);
+}
+
+/**
+ * __dlock_list_next_list: Find the first entry of the next available list
+ * @dlist: Pointer to the dlock_list_heads structure
+ * @iter : Pointer to the dlock list iterator structure
+ * Return: true if the entry is found, false if all the lists exhausted
+ *
+ * The information about the next available list will be put into the iterator.
+ */
+struct dlock_list_node *__dlock_list_next_list(struct dlock_list_iter *iter)
+{
+	struct dlock_list_node *next;
+	struct dlock_list_head *head;
+
+restart:
+	if (iter->entry) {
+		spin_unlock(&iter->entry->lock);
+		iter->entry = NULL;
+	}
+
+next_list:
+	/*
+	 * Try next list
+	 */
+	if (++iter->index >= nr_cpu_ids)
+		return NULL;	/* All the entries iterated */
+
+	if (list_empty(&iter->head[iter->index].list))
+		goto next_list;
+
+	head = iter->entry = &iter->head[iter->index];
+	spin_lock(&head->lock);
+	/*
+	 * There is a slight chance that the list may become empty just
+	 * before the lock is acquired. So an additional check is
+	 * needed to make sure that a valid node will be returned.
+	 */
+	if (list_empty(&head->list))
+		goto restart;
+
+	next = list_entry(head->list.next, struct dlock_list_node,
+			  list);
+	WARN_ON_ONCE(next->head != head);
+
+	return next;
+}