diff mbox

[v4] lib/dlock-list: Scale dlock_lists_empty()

Message ID 20171106184708.kmwfcchjwjzucuja@linux-n805 (mailing list archive)
State New, archived
Headers show

Commit Message

Davidlohr Bueso Nov. 6, 2017, 6:47 p.m. UTC
Instead of the current O(N) implementation, at the cost
of adding an atomic counter, we can convert the call to
an atomic_read(). The counter only serves for accounting
empty to non-empty transitions, and vice versa; therefore
only modified twice for each of the lists during the
lifetime of the dlock (while used).

In addition, to be able to unaccount a list_del(), we
add a dlist pointer to each head, thus minimizing the
overall memory footprint.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
Changes from v3:
 - s/waiters/used_lists, more doc around the counter.
 - fixed racy scenario when the list empty/non-empty
   condition changes after taking the lock.
 - sprinkled unlikely() around all checks, these are
   only corner cases in the lifetime of the lock.

 include/linux/dlock-list.h |  8 ++++++
 lib/dlock-list.c           | 67 +++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 65 insertions(+), 10 deletions(-)

Comments

Waiman Long Nov. 6, 2017, 7:06 p.m. UTC | #1
On 11/06/2017 01:47 PM, Davidlohr Bueso wrote:
> Instead of the current O(N) implementation, at the cost
> of adding an atomic counter, we can convert the call to
> an atomic_read(). The counter only serves for accounting
> empty to non-empty transitions, and vice versa; therefore
> only modified twice for each of the lists during the
> lifetime of the dlock (while used).
>
> In addition, to be able to unaccount a list_del(), we
> add a dlist pointer to each head, thus minimizing the
> overall memory footprint.
>
> Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
> ---
> Changes from v3:
> - s/waiters/used_lists, more doc around the counter.
> - fixed racy scenario when the list empty/non-empty
>   condition changes after taking the lock.
> - sprinkled unlikely() around all checks, these are
>   only corner cases in the lifetime of the lock.
>
> include/linux/dlock-list.h |  8 ++++++
> lib/dlock-list.c           | 67
> +++++++++++++++++++++++++++++++++++++++-------
> 2 files changed, 65 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/dlock-list.h b/include/linux/dlock-list.h
> index c00c7f92ada4..e18690a9bba6 100644
> --- a/include/linux/dlock-list.h
> +++ b/include/linux/dlock-list.h
> @@ -32,10 +32,18 @@
> struct dlock_list_head {
>     struct list_head list;
>     spinlock_t lock;
> +    struct dlock_list_heads *dlist;
> } ____cacheline_aligned_in_smp;
>
> +/*
> + * This is the main dlist data structure, with the array of heads
> + * and a counter that atomically tracks if any of the lists are
> + * being used. That is, empty to non-empty (and vice versa)
> + * head->list transitions.
> + */
> struct dlock_list_heads {
>     struct dlock_list_head *heads;
> +    atomic_t used_lists;
> };
>
> /*
> diff --git a/lib/dlock-list.c b/lib/dlock-list.c
> index a4ddecc01b12..a9c855d492b8 100644
> --- a/lib/dlock-list.c
> +++ b/lib/dlock-list.c
> @@ -122,8 +122,11 @@ int __alloc_dlock_list_heads(struct
> dlock_list_heads *dlist,
>
>         INIT_LIST_HEAD(&head->list);
>         head->lock = __SPIN_LOCK_UNLOCKED(&head->lock);
> +        head->dlist = dlist;
>         lockdep_set_class(&head->lock, key);
>     }
> +
> +    atomic_set(&dlist->used_lists, 0);
>     return 0;
> }
> EXPORT_SYMBOL(__alloc_dlock_list_heads);
> @@ -139,29 +142,36 @@ void free_dlock_list_heads(struct
> dlock_list_heads *dlist)
> {
>     kfree(dlist->heads);
>     dlist->heads = NULL;
> +    atomic_set(&dlist->used_lists, 0);
> }
> EXPORT_SYMBOL(free_dlock_list_heads);
>
> /**
>  * dlock_lists_empty - Check if all the dlock lists are empty
>  * @dlist: Pointer to the dlock_list_heads structure
> - * Return: true if list is empty, false otherwise.
>  *
> - * This can be a pretty expensive function call. If this function is
> required
> - * in a performance critical path, we may have to maintain a global
> count
> - * of the list entries in the global dlock_list_heads structure instead.
> + * Return: true if all dlock lists are empty, false otherwise.
>  */
> bool dlock_lists_empty(struct dlock_list_heads *dlist)
> {
> -    int idx;
> -
>     /* Shouldn't be called before nr_dlock_lists is initialized */
>     WARN_ON_ONCE(!nr_dlock_lists);
>
> -    for (idx = 0; idx < nr_dlock_lists; idx++)
> -        if (!list_empty(&dlist->heads[idx].list))
> -            return false;
> -    return true;
> +    /*
> +     * Serialize dlist->used_lists such that a 0->1 transition is not
> +     * missed by another thread checking if any of the dlock lists are
> +     * used.
> +     *
> +     * CPU0                    CPU1
> +     * dlock_list_add()                 dlock_lists_empty()
> +     *   [S] atomic_inc(used_lists);
> +     *       smp_mb__after_atomic();
> +     *                      smp_mb__before_atomic();
> +     *                      [L] atomic_read(used_lists)
> +     *       list_add()
> +     */
> +    smp_mb__before_atomic();
> +    return !atomic_read(&dlist->used_lists);
> }
> EXPORT_SYMBOL(dlock_lists_empty);
>
> @@ -177,11 +187,39 @@ void dlock_lists_add(struct dlock_list_node *node,
>              struct dlock_list_heads *dlist)
> {
>     struct dlock_list_head *head = &dlist->heads[this_cpu_read(cpu2idx)];
> +    bool list_empty_before_lock = false;
> +
> +    /*
> +     * Optimistically bump the used_lists counter _before_ taking
> +     * the head->lock such that we don't miss a thread adding itself
> +     * to a list while spinning for the lock.
> +     *
> +     * Then, after taking the lock, recheck if the empty to non-empty
> +     * transition changed and (un)account for ourselves, accordingly.
> +     * Note that all these scenarios are corner cases, and not the
> +     * common scenario, where the lists are actually populated most
> +     * of the time.
> +     */
> +    if (unlikely(list_empty_careful(&head->list))) {
> +        list_empty_before_lock = true;
> +        atomic_inc(&dlist->used_lists);
> +        smp_mb__after_atomic();
> +    }
>
>     /*
>      * There is no need to disable preemption
>      */
>     spin_lock(&head->lock);
> +
> +    if (unlikely(!list_empty_before_lock && list_empty(&head->list))) {
> +        atomic_inc(&dlist->used_lists);
> +        smp_mb__after_atomic();
> +    }
> +    if (unlikely(list_empty_before_lock && !list_empty(&head->list))) {
> +        atomic_dec(&dlist->used_lists);
> +        smp_mb__after_atomic();
> +    }
> +
>     node->head = head;
>     list_add(&node->list, &head->list);
>     spin_unlock(&head->lock);
> @@ -212,6 +250,15 @@ void dlock_lists_del(struct dlock_list_node *node)
>         spin_lock(&head->lock);
>         if (likely(head == node->head)) {
>             list_del_init(&node->list);
> +
> +            if (unlikely(list_empty(&head->list))) {
> +                struct dlock_list_heads *dlist;
> +                dlist = node->head->dlist;
> +
> +                atomic_dec(&dlist->used_lists);
> +                smp_mb__after_atomic();
> +            }
> +
>             node->head = NULL;
>             retry = false;
>         } else {


This patch looks good to me.

Acked-by: Waiman Long <longman@redhat.com>
Jan Kara Nov. 7, 2017, 11:59 a.m. UTC | #2
On Mon 06-11-17 10:47:08, Davidlohr Bueso wrote:
> Instead of the current O(N) implementation, at the cost
> of adding an atomic counter, we can convert the call to
> an atomic_read(). The counter only serves for accounting
> empty to non-empty transitions, and vice versa; therefore
> only modified twice for each of the lists during the
> lifetime of the dlock (while used).
> 
> In addition, to be able to unaccount a list_del(), we
> add a dlist pointer to each head, thus minimizing the
> overall memory footprint.
> 
> Signed-off-by: Davidlohr Bueso <dbueso@suse.de>

Looks good to me. You can add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza Kara

> ---
> Changes from v3:
> - s/waiters/used_lists, more doc around the counter.
> - fixed racy scenario when the list empty/non-empty
>   condition changes after taking the lock.
> - sprinkled unlikely() around all checks, these are
>   only corner cases in the lifetime of the lock.
> 
> include/linux/dlock-list.h |  8 ++++++
> lib/dlock-list.c           | 67 +++++++++++++++++++++++++++++++++++++++-------
> 2 files changed, 65 insertions(+), 10 deletions(-)
> 
> diff --git a/include/linux/dlock-list.h b/include/linux/dlock-list.h
> index c00c7f92ada4..e18690a9bba6 100644
> --- a/include/linux/dlock-list.h
> +++ b/include/linux/dlock-list.h
> @@ -32,10 +32,18 @@
> struct dlock_list_head {
> 	struct list_head list;
> 	spinlock_t lock;
> +	struct dlock_list_heads *dlist;
> } ____cacheline_aligned_in_smp;
> 
> +/*
> + * This is the main dlist data structure, with the array of heads
> + * and a counter that atomically tracks if any of the lists are
> + * being used. That is, empty to non-empty (and vice versa)
> + * head->list transitions.
> + */
> struct dlock_list_heads {
> 	struct dlock_list_head *heads;
> +	atomic_t used_lists;
> };
> 
> /*
> diff --git a/lib/dlock-list.c b/lib/dlock-list.c
> index a4ddecc01b12..a9c855d492b8 100644
> --- a/lib/dlock-list.c
> +++ b/lib/dlock-list.c
> @@ -122,8 +122,11 @@ int __alloc_dlock_list_heads(struct dlock_list_heads *dlist,
> 
> 		INIT_LIST_HEAD(&head->list);
> 		head->lock = __SPIN_LOCK_UNLOCKED(&head->lock);
> +		head->dlist = dlist;
> 		lockdep_set_class(&head->lock, key);
> 	}
> +
> +	atomic_set(&dlist->used_lists, 0);
> 	return 0;
> }
> EXPORT_SYMBOL(__alloc_dlock_list_heads);
> @@ -139,29 +142,36 @@ void free_dlock_list_heads(struct dlock_list_heads *dlist)
> {
> 	kfree(dlist->heads);
> 	dlist->heads = NULL;
> +	atomic_set(&dlist->used_lists, 0);
> }
> EXPORT_SYMBOL(free_dlock_list_heads);
> 
> /**
>  * dlock_lists_empty - Check if all the dlock lists are empty
>  * @dlist: Pointer to the dlock_list_heads structure
> - * Return: true if list is empty, false otherwise.
>  *
> - * This can be a pretty expensive function call. If this function is required
> - * in a performance critical path, we may have to maintain a global count
> - * of the list entries in the global dlock_list_heads structure instead.
> + * Return: true if all dlock lists are empty, false otherwise.
>  */
> bool dlock_lists_empty(struct dlock_list_heads *dlist)
> {
> -	int idx;
> -
> 	/* Shouldn't be called before nr_dlock_lists is initialized */
> 	WARN_ON_ONCE(!nr_dlock_lists);
> 
> -	for (idx = 0; idx < nr_dlock_lists; idx++)
> -		if (!list_empty(&dlist->heads[idx].list))
> -			return false;
> -	return true;
> +	/*
> +	 * Serialize dlist->used_lists such that a 0->1 transition is not
> +	 * missed by another thread checking if any of the dlock lists are
> +	 * used.
> +	 *
> +	 * CPU0				    CPU1
> +	 * dlock_list_add()                 dlock_lists_empty()
> +	 *   [S] atomic_inc(used_lists);
> +	 *       smp_mb__after_atomic();
> +	 *					  smp_mb__before_atomic();
> +	 *				      [L] atomic_read(used_lists)
> +	 *       list_add()
> +	 */
> +	smp_mb__before_atomic();
> +	return !atomic_read(&dlist->used_lists);
> }
> EXPORT_SYMBOL(dlock_lists_empty);
> 
> @@ -177,11 +187,39 @@ void dlock_lists_add(struct dlock_list_node *node,
> 		     struct dlock_list_heads *dlist)
> {
> 	struct dlock_list_head *head = &dlist->heads[this_cpu_read(cpu2idx)];
> +	bool list_empty_before_lock = false;
> +
> +	/*
> +	 * Optimistically bump the used_lists counter _before_ taking
> +	 * the head->lock such that we don't miss a thread adding itself
> +	 * to a list while spinning for the lock.
> +	 *
> +	 * Then, after taking the lock, recheck if the empty to non-empty
> +	 * transition changed and (un)account for ourselves, accordingly.
> +	 * Note that all these scenarios are corner cases, and not the
> +	 * common scenario, where the lists are actually populated most
> +	 * of the time.
> +	 */
> +	if (unlikely(list_empty_careful(&head->list))) {
> +		list_empty_before_lock = true;
> +		atomic_inc(&dlist->used_lists);
> +		smp_mb__after_atomic();
> +	}
> 
> 	/*
> 	 * There is no need to disable preemption
> 	 */
> 	spin_lock(&head->lock);
> +
> +	if (unlikely(!list_empty_before_lock && list_empty(&head->list))) {
> +		atomic_inc(&dlist->used_lists);
> +		smp_mb__after_atomic();
> +	}
> +	if (unlikely(list_empty_before_lock && !list_empty(&head->list))) {
> +		atomic_dec(&dlist->used_lists);
> +		smp_mb__after_atomic();
> +	}
> +
> 	node->head = head;
> 	list_add(&node->list, &head->list);
> 	spin_unlock(&head->lock);
> @@ -212,6 +250,15 @@ void dlock_lists_del(struct dlock_list_node *node)
> 		spin_lock(&head->lock);
> 		if (likely(head == node->head)) {
> 			list_del_init(&node->list);
> +
> +			if (unlikely(list_empty(&head->list))) {
> +				struct dlock_list_heads *dlist;
> +				dlist = node->head->dlist;
> +
> +				atomic_dec(&dlist->used_lists);
> +				smp_mb__after_atomic();
> +			}
> +
> 			node->head = NULL;
> 			retry = false;
> 		} else {
> -- 
> 2.13.6
>
Andreas Dilger Nov. 7, 2017, 5:59 p.m. UTC | #3
On Nov 7, 2017, at 4:59 AM, Jan Kara <jack@suse.cz> wrote:
> On Mon 06-11-17 10:47:08, Davidlohr Bueso wrote:
>> +	/*
>> +	 * Serialize dlist->used_lists such that a 0->1 transition is not
>> +	 * missed by another thread checking if any of the dlock lists are
>> +	 * used.
>> +	 *
>> +	 * CPU0				    CPU1
>> +	 * dlock_list_add()                 dlock_lists_empty()
>> +	 *   [S] atomic_inc(used_lists);
>> +	 *       smp_mb__after_atomic();
>> +	 *					  smp_mb__before_atomic();
>> +	 *				      [L] atomic_read(used_lists)
>> +	 *       list_add()
>> +	 */
>> +	smp_mb__before_atomic();
>> +	return !atomic_read(&dlist->used_lists);

Just a general kernel programming question here - I thought the whole point
of atomics is that they are, well, atomic across all CPUs so there is no
need for a memory barrier?  If there is a need for a memory barrier for
each atomic access (assuming it isn't accessed under another lock, which would
make the use of atomic types pointless, IMHO) then I'd think there is a lot
of code in the kernel that isn't doing this properly.

What am I missing here?

I don't see how this helps if the operations are executed like:

	 * CPU0				    CPU1
	 * dlock_list_add()                 dlock_lists_empty()
	 *   [S] atomic_inc(used_lists);
	 *					  smp_mb__before_atomic();
	 *       smp_mb__after_atomic();
	 *				      [L] atomic_read(used_lists)

or alternately like:

	 * CPU0				    CPU1
	 * dlock_list_add()                 dlock_lists_empty()
	 *					  smp_mb__before_atomic();
	 *   [S] atomic_inc(used_lists);
	 *       smp_mb__after_atomic();
	 *				      [L] atomic_read(used_lists)

then the same problem would exist, unless those functions/macros are somehow
bound to the atomic operations themselves?  In that case, what makes the use
of atomic_{inc,dec,read}() in other parts of the code safe without them?

Cheers, Andreas
Waiman Long Nov. 7, 2017, 6:57 p.m. UTC | #4
On 11/07/2017 12:59 PM, Andreas Dilger wrote:
> On Nov 7, 2017, at 4:59 AM, Jan Kara <jack@suse.cz> wrote:
>> On Mon 06-11-17 10:47:08, Davidlohr Bueso wrote:
>>> +	/*
>>> +	 * Serialize dlist->used_lists such that a 0->1 transition is not
>>> +	 * missed by another thread checking if any of the dlock lists are
>>> +	 * used.
>>> +	 *
>>> +	 * CPU0				    CPU1
>>> +	 * dlock_list_add()                 dlock_lists_empty()
>>> +	 *   [S] atomic_inc(used_lists);
>>> +	 *       smp_mb__after_atomic();
>>> +	 *					  smp_mb__before_atomic();
>>> +	 *				      [L] atomic_read(used_lists)
>>> +	 *       list_add()
>>> +	 */
>>> +	smp_mb__before_atomic();
>>> +	return !atomic_read(&dlist->used_lists);
> Just a general kernel programming question here - I thought the whole point
> of atomics is that they are, well, atomic across all CPUs so there is no
> need for a memory barrier?  If there is a need for a memory barrier for
> each atomic access (assuming it isn't accessed under another lock, which would
> make the use of atomic types pointless, IMHO) then I'd think there is a lot
> of code in the kernel that isn't doing this properly.
>
> What am I missing here?

Atomic update and memory barrier are 2 different things. Atomic update
means other CPUs see either the value before or after the update. They
won't see anything in between. For a counter, it means we won't miss any
counts. However, not all atomic operations give an ordering guarantee.
The atomic_read() and atomic_inc() are examples that do not provide
memory ordering guarantee. See Documentation/memory-barriers.txt for
more information about it.

A CPU can perform atomic operations 1 & 2 in program order, but other
CPUs may see operation 2 first before operation 1. Here memory barrier
can be used to guarantee that other CPUs see the memory updates in
certain order.

Hope this help.
Longman
James Bottomley Nov. 7, 2017, 7:36 p.m. UTC | #5
On Tue, 2017-11-07 at 13:57 -0500, Waiman Long wrote:
> On 11/07/2017 12:59 PM, Andreas Dilger wrote:
> > 
> > On Nov 7, 2017, at 4:59 AM, Jan Kara <jack@suse.cz> wrote:
> > > 
> > > On Mon 06-11-17 10:47:08, Davidlohr Bueso wrote:
> > > > 
> > > > +	/*
> > > > +	 * Serialize dlist->used_lists such that a 0->1
> > > > transition is not
> > > > +	 * missed by another thread checking if any of the
> > > > dlock lists are
> > > > +	 * used.
> > > > +	 *
> > > > +	 * CPU0				    CPU1
> > > > +	 *
> > > > dlock_list_add()                 dlock_lists_empty()
> > > > +	 *   [S] atomic_inc(used_lists);
> > > > +	 *       smp_mb__after_atomic();
> > > > +	 *					  smp_mb__be
> > > > fore_atomic();
> > > > +	 *				      [L]
> > > > atomic_read(used_lists)
> > > > +	 *       list_add()
> > > > +	 */
> > > > +	smp_mb__before_atomic();
> > > > +	return !atomic_read(&dlist->used_lists);
> > Just a general kernel programming question here - I thought the
> > whole point of atomics is that they are, well, atomic across all
> > CPUs so there is no need for a memory barrier?  If there is a need
> > for a memory barrier for each atomic access (assuming it isn't
> > accessed under another lock, which would make the use of atomic
> > types pointless, IMHO) then I'd think there is a lot of code in the
> > kernel that isn't doing this properly.
> > 
> > What am I missing here?
> 
> Atomic update and memory barrier are 2 different things. Atomic
> update means other CPUs see either the value before or after the
> update. They won't see anything in between. For a counter, it means
> we won't miss any counts. However, not all atomic operations give an
> ordering guarantee. The atomic_read() and atomic_inc() are examples
> that do not provide memory ordering guarantee. See
> Documentation/memory-barriers.txt for more information about it.
> 
> A CPU can perform atomic operations 1 & 2 in program order, but other
> CPUs may see operation 2 first before operation 1. Here memory
> barrier can be used to guarantee that other CPUs see the memory
> updates in certain order.

There's an omission here which I think Andreas may have been referring
to:  atomic_inc/dec operations *are* strongly ordered with respect to
each other, so if two CPUs are simultaneously executing atomic_inc, the
order in which they execute isn't guaranteed, but it is guaranteed that
the losing atomic_inc will not begin until the winning one is
completed, so after both are done the value will have +2.  So although
atomic_read and atomic_inc have no ordering guarantee at all (the point
of the barrier above), if you're looking at the return values of
atomic_inc/dec you don't need a barrier because regardless of which
order the CPUs go in, they'll see distinct values (we use this for
reference counting).

James
Boqun Feng Nov. 8, 2017, 2:08 a.m. UTC | #6
On Tue, Nov 07, 2017 at 10:59:29AM -0700, Andreas Dilger wrote:
> On Nov 7, 2017, at 4:59 AM, Jan Kara <jack@suse.cz> wrote:
> > On Mon 06-11-17 10:47:08, Davidlohr Bueso wrote:
> >> +	/*
> >> +	 * Serialize dlist->used_lists such that a 0->1 transition is not
> >> +	 * missed by another thread checking if any of the dlock lists are
> >> +	 * used.
> >> +	 *
> >> +	 * CPU0				    CPU1
> >> +	 * dlock_list_add()                 dlock_lists_empty()
> >> +	 *   [S] atomic_inc(used_lists);
> >> +	 *       smp_mb__after_atomic();
> >> +	 *					  smp_mb__before_atomic();
> >> +	 *				      [L] atomic_read(used_lists)
> >> +	 *       list_add()
> >> +	 */
> >> +	smp_mb__before_atomic();
> >> +	return !atomic_read(&dlist->used_lists);
> 
> Just a general kernel programming question here - I thought the whole point
> of atomics is that they are, well, atomic across all CPUs so there is no
> need for a memory barrier?  If there is a need for a memory barrier for
> each atomic access (assuming it isn't accessed under another lock, which would
> make the use of atomic types pointless, IMHO) then I'd think there is a lot
> of code in the kernel that isn't doing this properly.
> 
> What am I missing here?
> 
> I don't see how this helps if the operations are executed like:
> 
> 	 * CPU0				    CPU1
> 	 * dlock_list_add()                 dlock_lists_empty()
> 	 *   [S] atomic_inc(used_lists);
> 	 *					  smp_mb__before_atomic();
> 	 *       smp_mb__after_atomic();
> 	 *				      [L] atomic_read(used_lists)
> 
> or alternately like:
> 
> 	 * CPU0				    CPU1
> 	 * dlock_list_add()                 dlock_lists_empty()
> 	 *					  smp_mb__before_atomic();
> 	 *   [S] atomic_inc(used_lists);
> 	 *       smp_mb__after_atomic();
> 	 *				      [L] atomic_read(used_lists)
> 

Or worse:

 	 * CPU0				    CPU1
 	 * dlock_list_add()                 dlock_lists_empty()
 	 *					  smp_mb__before_atomic();
 	 *				      [L] atomic_read(used_lists)
 	 *   [S] atomic_inc(used_lists);
 	 *       smp_mb__after_atomic();

, in which case dlock_lists_empty() misses a increment of used_lists.

That said, this may be fine for the usage of dlock_list. But I think the
comments are confusing or wrong. The reason is you can not use barriers
to order two memory ops on different CPUs, and usually we add comments
like:

	[S] ...			[S] ...
	<barrier>		<barrier>
	[L] ...			[L] ...

Davidlohr, could you try to improve the comments by finding both memory
ops preceding and following the barriers? Maybe you will find those
barriers are not necessary in the end.

Regards,
Boqun

> then the same problem would exist, unless those functions/macros are somehow
> bound to the atomic operations themselves?  In that case, what makes the use
> of atomic_{inc,dec,read}() in other parts of the code safe without them?
> 
> Cheers, Andreas
> 
> 
> 
> 
>
Davidlohr Bueso Nov. 9, 2017, 5:24 p.m. UTC | #7
On Wed, 08 Nov 2017, Boqun Feng wrote:
>Or worse:
>
> 	 * CPU0				    CPU1
> 	 * dlock_list_add()                 dlock_lists_empty()
> 	 *					  smp_mb__before_atomic();
> 	 *				      [L] atomic_read(used_lists)
> 	 *   [S] atomic_inc(used_lists);
> 	 *       smp_mb__after_atomic();
>
>, in which case dlock_lists_empty() misses a increment of used_lists.
>
>That said, this may be fine for the usage of dlock_list. But I think the
>comments are confusing or wrong. The reason is you can not use barriers
>to order two memory ops on different CPUs, and usually we add comments
>like:

So I think that case is OK as CPU1 legitimately reads a 0, the problem
would be if we miss the inc it because we are doing spin_lock().

>
>	[S] ...			[S] ...
>	<barrier>		<barrier>
>	[L] ...			[L] ...

That is true.

>
>Davidlohr, could you try to improve the comments by finding both memory
>ops preceding and following the barriers? Maybe you will find those
>barriers are not necessary in the end.

Ok so for the dlock_list_add() the first barrier is so that the atomic_inc()
is not done inside the critical region, after the head->lock is taken.
The other barriers that follow I put such that the respective atomic op
is done before the list_add(), however thinking about it I don't think
they are really needed.

Regarding dlock_list_empty(), the smp_mb__before_atomic() is mainly for
pairing reasons; but at this point I don't see a respective counterpart
for the above diagram so I'm unclear.

Thanks,
Davidlohr
Peter Zijlstra Nov. 9, 2017, 5:30 p.m. UTC | #8
On Thu, Nov 09, 2017 at 09:24:08AM -0800, Davidlohr Bueso wrote:
> On Wed, 08 Nov 2017, Boqun Feng wrote:
> > Or worse:
> > 
> > 	 * CPU0				    CPU1
> > 	 * dlock_list_add()                 dlock_lists_empty()
> > 	 *					  smp_mb__before_atomic();
> > 	 *				      [L] atomic_read(used_lists)

Note that this is broken; smp_mb__before_atomic() is not valid on
atomic_read().

> > 	 *   [S] atomic_inc(used_lists);
> > 	 *       smp_mb__after_atomic();
> >
diff mbox

Patch

diff --git a/include/linux/dlock-list.h b/include/linux/dlock-list.h
index c00c7f92ada4..e18690a9bba6 100644
--- a/include/linux/dlock-list.h
+++ b/include/linux/dlock-list.h
@@ -32,10 +32,18 @@ 
 struct dlock_list_head {
 	struct list_head list;
 	spinlock_t lock;
+	struct dlock_list_heads *dlist;
 } ____cacheline_aligned_in_smp;
 
+/*
+ * This is the main dlist data structure, with the array of heads
+ * and a counter that atomically tracks if any of the lists are
+ * being used. That is, empty to non-empty (and vice versa)
+ * head->list transitions.
+ */
 struct dlock_list_heads {
 	struct dlock_list_head *heads;
+	atomic_t used_lists;
 };
 
 /*
diff --git a/lib/dlock-list.c b/lib/dlock-list.c
index a4ddecc01b12..a9c855d492b8 100644
--- a/lib/dlock-list.c
+++ b/lib/dlock-list.c
@@ -122,8 +122,11 @@  int __alloc_dlock_list_heads(struct dlock_list_heads *dlist,
 
 		INIT_LIST_HEAD(&head->list);
 		head->lock = __SPIN_LOCK_UNLOCKED(&head->lock);
+		head->dlist = dlist;
 		lockdep_set_class(&head->lock, key);
 	}
+
+	atomic_set(&dlist->used_lists, 0);
 	return 0;
 }
 EXPORT_SYMBOL(__alloc_dlock_list_heads);
@@ -139,29 +142,36 @@  void free_dlock_list_heads(struct dlock_list_heads *dlist)
 {
 	kfree(dlist->heads);
 	dlist->heads = NULL;
+	atomic_set(&dlist->used_lists, 0);
 }
 EXPORT_SYMBOL(free_dlock_list_heads);
 
 /**
  * dlock_lists_empty - Check if all the dlock lists are empty
  * @dlist: Pointer to the dlock_list_heads structure
- * Return: true if list is empty, false otherwise.
  *
- * This can be a pretty expensive function call. If this function is required
- * in a performance critical path, we may have to maintain a global count
- * of the list entries in the global dlock_list_heads structure instead.
+ * Return: true if all dlock lists are empty, false otherwise.
  */
 bool dlock_lists_empty(struct dlock_list_heads *dlist)
 {
-	int idx;
-
 	/* Shouldn't be called before nr_dlock_lists is initialized */
 	WARN_ON_ONCE(!nr_dlock_lists);
 
-	for (idx = 0; idx < nr_dlock_lists; idx++)
-		if (!list_empty(&dlist->heads[idx].list))
-			return false;
-	return true;
+	/*
+	 * Serialize dlist->used_lists such that a 0->1 transition is not
+	 * missed by another thread checking if any of the dlock lists are
+	 * used.
+	 *
+	 * CPU0				    CPU1
+	 * dlock_list_add()                 dlock_lists_empty()
+	 *   [S] atomic_inc(used_lists);
+	 *       smp_mb__after_atomic();
+	 *					  smp_mb__before_atomic();
+	 *				      [L] atomic_read(used_lists)
+	 *       list_add()
+	 */
+	smp_mb__before_atomic();
+	return !atomic_read(&dlist->used_lists);
 }
 EXPORT_SYMBOL(dlock_lists_empty);
 
@@ -177,11 +187,39 @@  void dlock_lists_add(struct dlock_list_node *node,
 		     struct dlock_list_heads *dlist)
 {
 	struct dlock_list_head *head = &dlist->heads[this_cpu_read(cpu2idx)];
+	bool list_empty_before_lock = false;
+
+	/*
+	 * Optimistically bump the used_lists counter _before_ taking
+	 * the head->lock such that we don't miss a thread adding itself
+	 * to a list while spinning for the lock.
+	 *
+	 * Then, after taking the lock, recheck if the empty to non-empty
+	 * transition changed and (un)account for ourselves, accordingly.
+	 * Note that all these scenarios are corner cases, and not the
+	 * common scenario, where the lists are actually populated most
+	 * of the time.
+	 */
+	if (unlikely(list_empty_careful(&head->list))) {
+		list_empty_before_lock = true;
+		atomic_inc(&dlist->used_lists);
+		smp_mb__after_atomic();
+	}
 
 	/*
 	 * There is no need to disable preemption
 	 */
 	spin_lock(&head->lock);
+
+	if (unlikely(!list_empty_before_lock && list_empty(&head->list))) {
+		atomic_inc(&dlist->used_lists);
+		smp_mb__after_atomic();
+	}
+	if (unlikely(list_empty_before_lock && !list_empty(&head->list))) {
+		atomic_dec(&dlist->used_lists);
+		smp_mb__after_atomic();
+	}
+
 	node->head = head;
 	list_add(&node->list, &head->list);
 	spin_unlock(&head->lock);
@@ -212,6 +250,15 @@  void dlock_lists_del(struct dlock_list_node *node)
 		spin_lock(&head->lock);
 		if (likely(head == node->head)) {
 			list_del_init(&node->list);
+
+			if (unlikely(list_empty(&head->list))) {
+				struct dlock_list_heads *dlist;
+				dlist = node->head->dlist;
+
+				atomic_dec(&dlist->used_lists);
+				smp_mb__after_atomic();
+			}
+
 			node->head = NULL;
 			retry = false;
 		} else {