[3/3] radix-tree: support locking of individual exception entries.
diff mbox

Message ID 20160303131033.GC12118@quack.suse.cz
State New
Headers show

Commit Message

Jan Kara March 3, 2016, 1:10 p.m. UTC
Hi Neil,

On Sun 28-02-16 16:09:29, NeilBrown wrote:
> The least significant bit of an exception entry is used as a lock flag.
> A caller can:
>  - create a locked entry by simply adding an entry with this flag set
>  - lock an existing entry with radix_tree_lookup_lock().  This may return
>     NULL if the entry doesn't exists, or was deleted while waiting for
>     the lock.  It may return a non-exception entry if that is what is
>     found.  If it returns a locked entry then it has exclusive rights
>     to delete the entry.
>  - unlock an entry that is already locked.  This will wake any waiters.
>  - delete an entry that is locked.  This will wake waiters so that they
>    return NULL without looking at the slot in the radix tree.
> 
> These must all be called with the radix tree locked (i.e. a spinlock held).
> That spinlock is passed to radix_tree_lookup_lock() so that it can drop
> the lock while waiting.
> 
> This is a "demonstration of concept".  I haven't actually tested, only compiled.
> A possible use case is for the exception entries used by DAX.
> 
> It is possible that some of the lookups can be optimised away in some
> cases by storing a slot pointer.  I wanted to keep it reasonable
> simple until it was determined if it might be useful.

Thanks for having a look! So the patch looks like it would do the work but
frankly the amount of hackiness in it has exceeded my personal threshold...
several times ;)

In particular I don't quite understand why have you decided to re-lookup
the exceptional entry in the wake function? That seems to be the source of
a lot of a hackiness? I was hoping for something simpler like what I've
attached (compile tested only). What do you think?

To avoid false wakeups and thundering herd issues which my simple version does
have, we could do something like what I outline in the second patch. Now
that I look at the result that is closer to your patch, just cleaner IMHO :).
But I wanted to have it separated to see how much complexity does this
additional functionality brings...

Now I'm going to have a look how to use this in DAX...

								Honza


> Signed-off-by: NeilBrown <neilb@suse.com>
> ---
>  include/linux/radix-tree.h |    8 ++
>  lib/radix-tree.c           |  158 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 166 insertions(+)
> 
> diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
> index 450c12b546b7..8f579f66574b 100644
> --- a/include/linux/radix-tree.h
> +++ b/include/linux/radix-tree.h
> @@ -308,6 +308,14 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
>  int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
>  unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
>  
> +void *radix_tree_lookup_lock(struct radix_tree_root *root, wait_queue_head_t *wq,
> +			     unsigned long index, spinlock_t *lock);
> +void radix_tree_unlock(struct radix_tree_root *root, wait_queue_head_t *wq,
> +		       unsigned long index);
> +void radix_tree_delete_unlock(struct radix_tree_root *root, wait_queue_head_t *wq,
> +			      unsigned long index);
> +
> +
>  static inline void radix_tree_preload_end(void)
>  {
>  	preempt_enable();
> diff --git a/lib/radix-tree.c b/lib/radix-tree.c
> index 37d4643ab5c0..a24ea002f3eb 100644
> --- a/lib/radix-tree.c
> +++ b/lib/radix-tree.c
> @@ -1500,3 +1500,161 @@ void __init radix_tree_init(void)
>  	radix_tree_init_maxindex();
>  	hotcpu_notifier(radix_tree_callback, 0);
>  }
> +
> +/* Exception entry locking.
> + * The least significant bit of an exception entry can be used as a
> + * "locked" flag.  Supported locking operations are:
> + * radix_tree_lookup_lock() - if the indexed entry exists, lock it and
> + *         return the value, else return NULL.  If the indexed entry is not
> + *         exceptional it is returned without locking.
> + * radix_tree_unlock() - release the lock on the indexed entry
> + * radix_tree_delete_unlock() - the entry must be locked.  It will be atomically
> + *     unlocked and removed.  Any threads sleeping in lookup_lock() will return.
> + * Each of these take a radix_tree_root, a wait_queue_head_t, and an index.
> + * The '*lock' function also takes a spinlock_t which must be held when any
> + * of the functions is called.  *lock will drop the spinlock while waiting for
> + * the entry lock.
> + *
> + * As delete_unlock could free the radix_tree_node, waiters much not touch it
> + * when woken.  We provide a wake function for the waitq which records when the
> + * item has been deleted.
> + *
> + * The wait_queue_head passed should be one that is used for bit_wait, such
> + * as zone->wait_table.  We re-use the 'flags' and 'timeout' fields of the
> + * wait_bit_key to store the root and index that we are waiting for.
> + * __wake_up may only be called on one of these keys while the radix tree
> + * is locked.  The wakeup function will take the lock itself if appropriate, or
> + * may record that the radix tree entry has been deleted.  In either case
> + * the waiting function just looks at the status reported by the wakeup function
> + * and doesn't look at the radix tree itself.
> + *
> + * There is no function for locking an entry while inserting it.  Simply
> + * insert an entry that is already marked as 'locked' - lsb set.
> + *
> + */
> +
> +struct wait_slot_queue {
> +	struct radix_tree_root	*root;
> +	unsigned long		index;
> +	wait_queue_t		wait;
> +	enum {SLOT_WAITING, SLOT_LOCKED, SLOT_GONE} state;
> +	void			*ret;
> +};
> +
> +static inline int slot_locked(void *v)
> +{
> +	unsigned long l = (unsigned long)v;
> +	return l & 1;
> +}
> +
> +static inline void *lock_slot(void **v)
> +{
> +	unsigned long *l = (unsigned long *)v;
> +	return (void*)(*l |= 1);
> +}
> +
> +static inline void * unlock_slot(void **v)
> +{
> +	unsigned long *l = (unsigned long *)v;
> +	return (void*)(*l &= ~1UL);
> +}
> +
> +static int wake_slot_function(wait_queue_t *wait, unsigned mode, int sync,
> +			      void *arg)
> +{
> +	struct wait_bit_key *key = arg;
> +	struct wait_slot_queue *wait_slot =
> +		container_of(wait, struct wait_slot_queue, wait);
> +	void **slot;
> +
> +	if (wait_slot->root != key->flags ||
> +	    wait_slot->index != key->timeout)
> +		/* Not waking this waiter */
> +		return 0;
> +	if (wait_slot->state != SLOT_WAITING)
> +		/* Should be impossible.... */
> +		return 1;
> +	if (key->bit_nr == -3)
> +		/* Was just deleted, no point in doing a lookup */
> +		wait_slot = NULL;
> +	else
> +		wait_slot->ret = __radix_tree_lookup(
> +			wait_slot->root, wait_slot->index, NULL, &slot);
> +	if (!wait_slot->ret || !radix_tree_exceptional_entry(wait_slot->ret)) {
> +		wait_slot->state = SLOT_GONE;
> +		return 1;
> +	}
> +	if (slot_locked(slot))
> +		/* still locked */
> +		return 0;
> +	wait_slot->ret = lock_slot(slot);
> +	wait_slot->state = SLOT_LOCKED;
> +	return 1;
> +}
> +
> +void *radix_tree_lookup_lock(struct radix_tree_root *root, wait_queue_head_t *wq,
> +			     unsigned long index, spinlock_t *lock)
> +{
> +	void *ret, **slot;
> +	struct wait_slot_queue wait;
> +
> +	ret = __radix_tree_lookup(root, index, NULL, &slot);
> +	if (!ret || !radix_tree_exceptional_entry(ret))
> +		return ret;
> +	if (!slot_locked(slot))
> +		return lock_slot(slot);
> +
> +	wait.wait.private = current;
> +	wait.wait.func = wake_slot_function;
> +	INIT_LIST_HEAD(&wait.wait.task_list);
> +	wait.state = SLOT_WAITING;
> +	wait.root = root;
> +	wait.index = index;
> +	wait.ret = NULL;
> +	for (;;) {
> +		prepare_to_wait(wq, &wait.wait,
> +				TASK_UNINTERRUPTIBLE);
> +		if (wait.state != SLOT_WAITING)
> +			break;
> +
> +		spin_unlock(lock);
> +		schedule();
> +		spin_lock(lock);
> +	}
> +	finish_wait(wq, &wait.wait);
> +	return wait.ret;
> +}
> +EXPORT_SYMBOL(radix_tree_lookup_lock);
> +
> +void radix_tree_unlock(struct radix_tree_root *root, wait_queue_head_t *wq,
> +			unsigned long index)
> +{
> +	void *ret, **slot;
> +
> +	ret = __radix_tree_lookup(root, index, NULL, &slot);
> +	if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret)))
> +		return;
> +	if (WARN_ON_ONCE(!slot_locked(slot)))
> +		return;
> +	unlock_slot(slot);
> +
> +	if (waitqueue_active(wq)) {
> +		struct wait_bit_key key = {.flags = root, .bit_nr = -2,
> +					   .timeout = index};
> +		__wake_up(wq, TASK_NORMAL, 1, &key);
> +	}
> +}
> +EXPORT_SYMBOL(radix_tree_unlock);
> +
> +void radix_tree_delete_unlock(struct radix_tree_root *root, wait_queue_head_t *wq,
> +			      unsigned long index)
> +{
> +	radix_tree_delete(root, index);
> +	if (waitqueue_active(wq)) {
> +		/* -3 here indicates deletion */
> +		struct wait_bit_key key = {.flags = root, .bit_nr = -3,
> +					   .timeout = index};
> +		__wake_up(wq, TASK_NORMAL, 1, &key);
> +	}
> +}
> +EXPORT_SYMBOL(radix_tree_delete_unlock);
> 
>

Comments

NeilBrown March 3, 2016, 11:51 p.m. UTC | #1
On Fri, Mar 04 2016, Jan Kara wrote:

> Hi Neil,
>
> On Sun 28-02-16 16:09:29, NeilBrown wrote:
>> The least significant bit of an exception entry is used as a lock flag.
>> A caller can:
>>  - create a locked entry by simply adding an entry with this flag set
>>  - lock an existing entry with radix_tree_lookup_lock().  This may return
>>     NULL if the entry doesn't exists, or was deleted while waiting for
>>     the lock.  It may return a non-exception entry if that is what is
>>     found.  If it returns a locked entry then it has exclusive rights
>>     to delete the entry.
>>  - unlock an entry that is already locked.  This will wake any waiters.
>>  - delete an entry that is locked.  This will wake waiters so that they
>>    return NULL without looking at the slot in the radix tree.
>> 
>> These must all be called with the radix tree locked (i.e. a spinlock held).
>> That spinlock is passed to radix_tree_lookup_lock() so that it can drop
>> the lock while waiting.
>> 
>> This is a "demonstration of concept".  I haven't actually tested, only compiled.
>> A possible use case is for the exception entries used by DAX.
>> 
>> It is possible that some of the lookups can be optimised away in some
>> cases by storing a slot pointer.  I wanted to keep it reasonable
>> simple until it was determined if it might be useful.
>
> Thanks for having a look! So the patch looks like it would do the work but
> frankly the amount of hackiness in it has exceeded my personal threshold...
> several times ;)

Achievement unlocked ? :-)

>
> In particular I don't quite understand why have you decided to re-lookup
> the exceptional entry in the wake function? That seems to be the source of
> a lot of a hackiness?

Yes.....

My original idea was that there would only be a single lookup.  If the
slot was found to be locked, the address of the slot would be stored in
the key so the wakeup function could trivially detect if it was being
deleted, or could claim the lock, and would signal the result to the
caller.

But then I realized that the address of the slot is not necessarily
stable.  In particular the slot for address 0 can be in the root, or it
can be in a node.  I could special-case address zero but it was easier
just to do the search.

Of course the slot disappears if the entry is deleted.  That is why the
wakeup function (which is called under the tree lock so can still safely
inspect the slot) would signal to the caller that the delete had
happened.

So the patch was a little confused....

>                       I was hoping for something simpler like what I've
> attached (compile tested only). What do you think?

Certainly simpler.

By not layering on top of wait_bit_key, you've precluded the use of the
current page wait_queues for these locks - you need to allocate new wait
queue heads.

If in

> +struct wait_exceptional_entry_queue {
> +	wait_queue_t wait;
> +	struct exceptional_entry_key key;
> +};

you had the exceptional_entry_key first (like wait_bit_queue does) you
would be closer to being able to re-use the queues.

Also I don't think it is safe to use an exclusive wait.  When a slot is
deleted, you need to wake up *all* the waiters.

Thanks,
NeilBrown


>
> To avoid false wakeups and thundering herd issues which my simple version does
> have, we could do something like what I outline in the second patch. Now
> that I look at the result that is closer to your patch, just cleaner IMHO :).
> But I wanted to have it separated to see how much complexity does this
> additional functionality brings...
>
> Now I'm going to have a look how to use this in DAX...
>
> 								Honza
NeilBrown March 4, 2016, 10:14 a.m. UTC | #2
On Fri, Mar 04 2016, NeilBrown wrote:

>
> By not layering on top of wait_bit_key, you've precluded the use of the
> current page wait_queues for these locks - you need to allocate new wait
> queue heads.
>
> If in
>
>> +struct wait_exceptional_entry_queue {
>> +	wait_queue_t wait;
>> +	struct exceptional_entry_key key;
>> +};
>
> you had the exceptional_entry_key first (like wait_bit_queue does) you
> would be closer to being able to re-use the queues.

Scratch that bit, I was confusing myself again.  Sorry.
Each wait_queue_t has it's own function so one function will never be
called on other items in the queue - of course.

>
> Also I don't think it is safe to use an exclusive wait.  When a slot is
> deleted, you need to wake up *all* the waiters.

I think this issue is still valid.

NeilBrown
Jan Kara March 4, 2016, 12:31 p.m. UTC | #3
On Fri 04-03-16 21:14:24, NeilBrown wrote:
> On Fri, Mar 04 2016, NeilBrown wrote:
> 
> >
> > By not layering on top of wait_bit_key, you've precluded the use of the
> > current page wait_queues for these locks - you need to allocate new wait
> > queue heads.
> >
> > If in
> >
> >> +struct wait_exceptional_entry_queue {
> >> +	wait_queue_t wait;
> >> +	struct exceptional_entry_key key;
> >> +};
> >
> > you had the exceptional_entry_key first (like wait_bit_queue does) you
> > would be closer to being able to re-use the queues.
> 
> Scratch that bit, I was confusing myself again.  Sorry.
> Each wait_queue_t has it's own function so one function will never be
> called on other items in the queue - of course.

Yes.

> > Also I don't think it is safe to use an exclusive wait.  When a slot is
> > deleted, you need to wake up *all* the waiters.
> 
> I think this issue is still valid.

Yes, you are right. I have deleted your function radix_tree_delete_unlock()
because I thought it won't be needed - but if we use exclusive waits (which
I think we want to) we need to wakeup all waiters when deleting entry as you
properly spotted.

Currently I'm undecided how we want to deal with that. The thing is - when
exceptional entries use locking, we need deleting of a radix tree entry to
avoid deleting locked entry so the only proper way to delete entry would be
via something like radix_tree_delete_unlock(). OTOH when entry locking is
not used (like for tmpfs exceptional entries), we don't want to bother with
passing waitqueues around and locking entry just to delete it. The best I
came up with was that radix_tree_delete_item() would complain about
deleting locked entry so that we catch when someone doesn't properly obey
the locking protocol... But I'm still somewhat hesitating whether it would
not be better to move the locking out of generic radix tree code since it
is not quite as generic as I'd like and e.g. clear_exceptional_entry()
would use locked delete only for DAX mappings anyway.

								Honza
NeilBrown March 9, 2016, 2:13 a.m. UTC | #4
On Fri, Mar 04 2016, Jan Kara wrote:

> On Fri 04-03-16 21:14:24, NeilBrown wrote:
>> On Fri, Mar 04 2016, NeilBrown wrote:
>> 
>> >
>> > By not layering on top of wait_bit_key, you've precluded the use of the
>> > current page wait_queues for these locks - you need to allocate new wait
>> > queue heads.
>> >
>> > If in
>> >
>> >> +struct wait_exceptional_entry_queue {
>> >> +	wait_queue_t wait;
>> >> +	struct exceptional_entry_key key;
>> >> +};
>> >
>> > you had the exceptional_entry_key first (like wait_bit_queue does) you
>> > would be closer to being able to re-use the queues.
>> 
>> Scratch that bit, I was confusing myself again.  Sorry.
>> Each wait_queue_t has it's own function so one function will never be
>> called on other items in the queue - of course.
>
> Yes.

I was thinking about this some more, wondering why I thought what I did,
and realised there is a small issue that it is worth being aware of.

If different users of the same work queue use different "keys", a wake
function can get a key of a different type to the one it is expecting.

e.g. __wake_up_bit passes the address of a "struct wait_bit_key" to
__wake_up which is then passed as a "void* arg" to the
wait_queue_func_t.

With your code, a "struct exceptional_entry_key" could be passed to
wake_bit_function, or a "struct wait_bit_key" could be passed to
wake_exceptional_entry_func.

Both structures start with a pointer which will never appear in the
wrong type, and both function test that pointer first and access nothing
else if it doesn't match, so the code is safe.  But it could be seen as
a bit fragile, and doing something to make it obvious that the different
key types need to align on that primary key field would not be a bad
thing. .... If we end up using this code.

Thanks,
NeilBrown

Patch
diff mbox

From 982c02870b262da742f593e695b4050aa99fabd3 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 3 Mar 2016 14:06:42 +0100
Subject: [PATCH] radix-tree: Avoid false wakeups when waiting for exceptional
 entry lock

Signed-off-by: Jan Kara <jack@suse.cz>
---
 lib/radix-tree.c | 42 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index a97231ab66f0..be9763dd9de5 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1522,6 +1522,28 @@  static inline void * unlock_slot(void **v)
 	return (void*)(*l &= ~1UL);
 }
 
+struct exceptional_entry_key {
+	struct radix_tree_root *root;
+	unsigned long index;
+};
+
+struct wait_exceptional_entry_queue {
+	wait_queue_t wait;
+	struct exceptional_entry_key key;
+};
+
+static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned mode,
+				       int sync, void *keyp)
+{
+	struct exceptional_entry_key *key = keyp;
+	struct wait_exceptional_entry_queue *ewait =
+		container_of(wait, struct wait_exceptional_entry_queue, wait);
+
+	if (key->root != ewait->key.root || key->index != ewait->key.index)
+		return 0;
+	return autoremove_wake_function(wait, mode, sync, NULL);
+}
+
 /**
  *	radix_tree_lookup_lock - lookup and lock exceptional entry if found
  *	@root:		radix tree root
@@ -1540,7 +1562,12 @@  void *radix_tree_lookup_lock(struct radix_tree_root *root, unsigned long index,
 			     wait_queue_head_t *wq, spinlock_t *lock)
 {
 	void *ret, **slot;
-	DEFINE_WAIT(wait);
+	struct wait_exceptional_entry_queue wait;
+
+	init_wait(&wait.wait);
+	wait.wait.func = wake_exceptional_entry_func;
+	wait.key.root = root;
+	wait.key.index = index;
 
 	for (;;) {
 		ret = __radix_tree_lookup(root, index, NULL, &slot);
@@ -1548,10 +1575,10 @@  void *radix_tree_lookup_lock(struct radix_tree_root *root, unsigned long index,
 			return ret;
 		if (!slot_locked(slot))
 			return lock_slot(slot);
-		prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
 		spin_unlock(lock);
 		schedule();
-		finish_wait(wq, &wait);
+		finish_wait(wq, &wait.wait);
 		spin_lock(lock);
 	}
 }
@@ -1579,7 +1606,12 @@  void radix_tree_unlock(struct radix_tree_root *root, unsigned long index,
 		return;
 	unlock_slot(slot);
 
-	if (waitqueue_active(wq))
-		wake_up(wq);
+	if (waitqueue_active(wq)) {
+		struct exceptional_entry_key key;
+
+		key.root = root;
+		key.index = index;
+		__wake_up(wq, TASK_NORMAL, 1, &key);
+	}
 }
 EXPORT_SYMBOL(radix_tree_unlock);
-- 
2.6.2