diff mbox

[02/10] radix-tree: make 'indirect' bit available to exception entries.

Message ID 20160321173458.GO23727@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Matthew Wilcox March 21, 2016, 5:34 p.m. UTC
On Mon, Mar 21, 2016 at 02:22:47PM +0100, Jan Kara wrote:
> A pointer to a radix_tree_node will always have the 'exception'
> bit cleared, so if the exception bit is set the value cannot
> be an indirect pointer.  Thus it is safe to make the 'indirect bit'
> available to store extra information in exception entries.
> 
> This patch adds a 'PTR_MASK' and a value is only treated as
> an indirect (pointer) entry the 2 ls-bits are '01'.

Nitpick: it's called INDIRECT_MASK, not PTR_MASK.

> The change in radix-tree.c ensures the stored value still looks like an
> indirect pointer, and saves a load as well.
> 
> We could swap the two bits and so keep all the exectional bits contigious.

typo "exceptional"

> But I have other plans for that bit....
> 
> Signed-off-by: NeilBrown <neilb@suse.com>
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  include/linux/radix-tree.h | 11 +++++++++--
>  lib/radix-tree.c           |  2 +-
>  2 files changed, 10 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
> index d08d6ec3bf53..2bc8c5829441 100644
> --- a/include/linux/radix-tree.h
> +++ b/include/linux/radix-tree.h
> @@ -41,8 +41,13 @@
>   * Indirect pointer in fact is also used to tag the last pointer of a node
>   * when it is shrunk, before we rcu free the node. See shrink code for
>   * details.
> + *
> + * To allow an exception entry to only lose one bit, we ignore
> + * the INDIRECT bit when the exception bit is set.  So an entry is
> + * indirect if the least significant 2 bits are 01.
>   */
>  #define RADIX_TREE_INDIRECT_PTR		1
> +#define RADIX_TREE_INDIRECT_MASK	3
>  /*
>   * A common use of the radix tree is to store pointers to struct pages;
>   * but shmem/tmpfs needs also to store swap entries in the same tree:
> @@ -54,7 +59,8 @@
>  
>  static inline int radix_tree_is_indirect_ptr(void *ptr)
>  {
> -	return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR);
> +	return ((unsigned long)ptr & RADIX_TREE_INDIRECT_MASK)
> +		== RADIX_TREE_INDIRECT_PTR;
>  }
>  
>  /*** radix-tree API starts here ***/
> @@ -222,7 +228,8 @@ static inline void *radix_tree_deref_slot_protected(void **pslot,
>   */
>  static inline int radix_tree_deref_retry(void *arg)
>  {
> -	return unlikely((unsigned long)arg & RADIX_TREE_INDIRECT_PTR);
> +	return unlikely(((unsigned long)arg & RADIX_TREE_INDIRECT_MASK)
> +			== RADIX_TREE_INDIRECT_PTR);
>  }
>  
>  /**
> diff --git a/lib/radix-tree.c b/lib/radix-tree.c
> index 1624c4117961..c6af1a445b67 100644
> --- a/lib/radix-tree.c
> +++ b/lib/radix-tree.c
> @@ -1412,7 +1412,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
>  		 * to force callers to retry.
>  		 */
>  		if (root->height == 0)
> -			*((unsigned long *)&to_free->slots[0]) |=
> +			*((unsigned long *)&to_free->slots[0]) =
>  						RADIX_TREE_INDIRECT_PTR;

I have a patch currently in my tree which has the same effect, but looks
a little neater:


What do you think to doing it this way?

It might be slightly neater to replace the first hunk with this:

#define RADIX_TREE_RETRY       ((void *)RADIX_TREE_INDIRECT_PTR)

I also considered putting that define in radix-tree.h instead of
radix-tree.c, but on the whole I don't think it'll be useful outside
radix-tree.h.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Jan Kara March 22, 2016, 9:12 a.m. UTC | #1
On Mon 21-03-16 13:34:58, Matthew Wilcox wrote:
> I have a patch currently in my tree which has the same effect, but looks
> a little neater:
> 
> diff --git a/lib/radix-tree.c b/lib/radix-tree.c
> index b77c31c..06dfed5 100644
> --- a/lib/radix-tree.c
> +++ b/lib/radix-tree.c
> @@ -70,6 +70,8 @@ struct radix_tree_preload {
>  };
>  static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
>  
> +#define RADIX_TREE_RETRY       ((void *)1)
> +
>  static inline void *ptr_to_indirect(void *ptr)
>  {
>         return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
> @@ -934,7 +936,7 @@ restart:
>                 }
>  
>                 slot = rcu_dereference_raw(node->slots[offset]);
> -               if (slot == NULL)
> +               if ((slot == NULL) || (slot == RADIX_TREE_RETRY))
>                         goto restart;
>                 offset = follow_sibling(node, &slot, offset);
>                 if (!radix_tree_is_indirect_ptr(slot))
> @@ -1443,8 +1455,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
>                  * to force callers to retry.
>                  */
>                 if (!radix_tree_is_indirect_ptr(slot))
> -                       *((unsigned long *)&to_free->slots[0]) |=
> -                                               RADIX_TREE_INDIRECT_PTR;
> +                       to_free->slots[0] = RADIX_TREE_RETRY;
>  
>                 radix_tree_node_free(to_free);
>         }
> 
> What do you think to doing it this way?
> 
> It might be slightly neater to replace the first hunk with this:
> 
> #define RADIX_TREE_RETRY       ((void *)RADIX_TREE_INDIRECT_PTR)
> 
> I also considered putting that define in radix-tree.h instead of
> radix-tree.c, but on the whole I don't think it'll be useful outside
> radix-tree.h.

So after spending over and hour reading radix tree code back and forth (and
also digging into historical versions where stuff is easier to understand) I
think I can finally fully appreciate subtlety of the retry logic ;). And
actually now I think that Neil's variant is buggy because in his case
radix_tree_lookup() could return NULL if it raced with radix_tree_shrink()
for index 0, although there is valid entry at index 0.

Your variant actually doesn't make things much better. See e.g.
mm/filemap.c: find_get_entry()

	pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
	// pagep points to node that is under RCU freeing
	if (pagep) {
		page = radix_tree_deref_slot(pagep);
		if (unlikely(!page))	// False since
					// RADIX_TREE_INDIRECT_PTR is set
		if (radix_tree_exception(page))	// False - no exeptional bit
		if (!page_cache_get_speculative(page)) // oops...

What we need to do is either to make all radix_tree_deref_slot() callers
check return value immediately with something like radix_tree_deref_retry()
(but that is still prone to hard to debug bugs when someone forgets to call
radix_tree_deref_retry() in some place) or we just give up the idea of
using INDIRECT bit in exceptional entries.

								Honza
Matthew Wilcox March 22, 2016, 9:27 a.m. UTC | #2
On Tue, Mar 22, 2016 at 10:12:32AM +0100, Jan Kara wrote:
> 		if (unlikely(!page))	// False since
> 					// RADIX_TREE_INDIRECT_PTR is set
> 		if (radix_tree_exception(page))	// False - no exeptional bit

Oops, you got confused:

static inline int radix_tree_exception(void *arg)
{
        return unlikely((unsigned long)arg &
                (RADIX_TREE_INDIRECT_PTR | RADIX_TREE_EXCEPTIONAL_ENTRY));
}

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara March 22, 2016, 10:37 a.m. UTC | #3
On Tue 22-03-16 05:27:08, Matthew Wilcox wrote:
> On Tue, Mar 22, 2016 at 10:12:32AM +0100, Jan Kara wrote:
> > 		if (unlikely(!page))	// False since
> > 					// RADIX_TREE_INDIRECT_PTR is set
> > 		if (radix_tree_exception(page))	// False - no exeptional bit
> 
> Oops, you got confused:
> 
> static inline int radix_tree_exception(void *arg)
> {
>         return unlikely((unsigned long)arg &
>                 (RADIX_TREE_INDIRECT_PTR | RADIX_TREE_EXCEPTIONAL_ENTRY));
> }

Ah, I've confused radix_tree_exception() and
radix_tree_exceptional_entry(). OK, so your code works AFAICT. But using
RADIX_TREE_RETRY still doesn't make things clearer to me - you still need
to check for INDIRECT bit in the retry logic to catch the
radix_tree_extend() race as well...

As a side note I think we should do away with radix_tree_exception() - it
isn't very useful (doesn't simplify any of its callers) and only creates
possibility for confusion.

								Honza
Ross Zwisler March 23, 2016, 4:41 p.m. UTC | #4
On Tue, Mar 22, 2016 at 11:37:54AM +0100, Jan Kara wrote:
> On Tue 22-03-16 05:27:08, Matthew Wilcox wrote:
> > On Tue, Mar 22, 2016 at 10:12:32AM +0100, Jan Kara wrote:
> > > 		if (unlikely(!page))	// False since
> > > 					// RADIX_TREE_INDIRECT_PTR is set
> > > 		if (radix_tree_exception(page))	// False - no exeptional bit
> > 
> > Oops, you got confused:
> > 
> > static inline int radix_tree_exception(void *arg)
> > {
> >         return unlikely((unsigned long)arg &
> >                 (RADIX_TREE_INDIRECT_PTR | RADIX_TREE_EXCEPTIONAL_ENTRY));
> > }
> 
> Ah, I've confused radix_tree_exception() and
> radix_tree_exceptional_entry(). OK, so your code works AFAICT. But using
> RADIX_TREE_RETRY still doesn't make things clearer to me - you still need
> to check for INDIRECT bit in the retry logic to catch the
> radix_tree_extend() race as well...
> 
> As a side note I think we should do away with radix_tree_exception() - it
> isn't very useful (doesn't simplify any of its callers) and only creates
> possibility for confusion.

Perhaps it would be clearer if we explicitly enumerated the four radix tree
entry types?

#define RADIX_TREE_TYPE_MASK		3

#define	RADIX_TREE_TYPE_DATA		0
#define RADIX_TREE_TYPE_INDIRECT	1
#define	RADIX_TREE_TYPE_EXCEPTIONAL	2
#define RADIX_TREE_TYPE_LOCKED_EXC	3

This would make radix_tree_exception (which we could rename so it doesn't
get confused with "exceptional" entries):

static inline int radix_tree_non_data(void *arg)
{
        return unlikely((unsigned long)arg & RADIX_TREE_TYPE_MASK);
}

Etc?  I guess we'd have to code it up and see if the result was simpler, but
it seems like it might be.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kara March 24, 2016, 12:31 p.m. UTC | #5
On Wed 23-03-16 10:41:44, Ross Zwisler wrote:
> On Tue, Mar 22, 2016 at 11:37:54AM +0100, Jan Kara wrote:
> > On Tue 22-03-16 05:27:08, Matthew Wilcox wrote:
> > > On Tue, Mar 22, 2016 at 10:12:32AM +0100, Jan Kara wrote:
> > > > 		if (unlikely(!page))	// False since
> > > > 					// RADIX_TREE_INDIRECT_PTR is set
> > > > 		if (radix_tree_exception(page))	// False - no exeptional bit
> > > 
> > > Oops, you got confused:
> > > 
> > > static inline int radix_tree_exception(void *arg)
> > > {
> > >         return unlikely((unsigned long)arg &
> > >                 (RADIX_TREE_INDIRECT_PTR | RADIX_TREE_EXCEPTIONAL_ENTRY));
> > > }
> > 
> > Ah, I've confused radix_tree_exception() and
> > radix_tree_exceptional_entry(). OK, so your code works AFAICT. But using
> > RADIX_TREE_RETRY still doesn't make things clearer to me - you still need
> > to check for INDIRECT bit in the retry logic to catch the
> > radix_tree_extend() race as well...
> > 
> > As a side note I think we should do away with radix_tree_exception() - it
> > isn't very useful (doesn't simplify any of its callers) and only creates
> > possibility for confusion.
> 
> Perhaps it would be clearer if we explicitly enumerated the four radix tree
> entry types?
> 
> #define RADIX_TREE_TYPE_MASK		3
> 
> #define	RADIX_TREE_TYPE_DATA		0
> #define RADIX_TREE_TYPE_INDIRECT	1
> #define	RADIX_TREE_TYPE_EXCEPTIONAL	2
> #define RADIX_TREE_TYPE_LOCKED_EXC	3
> 
> This would make radix_tree_exception (which we could rename so it doesn't
> get confused with "exceptional" entries):
> 
> static inline int radix_tree_non_data(void *arg)
> {
>         return unlikely((unsigned long)arg & RADIX_TREE_TYPE_MASK);
> }
> 
> Etc?  I guess we'd have to code it up and see if the result was simpler, but
> it seems like it might be.

Well, for now I have decided to postpone tricks with saving exceptional
entry bits and just used bit 2 for the lock bit for DAX exceptional entries
because the retry logic in the RCU walking code got rather convoluted with
that. If we ever feel we are running out of bits in the entry, we can
always look again at compressing the contents more.

								Honza
diff mbox

Patch

diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index b77c31c..06dfed5 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -70,6 +70,8 @@  struct radix_tree_preload {
 };
 static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
 
+#define RADIX_TREE_RETRY       ((void *)1)
+
 static inline void *ptr_to_indirect(void *ptr)
 {
        return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
@@ -934,7 +936,7 @@  restart:
                }
 
                slot = rcu_dereference_raw(node->slots[offset]);
-               if (slot == NULL)
+               if ((slot == NULL) || (slot == RADIX_TREE_RETRY))
                        goto restart;
                offset = follow_sibling(node, &slot, offset);
                if (!radix_tree_is_indirect_ptr(slot))
@@ -1443,8 +1455,7 @@  static inline void radix_tree_shrink(struct radix_tree_root *root)
                 * to force callers to retry.
                 */
                if (!radix_tree_is_indirect_ptr(slot))
-                       *((unsigned long *)&to_free->slots[0]) |=
-                                               RADIX_TREE_INDIRECT_PTR;
+                       to_free->slots[0] = RADIX_TREE_RETRY;
 
                radix_tree_node_free(to_free);
        }