Message ID | 20231206060629.2827226-11-david@fromorbit.com (mailing list archive) |
---|---|
State | Handled Elsewhere |
Delegated to: | Paul Moore |
Headers | show |
Series | vfs: inode cache scalability improvements | expand |
On Wed, Dec 06, 2023 at 05:05:39PM +1100, Dave Chinner wrote: > From: Dave Chinner <dchinner@redhat.com> > > hash-bl nests spinlocks inside the bit locks. This causes problems > for CONFIG_PREEMPT_RT which converts spin locks to sleeping locks, > and we're not allowed to sleep while holding a spinning lock. > > Further, lockdep does not support bit locks, so we lose lockdep > coverage of the inode hash table with the hash-bl conversion. > > To enable these configs to work, add an external per-chain spinlock > to the hlist_bl_head() and add helpers to use this instead of the > bit spinlock when preempt_rt or lockdep are enabled. > > This converts all users of hlist-bl to use the external spinlock in > these situations, so we also gain lockdep coverage of things like > the dentry cache hash table with this change. > > Signed-off-by: Dave Chinner <dchinner@redhat.com> Sleepable bit locks can be done with wait_on_bit(), is that worth considering for PREEMPT_RT? Or are the other features of real locks important there? (not a request for the current patchset, just perhaps a note for future work) Reviewed-by: Kent Overstreet <kent.overstreet@linux.dev> > --- > include/linux/list_bl.h | 126 ++++++++++++++++++++++++++++--------- > include/linux/rculist_bl.h | 13 ++++ > 2 files changed, 110 insertions(+), 29 deletions(-) > > diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h > index 8ee2bf5af131..990ad8e24e0b 100644 > --- a/include/linux/list_bl.h > +++ b/include/linux/list_bl.h > @@ -4,14 +4,27 @@ > > #include <linux/list.h> > #include <linux/bit_spinlock.h> > +#include <linux/spinlock.h> > > /* > * Special version of lists, where head of the list has a lock in the lowest > * bit. This is useful for scalable hash tables without increasing memory > * footprint overhead. > * > - * For modification operations, the 0 bit of hlist_bl_head->first > - * pointer must be set. > + * Whilst the general use of bit spin locking is considered safe, PREEMPT_RT > + * introduces a problem with nesting spin locks inside bit locks: spin locks > + * become sleeping locks, and we can't sleep inside spinning locks such as bit > + * locks. However, for RTPREEMPT, performance is less of an issue than > + * correctness, so we trade off the memory and cache footprint of a spinlock per > + * list so the list locks are converted to sleeping locks and work correctly > + * with PREEMPT_RT kernels. > + * > + * An added advantage of this is that we can use the same trick when lockdep is > + * enabled (again, performance doesn't matter) and gain lockdep coverage of all > + * the hash-bl operations. > + * > + * For modification operations when using pure bit locking, the 0 bit of > + * hlist_bl_head->first pointer must be set. > * > * With some small modifications, this can easily be adapted to store several > * arbitrary bits (not just a single lock bit), if the need arises to store > @@ -30,16 +43,21 @@ > #define LIST_BL_BUG_ON(x) > #endif > > +#undef LIST_BL_USE_SPINLOCKS > +#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_LOCKDEP) > +#define LIST_BL_USE_SPINLOCKS 1 > +#endif > > struct hlist_bl_head { > struct hlist_bl_node *first; > +#ifdef LIST_BL_USE_SPINLOCKS > + spinlock_t lock; > +#endif > }; > > struct hlist_bl_node { > struct hlist_bl_node *next, **pprev; > }; > -#define INIT_HLIST_BL_HEAD(ptr) \ > - ((ptr)->first = NULL) > > static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h) > { > @@ -54,6 +72,69 @@ static inline bool hlist_bl_unhashed(const struct hlist_bl_node *h) > return !h->pprev; > } > > +#ifdef LIST_BL_USE_SPINLOCKS > +#define INIT_HLIST_BL_HEAD(ptr) do { \ > + (ptr)->first = NULL; \ > + spin_lock_init(&(ptr)->lock); \ > +} while (0) > + > +static inline void hlist_bl_lock(struct hlist_bl_head *b) > +{ > + spin_lock(&b->lock); > +} > + > +static inline void hlist_bl_unlock(struct hlist_bl_head *b) > +{ > + spin_unlock(&b->lock); > +} > + > +static inline bool hlist_bl_is_locked(struct hlist_bl_head *b) > +{ > + return spin_is_locked(&b->lock); > +} > + > +static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h) > +{ > + return h->first; > +} > + > +static inline void hlist_bl_set_first(struct hlist_bl_head *h, > + struct hlist_bl_node *n) > +{ > + h->first = n; > +} > + > +static inline void hlist_bl_set_before(struct hlist_bl_node **pprev, > + struct hlist_bl_node *n) > +{ > + WRITE_ONCE(*pprev, n); > +} > + > +static inline bool hlist_bl_empty(const struct hlist_bl_head *h) > +{ > + return !READ_ONCE(h->first); > +} > + > +#else /* !LIST_BL_USE_SPINLOCKS */ > + > +#define INIT_HLIST_BL_HEAD(ptr) \ > + ((ptr)->first = NULL) > + > +static inline void hlist_bl_lock(struct hlist_bl_head *b) > +{ > + bit_spin_lock(0, (unsigned long *)b); > +} > + > +static inline void hlist_bl_unlock(struct hlist_bl_head *b) > +{ > + __bit_spin_unlock(0, (unsigned long *)b); > +} > + > +static inline bool hlist_bl_is_locked(struct hlist_bl_head *b) > +{ > + return bit_spin_is_locked(0, (unsigned long *)b); > +} > + > static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h) > { > return (struct hlist_bl_node *) > @@ -69,11 +150,21 @@ static inline void hlist_bl_set_first(struct hlist_bl_head *h, > h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK); > } > > +static inline void hlist_bl_set_before(struct hlist_bl_node **pprev, > + struct hlist_bl_node *n) > +{ > + WRITE_ONCE(*pprev, > + (struct hlist_bl_node *) > + ((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK))); > +} > + > static inline bool hlist_bl_empty(const struct hlist_bl_head *h) > { > return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK); > } > > +#endif /* LIST_BL_USE_SPINLOCKS */ > + > static inline void hlist_bl_add_head(struct hlist_bl_node *n, > struct hlist_bl_head *h) > { > @@ -94,11 +185,7 @@ static inline void hlist_bl_add_before(struct hlist_bl_node *n, > n->pprev = pprev; > n->next = next; > next->pprev = &n->next; > - > - /* pprev may be `first`, so be careful not to lose the lock bit */ > - WRITE_ONCE(*pprev, > - (struct hlist_bl_node *) > - ((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK))); > + hlist_bl_set_before(pprev, n); > } > > static inline void hlist_bl_add_behind(struct hlist_bl_node *n, > @@ -119,11 +206,7 @@ static inline void __hlist_bl_del(struct hlist_bl_node *n) > > LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); > > - /* pprev may be `first`, so be careful not to lose the lock bit */ > - WRITE_ONCE(*pprev, > - (struct hlist_bl_node *) > - ((unsigned long)next | > - ((unsigned long)*pprev & LIST_BL_LOCKMASK))); > + hlist_bl_set_before(pprev, next); > if (next) > next->pprev = pprev; > } > @@ -165,21 +248,6 @@ static inline bool hlist_bl_fake(struct hlist_bl_node *n) > return n->pprev == &n->next; > } > > -static inline void hlist_bl_lock(struct hlist_bl_head *b) > -{ > - bit_spin_lock(0, (unsigned long *)b); > -} > - > -static inline void hlist_bl_unlock(struct hlist_bl_head *b) > -{ > - __bit_spin_unlock(0, (unsigned long *)b); > -} > - > -static inline bool hlist_bl_is_locked(struct hlist_bl_head *b) > -{ > - return bit_spin_is_locked(0, (unsigned long *)b); > -} > - > /** > * hlist_bl_for_each_entry - iterate over list of given type > * @tpos: the type * to use as a loop cursor. > diff --git a/include/linux/rculist_bl.h b/include/linux/rculist_bl.h > index 0b952d06eb0b..2d5eb5153121 100644 > --- a/include/linux/rculist_bl.h > +++ b/include/linux/rculist_bl.h > @@ -8,6 +8,18 @@ > #include <linux/list_bl.h> > #include <linux/rcupdate.h> > > +#ifdef LIST_BL_USE_SPINLOCKS > +static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h, > + struct hlist_bl_node *n) > +{ > + rcu_assign_pointer(h->first, n); > +} > + > +static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h) > +{ > + return rcu_dereference_check(h->first, hlist_bl_is_locked(h)); > +} > +#else /* !LIST_BL_USE_SPINLOCKS */ > static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h, > struct hlist_bl_node *n) > { > @@ -23,6 +35,7 @@ static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h) > return (struct hlist_bl_node *) > ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK); > } > +#endif /* LIST_BL_USE_SPINLOCKS */ > > /** > * hlist_bl_del_rcu - deletes entry from hash list without re-initialization > -- > 2.42.0 >
On Wed, Dec 06, 2023 at 11:16:50PM -0500, Kent Overstreet wrote: > On Wed, Dec 06, 2023 at 05:05:39PM +1100, Dave Chinner wrote: > > From: Dave Chinner <dchinner@redhat.com> > > > > hash-bl nests spinlocks inside the bit locks. This causes problems > > for CONFIG_PREEMPT_RT which converts spin locks to sleeping locks, > > and we're not allowed to sleep while holding a spinning lock. > > > > Further, lockdep does not support bit locks, so we lose lockdep > > coverage of the inode hash table with the hash-bl conversion. > > > > To enable these configs to work, add an external per-chain spinlock > > to the hlist_bl_head() and add helpers to use this instead of the > > bit spinlock when preempt_rt or lockdep are enabled. > > > > This converts all users of hlist-bl to use the external spinlock in > > these situations, so we also gain lockdep coverage of things like > > the dentry cache hash table with this change. > > > > Signed-off-by: Dave Chinner <dchinner@redhat.com> > > Sleepable bit locks can be done with wait_on_bit(), is that worth > considering for PREEMPT_RT? Or are the other features of real locks > important there? I think wait_on_bit() is not scalable. It hashes down to one of 256 shared struct wait_queue_heads which have thundering herd behaviours, and it requires the locker to always run prepare_to_wait() and finish_wait(). This means there is at least one spinlock_irqsave()/unlock pair needed, sometimes two, just to get an uncontended sleeping bit lock. So as a fast path operation that requires lock scalability, it's going to be better to use a straight spinlock that doesn't require irq safety as it's far less expensive than a sleeping bit lock. Whether CONFIG_PREEMPT_RT changes that equation at all is not at all clear to me, and so I'll leave that consideration to RT people if they see a need to address it. In the mean time, we need to use an external spinlock for lockdep validation so it really doesn't make any sense at all to add a third locking variant with completely different semantics just for PREEMPT_RT... -Dave.
diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h index 8ee2bf5af131..990ad8e24e0b 100644 --- a/include/linux/list_bl.h +++ b/include/linux/list_bl.h @@ -4,14 +4,27 @@ #include <linux/list.h> #include <linux/bit_spinlock.h> +#include <linux/spinlock.h> /* * Special version of lists, where head of the list has a lock in the lowest * bit. This is useful for scalable hash tables without increasing memory * footprint overhead. * - * For modification operations, the 0 bit of hlist_bl_head->first - * pointer must be set. + * Whilst the general use of bit spin locking is considered safe, PREEMPT_RT + * introduces a problem with nesting spin locks inside bit locks: spin locks + * become sleeping locks, and we can't sleep inside spinning locks such as bit + * locks. However, for RTPREEMPT, performance is less of an issue than + * correctness, so we trade off the memory and cache footprint of a spinlock per + * list so the list locks are converted to sleeping locks and work correctly + * with PREEMPT_RT kernels. + * + * An added advantage of this is that we can use the same trick when lockdep is + * enabled (again, performance doesn't matter) and gain lockdep coverage of all + * the hash-bl operations. + * + * For modification operations when using pure bit locking, the 0 bit of + * hlist_bl_head->first pointer must be set. * * With some small modifications, this can easily be adapted to store several * arbitrary bits (not just a single lock bit), if the need arises to store @@ -30,16 +43,21 @@ #define LIST_BL_BUG_ON(x) #endif +#undef LIST_BL_USE_SPINLOCKS +#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_LOCKDEP) +#define LIST_BL_USE_SPINLOCKS 1 +#endif struct hlist_bl_head { struct hlist_bl_node *first; +#ifdef LIST_BL_USE_SPINLOCKS + spinlock_t lock; +#endif }; struct hlist_bl_node { struct hlist_bl_node *next, **pprev; }; -#define INIT_HLIST_BL_HEAD(ptr) \ - ((ptr)->first = NULL) static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h) { @@ -54,6 +72,69 @@ static inline bool hlist_bl_unhashed(const struct hlist_bl_node *h) return !h->pprev; } +#ifdef LIST_BL_USE_SPINLOCKS +#define INIT_HLIST_BL_HEAD(ptr) do { \ + (ptr)->first = NULL; \ + spin_lock_init(&(ptr)->lock); \ +} while (0) + +static inline void hlist_bl_lock(struct hlist_bl_head *b) +{ + spin_lock(&b->lock); +} + +static inline void hlist_bl_unlock(struct hlist_bl_head *b) +{ + spin_unlock(&b->lock); +} + +static inline bool hlist_bl_is_locked(struct hlist_bl_head *b) +{ + return spin_is_locked(&b->lock); +} + +static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h) +{ + return h->first; +} + +static inline void hlist_bl_set_first(struct hlist_bl_head *h, + struct hlist_bl_node *n) +{ + h->first = n; +} + +static inline void hlist_bl_set_before(struct hlist_bl_node **pprev, + struct hlist_bl_node *n) +{ + WRITE_ONCE(*pprev, n); +} + +static inline bool hlist_bl_empty(const struct hlist_bl_head *h) +{ + return !READ_ONCE(h->first); +} + +#else /* !LIST_BL_USE_SPINLOCKS */ + +#define INIT_HLIST_BL_HEAD(ptr) \ + ((ptr)->first = NULL) + +static inline void hlist_bl_lock(struct hlist_bl_head *b) +{ + bit_spin_lock(0, (unsigned long *)b); +} + +static inline void hlist_bl_unlock(struct hlist_bl_head *b) +{ + __bit_spin_unlock(0, (unsigned long *)b); +} + +static inline bool hlist_bl_is_locked(struct hlist_bl_head *b) +{ + return bit_spin_is_locked(0, (unsigned long *)b); +} + static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h) { return (struct hlist_bl_node *) @@ -69,11 +150,21 @@ static inline void hlist_bl_set_first(struct hlist_bl_head *h, h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK); } +static inline void hlist_bl_set_before(struct hlist_bl_node **pprev, + struct hlist_bl_node *n) +{ + WRITE_ONCE(*pprev, + (struct hlist_bl_node *) + ((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK))); +} + static inline bool hlist_bl_empty(const struct hlist_bl_head *h) { return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK); } +#endif /* LIST_BL_USE_SPINLOCKS */ + static inline void hlist_bl_add_head(struct hlist_bl_node *n, struct hlist_bl_head *h) { @@ -94,11 +185,7 @@ static inline void hlist_bl_add_before(struct hlist_bl_node *n, n->pprev = pprev; n->next = next; next->pprev = &n->next; - - /* pprev may be `first`, so be careful not to lose the lock bit */ - WRITE_ONCE(*pprev, - (struct hlist_bl_node *) - ((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK))); + hlist_bl_set_before(pprev, n); } static inline void hlist_bl_add_behind(struct hlist_bl_node *n, @@ -119,11 +206,7 @@ static inline void __hlist_bl_del(struct hlist_bl_node *n) LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); - /* pprev may be `first`, so be careful not to lose the lock bit */ - WRITE_ONCE(*pprev, - (struct hlist_bl_node *) - ((unsigned long)next | - ((unsigned long)*pprev & LIST_BL_LOCKMASK))); + hlist_bl_set_before(pprev, next); if (next) next->pprev = pprev; } @@ -165,21 +248,6 @@ static inline bool hlist_bl_fake(struct hlist_bl_node *n) return n->pprev == &n->next; } -static inline void hlist_bl_lock(struct hlist_bl_head *b) -{ - bit_spin_lock(0, (unsigned long *)b); -} - -static inline void hlist_bl_unlock(struct hlist_bl_head *b) -{ - __bit_spin_unlock(0, (unsigned long *)b); -} - -static inline bool hlist_bl_is_locked(struct hlist_bl_head *b) -{ - return bit_spin_is_locked(0, (unsigned long *)b); -} - /** * hlist_bl_for_each_entry - iterate over list of given type * @tpos: the type * to use as a loop cursor. diff --git a/include/linux/rculist_bl.h b/include/linux/rculist_bl.h index 0b952d06eb0b..2d5eb5153121 100644 --- a/include/linux/rculist_bl.h +++ b/include/linux/rculist_bl.h @@ -8,6 +8,18 @@ #include <linux/list_bl.h> #include <linux/rcupdate.h> +#ifdef LIST_BL_USE_SPINLOCKS +static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h, + struct hlist_bl_node *n) +{ + rcu_assign_pointer(h->first, n); +} + +static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h) +{ + return rcu_dereference_check(h->first, hlist_bl_is_locked(h)); +} +#else /* !LIST_BL_USE_SPINLOCKS */ static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h, struct hlist_bl_node *n) { @@ -23,6 +35,7 @@ static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h) return (struct hlist_bl_node *) ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK); } +#endif /* LIST_BL_USE_SPINLOCKS */ /** * hlist_bl_del_rcu - deletes entry from hash list without re-initialization