Message ID | 20181130160118.24683-2-axboe@kernel.dk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [1/2] sbitmap: ammortize cost of clearing bits | expand |
On Fri, Nov 30, 2018 at 09:01:17AM -0700, Jens Axboe wrote: > sbitmap maintains a set of words that we use to set and clear bits, with > each bit representing a tag for blk-mq. Even though we spread the bits > out and maintain a hint cache, one particular bit allocated will end up > being cleared in the exact same spot. > > This introduces batched clearing of bits. Instead of clearing a given > bit, the same bit is set in a cleared/free mask instead. If we fail > allocating a bit from a given word, then we check the free mask, and > batch move those cleared bits at that time. This trades 64 atomic bitops > for 2 cmpxchg(). > > In a threaded poll test case, half the overhead of getting and clearing > tags is removed with this change. On another poll test case with a > single thread, performance is unchanged. > > Signed-off-by: Jens Axboe <axboe@kernel.dk> > --- > include/linux/sbitmap.h | 31 +++++++++++++--- > lib/sbitmap.c | 80 +++++++++++++++++++++++++++++++++++++---- > 2 files changed, 100 insertions(+), 11 deletions(-) > > diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h > index 804a50983ec5..07f117ee19dc 100644 > --- a/include/linux/sbitmap.h > +++ b/include/linux/sbitmap.h > @@ -30,14 +30,24 @@ struct seq_file; > */ > struct sbitmap_word { > /** > - * @word: The bitmap word itself. > + * @depth: Number of bits being used in @word/@cleared > */ > - unsigned long word; > + unsigned long depth; > > /** > - * @depth: Number of bits being used in @word. > + * @word: word holding free bits > */ > - unsigned long depth; > + unsigned long word ____cacheline_aligned_in_smp; Still splitting up word and depth in separate cachelines? > + /** > + * @cleared: word holding cleared bits > + */ > + unsigned long cleared ____cacheline_aligned_in_smp; > + > + /** > + * @swap_lock: Held while swapping word <-> cleared > + */ > + spinlock_t swap_lock; > } ____cacheline_aligned_in_smp; > > /** > @@ -310,6 +320,19 @@ static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr) > clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); > } > > +/* > + * This one is special, since it doesn't actually clear the bit, rather it > + * sets the corresponding bit in the ->cleared mask instead. Paired with > + * the caller doing sbitmap_batch_clear() if a given index is full, which > + * will clear the previously freed entries in the corresponding ->word. > + */ > +static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr) > +{ > + unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared; > + > + set_bit(SB_NR_TO_BIT(sb, bitnr), addr); > +} > + > static inline void sbitmap_clear_bit_unlock(struct sbitmap *sb, > unsigned int bitnr) > { > diff --git a/lib/sbitmap.c b/lib/sbitmap.c > index 45cab6bbc1c7..f6a9553617bd 100644 > --- a/lib/sbitmap.c > +++ b/lib/sbitmap.c > @@ -59,6 +59,7 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, > for (i = 0; i < sb->map_nr; i++) { > sb->map[i].depth = min(depth, bits_per_word); > depth -= sb->map[i].depth; > + spin_lock_init(&sb->map[i].swap_lock); > } > return 0; > } > @@ -111,6 +112,57 @@ static int __sbitmap_get_word(unsigned long *word, unsigned long depth, > return nr; > } > > +/* > + * See if we have deferred clears that we can batch move > + */ > +static inline bool sbitmap_deferred_clear(struct sbitmap *sb, int index) > +{ > + unsigned long mask, val; > + bool ret = false; > + > + spin_lock(&sb->map[index].swap_lock); > + > + if (!sb->map[index].cleared) > + goto out_unlock; > + > + /* > + * First get a stable cleared mask, setting the old mask to 0. > + */ > + do { > + mask = sb->map[index].cleared; > + } while (cmpxchg(&sb->map[index].cleared, mask, 0) != mask); > + > + /* > + * Now clear the masked bits in our free word > + */ > + do { > + val = sb->map[index].word; > + } while (cmpxchg(&sb->map[index].word, val, val & ~mask) != val); > + > + ret = true; > +out_unlock: > + spin_unlock(&sb->map[index].swap_lock); > + return ret; > +} Okay, I couldn't find any holes in this one :) > +static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index, > + unsigned int alloc_hint, bool round_robin) > +{ > + int nr; > + > + do { > + nr = __sbitmap_get_word(&sb->map[index].word, > + sb->map[index].depth, alloc_hint, > + !round_robin); > + if (nr != -1) > + break; > + if (!sbitmap_deferred_clear(sb, index)) > + break; > + } while (1); > + > + return nr; > +} > + > int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin) > { > unsigned int i, index; > @@ -129,9 +181,8 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin) > alloc_hint = 0; > > for (i = 0; i < sb->map_nr; i++) { > - nr = __sbitmap_get_word(&sb->map[index].word, > - sb->map[index].depth, alloc_hint, > - !round_robin); > + nr = sbitmap_find_bit_in_index(sb, index, alloc_hint, > + round_robin); > if (nr != -1) { > nr += index << sb->shift; > break; > @@ -206,23 +257,37 @@ bool sbitmap_any_bit_clear(const struct sbitmap *sb) > } > EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear); > > -unsigned int sbitmap_weight(const struct sbitmap *sb) > +static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set) > { > unsigned int i, weight = 0; > > for (i = 0; i < sb->map_nr; i++) { > const struct sbitmap_word *word = &sb->map[i]; > > - weight += bitmap_weight(&word->word, word->depth); > + if (set) > + weight += bitmap_weight(&word->word, word->depth); Should probably do weight -= bitmap_weight(&word->cleared, word->depth); too, right? > + else > + weight += bitmap_weight(&word->cleared, word->depth); > } > return weight; > } > + > +unsigned int sbitmap_weight(const struct sbitmap *sb) > +{ > + return __sbitmap_weight(sb, true); > +} > EXPORT_SYMBOL_GPL(sbitmap_weight); > > +static unsigned int sbitmap_cleared(const struct sbitmap *sb) > +{ > + return __sbitmap_weight(sb, false); > +} > + > void sbitmap_show(struct sbitmap *sb, struct seq_file *m) > { > seq_printf(m, "depth=%u\n", sb->depth); > - seq_printf(m, "busy=%u\n", sbitmap_weight(sb)); > + seq_printf(m, "busy=%u\n", sbitmap_weight(sb) - sbitmap_cleared(sb)); > + seq_printf(m, "cleared=%u\n", sbitmap_cleared(sb)); > seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift); > seq_printf(m, "map_nr=%u\n", sb->map_nr); > } > @@ -514,7 +579,8 @@ EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up); > void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, > unsigned int cpu) > { > - sbitmap_clear_bit_unlock(&sbq->sb, nr); > + sbitmap_deferred_clear_bit(&sbq->sb, nr); > + > /* > * Pairs with the memory barrier in set_current_state() to ensure the > * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker > -- > 2.17.1 >
On 11/30/18 1:03 PM, Omar Sandoval wrote: > On Fri, Nov 30, 2018 at 09:01:17AM -0700, Jens Axboe wrote: >> sbitmap maintains a set of words that we use to set and clear bits, with >> each bit representing a tag for blk-mq. Even though we spread the bits >> out and maintain a hint cache, one particular bit allocated will end up >> being cleared in the exact same spot. >> >> This introduces batched clearing of bits. Instead of clearing a given >> bit, the same bit is set in a cleared/free mask instead. If we fail >> allocating a bit from a given word, then we check the free mask, and >> batch move those cleared bits at that time. This trades 64 atomic bitops >> for 2 cmpxchg(). >> >> In a threaded poll test case, half the overhead of getting and clearing >> tags is removed with this change. On another poll test case with a >> single thread, performance is unchanged. >> >> Signed-off-by: Jens Axboe <axboe@kernel.dk> >> --- >> include/linux/sbitmap.h | 31 +++++++++++++--- >> lib/sbitmap.c | 80 +++++++++++++++++++++++++++++++++++++---- >> 2 files changed, 100 insertions(+), 11 deletions(-) >> >> diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h >> index 804a50983ec5..07f117ee19dc 100644 >> --- a/include/linux/sbitmap.h >> +++ b/include/linux/sbitmap.h >> @@ -30,14 +30,24 @@ struct seq_file; >> */ >> struct sbitmap_word { >> /** >> - * @word: The bitmap word itself. >> + * @depth: Number of bits being used in @word/@cleared >> */ >> - unsigned long word; >> + unsigned long depth; >> >> /** >> - * @depth: Number of bits being used in @word. >> + * @word: word holding free bits >> */ >> - unsigned long depth; >> + unsigned long word ____cacheline_aligned_in_smp; > > Still splitting up word and depth in separate cachelines? Yeah, I mentioned that in one of the other postings, there's still a definite win to doing that. > Okay, I couldn't find any holes in this one :) Good to hear that :-) >> -unsigned int sbitmap_weight(const struct sbitmap *sb) >> +static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set) >> { >> unsigned int i, weight = 0; >> >> for (i = 0; i < sb->map_nr; i++) { >> const struct sbitmap_word *word = &sb->map[i]; >> >> - weight += bitmap_weight(&word->word, word->depth); >> + if (set) >> + weight += bitmap_weight(&word->word, word->depth); > > Should probably do > weight -= bitmap_weight(&word->cleared, word->depth); > > too, right? We only use these for the debugfs stuff, how about I just make it static instead?
On Fri, Nov 30, 2018 at 01:10:47PM -0700, Jens Axboe wrote: > On 11/30/18 1:03 PM, Omar Sandoval wrote: > > On Fri, Nov 30, 2018 at 09:01:17AM -0700, Jens Axboe wrote: > >> sbitmap maintains a set of words that we use to set and clear bits, with > >> each bit representing a tag for blk-mq. Even though we spread the bits > >> out and maintain a hint cache, one particular bit allocated will end up > >> being cleared in the exact same spot. > >> > >> This introduces batched clearing of bits. Instead of clearing a given > >> bit, the same bit is set in a cleared/free mask instead. If we fail > >> allocating a bit from a given word, then we check the free mask, and > >> batch move those cleared bits at that time. This trades 64 atomic bitops > >> for 2 cmpxchg(). > >> > >> In a threaded poll test case, half the overhead of getting and clearing > >> tags is removed with this change. On another poll test case with a > >> single thread, performance is unchanged. > >> > >> Signed-off-by: Jens Axboe <axboe@kernel.dk> > >> --- > >> include/linux/sbitmap.h | 31 +++++++++++++--- > >> lib/sbitmap.c | 80 +++++++++++++++++++++++++++++++++++++---- > >> 2 files changed, 100 insertions(+), 11 deletions(-) > >> > >> diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h > >> index 804a50983ec5..07f117ee19dc 100644 > >> --- a/include/linux/sbitmap.h > >> +++ b/include/linux/sbitmap.h > >> @@ -30,14 +30,24 @@ struct seq_file; > >> */ > >> struct sbitmap_word { > >> /** > >> - * @word: The bitmap word itself. > >> + * @depth: Number of bits being used in @word/@cleared > >> */ > >> - unsigned long word; > >> + unsigned long depth; > >> > >> /** > >> - * @depth: Number of bits being used in @word. > >> + * @word: word holding free bits > >> */ > >> - unsigned long depth; > >> + unsigned long word ____cacheline_aligned_in_smp; > > > > Still splitting up word and depth in separate cachelines? > > Yeah, I mentioned that in one of the other postings, there's still a > definite win to doing that. > > > Okay, I couldn't find any holes in this one :) > > Good to hear that :-) > > >> -unsigned int sbitmap_weight(const struct sbitmap *sb) > >> +static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set) > >> { > >> unsigned int i, weight = 0; > >> > >> for (i = 0; i < sb->map_nr; i++) { > >> const struct sbitmap_word *word = &sb->map[i]; > >> > >> - weight += bitmap_weight(&word->word, word->depth); > >> + if (set) > >> + weight += bitmap_weight(&word->word, word->depth); > > > > Should probably do > > weight -= bitmap_weight(&word->cleared, word->depth); > > > > too, right? > > We only use these for the debugfs stuff, how about I just make it static > instead? Yeah, with that, Reviewed-by: Omar Sandoval <osandov@fb.com>
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 804a50983ec5..07f117ee19dc 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -30,14 +30,24 @@ struct seq_file; */ struct sbitmap_word { /** - * @word: The bitmap word itself. + * @depth: Number of bits being used in @word/@cleared */ - unsigned long word; + unsigned long depth; /** - * @depth: Number of bits being used in @word. + * @word: word holding free bits */ - unsigned long depth; + unsigned long word ____cacheline_aligned_in_smp; + + /** + * @cleared: word holding cleared bits + */ + unsigned long cleared ____cacheline_aligned_in_smp; + + /** + * @swap_lock: Held while swapping word <-> cleared + */ + spinlock_t swap_lock; } ____cacheline_aligned_in_smp; /** @@ -310,6 +320,19 @@ static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr) clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } +/* + * This one is special, since it doesn't actually clear the bit, rather it + * sets the corresponding bit in the ->cleared mask instead. Paired with + * the caller doing sbitmap_batch_clear() if a given index is full, which + * will clear the previously freed entries in the corresponding ->word. + */ +static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr) +{ + unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared; + + set_bit(SB_NR_TO_BIT(sb, bitnr), addr); +} + static inline void sbitmap_clear_bit_unlock(struct sbitmap *sb, unsigned int bitnr) { diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 45cab6bbc1c7..f6a9553617bd 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -59,6 +59,7 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, for (i = 0; i < sb->map_nr; i++) { sb->map[i].depth = min(depth, bits_per_word); depth -= sb->map[i].depth; + spin_lock_init(&sb->map[i].swap_lock); } return 0; } @@ -111,6 +112,57 @@ static int __sbitmap_get_word(unsigned long *word, unsigned long depth, return nr; } +/* + * See if we have deferred clears that we can batch move + */ +static inline bool sbitmap_deferred_clear(struct sbitmap *sb, int index) +{ + unsigned long mask, val; + bool ret = false; + + spin_lock(&sb->map[index].swap_lock); + + if (!sb->map[index].cleared) + goto out_unlock; + + /* + * First get a stable cleared mask, setting the old mask to 0. + */ + do { + mask = sb->map[index].cleared; + } while (cmpxchg(&sb->map[index].cleared, mask, 0) != mask); + + /* + * Now clear the masked bits in our free word + */ + do { + val = sb->map[index].word; + } while (cmpxchg(&sb->map[index].word, val, val & ~mask) != val); + + ret = true; +out_unlock: + spin_unlock(&sb->map[index].swap_lock); + return ret; +} + +static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index, + unsigned int alloc_hint, bool round_robin) +{ + int nr; + + do { + nr = __sbitmap_get_word(&sb->map[index].word, + sb->map[index].depth, alloc_hint, + !round_robin); + if (nr != -1) + break; + if (!sbitmap_deferred_clear(sb, index)) + break; + } while (1); + + return nr; +} + int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin) { unsigned int i, index; @@ -129,9 +181,8 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin) alloc_hint = 0; for (i = 0; i < sb->map_nr; i++) { - nr = __sbitmap_get_word(&sb->map[index].word, - sb->map[index].depth, alloc_hint, - !round_robin); + nr = sbitmap_find_bit_in_index(sb, index, alloc_hint, + round_robin); if (nr != -1) { nr += index << sb->shift; break; @@ -206,23 +257,37 @@ bool sbitmap_any_bit_clear(const struct sbitmap *sb) } EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear); -unsigned int sbitmap_weight(const struct sbitmap *sb) +static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set) { unsigned int i, weight = 0; for (i = 0; i < sb->map_nr; i++) { const struct sbitmap_word *word = &sb->map[i]; - weight += bitmap_weight(&word->word, word->depth); + if (set) + weight += bitmap_weight(&word->word, word->depth); + else + weight += bitmap_weight(&word->cleared, word->depth); } return weight; } + +unsigned int sbitmap_weight(const struct sbitmap *sb) +{ + return __sbitmap_weight(sb, true); +} EXPORT_SYMBOL_GPL(sbitmap_weight); +static unsigned int sbitmap_cleared(const struct sbitmap *sb) +{ + return __sbitmap_weight(sb, false); +} + void sbitmap_show(struct sbitmap *sb, struct seq_file *m) { seq_printf(m, "depth=%u\n", sb->depth); - seq_printf(m, "busy=%u\n", sbitmap_weight(sb)); + seq_printf(m, "busy=%u\n", sbitmap_weight(sb) - sbitmap_cleared(sb)); + seq_printf(m, "cleared=%u\n", sbitmap_cleared(sb)); seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift); seq_printf(m, "map_nr=%u\n", sb->map_nr); } @@ -514,7 +579,8 @@ EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up); void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, unsigned int cpu) { - sbitmap_clear_bit_unlock(&sbq->sb, nr); + sbitmap_deferred_clear_bit(&sbq->sb, nr); + /* * Pairs with the memory barrier in set_current_state() to ensure the * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker
sbitmap maintains a set of words that we use to set and clear bits, with each bit representing a tag for blk-mq. Even though we spread the bits out and maintain a hint cache, one particular bit allocated will end up being cleared in the exact same spot. This introduces batched clearing of bits. Instead of clearing a given bit, the same bit is set in a cleared/free mask instead. If we fail allocating a bit from a given word, then we check the free mask, and batch move those cleared bits at that time. This trades 64 atomic bitops for 2 cmpxchg(). In a threaded poll test case, half the overhead of getting and clearing tags is removed with this change. On another poll test case with a single thread, performance is unchanged. Signed-off-by: Jens Axboe <axboe@kernel.dk> --- include/linux/sbitmap.h | 31 +++++++++++++--- lib/sbitmap.c | 80 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 100 insertions(+), 11 deletions(-)