Message ID | 1479341856-30320-59-git-send-email-mawilcox@linuxonhyperv.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, Nov 17, 2016 at 3:17 AM, Matthew Wilcox <mawilcox@linuxonhyperv.com> wrote: > From: Matthew Wilcox <mawilcox@microsoft.com> This code still looks overengineered for me. > > This fixes several interlinked problems with the iterators in the > presence of multiorder entries. > > 1. radix_tree_iter_next() would only advance by one slot, which would > result in the iterators returning the same entry more than once if there > were sibling entries. Is this a problem? Do we have users who cannot evalate length of entry by looking into it head? > > 2. radix_tree_next_slot() could return an internal pointer instead of > a user pointer if a tagged multiorder entry was immediately followed by > an entry of lower order. > > 3. radix_tree_next_slot() expanded to a lot more code than it used to > when multiorder support was compiled in. And I wasn't comfortable with > entry_to_node() being in a header file. > > Fixing radix_tree_iter_next() for the presence of sibling entries > necessarily involves examining the contents of the radix tree, so we now > need to pass 'slot' to radix_tree_iter_next(), and we need to change the > calling convention so it is called *before* dropping the lock which > protects the tree. Fortunately, unconverted code won't compile. > > radix_tree_next_slot() becomes closer to how it looked before multiorder > support was introduced. It only checks to see if the next entry in the > chunk is a sibling entry or a pointer to a node; this should be rare > enough that handling this case out of line is not a performance impact > (and such impact is amortised by the fact that the entry we just > processed was a multiorder entry). Also, radix_tree_next_slot() used > to force a new chunk lookup for untagged entries, which is more > expensive than the out of line sibling entry > > Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com> > --- > fs/btrfs/tests/btrfs-tests.c | 2 +- > include/linux/radix-tree.h | 63 ++++++++------------ > lib/radix-tree.c | 94 ++++++++++++++++++++++++++++++ > mm/khugepaged.c | 2 +- > mm/shmem.c | 6 +- > tools/testing/radix-tree/iteration_check.c | 4 +- > tools/testing/radix-tree/multiorder.c | 12 ++++ > tools/testing/radix-tree/regression3.c | 6 +- > tools/testing/radix-tree/test.h | 1 + > 9 files changed, 142 insertions(+), 48 deletions(-) > > diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c > index 73076a0..6d3457a 100644 > --- a/fs/btrfs/tests/btrfs-tests.c > +++ b/fs/btrfs/tests/btrfs-tests.c > @@ -162,7 +162,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) > slot = radix_tree_iter_retry(&iter); > continue; > } > - slot = radix_tree_iter_next(&iter); > + slot = radix_tree_iter_next(slot, &iter); > spin_unlock(&fs_info->buffer_lock); > free_extent_buffer_stale(eb); > spin_lock(&fs_info->buffer_lock); > diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h > index 66fb8c0..36c6175 100644 > --- a/include/linux/radix-tree.h > +++ b/include/linux/radix-tree.h > @@ -424,15 +424,10 @@ __radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots) > * > * If the iterator needs to release then reacquire a lock, the chunk may > * have been invalidated by an insertion or deletion. Call this function > - * to continue the iteration from the next index. > + * before releasing the lock to continue the iteration from the next index. > */ > -static inline __must_check > -void **radix_tree_iter_next(struct radix_tree_iter *iter) > -{ > - iter->next_index = __radix_tree_iter_add(iter, 1); > - iter->tags = 0; > - return NULL; > -} > +void **__must_check radix_tree_iter_next(void **slot, > + struct radix_tree_iter *iter); > > /** > * radix_tree_chunk_size - get current chunk size > @@ -446,10 +441,17 @@ radix_tree_chunk_size(struct radix_tree_iter *iter) > return (iter->next_index - iter->index) >> iter_shift(iter); > } > > -static inline struct radix_tree_node *entry_to_node(void *ptr) > +#ifdef CONFIG_RADIX_TREE_MULTIORDER > +void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, > + unsigned flags); > +#else > +/* Can't happen without sibling entries, but the compiler can't tell that */ > +static inline void ** __radix_tree_next_slot(void **slot, > + struct radix_tree_iter *iter, unsigned flags) > { > - return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); > + return slot; > } > +#endif > > /** > * radix_tree_next_slot - find next slot in chunk > @@ -474,51 +476,31 @@ static __always_inline void ** > radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) > { > if (flags & RADIX_TREE_ITER_TAGGED) { > - void *canon = slot; > - > iter->tags >>= 1; > if (unlikely(!iter->tags)) > return NULL; > - while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) && > - radix_tree_is_internal_node(slot[1])) { > - if (entry_to_node(slot[1]) == canon) { > - iter->tags >>= 1; > - iter->index = __radix_tree_iter_add(iter, 1); > - slot++; > - continue; > - } > - iter->next_index = __radix_tree_iter_add(iter, 1); > - return NULL; > - } > if (likely(iter->tags & 1ul)) { > iter->index = __radix_tree_iter_add(iter, 1); > - return slot + 1; > + slot++; > + goto found; > } > if (!(flags & RADIX_TREE_ITER_CONTIG)) { > unsigned offset = __ffs(iter->tags); > > - iter->tags >>= offset; > - iter->index = __radix_tree_iter_add(iter, offset + 1); > - return slot + offset + 1; > + iter->tags >>= offset++; > + iter->index = __radix_tree_iter_add(iter, offset); > + slot += offset; > + goto found; > } > } else { > long count = radix_tree_chunk_size(iter); > - void *canon = slot; > > while (--count > 0) { > slot++; > iter->index = __radix_tree_iter_add(iter, 1); > > - if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) && > - radix_tree_is_internal_node(*slot)) { > - if (entry_to_node(*slot) == canon) > - continue; > - iter->next_index = iter->index; > - break; > - } > - > if (likely(*slot)) > - return slot; > + goto found; > if (flags & RADIX_TREE_ITER_CONTIG) { > /* forbid switching to the next chunk */ > iter->next_index = 0; > @@ -527,6 +509,11 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) > } > } > return NULL; > + > + found: > + if (unlikely(radix_tree_is_internal_node(*slot))) > + return __radix_tree_next_slot(slot, iter, flags); > + return slot; > } > > /** > @@ -577,6 +564,6 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) > slot || (slot = radix_tree_next_chunk(root, iter, \ > RADIX_TREE_ITER_TAGGED | tag)) ; \ > slot = radix_tree_next_slot(slot, iter, \ > - RADIX_TREE_ITER_TAGGED)) > + RADIX_TREE_ITER_TAGGED | tag)) > > #endif /* _LINUX_RADIX_TREE_H */ > diff --git a/lib/radix-tree.c b/lib/radix-tree.c > index 09c5f1d..27b53ef 100644 > --- a/lib/radix-tree.c > +++ b/lib/radix-tree.c > @@ -69,6 +69,11 @@ struct radix_tree_preload { > }; > static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; > > +static inline struct radix_tree_node *entry_to_node(void *ptr) > +{ > + return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); > +} > + > static inline void *node_to_entry(void *ptr) > { > return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); > @@ -1138,6 +1143,95 @@ static inline void __set_iter_shift(struct radix_tree_iter *iter, > #endif > } > > +static void ** __radix_tree_iter_next(struct radix_tree_node **nodep, > + void **slot, struct radix_tree_iter *iter) > +{ > + void *sib = node_to_entry(slot - 1); > + > + while (iter->index < iter->next_index) { > + *nodep = rcu_dereference_raw(*slot); > + if (*nodep && *nodep != sib) > + return slot; > + slot++; > + iter->index = __radix_tree_iter_add(iter, 1); > + iter->tags >>= 1; > + } > + > + *nodep = NULL; > + return NULL; > +} > + > +#ifdef CONFIG_RADIX_TREE_MULTIORDER > +void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, > + unsigned flags) > +{ > + unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK; > + struct radix_tree_node *node = rcu_dereference_raw(*slot); > + > + slot = __radix_tree_iter_next(&node, slot, iter); > + > + while (radix_tree_is_internal_node(node)) { > + unsigned offset; > + > + if (node == RADIX_TREE_RETRY) > + return slot; > + node = entry_to_node(node); > + > + if (flags & RADIX_TREE_ITER_TAGGED) { > + unsigned tag_long, tag_bit; > + offset = radix_tree_find_next_bit(node, tag, 0); > + if (offset == RADIX_TREE_MAP_SIZE) > + return NULL; > + slot = &node->slots[offset]; > + > + tag_long = offset / BITS_PER_LONG; > + tag_bit = offset % BITS_PER_LONG; > + iter->tags = node->tags[tag][tag_long] >> tag_bit; > + BUG_ON(iter->tags >= (RADIX_TREE_MAP_SIZE * 2)); > + node = rcu_dereference_raw(*slot); > + } else { > + offset = 0; > + slot = &node->slots[0]; > + for (;;) { > + node = rcu_dereference_raw(*slot); > + if (node) > + break; > + slot++; > + offset++; > + if (offset == RADIX_TREE_MAP_SIZE) > + return NULL; > + } > + } > + if ((flags & RADIX_TREE_ITER_CONTIG) && (offset > 0)) > + goto none; > + iter->shift -= RADIX_TREE_MAP_SHIFT; > + iter->index = __radix_tree_iter_add(iter, offset); > + iter->next_index = (iter->index | shift_maxindex(iter->shift)) + > + 1; > + } > + > + return slot; > + none: > + iter->next_index = 0; > + return NULL; > +} > +EXPORT_SYMBOL(__radix_tree_next_slot); > +#endif > + > +void **radix_tree_iter_next(void **slot, struct radix_tree_iter *iter) > +{ > + struct radix_tree_node *node; > + > + slot++; > + iter->index = __radix_tree_iter_add(iter, 1); > + node = rcu_dereference_raw(*slot); > + __radix_tree_iter_next(&node, slot, iter); > + iter->next_index = iter->index; > + iter->tags = 0; > + return NULL; > +} > +EXPORT_SYMBOL(radix_tree_iter_next); > + > /** > * radix_tree_next_chunk - find next chunk of slots for iteration > * > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index 728d779..46155d1 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -1614,8 +1614,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, > present++; > > if (need_resched()) { > + slot = radix_tree_iter_next(slot, &iter); > cond_resched_rcu(); > - slot = radix_tree_iter_next(&iter); > } > } > rcu_read_unlock(); > diff --git a/mm/shmem.c b/mm/shmem.c > index 166ebf5..0b3fe33 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -658,8 +658,8 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, > swapped++; > > if (need_resched()) { > + slot = radix_tree_iter_next(slot, &iter); > cond_resched_rcu(); > - slot = radix_tree_iter_next(&iter); > } > } > > @@ -2434,8 +2434,8 @@ static void shmem_tag_pins(struct address_space *mapping) > } > > if (need_resched()) { > + slot = radix_tree_iter_next(slot, &iter); > cond_resched_rcu(); > - slot = radix_tree_iter_next(&iter); > } > } > rcu_read_unlock(); > @@ -2504,8 +2504,8 @@ static int shmem_wait_for_pins(struct address_space *mapping) > spin_unlock_irq(&mapping->tree_lock); > continue_resched: > if (need_resched()) { > + slot = radix_tree_iter_next(slot, &iter); > cond_resched_rcu(); > - slot = radix_tree_iter_next(&iter); > } > } > rcu_read_unlock(); > diff --git a/tools/testing/radix-tree/iteration_check.c b/tools/testing/radix-tree/iteration_check.c > index df71cb8..6eca4c2 100644 > --- a/tools/testing/radix-tree/iteration_check.c > +++ b/tools/testing/radix-tree/iteration_check.c > @@ -79,7 +79,7 @@ static void *tagged_iteration_fn(void *arg) > } > > if (rand_r(&seeds[0]) % 50 == 0) { > - slot = radix_tree_iter_next(&iter); > + slot = radix_tree_iter_next(slot, &iter); > rcu_read_unlock(); > rcu_barrier(); > rcu_read_lock(); > @@ -127,7 +127,7 @@ static void *untagged_iteration_fn(void *arg) > } > > if (rand_r(&seeds[1]) % 50 == 0) { > - slot = radix_tree_iter_next(&iter); > + slot = radix_tree_iter_next(slot, &iter); > rcu_read_unlock(); > rcu_barrier(); > rcu_read_lock(); > diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c > index 25e0463..588209a 100644 > --- a/tools/testing/radix-tree/multiorder.c > +++ b/tools/testing/radix-tree/multiorder.c > @@ -232,10 +232,14 @@ void multiorder_iteration(void) > int height = order[i] / RADIX_TREE_MAP_SHIFT; > int shift = height * RADIX_TREE_MAP_SHIFT; > int mask = (1 << order[i]) - 1; > + struct item *item = *slot; > > assert(iter.index >= (index[i] &~ mask)); > assert(iter.index <= (index[i] | mask)); > assert(iter.shift == shift); > + assert(!radix_tree_is_internal_node(item)); > + assert(item->index >= (index[i] &~ mask)); > + assert(item->index <= (index[i] | mask)); > i++; > } > } > @@ -279,12 +283,16 @@ void multiorder_tagged_iteration(void) > } > > radix_tree_for_each_tagged(slot, &tree, &iter, j, 1) { > + struct item *item = *slot; > for (k = i; index[k] < tag_index[i]; k++) > ; > mask = (1 << order[k]) - 1; > > assert(iter.index >= (tag_index[i] &~ mask)); > assert(iter.index <= (tag_index[i] | mask)); > + assert(!radix_tree_is_internal_node(item)); > + assert(item->index >= (tag_index[i] &~ mask)); > + assert(item->index <= (tag_index[i] | mask)); > i++; > } > } > @@ -303,12 +311,16 @@ void multiorder_tagged_iteration(void) > } > > radix_tree_for_each_tagged(slot, &tree, &iter, j, 2) { > + struct item *item = *slot; > for (k = i; index[k] < tag_index[i]; k++) > ; > mask = (1 << order[k]) - 1; > > assert(iter.index >= (tag_index[i] &~ mask)); > assert(iter.index <= (tag_index[i] | mask)); > + assert(!radix_tree_is_internal_node(item)); > + assert(item->index >= (tag_index[i] &~ mask)); > + assert(item->index <= (tag_index[i] | mask)); > i++; > } > } > diff --git a/tools/testing/radix-tree/regression3.c b/tools/testing/radix-tree/regression3.c > index 1f06ed7..4d28eeb 100644 > --- a/tools/testing/radix-tree/regression3.c > +++ b/tools/testing/radix-tree/regression3.c > @@ -88,7 +88,7 @@ void regression3_test(void) > printf("slot %ld %p\n", iter.index, *slot); > if (!iter.index) { > printf("next at %ld\n", iter.index); > - slot = radix_tree_iter_next(&iter); > + slot = radix_tree_iter_next(slot, &iter); > } > } > > @@ -96,7 +96,7 @@ void regression3_test(void) > printf("contig %ld %p\n", iter.index, *slot); > if (!iter.index) { > printf("next at %ld\n", iter.index); > - slot = radix_tree_iter_next(&iter); > + slot = radix_tree_iter_next(slot, &iter); > } > } > > @@ -106,7 +106,7 @@ void regression3_test(void) > printf("tagged %ld %p\n", iter.index, *slot); > if (!iter.index) { > printf("next at %ld\n", iter.index); > - slot = radix_tree_iter_next(&iter); > + slot = radix_tree_iter_next(slot, &iter); > } > } > > diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h > index 33d2b6b..b678f13 100644 > --- a/tools/testing/radix-tree/test.h > +++ b/tools/testing/radix-tree/test.h > @@ -43,6 +43,7 @@ void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag); > extern int nr_allocated; > > /* Normally private parts of lib/radix-tree.c */ > +struct radix_tree_node *entry_to_node(void *ptr); > void radix_tree_dump(struct radix_tree_root *root); > int root_tag_get(struct radix_tree_root *root, unsigned int tag); > unsigned long node_maxindex(struct radix_tree_node *); > -- > 2.10.2 > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RnJvbTogS29uc3RhbnRpbiBLaGxlYm5pa292IFttYWlsdG86a29jdDlpQGdtYWlsLmNvbV0NCj4g T24gVGh1LCBOb3YgMTcsIDIwMTYgYXQgMzoxNyBBTSwgTWF0dGhldyBXaWxjb3gNCj4gPG1hd2ls Y294QGxpbnV4b25oeXBlcnYuY29tPiB3cm90ZToNCj4gPiBUaGlzIGZpeGVzIHNldmVyYWwgaW50 ZXJsaW5rZWQgcHJvYmxlbXMgd2l0aCB0aGUgaXRlcmF0b3JzIGluIHRoZQ0KPiA+IHByZXNlbmNl IG9mIG11bHRpb3JkZXIgZW50cmllcy4NCj4gPg0KPiA+IDEuIHJhZGl4X3RyZWVfaXRlcl9uZXh0 KCkgd291bGQgb25seSBhZHZhbmNlIGJ5IG9uZSBzbG90LCB3aGljaCB3b3VsZA0KPiA+IHJlc3Vs dCBpbiB0aGUgaXRlcmF0b3JzIHJldHVybmluZyB0aGUgc2FtZSBlbnRyeSBtb3JlIHRoYW4gb25j ZSBpZiB0aGVyZQ0KPiA+IHdlcmUgc2libGluZyBlbnRyaWVzLg0KPiANCj4gSXMgdGhpcyBhIHBy b2JsZW0/IERvIHdlIGhhdmUgdXNlcnMgd2hvIGNhbm5vdCBldmFsYXRlIGxlbmd0aCBvZiBlbnRy eQ0KPiBieSBsb29raW5nIGludG8gaXQgaGVhZD8NCg0KQXQgdGhlIG1vbWVudCB3ZSBoYXZlIG5v IHVzZXJzIGluIHRyZWUgOi0pICBUaGUgdHdvIHVzZXJzIEkga25vdyBvZiBhcmUgdGhlIHBhZ2Ug Y2FjaGUgYW5kIERBWC4gIFRoZSBwYWdlIGNhY2hlIHN0b3JlcyBhIHBvaW50ZXIgdG8gYSBzdHJ1 Y3QgcGFnZSwgd2hpY2ggaGFzIGNvbXBvdW5kX29yZGVyKCkgdG8gdGVsbCB5b3UgdGhlIHNpemUu ICBEQVggdXNlcyBhIGNvdXBsZSBvZiBiaXRzIGluIHRoZSByYWRpeCB0cmVlIGVudHJ5IHRvIGRl c2NyaWJlIHdoZXRoZXIgdGhpcyBpcyBhIFBURS9QTUQvUFVEIGFuZCBzbyBhbHNvIGtub3dzIHRo ZSBzaXplIG9mIHRoZSBlbnRyeSB0aGF0IGl0IGZvdW5kLiAgV2UgYWxzbyBzdG9yZSBzd2FwIGNh Y2hlIGVudHJpZXMgaW4gdGhlIHNhbWUgcmFkaXggdHJlZSAodGFnZ2VkIGV4Y2VwdGlvbmFsKS4g IFRoZXNlIGN1cnJlbnRseSBoYXZlIG5vIGluZm9ybWF0aW9uIGluIHRoZW0gdG8gZGVzY3JpYmUg dGhlaXIgc2l6ZSBiZWNhdXNlIGVhY2ggb25lIHJlcHJlc2VudHMgb25seSBvbmUgcGFnZS4gIFRo ZSBsYXRlc3QgcGF0Y2hzZXQgdG8gc3VwcG9ydCBzd2FwcGluZyBodWdlIHBhZ2VzIGluc2VydHMg NTEyIGVudHJpZXMgaW50byB0aGUgcmFkaXggdHJlZSBpbnN0ZWFkIG9mIHRha2luZyBhZHZhbnRh Z2Ugb2YgdGhlIG11bHRpb3JkZXIgZW50cnkgaW5mcmFzdHJ1Y3R1cmUuICBIb3BlZnVsbHkgdGhh dCBnZXRzIGZpeGVkIHNvb24sIGJ1dCBpdCB3aWxsIHJlcXVpcmUgc3RlYWxpbmcgYSBiaXQgZnJv bSBlaXRoZXIgdGhlIG51bWJlciBvZiBzd2FwIGZpbGVzIGFsbG93ZWQgb3IgZnJvbSB0aGUgbWF4 aW11bSBzaXplIG9mIGVhY2ggc3dhcCBmaWxlIChjdXJyZW50bHkgMzIvMTI4R0IpDQoNCkkgdGhp bmsgd2hhdCB5b3UncmUgc3VnZ2VzdGluZyBpcyB0aGF0IHdlIGludHJvZHVjZSBhIG5ldyBBUEk6 DQoNCiBzbG90ID0gcmFkaXhfdHJlZV9pdGVyX3NhdmUoJml0ZXIsIG9yZGVyKTsNCg0Kd2hlcmUg dGhlIGNhbGxlciB0ZWxscyB1cyB0aGUgb3JkZXIgb2YgdGhlIGVudHJ5IGl0IGp1c3QgY29uc3Vt ZWQuICBPciBtYXliZSB5b3UncmUgc3VnZ2VzdGluZw0KDQogc2xvdCA9IHJhZGl4X3RyZWVfaXRl cl9hZHZhbmNlKCZpdGVyLCBuZXdpbmRleCkNCg0Kd2hpY2ggd291bGQgYWxsb3cgdXMgdG8gc2tp cCB0byBhbnkgaW5kZXguICBBbHRob3VnaCAuLi4gaXNuJ3QgdGhhdCBqdXN0IHJhZGl4X3RyZWVf aXRlcl9pbml0KCk/DQoNCkl0IGRvZXMgcHVzaCBhIGJpdCBvZiBjb21wbGV4aXR5IG9udG8gdGhl IGNhbGxlcnMuICBXZSBoYXZlIDcgY2FsbGVycyBvZiByYWRpeF90cmVlX2l0ZXJfbmV4dCgpIGlu IG15IGN1cnJlbnQgdHJlZSAoYWZ0ZXIgYXBwbHlpbmcgdGhpcyBwYXRjaCBzZXQsIHNvIHJhbmdl X3RhZ19pZl90YWdnZWQgYW5kIGxvY2F0ZV9pdGVtIGhhdmUgYmVlbiBwdXNoZWQgaW50byB0aGVp ciBjYWxsZXJzKTogYnRyZnMsIGt1Z2VwYWdlZCwgcGFnZS13cml0ZWJhY2sgYW5kIHNobWVtLiAg YnRyZnMga25vd3MgaXRzIG9iamVjdHMgb2NjdXB5IG9uZSBzbG90LiAga2h1Z2VwYWdlZCBrbm93 cyB0aGF0IGl0cyBwYWdlIGlzIG9yZGVyIDAgYXQgdGhlIHRpbWUgaXQgY2FsbHMgcmFkaXhfdHJl ZV9pdGVyX25leHQoKS4gIFBhZ2Utd3JpdGViYWNrIGhhcyBhIHN0cnVjdCBwYWdlIGFuZCBjYW4g c2ltcGx5IHVzZSBjb21wb3VuZF9vcmRlcigpLiAgSXQncyBzaG1lbSB3aGVyZSB0aGluZ3MgZ2V0 IHN0aWNreSwgYWx0aG91Z2ggaXQncyBhbGwgc29sdmFibGUgd2l0aCBzb21lIHRlbXBvcmFyeSB2 YXJpYWJsZXMuDQoNCk1NIHBlb3BsZSwgd2hhdCBkbyB5b3UgdGhpbmsgb2YgdGhpcyBwYXRjaD8g IEl0J3Mgb24gdG9wIG9mIG15IGN1cnJlbnQgSURSIHRyZWUsIGFsdGhvdWdoIEknZCBmb2xkIGl0 IGludG8gcGF0Y2ggMjAgb2YgdGhlIHNlcmllcyBpZiBpdCdzIGFjY2VwdGFibGUuDQoNCmRpZmYg LS1naXQgYS9mcy9idHJmcy90ZXN0cy9idHJmcy10ZXN0cy5jIGIvZnMvYnRyZnMvdGVzdHMvYnRy ZnMtdGVzdHMuYw0KaW5kZXggNmQzNDU3YS4uN2Y4NjRlYyAxMDA2NDQNCi0tLSBhL2ZzL2J0cmZz L3Rlc3RzL2J0cmZzLXRlc3RzLmMNCisrKyBiL2ZzL2J0cmZzL3Rlc3RzL2J0cmZzLXRlc3RzLmMN CkBAIC0xNjIsNyArMTYyLDcgQEAgdm9pZCBidHJmc19mcmVlX2R1bW15X2ZzX2luZm8oc3RydWN0 IGJ0cmZzX2ZzX2luZm8gKmZzX2luZm8pDQogCQkJCXNsb3QgPSByYWRpeF90cmVlX2l0ZXJfcmV0 cnkoJml0ZXIpOw0KIAkJCWNvbnRpbnVlOw0KIAkJfQ0KLQkJc2xvdCA9IHJhZGl4X3RyZWVfaXRl cl9uZXh0KHNsb3QsICZpdGVyKTsNCisJCXNsb3QgPSByYWRpeF90cmVlX2l0ZXJfc2F2ZSgmaXRl ciwgMCk7DQogCQlzcGluX3VubG9jaygmZnNfaW5mby0+YnVmZmVyX2xvY2spOw0KIAkJZnJlZV9l eHRlbnRfYnVmZmVyX3N0YWxlKGViKTsNCiAJCXNwaW5fbG9jaygmZnNfaW5mby0+YnVmZmVyX2xv Y2spOw0KZGlmZiAtLWdpdCBhL2luY2x1ZGUvbGludXgvcmFkaXgtdHJlZS5oIGIvaW5jbHVkZS9s aW51eC9yYWRpeC10cmVlLmgNCmluZGV4IDRlNDJkNGQuLjQ0MTkzMjUgMTAwNjQ0DQotLS0gYS9p bmNsdWRlL2xpbnV4L3JhZGl4LXRyZWUuaA0KKysrIGIvaW5jbHVkZS9saW51eC9yYWRpeC10cmVl LmgNCkBAIC00MjEsMTUgKzQyMSwyMiBAQCBfX3JhZGl4X3RyZWVfaXRlcl9hZGQoc3RydWN0IHJh ZGl4X3RyZWVfaXRlciAqaXRlciwgdW5zaWduZWQgbG9uZyBzbG90cykNCiB9DQogDQogLyoqDQot ICogcmFkaXhfdHJlZV9pdGVyX25leHQgLSByZXN1bWUgaXRlcmF0aW5nIHdoZW4gdGhlIGNodW5r IG1heSBiZSBpbnZhbGlkDQotICogQGl0ZXI6CWl0ZXJhdG9yIHN0YXRlDQorICogcmFkaXhfdHJl ZV9pdGVyX3NhdmUgLSByZXN1bWUgaXRlcmF0aW5nIHdoZW4gdGhlIGNodW5rIG1heSBiZSBpbnZh bGlkDQorICogQGl0ZXI6IGl0ZXJhdG9yIHN0YXRlDQorICogQG9yZGVyOiBvcmRlciBvZiB0aGUg ZW50cnkgdGhhdCB3YXMganVzdCBwcm9jZXNzZWQNCiAgKg0KLSAqIElmIHRoZSBpdGVyYXRvciBu ZWVkcyB0byByZWxlYXNlIHRoZW4gcmVhY3F1aXJlIGEgbG9jaywgdGhlIGNodW5rIG1heQ0KLSAq IGhhdmUgYmVlbiBpbnZhbGlkYXRlZCBieSBhbiBpbnNlcnRpb24gb3IgZGVsZXRpb24uICBDYWxs IHRoaXMgZnVuY3Rpb24NCisgKiBJZiB0aGUgaXRlcmF0aW9uIG5lZWRzIHRvIHJlbGVhc2UgdGhl biByZWFjcXVpcmUgYSBsb2NrLCB0aGUgY2h1bmsgbWF5DQorICogYmUgaW52YWxpZGF0ZWQgYnkg YW4gaW5zZXJ0aW9uIG9yIGRlbGV0aW9uLiAgQ2FsbCB0aGlzIGZ1bmN0aW9uDQogICogYmVmb3Jl IHJlbGVhc2luZyB0aGUgbG9jayB0byBjb250aW51ZSB0aGUgaXRlcmF0aW9uIGZyb20gdGhlIG5l eHQgaW5kZXguDQogICovDQotdm9pZCAqKl9fbXVzdF9jaGVjayByYWRpeF90cmVlX2l0ZXJfbmV4 dCh2b2lkICoqc2xvdCwNCi0JCQkJCXN0cnVjdCByYWRpeF90cmVlX2l0ZXIgKml0ZXIpOw0KK3N0 YXRpYyBpbmxpbmUgdm9pZCAqKl9fbXVzdF9jaGVjaw0KK3JhZGl4X3RyZWVfaXRlcl9zYXZlKHN0 cnVjdCByYWRpeF90cmVlX2l0ZXIgKml0ZXIsIHVuc2lnbmVkIG9yZGVyKQ0KK3sNCisJaXRlci0+ bmV4dF9pbmRleCA9IHJvdW5kX3VwKGl0ZXItPmluZGV4LCAxIDw8IG9yZGVyKTsNCisJaXRlci0+ aW5kZXggPSAwOw0KKwlpdGVyLT50YWdzID0gMDsNCisJcmV0dXJuIE5VTEw7DQorfQ0KIA0KIC8q Kg0KICAqIHJhZGl4X3RyZWVfY2h1bmtfc2l6ZSAtIGdldCBjdXJyZW50IGNodW5rIHNpemUNCkBA IC00NjcsNyArNDc0LDcgQEAgc3RhdGljIGlubGluZSB2b2lkICoqIF9fcmFkaXhfdHJlZV9uZXh0 X3Nsb3Qodm9pZCAqKnNsb3QsDQogICogRm9yIHRhZ2dlZCBsb29rdXAgaXQgYWxzbyBlYXRzIEBp dGVyLT50YWdzLg0KICAqDQogICogVGhlcmUgYXJlIHNldmVyYWwgY2FzZXMgd2hlcmUgJ3Nsb3Qn IGNhbiBiZSBwYXNzZWQgaW4gYXMgTlVMTCB0byB0aGlzDQotICogZnVuY3Rpb24uICBUaGVzZSBj YXNlcyByZXN1bHQgZnJvbSB0aGUgdXNlIG9mIHJhZGl4X3RyZWVfaXRlcl9uZXh0KCkgb3INCisg KiBmdW5jdGlvbi4gIFRoZXNlIGNhc2VzIHJlc3VsdCBmcm9tIHRoZSB1c2Ugb2YgcmFkaXhfdHJl ZV9pdGVyX3NhdmUoKSBvcg0KICAqIHJhZGl4X3RyZWVfaXRlcl9yZXRyeSgpLiAgSW4gdGhlc2Ug Y2FzZXMgd2UgZG9uJ3QgZW5kIHVwIGRlcmVmZXJlbmNpbmcNCiAgKiAnc2xvdCcgYmVjYXVzZSBl aXRoZXI6DQogICogYSkgd2UgYXJlIGRvaW5nIHRhZ2dlZCBpdGVyYXRpb24gYW5kIGl0ZXItPnRh Z3MgaGFzIGJlZW4gc2V0IHRvIDAsIG9yDQpkaWZmIC0tZ2l0IGEvbGliL3JhZGl4LXRyZWUuYyBi L2xpYi9yYWRpeC10cmVlLmMNCmluZGV4IDdlMjQ2OWIuLmZjYmFkYWQgMTAwNjQ0DQotLS0gYS9s aWIvcmFkaXgtdHJlZS5jDQorKysgYi9saWIvcmFkaXgtdHJlZS5jDQpAQCAtMTI0NSw2ICsxMjQ1 LDcgQEAgc3RhdGljIGlubGluZSB2b2lkIF9fc2V0X2l0ZXJfc2hpZnQoc3RydWN0IHJhZGl4X3Ry ZWVfaXRlciAqaXRlciwNCiAjZW5kaWYNCiB9DQogDQorI2lmZGVmIENPTkZJR19SQURJWF9UUkVF X01VTFRJT1JERVINCiBzdGF0aWMgdm9pZCAqKiBfX3JhZGl4X3RyZWVfaXRlcl9uZXh0KHN0cnVj dCByYWRpeF90cmVlX25vZGUgKipub2RlcCwNCiAJCQl2b2lkICoqc2xvdCwgc3RydWN0IHJhZGl4 X3RyZWVfaXRlciAqaXRlcikNCiB7DQpAQCAtMTI2Myw3ICsxMjY0LDYgQEAgc3RhdGljIHZvaWQg KiogX19yYWRpeF90cmVlX2l0ZXJfbmV4dChzdHJ1Y3QgcmFkaXhfdHJlZV9ub2RlICoqbm9kZXAs DQogCXJldHVybiBOVUxMOw0KIH0NCiANCi0jaWZkZWYgQ09ORklHX1JBRElYX1RSRUVfTVVMVElP UkRFUg0KIHZvaWQgKiogX19yYWRpeF90cmVlX25leHRfc2xvdCh2b2lkICoqc2xvdCwgc3RydWN0 IHJhZGl4X3RyZWVfaXRlciAqaXRlciwNCiAJCQkJCXVuc2lnbmVkIGZsYWdzKQ0KIHsNCkBAIC0x MzIxLDIwICsxMzIxLDYgQEAgdm9pZCAqKiBfX3JhZGl4X3RyZWVfbmV4dF9zbG90KHZvaWQgKipz bG90LCBzdHJ1Y3QgcmFkaXhfdHJlZV9pdGVyICppdGVyLA0KIEVYUE9SVF9TWU1CT0woX19yYWRp eF90cmVlX25leHRfc2xvdCk7DQogI2VuZGlmDQogDQotdm9pZCAqKnJhZGl4X3RyZWVfaXRlcl9u ZXh0KHZvaWQgKipzbG90LCBzdHJ1Y3QgcmFkaXhfdHJlZV9pdGVyICppdGVyKQ0KLXsNCi0Jc3Ry dWN0IHJhZGl4X3RyZWVfbm9kZSAqbm9kZTsNCi0NCi0Jc2xvdCsrOw0KLQlpdGVyLT5pbmRleCA9 IF9fcmFkaXhfdHJlZV9pdGVyX2FkZChpdGVyLCAxKTsNCi0Jbm9kZSA9IHJjdV9kZXJlZmVyZW5j ZV9yYXcoKnNsb3QpOw0KLQlfX3JhZGl4X3RyZWVfaXRlcl9uZXh0KCZub2RlLCBzbG90LCBpdGVy KTsNCi0JaXRlci0+bmV4dF9pbmRleCA9IGl0ZXItPmluZGV4Ow0KLQlpdGVyLT50YWdzID0gMDsN Ci0JcmV0dXJuIE5VTEw7DQotfQ0KLUVYUE9SVF9TWU1CT0wocmFkaXhfdHJlZV9pdGVyX25leHQp Ow0KLQ0KIC8qKg0KICAqIHJhZGl4X3RyZWVfbmV4dF9jaHVuayAtIGZpbmQgbmV4dCBjaHVuayBv ZiBzbG90cyBmb3IgaXRlcmF0aW9uDQogICoNCmRpZmYgLS1naXQgYS9tbS9raHVnZXBhZ2VkLmMg Yi9tbS9raHVnZXBhZ2VkLmMNCmluZGV4IDQ2MTU1ZDEuLjU0NDQ2ZTYgMTAwNjQ0DQotLS0gYS9t bS9raHVnZXBhZ2VkLmMNCisrKyBiL21tL2todWdlcGFnZWQuYw0KQEAgLTE2MTQsNyArMTYxNCw3 IEBAIHN0YXRpYyB2b2lkIGtodWdlcGFnZWRfc2Nhbl9zaG1lbShzdHJ1Y3QgbW1fc3RydWN0ICpt bSwNCiAJCXByZXNlbnQrKzsNCiANCiAJCWlmIChuZWVkX3Jlc2NoZWQoKSkgew0KLQkJCXNsb3Qg PSByYWRpeF90cmVlX2l0ZXJfbmV4dChzbG90LCAmaXRlcik7DQorCQkJc2xvdCA9IHJhZGl4X3Ry ZWVfaXRlcl9zYXZlKCZpdGVyLCAwKTsNCiAJCQljb25kX3Jlc2NoZWRfcmN1KCk7DQogCQl9DQog CX0NCmRpZmYgLS1naXQgYS9tbS9wYWdlLXdyaXRlYmFjay5jIGIvbW0vcGFnZS13cml0ZWJhY2su Yw0KaW5kZXggYzU5MzcxNS4uN2Q2Yjg3MCAxMDA2NDQNCi0tLSBhL21tL3BhZ2Utd3JpdGViYWNr LmMNCisrKyBiL21tL3BhZ2Utd3JpdGViYWNrLmMNCkBAIC0yMTE5LDcgKzIxMTksNyBAQCB2b2lk IHRhZ19wYWdlc19mb3Jfd3JpdGViYWNrKHN0cnVjdCBhZGRyZXNzX3NwYWNlICptYXBwaW5nLA0K IAkJdGFnZ2VkKys7DQogCQlpZiAoKHRhZ2dlZCAlIFdSSVRFQkFDS19UQUdfQkFUQ0gpICE9IDAp DQogCQkJY29udGludWU7DQotCQlzbG90ID0gcmFkaXhfdHJlZV9pdGVyX25leHQoc2xvdCwgJml0 ZXIpOw0KKwkJc2xvdCA9IHJhZGl4X3RyZWVfaXRlcl9zYXZlKCZpdGVyLCBjb21wb3VuZF9vcmRl cigqc2xvdCkpOw0KIAkJc3Bpbl91bmxvY2tfaXJxKCZtYXBwaW5nLT50cmVlX2xvY2spOw0KIAkJ Y29uZF9yZXNjaGVkKCk7DQogCQlzcGluX2xvY2tfaXJxKCZtYXBwaW5nLT50cmVlX2xvY2spOw0K ZGlmZiAtLWdpdCBhL21tL3NobWVtLmMgYi9tbS9zaG1lbS5jDQppbmRleCA4ZjljOWFhLi4zZjJk MDdhIDEwMDY0NA0KLS0tIGEvbW0vc2htZW0uYw0KKysrIGIvbW0vc2htZW0uYw0KQEAgLTY0NCw2 ICs2NDQsNyBAQCB1bnNpZ25lZCBsb25nIHNobWVtX3BhcnRpYWxfc3dhcF91c2FnZShzdHJ1Y3Qg YWRkcmVzc19zcGFjZSAqbWFwcGluZywNCiAJcmN1X3JlYWRfbG9jaygpOw0KIA0KIAlyYWRpeF90 cmVlX2Zvcl9lYWNoX3Nsb3Qoc2xvdCwgJm1hcHBpbmctPnBhZ2VfdHJlZSwgJml0ZXIsIHN0YXJ0 KSB7DQorCQl1bnNpZ25lZCBpbnQgb3JkZXIgPSAwOw0KIAkJaWYgKGl0ZXIuaW5kZXggPj0gZW5k KQ0KIAkJCWJyZWFrOw0KIA0KQEAgLTY1Niw5ICs2NTcsMTEgQEAgdW5zaWduZWQgbG9uZyBzaG1l bV9wYXJ0aWFsX3N3YXBfdXNhZ2Uoc3RydWN0IGFkZHJlc3Nfc3BhY2UgKm1hcHBpbmcsDQogDQog CQlpZiAocmFkaXhfdHJlZV9leGNlcHRpb25hbF9lbnRyeShwYWdlKSkNCiAJCQlzd2FwcGVkKys7 DQorCQllbHNlDQorCQkJb3JkZXIgPSBjb21wb3VuZF9vcmRlcihwYWdlKTsNCiANCiAJCWlmIChu ZWVkX3Jlc2NoZWQoKSkgew0KLQkJCXNsb3QgPSByYWRpeF90cmVlX2l0ZXJfbmV4dChzbG90LCAm aXRlcik7DQorCQkJc2xvdCA9IHJhZGl4X3RyZWVfaXRlcl9zYXZlKCZpdGVyLCBvcmRlcik7DQog CQkJY29uZF9yZXNjaGVkX3JjdSgpOw0KIAkJfQ0KIAl9DQpAQCAtMTA2Miw3ICsxMDY1LDcgQEAg c3RhdGljIHVuc2lnbmVkIGxvbmcgZmluZF9zd2FwX2VudHJ5KHN0cnVjdCByYWRpeF90cmVlX3Jv b3QgKnJvb3QsIHZvaWQgKml0ZW0pDQogCQljaGVja2VkKys7DQogCQlpZiAoKGNoZWNrZWQgJSA0 MDk2KSAhPSAwKQ0KIAkJCWNvbnRpbnVlOw0KLQkJc2xvdCA9IHJhZGl4X3RyZWVfaXRlcl9uZXh0 KHNsb3QsICZpdGVyKTsNCisJCXNsb3QgPSByYWRpeF90cmVlX2l0ZXJfc2F2ZSgmaXRlciwgMCk7 DQogCQljb25kX3Jlc2NoZWRfcmN1KCk7DQogCX0NCiANCkBAIC0yNDQ0LDIxICsyNDQ3LDI1IEBA IHN0YXRpYyB2b2lkIHNobWVtX3RhZ19waW5zKHN0cnVjdCBhZGRyZXNzX3NwYWNlICptYXBwaW5n KQ0KIAlyY3VfcmVhZF9sb2NrKCk7DQogDQogCXJhZGl4X3RyZWVfZm9yX2VhY2hfc2xvdChzbG90 LCAmbWFwcGluZy0+cGFnZV90cmVlLCAmaXRlciwgc3RhcnQpIHsNCisJCXVuc2lnbmVkIGludCBv cmRlciA9IDA7DQogCQlwYWdlID0gcmFkaXhfdHJlZV9kZXJlZl9zbG90KHNsb3QpOw0KIAkJaWYg KCFwYWdlIHx8IHJhZGl4X3RyZWVfZXhjZXB0aW9uKHBhZ2UpKSB7DQogCQkJaWYgKHJhZGl4X3Ry ZWVfZGVyZWZfcmV0cnkocGFnZSkpIHsNCiAJCQkJc2xvdCA9IHJhZGl4X3RyZWVfaXRlcl9yZXRy eSgmaXRlcik7DQogCQkJCWNvbnRpbnVlOw0KIAkJCX0NCi0JCX0gZWxzZSBpZiAocGFnZV9jb3Vu dChwYWdlKSAtIHBhZ2VfbWFwY291bnQocGFnZSkgPiAxKSB7DQotCQkJc3Bpbl9sb2NrX2lycSgm bWFwcGluZy0+dHJlZV9sb2NrKTsNCi0JCQlyYWRpeF90cmVlX3RhZ19zZXQoJm1hcHBpbmctPnBh Z2VfdHJlZSwgaXRlci5pbmRleCwNCi0JCQkJCSAgIFNITUVNX1RBR19QSU5ORUQpOw0KLQkJCXNw aW5fdW5sb2NrX2lycSgmbWFwcGluZy0+dHJlZV9sb2NrKTsNCisJCX0gZWxzZSB7DQorCQkJb3Jk ZXIgPSBjb21wb3VuZF9vcmRlcihwYWdlKTsNCisJCQlpZiAocGFnZV9jb3VudChwYWdlKSAtIHBh Z2VfbWFwY291bnQocGFnZSkgPiAxKSB7DQorCQkJCXNwaW5fbG9ja19pcnEoJm1hcHBpbmctPnRy ZWVfbG9jayk7DQorCQkJCXJhZGl4X3RyZWVfdGFnX3NldCgmbWFwcGluZy0+cGFnZV90cmVlLA0K KwkJCQkJCWl0ZXIuaW5kZXgsIFNITUVNX1RBR19QSU5ORUQpOw0KKwkJCQlzcGluX3VubG9ja19p cnEoJm1hcHBpbmctPnRyZWVfbG9jayk7DQorCQkJfQ0KIAkJfQ0KIA0KIAkJaWYgKG5lZWRfcmVz Y2hlZCgpKSB7DQotCQkJc2xvdCA9IHJhZGl4X3RyZWVfaXRlcl9uZXh0KHNsb3QsICZpdGVyKTsN CisJCQlzbG90ID0gcmFkaXhfdHJlZV9pdGVyX3NhdmUoJml0ZXIsIG9yZGVyKTsNCiAJCQljb25k X3Jlc2NoZWRfcmN1KCk7DQogCQl9DQogCX0NCkBAIC0yNTI4LDcgKzI1MzUsMTAgQEAgc3RhdGlj IGludCBzaG1lbV93YWl0X2Zvcl9waW5zKHN0cnVjdCBhZGRyZXNzX3NwYWNlICptYXBwaW5nKQ0K IAkJCXNwaW5fdW5sb2NrX2lycSgmbWFwcGluZy0+dHJlZV9sb2NrKTsNCiBjb250aW51ZV9yZXNj aGVkOg0KIAkJCWlmIChuZWVkX3Jlc2NoZWQoKSkgew0KLQkJCQlzbG90ID0gcmFkaXhfdHJlZV9p dGVyX25leHQoc2xvdCwgJml0ZXIpOw0KKwkJCQl1bnNpZ25lZCBpbnQgb3JkZXIgPSAwOw0KKwkJ CQlpZiAocGFnZSkNCisJCQkJCW9yZGVyID0gY29tcG91bmRfb3JkZXIocGFnZSk7DQorCQkJCXNs b3QgPSByYWRpeF90cmVlX2l0ZXJfc2F2ZSgmaXRlciwgb3JkZXIpOw0KIAkJCQljb25kX3Jlc2No ZWRfcmN1KCk7DQogCQkJfQ0KIAkJfQ0K -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Nov 18, 2016 at 7:31 PM, Matthew Wilcox <mawilcox@microsoft.com> wrote: > From: Konstantin Khlebnikov [mailto:koct9i@gmail.com] >> On Thu, Nov 17, 2016 at 3:17 AM, Matthew Wilcox >> <mawilcox@linuxonhyperv.com> wrote: >> > This fixes several interlinked problems with the iterators in the >> > presence of multiorder entries. >> > >> > 1. radix_tree_iter_next() would only advance by one slot, which would >> > result in the iterators returning the same entry more than once if there >> > were sibling entries. >> >> Is this a problem? Do we have users who cannot evalate length of entry >> by looking into it head? > > At the moment we have no users in tree :-) The two users I know of are the page cache and DAX. The page cache stores a pointer to a struct page, which has compound_order() to tell you the size. DAX uses a couple of bits in the radix tree entry to describe whether this is a PTE/PMD/PUD and so also knows the size of the entry that it found. We also store swap cache entries in the same radix tree (tagged exceptional). These currently have no information in them to describe their size because each one represents only one page. The latest patchset to support swapping huge pages inserts 512 entries into the radix tree instead of taking advantage of the multiorder entry infrastructure. Hopefully that gets fixed soon, but it will require stealing a bit from either the number of swap files allowed or from the maximum size of each swap file (currently 32/128GB) > > I think what you're suggesting is that we introduce a new API: > > slot = radix_tree_iter_save(&iter, order); > > where the caller tells us the order of the entry it just consumed. Or maybe you're suggesting > > slot = radix_tree_iter_advance(&iter, newindex) Yes, someting like that. > > which would allow us to skip to any index. Although ... isn't that just radix_tree_iter_init()? Iterator could keep pointer to current node and reuse it for next iteration if possible. > > It does push a bit of complexity onto the callers. We have 7 callers of radix_tree_iter_next() in my current tree (after applying this patch set, so range_tag_if_tagged and locate_item have been pushed into their callers): btrfs, kugepaged, page-writeback and shmem. btrfs knows its objects occupy one slot. khugepaged knows that its page is order 0 at the time it calls radix_tree_iter_next(). Page-writeback has a struct page and can simply use compound_order(). It's shmem where things get sticky, although it's all solvable with some temporary variables. > Users who work only with single slot enties don't get any complications, all other already manage these multiorder entries somehow. > MM people, what do you think of this patch? It's on top of my current IDR tree, although I'd fold it into patch 20 of the series if it's acceptable. > > diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c > index 6d3457a..7f864ec 100644 > --- a/fs/btrfs/tests/btrfs-tests.c > +++ b/fs/btrfs/tests/btrfs-tests.c > @@ -162,7 +162,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) > slot = radix_tree_iter_retry(&iter); > continue; > } > - slot = radix_tree_iter_next(slot, &iter); > + slot = radix_tree_iter_save(&iter, 0); > spin_unlock(&fs_info->buffer_lock); > free_extent_buffer_stale(eb); > spin_lock(&fs_info->buffer_lock); > diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h > index 4e42d4d..4419325 100644 > --- a/include/linux/radix-tree.h > +++ b/include/linux/radix-tree.h > @@ -421,15 +421,22 @@ __radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots) > } > > /** > - * radix_tree_iter_next - resume iterating when the chunk may be invalid > - * @iter: iterator state > + * radix_tree_iter_save - resume iterating when the chunk may be invalid > + * @iter: iterator state > + * @order: order of the entry that was just processed > * > - * If the iterator needs to release then reacquire a lock, the chunk may > - * have been invalidated by an insertion or deletion. Call this function > + * If the iteration needs to release then reacquire a lock, the chunk may > + * be invalidated by an insertion or deletion. Call this function > * before releasing the lock to continue the iteration from the next index. > */ > -void **__must_check radix_tree_iter_next(void **slot, > - struct radix_tree_iter *iter); > +static inline void **__must_check > +radix_tree_iter_save(struct radix_tree_iter *iter, unsigned order) > +{ > + iter->next_index = round_up(iter->index, 1 << order); > + iter->index = 0; > + iter->tags = 0; > + return NULL; > +} > > /** > * radix_tree_chunk_size - get current chunk size > @@ -467,7 +474,7 @@ static inline void ** __radix_tree_next_slot(void **slot, > * For tagged lookup it also eats @iter->tags. > * > * There are several cases where 'slot' can be passed in as NULL to this > - * function. These cases result from the use of radix_tree_iter_next() or > + * function. These cases result from the use of radix_tree_iter_save() or > * radix_tree_iter_retry(). In these cases we don't end up dereferencing > * 'slot' because either: > * a) we are doing tagged iteration and iter->tags has been set to 0, or > diff --git a/lib/radix-tree.c b/lib/radix-tree.c > index 7e2469b..fcbadad 100644 > --- a/lib/radix-tree.c > +++ b/lib/radix-tree.c > @@ -1245,6 +1245,7 @@ static inline void __set_iter_shift(struct radix_tree_iter *iter, > #endif > } > > +#ifdef CONFIG_RADIX_TREE_MULTIORDER > static void ** __radix_tree_iter_next(struct radix_tree_node **nodep, > void **slot, struct radix_tree_iter *iter) > { > @@ -1263,7 +1264,6 @@ static void ** __radix_tree_iter_next(struct radix_tree_node **nodep, > return NULL; > } > > -#ifdef CONFIG_RADIX_TREE_MULTIORDER > void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, > unsigned flags) > { > @@ -1321,20 +1321,6 @@ void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, > EXPORT_SYMBOL(__radix_tree_next_slot); > #endif > > -void **radix_tree_iter_next(void **slot, struct radix_tree_iter *iter) > -{ > - struct radix_tree_node *node; > - > - slot++; > - iter->index = __radix_tree_iter_add(iter, 1); > - node = rcu_dereference_raw(*slot); > - __radix_tree_iter_next(&node, slot, iter); > - iter->next_index = iter->index; > - iter->tags = 0; > - return NULL; > -} > -EXPORT_SYMBOL(radix_tree_iter_next); > - > /** > * radix_tree_next_chunk - find next chunk of slots for iteration > * > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index 46155d1..54446e6 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -1614,7 +1614,7 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, > present++; > > if (need_resched()) { > - slot = radix_tree_iter_next(slot, &iter); > + slot = radix_tree_iter_save(&iter, 0); > cond_resched_rcu(); > } > } > diff --git a/mm/page-writeback.c b/mm/page-writeback.c > index c593715..7d6b870 100644 > --- a/mm/page-writeback.c > +++ b/mm/page-writeback.c > @@ -2119,7 +2119,7 @@ void tag_pages_for_writeback(struct address_space *mapping, > tagged++; > if ((tagged % WRITEBACK_TAG_BATCH) != 0) > continue; > - slot = radix_tree_iter_next(slot, &iter); > + slot = radix_tree_iter_save(&iter, compound_order(*slot)); > spin_unlock_irq(&mapping->tree_lock); > cond_resched(); > spin_lock_irq(&mapping->tree_lock); > diff --git a/mm/shmem.c b/mm/shmem.c > index 8f9c9aa..3f2d07a 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -644,6 +644,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, > rcu_read_lock(); > > radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { > + unsigned int order = 0; > if (iter.index >= end) > break; > > @@ -656,9 +657,11 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, > > if (radix_tree_exceptional_entry(page)) > swapped++; > + else > + order = compound_order(page); > > if (need_resched()) { > - slot = radix_tree_iter_next(slot, &iter); > + slot = radix_tree_iter_save(&iter, order); > cond_resched_rcu(); > } > } > @@ -1062,7 +1065,7 @@ static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) > checked++; > if ((checked % 4096) != 0) > continue; > - slot = radix_tree_iter_next(slot, &iter); > + slot = radix_tree_iter_save(&iter, 0); > cond_resched_rcu(); > } > > @@ -2444,21 +2447,25 @@ static void shmem_tag_pins(struct address_space *mapping) > rcu_read_lock(); > > radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { > + unsigned int order = 0; > page = radix_tree_deref_slot(slot); > if (!page || radix_tree_exception(page)) { > if (radix_tree_deref_retry(page)) { > slot = radix_tree_iter_retry(&iter); > continue; > } > - } else if (page_count(page) - page_mapcount(page) > 1) { > - spin_lock_irq(&mapping->tree_lock); > - radix_tree_tag_set(&mapping->page_tree, iter.index, > - SHMEM_TAG_PINNED); > - spin_unlock_irq(&mapping->tree_lock); > + } else { > + order = compound_order(page); > + if (page_count(page) - page_mapcount(page) > 1) { > + spin_lock_irq(&mapping->tree_lock); > + radix_tree_tag_set(&mapping->page_tree, > + iter.index, SHMEM_TAG_PINNED); > + spin_unlock_irq(&mapping->tree_lock); > + } > } > > if (need_resched()) { > - slot = radix_tree_iter_next(slot, &iter); > + slot = radix_tree_iter_save(&iter, order); > cond_resched_rcu(); > } > } > @@ -2528,7 +2535,10 @@ static int shmem_wait_for_pins(struct address_space *mapping) > spin_unlock_irq(&mapping->tree_lock); > continue_resched: > if (need_resched()) { > - slot = radix_tree_iter_next(slot, &iter); > + unsigned int order = 0; > + if (page) > + order = compound_order(page); > + slot = radix_tree_iter_save(&iter, order); > cond_resched_rcu(); > } > } -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 73076a0..6d3457a 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -162,7 +162,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) slot = radix_tree_iter_retry(&iter); continue; } - slot = radix_tree_iter_next(&iter); + slot = radix_tree_iter_next(slot, &iter); spin_unlock(&fs_info->buffer_lock); free_extent_buffer_stale(eb); spin_lock(&fs_info->buffer_lock); diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 66fb8c0..36c6175 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -424,15 +424,10 @@ __radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots) * * If the iterator needs to release then reacquire a lock, the chunk may * have been invalidated by an insertion or deletion. Call this function - * to continue the iteration from the next index. + * before releasing the lock to continue the iteration from the next index. */ -static inline __must_check -void **radix_tree_iter_next(struct radix_tree_iter *iter) -{ - iter->next_index = __radix_tree_iter_add(iter, 1); - iter->tags = 0; - return NULL; -} +void **__must_check radix_tree_iter_next(void **slot, + struct radix_tree_iter *iter); /** * radix_tree_chunk_size - get current chunk size @@ -446,10 +441,17 @@ radix_tree_chunk_size(struct radix_tree_iter *iter) return (iter->next_index - iter->index) >> iter_shift(iter); } -static inline struct radix_tree_node *entry_to_node(void *ptr) +#ifdef CONFIG_RADIX_TREE_MULTIORDER +void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, + unsigned flags); +#else +/* Can't happen without sibling entries, but the compiler can't tell that */ +static inline void ** __radix_tree_next_slot(void **slot, + struct radix_tree_iter *iter, unsigned flags) { - return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); + return slot; } +#endif /** * radix_tree_next_slot - find next slot in chunk @@ -474,51 +476,31 @@ static __always_inline void ** radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) { if (flags & RADIX_TREE_ITER_TAGGED) { - void *canon = slot; - iter->tags >>= 1; if (unlikely(!iter->tags)) return NULL; - while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) && - radix_tree_is_internal_node(slot[1])) { - if (entry_to_node(slot[1]) == canon) { - iter->tags >>= 1; - iter->index = __radix_tree_iter_add(iter, 1); - slot++; - continue; - } - iter->next_index = __radix_tree_iter_add(iter, 1); - return NULL; - } if (likely(iter->tags & 1ul)) { iter->index = __radix_tree_iter_add(iter, 1); - return slot + 1; + slot++; + goto found; } if (!(flags & RADIX_TREE_ITER_CONTIG)) { unsigned offset = __ffs(iter->tags); - iter->tags >>= offset; - iter->index = __radix_tree_iter_add(iter, offset + 1); - return slot + offset + 1; + iter->tags >>= offset++; + iter->index = __radix_tree_iter_add(iter, offset); + slot += offset; + goto found; } } else { long count = radix_tree_chunk_size(iter); - void *canon = slot; while (--count > 0) { slot++; iter->index = __radix_tree_iter_add(iter, 1); - if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) && - radix_tree_is_internal_node(*slot)) { - if (entry_to_node(*slot) == canon) - continue; - iter->next_index = iter->index; - break; - } - if (likely(*slot)) - return slot; + goto found; if (flags & RADIX_TREE_ITER_CONTIG) { /* forbid switching to the next chunk */ iter->next_index = 0; @@ -527,6 +509,11 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) } } return NULL; + + found: + if (unlikely(radix_tree_is_internal_node(*slot))) + return __radix_tree_next_slot(slot, iter, flags); + return slot; } /** @@ -577,6 +564,6 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) slot || (slot = radix_tree_next_chunk(root, iter, \ RADIX_TREE_ITER_TAGGED | tag)) ; \ slot = radix_tree_next_slot(slot, iter, \ - RADIX_TREE_ITER_TAGGED)) + RADIX_TREE_ITER_TAGGED | tag)) #endif /* _LINUX_RADIX_TREE_H */ diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 09c5f1d..27b53ef 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -69,6 +69,11 @@ struct radix_tree_preload { }; static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; +static inline struct radix_tree_node *entry_to_node(void *ptr) +{ + return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); +} + static inline void *node_to_entry(void *ptr) { return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); @@ -1138,6 +1143,95 @@ static inline void __set_iter_shift(struct radix_tree_iter *iter, #endif } +static void ** __radix_tree_iter_next(struct radix_tree_node **nodep, + void **slot, struct radix_tree_iter *iter) +{ + void *sib = node_to_entry(slot - 1); + + while (iter->index < iter->next_index) { + *nodep = rcu_dereference_raw(*slot); + if (*nodep && *nodep != sib) + return slot; + slot++; + iter->index = __radix_tree_iter_add(iter, 1); + iter->tags >>= 1; + } + + *nodep = NULL; + return NULL; +} + +#ifdef CONFIG_RADIX_TREE_MULTIORDER +void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, + unsigned flags) +{ + unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK; + struct radix_tree_node *node = rcu_dereference_raw(*slot); + + slot = __radix_tree_iter_next(&node, slot, iter); + + while (radix_tree_is_internal_node(node)) { + unsigned offset; + + if (node == RADIX_TREE_RETRY) + return slot; + node = entry_to_node(node); + + if (flags & RADIX_TREE_ITER_TAGGED) { + unsigned tag_long, tag_bit; + offset = radix_tree_find_next_bit(node, tag, 0); + if (offset == RADIX_TREE_MAP_SIZE) + return NULL; + slot = &node->slots[offset]; + + tag_long = offset / BITS_PER_LONG; + tag_bit = offset % BITS_PER_LONG; + iter->tags = node->tags[tag][tag_long] >> tag_bit; + BUG_ON(iter->tags >= (RADIX_TREE_MAP_SIZE * 2)); + node = rcu_dereference_raw(*slot); + } else { + offset = 0; + slot = &node->slots[0]; + for (;;) { + node = rcu_dereference_raw(*slot); + if (node) + break; + slot++; + offset++; + if (offset == RADIX_TREE_MAP_SIZE) + return NULL; + } + } + if ((flags & RADIX_TREE_ITER_CONTIG) && (offset > 0)) + goto none; + iter->shift -= RADIX_TREE_MAP_SHIFT; + iter->index = __radix_tree_iter_add(iter, offset); + iter->next_index = (iter->index | shift_maxindex(iter->shift)) + + 1; + } + + return slot; + none: + iter->next_index = 0; + return NULL; +} +EXPORT_SYMBOL(__radix_tree_next_slot); +#endif + +void **radix_tree_iter_next(void **slot, struct radix_tree_iter *iter) +{ + struct radix_tree_node *node; + + slot++; + iter->index = __radix_tree_iter_add(iter, 1); + node = rcu_dereference_raw(*slot); + __radix_tree_iter_next(&node, slot, iter); + iter->next_index = iter->index; + iter->tags = 0; + return NULL; +} +EXPORT_SYMBOL(radix_tree_iter_next); + /** * radix_tree_next_chunk - find next chunk of slots for iteration * diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 728d779..46155d1 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1614,8 +1614,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, present++; if (need_resched()) { + slot = radix_tree_iter_next(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); diff --git a/mm/shmem.c b/mm/shmem.c index 166ebf5..0b3fe33 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -658,8 +658,8 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, swapped++; if (need_resched()) { + slot = radix_tree_iter_next(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } @@ -2434,8 +2434,8 @@ static void shmem_tag_pins(struct address_space *mapping) } if (need_resched()) { + slot = radix_tree_iter_next(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); @@ -2504,8 +2504,8 @@ static int shmem_wait_for_pins(struct address_space *mapping) spin_unlock_irq(&mapping->tree_lock); continue_resched: if (need_resched()) { + slot = radix_tree_iter_next(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); diff --git a/tools/testing/radix-tree/iteration_check.c b/tools/testing/radix-tree/iteration_check.c index df71cb8..6eca4c2 100644 --- a/tools/testing/radix-tree/iteration_check.c +++ b/tools/testing/radix-tree/iteration_check.c @@ -79,7 +79,7 @@ static void *tagged_iteration_fn(void *arg) } if (rand_r(&seeds[0]) % 50 == 0) { - slot = radix_tree_iter_next(&iter); + slot = radix_tree_iter_next(slot, &iter); rcu_read_unlock(); rcu_barrier(); rcu_read_lock(); @@ -127,7 +127,7 @@ static void *untagged_iteration_fn(void *arg) } if (rand_r(&seeds[1]) % 50 == 0) { - slot = radix_tree_iter_next(&iter); + slot = radix_tree_iter_next(slot, &iter); rcu_read_unlock(); rcu_barrier(); rcu_read_lock(); diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 25e0463..588209a 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -232,10 +232,14 @@ void multiorder_iteration(void) int height = order[i] / RADIX_TREE_MAP_SHIFT; int shift = height * RADIX_TREE_MAP_SHIFT; int mask = (1 << order[i]) - 1; + struct item *item = *slot; assert(iter.index >= (index[i] &~ mask)); assert(iter.index <= (index[i] | mask)); assert(iter.shift == shift); + assert(!radix_tree_is_internal_node(item)); + assert(item->index >= (index[i] &~ mask)); + assert(item->index <= (index[i] | mask)); i++; } } @@ -279,12 +283,16 @@ void multiorder_tagged_iteration(void) } radix_tree_for_each_tagged(slot, &tree, &iter, j, 1) { + struct item *item = *slot; for (k = i; index[k] < tag_index[i]; k++) ; mask = (1 << order[k]) - 1; assert(iter.index >= (tag_index[i] &~ mask)); assert(iter.index <= (tag_index[i] | mask)); + assert(!radix_tree_is_internal_node(item)); + assert(item->index >= (tag_index[i] &~ mask)); + assert(item->index <= (tag_index[i] | mask)); i++; } } @@ -303,12 +311,16 @@ void multiorder_tagged_iteration(void) } radix_tree_for_each_tagged(slot, &tree, &iter, j, 2) { + struct item *item = *slot; for (k = i; index[k] < tag_index[i]; k++) ; mask = (1 << order[k]) - 1; assert(iter.index >= (tag_index[i] &~ mask)); assert(iter.index <= (tag_index[i] | mask)); + assert(!radix_tree_is_internal_node(item)); + assert(item->index >= (tag_index[i] &~ mask)); + assert(item->index <= (tag_index[i] | mask)); i++; } } diff --git a/tools/testing/radix-tree/regression3.c b/tools/testing/radix-tree/regression3.c index 1f06ed7..4d28eeb 100644 --- a/tools/testing/radix-tree/regression3.c +++ b/tools/testing/radix-tree/regression3.c @@ -88,7 +88,7 @@ void regression3_test(void) printf("slot %ld %p\n", iter.index, *slot); if (!iter.index) { printf("next at %ld\n", iter.index); - slot = radix_tree_iter_next(&iter); + slot = radix_tree_iter_next(slot, &iter); } } @@ -96,7 +96,7 @@ void regression3_test(void) printf("contig %ld %p\n", iter.index, *slot); if (!iter.index) { printf("next at %ld\n", iter.index); - slot = radix_tree_iter_next(&iter); + slot = radix_tree_iter_next(slot, &iter); } } @@ -106,7 +106,7 @@ void regression3_test(void) printf("tagged %ld %p\n", iter.index, *slot); if (!iter.index) { printf("next at %ld\n", iter.index); - slot = radix_tree_iter_next(&iter); + slot = radix_tree_iter_next(slot, &iter); } } diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h index 33d2b6b..b678f13 100644 --- a/tools/testing/radix-tree/test.h +++ b/tools/testing/radix-tree/test.h @@ -43,6 +43,7 @@ void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag); extern int nr_allocated; /* Normally private parts of lib/radix-tree.c */ +struct radix_tree_node *entry_to_node(void *ptr); void radix_tree_dump(struct radix_tree_root *root); int root_tag_get(struct radix_tree_root *root, unsigned int tag); unsigned long node_maxindex(struct radix_tree_node *);