From patchwork Tue Jun 25 21:17:56 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Matthew Wilcox (Oracle)" X-Patchwork-Id: 13712059 Received: from casper.infradead.org (casper.infradead.org [90.155.50.34]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id DA6598248D; Tue, 25 Jun 2024 21:18:08 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=90.155.50.34 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719350291; cv=none; b=UCln9QhRevJ/UYs8Js1oV8PKQFtzpyF6KkSP19wLLVDkYvVmw0SlCMOlCu+PM4io8HF2aPwmKIByFvm0sDmJFxpcSt4/LdAYwyt06/5CQzif83tr35tOsoPqjZ4IY9Zki7qhysWTM1vUGCAN02QDbQBUMGh6WBfPyrlF6n8nuBo= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719350291; c=relaxed/simple; bh=vQvaW5w0Li2SfcadZpOVHUJfaRsv8Eava3bSY/+NCHc=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=tjWSP2tgqV3v74wyAZfVZtuJeVfNPfET2DgM2YwMqaqMGjYMqSsMy9n+6lVx1j+f2Q38HIORLPEW8jd2RfPkYUJdq2zGlRI693OaTkC++77vJQ7Z4v+plUZKaaZ8eAXAoxkG9cgJKEAqVQek235Ab9B7LZ1uYih4+RdLotnOolU= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=infradead.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=BBr9RjEV; arc=none smtp.client-ip=90.155.50.34 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=infradead.org Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="BBr9RjEV" DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=casper.20170209; h=Content-Transfer-Encoding:MIME-Version: References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To: Content-Type:Content-ID:Content-Description; bh=Sb6ACkUd4CO5RdUu78EWqLMX3GvyMHObHltD037Rayc=; b=BBr9RjEVVwqu90Ed0hi0+x4w01 kqmaLJ4Bl67InK6HPYRGnHpmNIw2noKN7IOzFdkt+eDXXUlNRNaoTnqBix8N17Gb9LA/4l8WYSnsu ykkD9CoRCyITv4MfM34fQBYoILvlqKwdydyuSHbvXrDf24he1yuFgTuZ4A9jhGwJLFcoY/LGK4PYd 9qrosdlGtXVRgzVdINZTjvJbD3/fbDHeF1uBex5h9PP6l0+U2PESah1nR+afnyZes6TPewobmdAUd 8bjagjktUJoB2KV9lfK/4V28YX1RNRwHB8fNLhPmQaKD6Il3G1B+gJnwzwfS02DUJyupMDLGtfXXj WdeIoRTQ==; Received: from willy by casper.infradead.org with local (Exim 4.97.1 #2 (Red Hat Linux)) id 1sMDYE-0000000BXYI-2ylQ; Tue, 25 Jun 2024 21:18:07 +0000 From: "Matthew Wilcox (Oracle)" To: linux-kernel@vger.kernel.org Cc: "Matthew Wilcox (Oracle)" , linux-fsdevel@vger.kernel.org, maple-tree@lists.infradead.org Subject: [PATCH v2 1/5] tools: Add kernel stubs needed for rosebush Date: Tue, 25 Jun 2024 22:17:56 +0100 Message-ID: <20240625211803.2750563-2-willy@infradead.org> X-Mailer: git-send-email 2.45.1 In-Reply-To: <20240625211803.2750563-1-willy@infradead.org> References: <20240625211803.2750563-1-willy@infradead.org> Precedence: bulk X-Mailing-List: linux-fsdevel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Various parts of the kernel API didn't need to exist before, so add them: - spin_trylock() - DECLARE_FLEX_ARRAY - vmalloc - __vmalloc_node - kvmalloc - kvfree_rcu_mightsleep Signed-off-by: Matthew Wilcox (Oracle) --- tools/include/linux/compiler.h | 4 ++++ tools/include/linux/compiler_types.h | 2 ++ tools/include/linux/spinlock.h | 2 ++ tools/include/linux/stddef.h | 3 +++ tools/testing/radix-tree/linux/kernel.h | 2 ++ tools/testing/radix-tree/linux/vmalloc.h | 14 ++++++++++++++ 6 files changed, 27 insertions(+) create mode 100644 tools/include/linux/stddef.h create mode 100644 tools/testing/radix-tree/linux/vmalloc.h diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h index 8a63a9913495..936b5153b0fe 100644 --- a/tools/include/linux/compiler.h +++ b/tools/include/linux/compiler.h @@ -62,6 +62,10 @@ #define __nocf_check __attribute__((nocf_check)) #endif +#ifndef __refdata +#define __refdata +#endif + /* Are two types/vars the same type (ignoring qualifiers)? */ #ifndef __same_type # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) diff --git a/tools/include/linux/compiler_types.h b/tools/include/linux/compiler_types.h index d09f9dc172a4..18bd8767cebc 100644 --- a/tools/include/linux/compiler_types.h +++ b/tools/include/linux/compiler_types.h @@ -17,6 +17,7 @@ /* context/locking */ # define __must_hold(x) __attribute__((context(x,1,1))) # define __acquires(x) __attribute__((context(x,0,1))) +# define __cond_acquires(x) __attribute__((context(x,0,-1))) # define __releases(x) __attribute__((context(x,1,0))) # define __acquire(x) __context__(x,1) # define __release(x) __context__(x,-1) @@ -27,6 +28,7 @@ # define __acquires(x) # define __releases(x) # define __acquire(x) (void)0 +# define __cond_acquires(x) # define __release(x) (void)0 # define __cond_lock(x,c) (c) #endif /* __CHECKER__ */ diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h index a6cdf25b6b9d..871a6a305b9d 100644 --- a/tools/include/linux/spinlock.h +++ b/tools/include/linux/spinlock.h @@ -20,6 +20,8 @@ #define spin_lock_irqsave(x, f) (void)f, pthread_mutex_lock(x) #define spin_unlock_irqrestore(x, f) (void)f, pthread_mutex_unlock(x) +#define spin_trylock(x) (pthread_mutex_trylock(x) == 0) + #define arch_spinlock_t pthread_mutex_t #define __ARCH_SPIN_LOCK_UNLOCKED PTHREAD_MUTEX_INITIALIZER diff --git a/tools/include/linux/stddef.h b/tools/include/linux/stddef.h new file mode 100644 index 000000000000..337f520eba1d --- /dev/null +++ b/tools/include/linux/stddef.h @@ -0,0 +1,3 @@ +#include + +#define DECLARE_FLEX_ARRAY(TYPE, NAME) __DECLARE_FLEX_ARRAY(TYPE, NAME) diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h index c0a2bb785b92..72a95fd9708c 100644 --- a/tools/testing/radix-tree/linux/kernel.h +++ b/tools/testing/radix-tree/linux/kernel.h @@ -26,4 +26,6 @@ #define __must_hold(x) #define EXPORT_PER_CPU_SYMBOL_GPL(x) + +#define PAGE_SIZE 4096 #endif /* _KERNEL_H */ diff --git a/tools/testing/radix-tree/linux/vmalloc.h b/tools/testing/radix-tree/linux/vmalloc.h new file mode 100644 index 000000000000..2857c37472bb --- /dev/null +++ b/tools/testing/radix-tree/linux/vmalloc.h @@ -0,0 +1,14 @@ +#include + +#define vmalloc(x) malloc(x) +#define __vmalloc_node(size, align, gfp, node, caller) memalign(size, align) +#define vfree(x) free(x) + +#define kvmalloc(x, gfp) memalign(x, x) +#define kvfree(x) free(x) + +/* A correct but slow implementation */ +#define kvfree_rcu_mightsleep(x) { \ + synchronize_rcu(); \ + free(x); \ +} From patchwork Tue Jun 25 21:17:57 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Matthew Wilcox (Oracle)" X-Patchwork-Id: 13712062 Received: from casper.infradead.org (casper.infradead.org [90.155.50.34]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 987BD14A084; Tue, 25 Jun 2024 21:18:09 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=90.155.50.34 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719350292; cv=none; b=rfMep/Xd+hNgtUwNNk9PRxSDUE+oj7lfZbEm5h6rG55xfH795/FL50a+/7VgbUnZx8YRHiPAnEfJ5EaTKCXr+3ZBI8L9rjT5TS0os9CBcFh8B5pcjLYs4da47BhTrEqbzfENbI0PhaNXe/DefTcRKZI9MAQodF3wSplaMKN3tHo= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719350292; c=relaxed/simple; bh=z33Qn2ifZkI0roc/ACklv2ibDlnkFj8A3Oz7Qu8j23Q=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=rcwSdSSlNHi6FNPVuzcuYVcFpYVENmqVDg0WrxUn4Y4HVOOeEnVzGcNT+nBmBOgSrDXT4kKeRc4Ge4UAvLkMPqI+K+557CH3AedIl6aFoMkuLXVyOEf5CV29UdsWoWEHAivl5t+Ix88lMspKLBd03GXfjIKKMVg/T4Be7XZdaWc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=infradead.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=rVzWBt0k; arc=none smtp.client-ip=90.155.50.34 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=infradead.org Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="rVzWBt0k" DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=casper.20170209; h=Content-Transfer-Encoding:MIME-Version: References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To: Content-Type:Content-ID:Content-Description; bh=+KPEXN2N23LZbETowdTrOUzRVkppcpsEf4YytSR1K1c=; b=rVzWBt0kFTiBNtpn1xd2w9BW8t 8dcX/DA8Z7bjXH8U2rXJSuZJp3wOFb+/EW8fdfYjQB20EGz/hPhs2iyc78Wk/cLZVZq9vcnOi6vjU 5rEAh+p7p0+fgoOMhytLHpXZBAg5xHPbqWiEi5gRS0OoYjfHmkv6dl6Uz+cUnTR2h6OQO5jjtdk3u DWbFG6RCyYbS4BV8g2k3oJg93QN5yA9CWQN2FawHYRCTmQBVtpH8m9tVqVu/SHxWWb/G7G4E7JVJN Vs7ycxzPjRO0rngVqBLFeCuSZ9cUUs3k8WmSyE5Wh75Oo8wJ6UyDvK4Rof9pY5h+kHhaSYpBJ5fyv YLSDy8AQ==; Received: from willy by casper.infradead.org with local (Exim 4.97.1 #2 (Red Hat Linux)) id 1sMDYF-0000000BXYQ-1IDh; Tue, 25 Jun 2024 21:18:07 +0000 From: "Matthew Wilcox (Oracle)" To: linux-kernel@vger.kernel.org Cc: "Matthew Wilcox (Oracle)" , linux-fsdevel@vger.kernel.org, maple-tree@lists.infradead.org Subject: [PATCH v2 2/5] rosebush: Add new data structure Date: Tue, 25 Jun 2024 22:17:57 +0100 Message-ID: <20240625211803.2750563-3-willy@infradead.org> X-Mailer: git-send-email 2.45.1 In-Reply-To: <20240625211803.2750563-1-willy@infradead.org> References: <20240625211803.2750563-1-willy@infradead.org> Precedence: bulk X-Mailing-List: linux-fsdevel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Rosebush is a resizing hash table. See Docuemntation/core-api/rosebush.rst for details. Signed-off-by: Matthew Wilcox (Oracle) --- Documentation/core-api/index.rst | 1 + Documentation/core-api/rosebush.rst | 121 +++++ MAINTAINERS | 8 + include/linux/rosebush.h | 62 +++ lib/Makefile | 2 +- lib/rosebush.c | 654 ++++++++++++++++++++++++++++ 6 files changed, 847 insertions(+), 1 deletion(-) create mode 100644 Documentation/core-api/rosebush.rst create mode 100644 include/linux/rosebush.h create mode 100644 lib/rosebush.c diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index f147854700e4..8984a7d6ad73 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -36,6 +36,7 @@ Library functionality that is used throughout the kernel. kobject kref assoc_array + rosebush xarray maple_tree idr diff --git a/Documentation/core-api/rosebush.rst b/Documentation/core-api/rosebush.rst new file mode 100644 index 000000000000..f0c0bc690e48 --- /dev/null +++ b/Documentation/core-api/rosebush.rst @@ -0,0 +1,121 @@ +.. SPDX-License-Identifier: GPL-2.0+ + +======== +Rosebush +======== + +:Author: Matthew Wilcox + +Overview +======== + +Rosebush is a hashtable, different from the rhashtable. It is scalable +(one spinlock per bucket), resizing in two dimensions (number and size +of buckets), and concurrent (can be iterated under the RCU read lock). +It is designed to minimise dependent cache misses, which can stall a +modern CPU for thousands of instructions. + +Objects stored in a rosebush do not have an embedded linked list. +They can therefore be placed into the same rosebush multiple times and +be placed in multiple rosebushes. It is also possible to store pointers +which have special meaning like ERR_PTR(). It is not possible to store +a NULL pointer in a rosebush, as this is the return value that indicates +the iteration has finished. + +The user of the rosebush is responsible for calculating their own hash. +A high quality hash is desirable to keep the scalable properties of +the rosebush, but a hash with poor distribution in the lower bits will +not lead to a catastrophic breakdown. It may lead to excessive memory +consumption and a lot of CPU time spent during lookup. + +Rosebush is not yet IRQ or BH safe. It can be iterated in interrupt +context, but not modified. + +RCU Iteration +------------- + +There is no rosebush_lookup() function. This is because the hash value +may not be unique. Instead, the user should iterate the rosebush, +which will return pointers which probably have matching hash values. +It is the user's responsibility to determine if the returned pointer is +one they are interested in. + +Rosebush iteration guarantees to return all pointers which have a +matching hash, were in the rosebush before the iteration started and +remain in the rosebush after iteration ends. It may return additional +pointers, including pointers which do not have a matching hash value, +but it guarantees not to skip any pointers, and it guarantees to only +return pointers which have (at some point) been stored in the rosebush. + +If the rosebush is modified while the iteration is in progress, newly +added entries may or may not be returned and removed entries may or may +not be returned. Causality is not honoured; e.g. if Entry A is known +to be inserted before Entry B, it is possible for an iteration to return +Entry B and not Entry A. + +Functions and structures +======================== + +.. kernel-doc:: include/linux/rosebush.h +.. kernel-doc:: lib/rosebush.c + +Internals +========= + +The rosebush is organised into a top level table which contains pointers +to buckets. Each bucket contains a spinlock (for modifications to the +bucket), the number of entries in the bucket and a contention counter. + +The top level table is a power of two in size. The bottom M bits of the +hash are used to index into this table. The bucket contains hash values +followed by their corresponding pointers. We also track a contention +count, which lets us know if this spinlock is overloaded and we should +split the bucket to improve scalability. + +A bucket may be shared between multiple table entries. For simplicity, +we require that all buckets are shared between a power-of-two number +of slots. For example, a table with 8 entries might have entries that +point to buckets A, B, A, B, A, C, A, D. If we were to then split bucket +A, we would replace the first pair of pointers with pointers to bucket +E and the second pair with pointers to bucket F. This is akin to the +buddy page allocator. + +When we decide that the table needs to be resized, we allocate a new +table, and duplicate the current contents of the table into each half +of the new table. At this point, all buckets in the table are shared. +We then split the bucket that we're trying to insert into. + +IRQ / BH safety +--------------- + +If we decide to make the rosebush modifiable in IRQ context, we need +to take the locks in an irq-safe way; we need to figure out how to +allocate the top level table without vmalloc(), and we need to manage +without kvfree_rcu_mightsleep(). These all have solutions, but those +solutions have a cost that isn't worth paying until we have users. + +Some of those problems go away if we limit our support to removal in IRQ +context and only allow insertions in process context (as we do not need +to reallocate the table or bucket when removing an item). + +Small rosebushes +---------------- + +As an optimisation, if the rosebush has no entries, the buckets pointer +is NULL. If the rosebush has only a few entries, there are only two +buckets (allocated as a single allocation) and the table pointer points +directly to the first one instead of pointing to a table. + +Shrinking the rosebush +---------------------- + +We do not currently attempt either to join buckets or to shrink the table +if sufficiently many buckets are shared. If this proves to be a desirable +course of action, we can add support for it, with sufficient hysteresis +that adding/removing a single entry will not cause bouncing. + +Rosebush statistics +------------------- + +It would probably be wise to be able to gather statistics about bucket +occupancy rates, but this work has not yet been done. diff --git a/MAINTAINERS b/MAINTAINERS index a39c237edb95..d4a8e99919a4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19467,6 +19467,14 @@ F: include/net/rose.h F: include/uapi/linux/rose.h F: net/rose/ +ROSEBUSH DATA STRUCTURE +M: Matthew Wilcox +L: maple-tree@lists.infradead.org +S: Supported +F: Documentation/core-api/rosebush.rst +F: include/linux/rosebush.h +F: lib/*rosebush.c + ROTATION DRIVER FOR ALLWINNER A83T M: Jernej Skrabec L: linux-media@vger.kernel.org diff --git a/include/linux/rosebush.h b/include/linux/rosebush.h new file mode 100644 index 000000000000..57f4dfa3f93d --- /dev/null +++ b/include/linux/rosebush.h @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* See lib/rosebush.c */ + +#ifndef _LINUX_ROSEBUSH_H +#define _LINUX_ROSEBUSH_H + +#include + +/* + * Embed this struct in your struct, don't allocate it separately. + * None of this is for customers to use; treat it as opaque. + * In particular, taking the rbh_resize_lock will prevent only rbh_table + * from being reallocated; buckets can still be grown and split without + * the lock. But you will get incomprehensible lockdep warnings! + */ +struct rbh { + spinlock_t rbh_resize_lock; + unsigned long rbh_table; /* A tagged pointer */ +}; + +#define DEFINE_ROSEBUSH(name) struct rbh name = \ + { .rbh_resize_lock = __SPIN_LOCK_UNLOCKED(name.lock), } + +int rbh_insert(struct rbh *rbh, u32 hash, void *p); +int rbh_reserve(struct rbh *rbh, u32 hash); +int rbh_use(struct rbh *rbh, u32 hash, void *p); +int rbh_remove(struct rbh *rbh, u32 hash, void *p); +void rbh_destroy(struct rbh *rbh); + +/** + * rbh_release - Release a reserved slot in a rosebush. + * @rbh: The rosebush. + * @hash: The hash value that was reserved. + * + * If you decide that you do not need to use a reserved slot, call this + * function to release the slot. + * + * Return: 0 on success, -ENOENT if no reserved slot was found. + */ + +static inline int rbh_release(struct rbh *rbh, u32 hash) +{ + return rbh_remove(rbh, hash, NULL); +} + +struct rbh_iter { + struct rbh *rbh; + struct rbh_bucket *bucket; + u32 hash; + unsigned int index; +}; + +#define RBH_ITER(name, _rbh, _hash) \ + struct rbh_iter name = { .rbh = _rbh, .hash = _hash, } + +void *rbh_next(struct rbh_iter *rbhi); + +void rbh_iter_remove(struct rbh_iter *rbhi, void *p); +void rbh_iter_lock(struct rbh_iter *rbhi); +void rbh_iter_unlock(struct rbh_iter *rbhi); + +#endif diff --git a/lib/Makefile b/lib/Makefile index 3b1769045651..723e6c90b58d 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -28,7 +28,7 @@ CFLAGS_string.o += -fno-stack-protector endif lib-y := ctype.o string.o vsprintf.o cmdline.o \ - rbtree.o radix-tree.o timerqueue.o xarray.o \ + rosebush.o rbtree.o radix-tree.o timerqueue.o xarray.o \ maple_tree.o idr.o extable.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ diff --git a/lib/rosebush.c b/lib/rosebush.c new file mode 100644 index 000000000000..47106a04d11d --- /dev/null +++ b/lib/rosebush.c @@ -0,0 +1,654 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Rosebush, a resizing bucket hash table + * Copyright (c) 2024 Oracle Corporation + * Author: Matthew Wilcox + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * The lock is held whenever we are modifying the contents of the bucket. + * The contention counter tracks whether we need to split the bucket due + * to contention on the spinlock. + * The bucket is 256 bytes in size (20 * 12 = 240, plus parent, lock, etc) + */ +#ifdef CONFIG_64BIT +#define RBH_ENTRIES 20 +#else +#define RBH_ENTRIES 30 +#endif + +struct rbh_bucket { + u32 rbh_hashes[RBH_ENTRIES]; + void __rcu *rbh_slots[RBH_ENTRIES]; + const struct rbh *rbh; + spinlock_t rbh_lock; + u32 rbh_contention; +}; + +struct rbh_table { + DECLARE_FLEX_ARRAY(struct rbh_bucket __rcu *, buckets); +}; + +struct rbh_initial_table { + struct rbh_bucket buckets[2]; +}; + +/* + * As far as lockdep is concerned, all buckets in the same rosebush use + * the same lock. We use the classes to distinguish the rbh resize lock + * from the bucket locks. The only time we take two bucket locks is + * when we already hold the resize lock, so there is no need to define + * an order between bucket locks. + */ +#ifdef CONFIG_DEBUG_SPINLOCK +#define bucket_lock_init(rbh, bucket) \ + __raw_spin_lock_init(spinlock_check(&(bucket)->rbh_lock), \ + "rbh", (rbh)->rbh_resize_lock.dep_map.key, LD_WAIT_SPIN) +#else +#define bucket_lock_init(rbh, bucket) \ + spin_lock_init(&(bucket)->rbh_lock) +#endif + +#define rbh_resize_lock(rbh) spin_lock_nested(&(rbh)->rbh_resize_lock, 2) +#define rbh_resize_unlock(rbh) spin_unlock(&(rbh)->rbh_resize_lock) + +struct table_mask { + struct rbh_table *table; + u32 mask; +}; + +/* + * A very complicated way of spelling &rbh->bucket[hash]. + * + * The first complication is that we encode the number of buckets + * in the pointer so that we can get both in an atomic load. + * + * The second complication is that small hash tables don't have a top + * level table; instead the buckets pointer points to a pair of buckets. + */ +static struct rbh_bucket *get_bucket(struct rbh *rbh, u32 hash, + struct table_mask *state) +{ + unsigned long tagged; + struct rbh_initial_table *initial_table; + unsigned int bnr; + + /* rcu_dereference(), but not a pointer */ + tagged = READ_ONCE(rbh->rbh_table); + if (!tagged) + return NULL; + + /* The lowest bits indicates how many buckets the table holds */ + state->table = (struct rbh_table *)(tagged & (tagged + 1)); + state->mask = tagged - (unsigned long)state->table; + bnr = hash & state->mask; + if (state->mask != 1) + return rcu_dereference(state->table->buckets[bnr]); + + initial_table = (struct rbh_initial_table *)state->table; + return &initial_table->buckets[bnr]; +} + +static struct rbh_bucket *lock_bucket(struct rbh *rbh, u32 hash) + __acquires(&bucket->rbh_lock) +{ + struct rbh_bucket *bucket, *new_bucket; + struct table_mask state; + + bucket = get_bucket(rbh, hash, &state); + if (!bucket) + return bucket; +again: + spin_lock(&bucket->rbh_lock); + new_bucket = get_bucket(rbh, hash, &state); + if (bucket == new_bucket) + return bucket; + spin_unlock(&bucket->rbh_lock); + bucket = new_bucket; + goto again; +} + +static bool rbh_first(struct rbh *rbh, u32 hash) +{ + struct rbh_initial_table *table; + int i; + +//printk("%s: table size %zd\n", __func__, sizeof(*table)); + table = kmalloc(sizeof(*table), GFP_KERNEL); + if (!table) + return false; + + rbh_resize_lock(rbh); + if (rbh->rbh_table) { + rbh_resize_unlock(rbh); + /* Somebody else resized it for us */ + kfree(table); + return true; + } + + bucket_lock_init(rbh, &table->buckets[0]); + table->buckets[0].rbh = rbh; + table->buckets[0].rbh_contention = 0; + bucket_lock_init(rbh, &table->buckets[1]); + table->buckets[1].rbh = rbh; + table->buckets[1].rbh_contention = 0; + for (i = 0; i < RBH_ENTRIES; i++) { + table->buckets[0].rbh_hashes[i] = ~0; + table->buckets[1].rbh_hashes[i] = 0; + } + /* rcu_assign_pointer() but not a pointer */ + smp_store_release(&rbh->rbh_table, (unsigned long)table | 1); + rbh_resize_unlock(rbh); + +//printk("%s: new table = %px\n", __func__, table); + return true; +} + +static void copy_initial_buckets(const struct rbh *rbh, + struct rbh_table *table, struct rbh_initial_table *init_table) + __acquires(&init_table->buckets[0].rbh_lock) + __acquires(&init_table->buckets[1].rbh_lock) +{ + struct rbh_bucket *bucket; + + bucket = (void __force *)table->buckets[0]; + spin_lock(&init_table->buckets[0].rbh_lock); + memcpy(bucket, &init_table->buckets[0], sizeof(init_table->buckets[0])); + bucket_lock_init(rbh, bucket); + + bucket = (void __force *)table->buckets[1]; + spin_lock_nested(&init_table->buckets[1].rbh_lock, 1); + memcpy(bucket, &init_table->buckets[1], sizeof(init_table->buckets[1])); + bucket_lock_init(rbh, bucket); +} + +/* + * When we grow the table, we duplicate the bucket pointers so this + * thread doesn't pay the entire cost of growing the table. + */ +static int rbh_grow_table(struct rbh *rbh, u32 hash, struct table_mask *state) + __releases(RCU) + __acquires(RCU) +{ + struct rbh_table *table; + struct rbh_initial_table *init_table; + u32 mask = state->mask; + unsigned long tagged = (unsigned long)state->table | mask; + size_t size; + + rcu_read_unlock(); + + size = (mask + 1) * 2 * sizeof(void *); + if (size > 4 * PAGE_SIZE) + /* XXX: NUMA_NO_NODE doesn't necessarily interleave */ + table = __vmalloc_node(size, size, GFP_KERNEL, NUMA_NO_NODE, + &table); + else + table = kvmalloc(size, GFP_KERNEL); + if (!table) { + rcu_read_lock(); + /* Maybe somebody resized it for us */ + if (READ_ONCE(rbh->rbh_table) != tagged) + return 0; + return -ENOMEM; + } + BUG_ON((unsigned long)table & (size - 1)); + + if (mask == 1) { + /* Don't need to bother with RCU until we publish the table */ + table->buckets[0] = (void __rcu *)kmalloc(sizeof(struct rbh_bucket), GFP_KERNEL); + if (!table->buckets[0]) + goto free_all; + table->buckets[1] = (void __rcu *)kmalloc(sizeof(struct rbh_bucket), GFP_KERNEL); + if (!table->buckets[1]) + goto free_all; + } + + rbh_resize_lock(rbh); + if (rbh->rbh_table != tagged) { + rbh_resize_unlock(rbh); + /* Somebody else resized it for us */ + kvfree(table); + rcu_read_lock(); + return 0; + } + +//printk("%s: replacing table %p with table %p mask %d\n", __func__, state->table, table, mask); + if (mask == 1) { + init_table = (void *)state->table; + copy_initial_buckets(rbh, table, init_table); + } else { + memcpy(&table->buckets, &state->table->buckets, + (mask + 1) * sizeof(void *)); + } + memcpy(&table->buckets[mask + 1], &table->buckets[0], + (mask + 1) * sizeof(void *)); + + tagged = ((unsigned long)table) | (mask << 1) | 1; + /* rcu_assign_pointer() but not a pointer */ + smp_store_release(&rbh->rbh_table, tagged); + rbh_resize_unlock(rbh); + if (mask == 1) { + spin_unlock(&init_table->buckets[0].rbh_lock); + spin_unlock(&init_table->buckets[1].rbh_lock); + } + kvfree_rcu_mightsleep(state->table); + + rcu_read_lock(); + return 0; +free_all: + kfree((void __force *)table->buckets[0]); + kvfree(table); + rcu_read_lock(); + return -ENOMEM; +} + +static void bucket_copy(const struct rbh *rbh, struct rbh_bucket *buckets[2], + const struct rbh_bucket *old_bucket, u32 hash, u32 mask) +{ + unsigned int i; + unsigned int js[2] = {0, 0}; + + bucket_lock_init(rbh, buckets[0]); + buckets[0]->rbh_contention = 0; + buckets[0]->rbh = rbh; + bucket_lock_init(rbh, buckets[1]); + buckets[1]->rbh_contention = 0; + buckets[1]->rbh = rbh; + for (i = 0; i < RBH_ENTRIES; i++) { + unsigned int nr = !!(old_bucket->rbh_hashes[i] & mask); + buckets[nr]->rbh_hashes[js[nr]] = old_bucket->rbh_hashes[i]; + buckets[nr]->rbh_slots[js[nr]++] = old_bucket->rbh_slots[i]; + } +//printk("%s: bucket:%p copied %d entries from %p hash:%x mask:%x\n", __func__, buckets[0], js[0], old_bucket, hash, mask); +//printk("%s: bucket:%p copied %d entries from %p hash:%x mask:%x\n", __func__, buckets[1], js[1], old_bucket, hash, mask); + + /* Fill the new buckets with deleted entries */ + while (js[0] < RBH_ENTRIES) + buckets[0]->rbh_hashes[js[0]++] = ~hash; + while (js[1] < RBH_ENTRIES) + buckets[1]->rbh_hashes[js[1]++] = ~hash; +} + +#define rbh_dereference_protected(p, rbh) \ + rcu_dereference_protected(p, lockdep_is_held(&(rbh)->rbh_resize_lock)) + +static int rbh_split_bucket(struct rbh *rbh, struct rbh_bucket *bucket, + u32 hash, struct table_mask *state) +{ + unsigned long tagged; + u32 bit, mask; + struct rbh_table *table; + struct rbh_bucket *buckets[2]; + int err = 0; + + if (state->mask == 1) { + err = rbh_grow_table(rbh, hash, state); + } else { + u32 buddy = (hash & state->mask) ^ ((state->mask + 1) / 2); + if (rcu_dereference(state->table->buckets[buddy]) != bucket) + err = rbh_grow_table(rbh, hash, state); + } + if (err < 0) + return err; + + rcu_read_unlock(); + + /* XXX: use slab */ + buckets[0] = kmalloc(sizeof(*bucket), GFP_KERNEL); + if (!buckets[0]) + return -ENOMEM; + buckets[1] = kmalloc(sizeof(*bucket), GFP_KERNEL); + if (!buckets[1]) { + kfree(buckets[0]); + return -ENOMEM; + } + +//printk("%s: adding buckets %p %p for hash %d\n", __func__, buckets[0], buckets[1], hash); + rbh_resize_lock(rbh); + tagged = rbh->rbh_table; + table = (struct rbh_table *)(tagged & (tagged + 1)); + mask = tagged - (unsigned long)table; + hash &= mask; + if (rbh_dereference_protected(table->buckets[hash], rbh) != bucket) + goto free; + + /* Figure out which entries we need to take to the new bucket */ + bit = mask + 1; + while (bit > 1) { + bit /= 2; +//printk("hash:%d buddy:%d\n", hash, hash ^ bit); + if (rbh_dereference_protected(table->buckets[hash ^ bit], rbh) + != bucket) + break; + } + bit *= 2; + if (bit == mask + 1) + goto free; + + spin_lock(&bucket->rbh_lock); + bucket_copy(rbh, buckets, bucket, hash, bit); + +//printk("hash:%d mask:%d bit:%d\n", hash, mask, bit); + hash &= (bit - 1); + do { +//printk("assigning bucket %p to index %d\n", buckets[0], hash); + rcu_assign_pointer(table->buckets[hash], buckets[0]); + hash += bit; +//printk("assigning bucket %p to index %d\n", buckets[1], hash); + rcu_assign_pointer(table->buckets[hash], buckets[1]); + hash += bit; + } while (hash < mask); +//printk("owner:%px\n", bucket->rbh_lock.owner) + spin_unlock(&bucket->rbh_lock); + rbh_resize_unlock(rbh); + kvfree_rcu_mightsleep(bucket); + + return 0; +free: + rbh_resize_unlock(rbh); +//printk("%s: freeing bucket %p\n", __func__, bucket); + kfree(buckets[0]); + kfree(buckets[1]); + + return 0; +} + +static int __rbh_insert(struct rbh *rbh, u32 hash, void *p) +{ + struct table_mask state; + struct rbh_bucket *bucket, *new_bucket; + unsigned int i; + int err; + +restart: + rcu_read_lock(); + bucket = get_bucket(rbh, hash, &state); + if (unlikely(!bucket)) { + rcu_read_unlock(); + if (!rbh_first(rbh, hash)) + return -ENOMEM; + goto restart; + } + +again: + if (spin_trylock(&bucket->rbh_lock)) { + if (bucket->rbh_contention) + bucket->rbh_contention--; + } else { + spin_lock(&bucket->rbh_lock); + /* Numbers chosen ad-hoc */ + bucket->rbh_contention += 10; + if (unlikely(bucket->rbh_contention > 5000)) { + spin_unlock(&bucket->rbh_lock); + /* OK if this fails; it's only contention */ + rbh_split_bucket(rbh, bucket, hash, &state); + + bucket = get_bucket(rbh, hash, &state); + spin_lock(&bucket->rbh_lock); + } + } + + new_bucket = get_bucket(rbh, hash, &state); + if (bucket != new_bucket) { + spin_unlock(&bucket->rbh_lock); + bucket = new_bucket; + goto again; + } + + /* Deleted elements differ in their bottom bit */ + for (i = 0; i < RBH_ENTRIES; i++) { + u32 bhash = bucket->rbh_hashes[i]; + + if ((bhash & 1) == (hash & 1)) + continue; + rcu_assign_pointer(bucket->rbh_slots[i], p); + /* This array is read under RCU */ + WRITE_ONCE(bucket->rbh_hashes[i], hash); + + spin_unlock(&bucket->rbh_lock); + rcu_read_unlock(); + return 0; + } + + /* No space in this bucket */ + spin_unlock(&bucket->rbh_lock); + + err = rbh_split_bucket(rbh, bucket, hash, &state); + rcu_read_unlock(); + if (err) + return err; + goto restart; +} + +/** + * rbh_insert - Add a pointer to a rosebush. + * @rbh: The rosebush. + * @hash: The hash value for this pointer. + * @p: The pointer to add. + * + * Return: 0 on success, -ENOMEM if memory allocation fails, + * -EINVAL if @p is NULL. + */ +int rbh_insert(struct rbh *rbh, u32 hash, void *p) +{ + if (p == NULL) + return -EINVAL; + return __rbh_insert(rbh, hash, p); +} +EXPORT_SYMBOL(rbh_insert); + +/** + * rbh_remove - Remove a pointer from a rosebush. + * @rbh: The rosebush. + * @hash: The hash value for this pointer. + * @p: The pointer to remove. + * + * Return: 0 on success, -ENOENT if this pointer could not be found. + */ +int rbh_remove(struct rbh *rbh, u32 hash, void *p) +{ + struct rbh_bucket *bucket; + unsigned int i; + int err = -ENOENT; + + rcu_read_lock(); + bucket = lock_bucket(rbh, hash); + if (!bucket) + goto rcu_unlock; + + for (i = 0; i < RBH_ENTRIES; i++) { + if (bucket->rbh_hashes[i] != hash) + continue; + if (rcu_dereference_protected(bucket->rbh_slots[i], + lockdep_is_held(&bucket->rbh_lock)) != p) + continue; + bucket->rbh_hashes[i] = ~hash; + /* Do not modify the slot */ + err = 0; + break; + } + + spin_unlock(&bucket->rbh_lock); +rcu_unlock: + rcu_read_unlock(); + return err; +} +EXPORT_SYMBOL(rbh_remove); + +/** + * rbh_reserve - Reserve a slot in a rosebush for later use. + * @rbh: The rosebush. + * @hash: The hash value that will be used. + * + * Some callers need to take another lock before inserting an object + * into the rosebush. This function reserves space for them to do that. + * A subsequent call to rbh_use() will not allocate memory. If you find + * that you do not need the reserved space any more, call rbh_remove(), + * passing NULL as the pointer. + * + * Return: 0 on success, -ENOMEM on failure. + */ +int rbh_reserve(struct rbh *rbh, u32 hash) +{ + return __rbh_insert(rbh, hash, NULL); +} +EXPORT_SYMBOL(rbh_reserve); + +/** + * rbh_use - Use a reserved slot in a rosebush. + * @rbh: The rosebush. + * @hash: The hash value for this pointer. + * @p: The pointer to add. + * + * Return: 0 on success, -EINVAL if @p is NULL, + * -ENOENT if no reserved slot could be found. + */ +int rbh_use(struct rbh *rbh, u32 hash, void *p) +{ + struct rbh_bucket *bucket; + unsigned int i; + int err = -ENOENT; + + rcu_read_lock(); + bucket = lock_bucket(rbh, hash); + if (!bucket) + goto rcu_unlock; + + for (i = 0; i < RBH_ENTRIES; i++) { + if (bucket->rbh_hashes[i] != hash) + continue; + if (bucket->rbh_slots[i] != NULL) + continue; + rcu_assign_pointer(bucket->rbh_slots[i], p); + err = 0; + break; + } + + spin_unlock(&bucket->rbh_lock); +rcu_unlock: + rcu_read_unlock(); + return err; +} +EXPORT_SYMBOL(rbh_use); + +/** + * rbh_next - Find the next entry matching this hash + * @rbhi: The rosebush iterator. + * + * Return: NULL if there are no more matching hash values, otherwise + * the next pointer. + */ +void *rbh_next(struct rbh_iter *rbhi) +{ + struct table_mask state; + struct rbh_bucket *bucket = rbhi->bucket; + void *p; + + if (!bucket) { + bucket = get_bucket(rbhi->rbh, rbhi->hash, &state); + if (!bucket) + return NULL; + rbhi->bucket = bucket; + rbhi->index = UINT_MAX; + } + + while (++rbhi->index < RBH_ENTRIES) { + if (READ_ONCE(bucket->rbh_hashes[rbhi->index]) != rbhi->hash) + continue; + p = rcu_dereference(bucket->rbh_slots[rbhi->index]); + if (p) + return p; + } + + return NULL; +} +EXPORT_SYMBOL(rbh_next); + +/** + * rbh_destroy - Destroy a rosebush + * @rbh: The rosebush to destroy + * + * The caller must ensure that no other threads will simultaneously + * attempt to use the rosebush. + */ +void rbh_destroy(struct rbh *rbh) +{ + unsigned long tagged = rbh->rbh_table; + struct rbh_table *table = (struct rbh_table *)(tagged & (tagged + 1)); + u32 mask = tagged - (unsigned long)table; + u32 i, j, k; + + for (i = 0; i <= mask; i++) { + struct rbh_bucket __rcu *bucket; + + bucket = table->buckets[i]; + if (!bucket) + continue; + kfree((void __force *)bucket); + for (j = (mask + 1) / 2; j > 0; j /= 2) { + if (table->buckets[i ^ j] == bucket) + continue; + j *= 2; + break; + } + k = i; + do { + table->buckets[k] = NULL; + k += j; + } while (k < mask && table->buckets[k] == bucket); + } + + kfree(table); + rbh->rbh_table = 0; +} + +#ifndef __KERNEL__ +static void dump_bucket(struct rbh_bucket *bucket, unsigned bnr) +{ + unsigned i; + + printk("bucket:%p index:%d rbh:%p\n", bucket, bnr, bucket->rbh); + for (i = 0; i < RBH_ENTRIES; i++) { + printk("hash:%x entry:%p (%x)\n", bucket->rbh_hashes[i], + bucket->rbh_slots[i], bucket->rbh_hashes[i] * 2 + 1); + } +} + +static void rbh_dump(struct rbh *rbh) +{ + unsigned long tagged = READ_ONCE(rbh->rbh_table); + struct rbh_table *table; + u32 mask, i; + + table = (struct rbh_table *)(tagged & (tagged + 1)); + mask = tagged - (unsigned long)table; + + printk("rosebush %p has %d buckets in table %p\n", rbh, mask + 1, table); + + if (mask == 1) { + dump_bucket((struct rbh_bucket *)table, 0); + dump_bucket((struct rbh_bucket *)table + 1, 1); + } else for (i = 0; i <= mask; i++) + dump_bucket(table->buckets[i], i); +} +#endif + +/* + * TODO: + * * convert the dcache + * * 2 byte hashes in the bucket. Once the table has 2^17 buckets, we can + * use 10 bytes per entry instead of 12 (24 entries/bucket instead of 20) + * * 1 byte hashes in the bucket. Once the table has 2^25 buckets, we can + * use 9 bytes per entry instead of 10 (26 entries/bucket instead of 24) + */ From patchwork Tue Jun 25 21:17:58 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Matthew Wilcox (Oracle)" X-Patchwork-Id: 13712060 Received: from casper.infradead.org (casper.infradead.org [90.155.50.34]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0392B176252; Tue, 25 Jun 2024 21:18:09 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=90.155.50.34 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719350291; cv=none; b=RD71gFf1N35Q0H40B2IEMXgY7cMTvKzip6WlsrlDekS1LeEaltUHdeFIJrlQaPjlKDzrxyzkLPluKwWofPiMFDHEd548qsOTtNVqLW0VXZcZ0okRsOaRfuL86oUk6azTKPLbB7/jzj4eQWNF8huIzgDcKuYqyTiR8Rx57u3c4wY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719350291; c=relaxed/simple; bh=k7WG3rwgkOEy8+g7ol3n2RpCqSOWutW6VRpzXP8z8oA=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=n1uZO+51CJF8mFk32T6tge2jtUQ9WGu3i3dB7ydeHj+gHM9d43WF2CtF0g/8qwXwckjOFivSa0dB95fbUWK3LYOdK0hN8RCymoqBJ4cZXPJtNdxSIUC5DBJXKqewutAWqXFEYWWJmeeZ+2mcDrcQ/akkRKF+2W8wCJNfvwV2yCw= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=infradead.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=eaddj1WG; arc=none smtp.client-ip=90.155.50.34 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=infradead.org Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="eaddj1WG" DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=casper.20170209; h=Content-Transfer-Encoding:MIME-Version: References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To: Content-Type:Content-ID:Content-Description; bh=40AQCIbaWR3aQUx2SpMr2/LydQuRZnfIDFJXoDJax6c=; b=eaddj1WGAmbB/NFaLU1SskSYtu 4zKTkZdka8SGo2Hc9zZFGH7d+R4UZL4kV05r7K/tjWtVmF/HZNUgdLqcQJlxVAmemOlb/D61tqhHU ZSlzJqlUvbc1MlPfSlwJc/LL+10+b7myE6sk0t5lQjmslDBOee+04GxqPMhzJkdTEONnMkdV6E7yD 50s+DaQnXAMfkA8RHXSBhC9gaWiHdd//9hwAyWNAE5ploFwBJqMVk7sz+8qEP7W8FS42ZaNfel0k+ EPia/5g0DkdcvhC6aj0mJuy3Z2fq0pkReftHDEwJLTPytKWb6RrulY4NFfxqOTK1HZAU2IrHkfgUt dBNfbs3w==; Received: from willy by casper.infradead.org with local (Exim 4.97.1 #2 (Red Hat Linux)) id 1sMDYF-0000000BXYW-2iGQ; Tue, 25 Jun 2024 21:18:08 +0000 From: "Matthew Wilcox (Oracle)" To: linux-kernel@vger.kernel.org Cc: "Matthew Wilcox (Oracle)" , linux-fsdevel@vger.kernel.org, maple-tree@lists.infradead.org Subject: [PATCH v2 3/5] rosebush: Add test suite Date: Tue, 25 Jun 2024 22:17:58 +0100 Message-ID: <20240625211803.2750563-4-willy@infradead.org> X-Mailer: git-send-email 2.45.1 In-Reply-To: <20240625211803.2750563-1-willy@infradead.org> References: <20240625211803.2750563-1-willy@infradead.org> Precedence: bulk X-Mailing-List: linux-fsdevel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 This is not a very sophisticated test suite yet, but it helped find a few bugs and provides a framework for adding more tests as more bugs are found. Signed-off-by: Matthew Wilcox (Oracle) --- lib/Kconfig.debug | 3 + lib/Makefile | 1 + lib/test_rosebush.c | 140 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 144 insertions(+) create mode 100644 lib/test_rosebush.c diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 59b6765d86b8..f3cfd79d8dbd 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2447,6 +2447,9 @@ config TEST_RHASHTABLE If unsure, say N. +config TEST_ROSEBUSH + tristate "Test the Rosebush data structure" + config TEST_IDA tristate "Perform selftest on IDA functions" diff --git a/lib/Makefile b/lib/Makefile index 723e6c90b58d..de4edefc2c11 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -76,6 +76,7 @@ obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o obj-$(CONFIG_TEST_MIN_HEAP) += test_min_heap.o obj-$(CONFIG_TEST_LKM) += test_module.o obj-$(CONFIG_TEST_VMALLOC) += test_vmalloc.o +obj-$(CONFIG_TEST_ROSEBUSH) += test_rosebush.o obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o obj-$(CONFIG_TEST_SORT) += test_sort.o obj-$(CONFIG_TEST_USER_COPY) += test_user_copy.o diff --git a/lib/test_rosebush.c b/lib/test_rosebush.c new file mode 100644 index 000000000000..59c342e7a5b3 --- /dev/null +++ b/lib/test_rosebush.c @@ -0,0 +1,140 @@ +#include +#include + +static void iter_rbh(struct kunit *test, struct rbh *rbh, u32 hash, void *p) +{ + RBH_ITER(iter, rbh, hash); + void *q; + + rcu_read_lock(); + q = rbh_next(&iter); + KUNIT_EXPECT_PTR_EQ_MSG(test, p, q, + "rbh_next hash:%u returned %px, expected %px", hash, q, p); + q = rbh_next(&iter); + KUNIT_EXPECT_PTR_EQ_MSG(test, NULL, q, + "rbh_next hash:%u returned %px, expected NULL", hash, q); + rcu_read_unlock(); +} + +static void check_empty_rbh(struct kunit *test, struct rbh *rbh) +{ + iter_rbh(test, rbh, 0, NULL); + iter_rbh(test, rbh, 1, NULL); + iter_rbh(test, rbh, 17, NULL); + iter_rbh(test, rbh, 42, NULL); +} + +static void test_insert(struct kunit *test, struct rbh *rbh, u32 hash) +{ + void *p = (void *)((hash << 1) | 1UL); + int err; + + err = rbh_insert(rbh, hash, p); + KUNIT_EXPECT_EQ(test, err, 0); + + iter_rbh(test, rbh, hash, p); +} + +static void test_reserve(struct kunit *test, struct rbh *rbh, u32 hash) +{ + int err; + + err = rbh_reserve(rbh, hash); + KUNIT_EXPECT_EQ(test, err, 0); + + iter_rbh(test, rbh, hash, NULL); +} + +static void test_use(struct kunit *test, struct rbh *rbh, u32 hash) +{ + void *p = (void *)((hash << 1) | 1UL); + int err; + + err = rbh_use(rbh, hash, p); + KUNIT_EXPECT_EQ(test, err, 0); + + iter_rbh(test, rbh, hash, p); +} + +static void test_remove(struct kunit *test, struct rbh *rbh, u32 hash) +{ + void *p = (void *)((hash << 1) | 1UL); + int err; + + err = rbh_remove(rbh, hash, p); + KUNIT_EXPECT_EQ(test, err, 0); + + iter_rbh(test, rbh, hash, NULL); +} + +static DEFINE_ROSEBUSH(rosebush); + +/* + * Conduct a number of tests on a rosebush that has never been used. + * They should all return NULL or an errno. We're looking for crashes + * here. + */ +static void empty(struct kunit *test) +{ + int err; + + check_empty_rbh(test, &rosebush); + err = rbh_remove(&rosebush, 0, test); + KUNIT_EXPECT_EQ(test, err, -ENOENT); + err = rbh_use(&rosebush, 0, test); + KUNIT_EXPECT_EQ(test, err, -ENOENT); + KUNIT_EXPECT_EQ(test, rosebush.rbh_table, 0); +} + +static void first(struct kunit *test) +{ + int err; + + test_insert(test, &rosebush, 5); + check_empty_rbh(test, &rosebush); + test_remove(test, &rosebush, 5); + check_empty_rbh(test, &rosebush); + + err = rbh_remove(&rosebush, 5, NULL); + KUNIT_EXPECT_EQ(test, err, -ENOENT); + test_reserve(test, &rosebush, 5); + err = rbh_remove(&rosebush, 5, test); + KUNIT_EXPECT_EQ(test, err, -ENOENT); + err = rbh_remove(&rosebush, 5, NULL); + KUNIT_EXPECT_EQ(test, err, 0); + err = rbh_remove(&rosebush, 5, NULL); + KUNIT_EXPECT_EQ(test, err, -ENOENT); + + test_reserve(test, &rosebush, 5); + test_use(test, &rosebush, 5); + err = rbh_remove(&rosebush, 5, NULL); + KUNIT_EXPECT_EQ(test, err, -ENOENT); + test_remove(test, &rosebush, 5); +} + +static void grow(struct kunit *test) +{ + int i; + + for (i = 3; i < 3333; i += 2) + test_insert(test, &rosebush, i); + + rbh_destroy(&rosebush); +} + +static struct kunit_case rosebush_cases[] __refdata = { + KUNIT_CASE(empty), + KUNIT_CASE(first), + KUNIT_CASE(grow), + {} +}; + +static struct kunit_suite rosebush_suite = { + .name = "rosebush", + .test_cases = rosebush_cases, +}; + +kunit_test_suite(rosebush_suite); + +MODULE_AUTHOR("Matthew Wilcox (Oracle) "); +MODULE_LICENSE("GPL"); From patchwork Tue Jun 25 21:17:59 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Matthew Wilcox (Oracle)" X-Patchwork-Id: 13712064 Received: from casper.infradead.org (casper.infradead.org [90.155.50.34]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6E989176AAF; Tue, 25 Jun 2024 21:18:10 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=90.155.50.34 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719350293; cv=none; b=EPPValsNzz8UNx5UNqyiYy18swANRopNgnx2apRvIZ+oIXBV2uFMxN+uB9cYxUjNeV2NoISn8K82KuU0KJTq8TjcOO07D7BqxIc+8+5KUS7yvVDxVmsW4bbCor6YVOb8dvs+AaablVb5TYwD3KMdw3StXdJzGYD4okcUiwfCn1Q= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719350293; c=relaxed/simple; bh=z22rC2d+tdG992WQD+/IlVFv4wF2WVaOL0fVpFvwpBs=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=Vb0mr09c88NSaopy15pZ175fkM5gm/1yltzgGoB9Cq8D0RArm3VyRLZu4COk2VOYn9UbbrUTIs9HSazI3IXfLPgpg2/dOWzGbi7jYNXfUuR0iCURsdY4GI4PDc0ErXeZ6siLnj+Uedj9Tiwz60LzuKTt01X4AYFqVTsDslLszJ0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=infradead.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=lagVaS03; arc=none smtp.client-ip=90.155.50.34 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=infradead.org Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="lagVaS03" DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=casper.20170209; h=Content-Transfer-Encoding:MIME-Version: References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To: Content-Type:Content-ID:Content-Description; bh=xaYHyg9fqZq3nYOQo5wO57AGN36TvtPlQVk14puk+DA=; b=lagVaS03mlmoCwvSrG6D+LUG2+ Fi55kw6uylROLhR/b3asWKar4z1DVfPI8Zx6eDePs065Aw2GJbU27/jXON88nl8ly2UFNzFMdyhq6 xTFAmVMHRX1eaXc7E6kr82Ajx4DFJnY3CJ7ihI9Pd0OhdowQEbi2q1kpQdaKWFIlTnN4CHIHEDDzL Se0Kh1j4fHy1a0QowgVfVWo9xFpFIqCzZR98LoKFyACcY7CrqGQwia1FtNalh4ZAIDFryr/Ei058o N/EIBYRHlOp4HycUSP75Jl20rW784nZ/HnkbYkyFPBbZYUE95akF2oceXMqy8WKuLf/cPCotiHCkR bMQscKvA==; Received: from willy by casper.infradead.org with local (Exim 4.97.1 #2 (Red Hat Linux)) id 1sMDYG-0000000BXYc-1UQV; Tue, 25 Jun 2024 21:18:08 +0000 From: "Matthew Wilcox (Oracle)" To: linux-kernel@vger.kernel.org Cc: "Matthew Wilcox (Oracle)" , linux-fsdevel@vger.kernel.org, maple-tree@lists.infradead.org Subject: [PATCH v2 4/5] tools: Add support for running rosebush tests in userspace Date: Tue, 25 Jun 2024 22:17:59 +0100 Message-ID: <20240625211803.2750563-5-willy@infradead.org> X-Mailer: git-send-email 2.45.1 In-Reply-To: <20240625211803.2750563-1-willy@infradead.org> References: <20240625211803.2750563-1-willy@infradead.org> Precedence: bulk X-Mailing-List: linux-fsdevel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Enable make -C tools/testing/radix-tree. Much easier to debug than an in-kernel module. Signed-off-by: Matthew Wilcox (Oracle) --- tools/include/linux/rosebush.h | 1 + tools/testing/radix-tree/.gitignore | 1 + tools/testing/radix-tree/Makefile | 6 ++++- tools/testing/radix-tree/kunit/test.h | 20 +++++++++++++++ tools/testing/radix-tree/rosebush.c | 36 +++++++++++++++++++++++++++ 5 files changed, 63 insertions(+), 1 deletion(-) create mode 100644 tools/include/linux/rosebush.h create mode 100644 tools/testing/radix-tree/kunit/test.h create mode 100644 tools/testing/radix-tree/rosebush.c diff --git a/tools/include/linux/rosebush.h b/tools/include/linux/rosebush.h new file mode 100644 index 000000000000..3f12f4288250 --- /dev/null +++ b/tools/include/linux/rosebush.h @@ -0,0 +1 @@ +#include "../../../include/linux/rosebush.h" diff --git a/tools/testing/radix-tree/.gitignore b/tools/testing/radix-tree/.gitignore index 49bccb90c35b..fb154f26bdab 100644 --- a/tools/testing/radix-tree/.gitignore +++ b/tools/testing/radix-tree/.gitignore @@ -9,3 +9,4 @@ radix-tree.c xarray maple ma_xa_benchmark +rosebush diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile index 7527f738b4a1..982ff4b7fdeb 100644 --- a/tools/testing/radix-tree/Makefile +++ b/tools/testing/radix-tree/Makefile @@ -4,7 +4,7 @@ CFLAGS += -I. -I../../include -I../../../lib -g -Og -Wall \ -D_LGPL_SOURCE -fsanitize=address -fsanitize=undefined LDFLAGS += -fsanitize=address -fsanitize=undefined LDLIBS+= -lpthread -lurcu -TARGETS = main idr-test multiorder xarray maple +TARGETS = main idr-test multiorder xarray maple rosebush CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o find_bit.o bitmap.o \ slab.o maple.o OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \ @@ -36,6 +36,8 @@ xarray: $(CORE_OFILES) maple: $(CORE_OFILES) +rosebush: $(CORE_OFILES) + multiorder: multiorder.o $(CORE_OFILES) clean: @@ -62,6 +64,8 @@ xarray.o: ../../../lib/xarray.c ../../../lib/test_xarray.c maple.o: ../../../lib/maple_tree.c ../../../lib/test_maple_tree.c +rosebush.o: ../../../lib/rosebush.c ../../../lib/test_rosebush.c + generated/map-shift.h: @if ! grep -qws $(SHIFT) generated/map-shift.h; then \ echo "#define XA_CHUNK_SHIFT $(SHIFT)" > \ diff --git a/tools/testing/radix-tree/kunit/test.h b/tools/testing/radix-tree/kunit/test.h new file mode 100644 index 000000000000..0805e3695762 --- /dev/null +++ b/tools/testing/radix-tree/kunit/test.h @@ -0,0 +1,20 @@ +struct kunit { +}; + +struct kunit_case { + void (*run_case)(struct kunit *test); +}; + +struct kunit_suite { + char *name; + struct kunit_case *test_cases; +}; + +#define KUNIT_CASE(test_name) { .run_case = test_name, } +#define kunit_test_suite(x) + +#define KUNIT_EXPECT_EQ(test, left, right) \ + KUNIT_EXPECT_PTR_EQ_MSG(test, left, right, NULL) +#define KUNIT_EXPECT_PTR_EQ_MSG(test, left, right, fmt, ...) \ + assert(left == right) + diff --git a/tools/testing/radix-tree/rosebush.c b/tools/testing/radix-tree/rosebush.c new file mode 100644 index 000000000000..51703737833e --- /dev/null +++ b/tools/testing/radix-tree/rosebush.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * rosebush.c: Userspace testing for rosebush test-suite + * Copyright (c) 2024 Oracle Corporation + * Author: Matthew Wilcox + */ + +#include "test.h" +#include +#include + +#define module_init(x) +#define module_exit(x) +#define MODULE_AUTHOR(x) +#define MODULE_LICENSE(x) +#define dump_stack() assert(0) + +#include "../../../lib/rosebush.c" +#include "../../../lib/test_rosebush.c" + +int __weak main(void) +{ + struct kunit test; + int i; + + assert(rosebush_suite.test_cases == rosebush_cases); + + for (i = 0; i < ARRAY_SIZE(rosebush_cases); i++) { + if (!rosebush_cases[i].run_case) + continue; + printf("i = %d %p\n", i, rosebush_cases[i].run_case); + rosebush_cases[i].run_case(&test); + } + + return 0; +} From patchwork Tue Jun 25 21:18:00 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Matthew Wilcox (Oracle)" X-Patchwork-Id: 13712063 Received: from casper.infradead.org (casper.infradead.org [90.155.50.34]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 935AD17CA16; Tue, 25 Jun 2024 21:18:10 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=90.155.50.34 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719350292; cv=none; b=XfpHFwdcSvFcXIvFVoCA1CbjzD7Is5nurT7aE/h3cRnaSfSG55hS+IZ/eMoB9RP5br7bti/nYbF2tNPxC71gHRuYu0w8ihbSY6bE/zC0be3a99RjsplakDjXNhk0h48PjkmqgihV11YE9ILnFOEFqXhYt621qWQ0Faap+Ms1ixY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719350292; c=relaxed/simple; bh=h47z6HO+e2qPPToeoc9r/aR+oTeUb2HZTNOhNNGhHtk=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=GA+ZRQQlQOVNckILN+wjOAuLvEeaa2Lr7O3wwjvRGW7tGj9dMfbZ3JR8hrReW3Z0a80Mgpm3ejCLxnkeuoAZJfVKB101cAS/nlUxLnnVGm6aOgj/Gk6pmAPRit5q4YLdbk+UdMBXdOVfOyFmph2FabMyO0EpLJ60t7pfYxCpcFI= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=infradead.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=ZeepE5g6; arc=none smtp.client-ip=90.155.50.34 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=infradead.org Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="ZeepE5g6" DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=casper.20170209; h=Content-Transfer-Encoding:MIME-Version: References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To: Content-Type:Content-ID:Content-Description; bh=yWCNwPRRR/3MDBLoGzNNPVXBUmyUwBDeOmRQI2p75TA=; b=ZeepE5g6cH2FXhn8x08wOCbkto oowhRwK0grOPj8Hbb5ak2SAYrWuhjrc0lzmzO/4ysWZDwYfbnPvSZCniqq6Ydmtiztwn4Dvsg5kTg I3EHIcIX+0fNikUjrXJDA/3ZahfV+32UVxInV/3A5Ptx1gsnxlfZcmDatrMENp3kzlBLA4dbD4YHF 6/Bb/EHtKYq08SnzREQZWlPDeazqBl8CgJHqJrTN2roi6iUFi0prEKYAhov5W/KjZ4HPQThBtN/zR 3riO4XBYoDmHdgaeqfO/pBioX0mRilzSsjRHSA4IaI6bdW3WKjc9cpvbk7ITb6gyfgrDGfNhKdYJR WS+paHnQ==; Received: from willy by casper.infradead.org with local (Exim 4.97.1 #2 (Red Hat Linux)) id 1sMDYG-0000000BXYi-2wFT; Tue, 25 Jun 2024 21:18:08 +0000 From: "Matthew Wilcox (Oracle)" To: linux-kernel@vger.kernel.org Cc: "Matthew Wilcox (Oracle)" , linux-fsdevel@vger.kernel.org, maple-tree@lists.infradead.org Subject: [PATCH v2 5/5] dcache: Convert to use rosebush Date: Tue, 25 Jun 2024 22:18:00 +0100 Message-ID: <20240625211803.2750563-6-willy@infradead.org> X-Mailer: git-send-email 2.45.1 In-Reply-To: <20240625211803.2750563-1-willy@infradead.org> References: <20240625211803.2750563-1-willy@infradead.org> Precedence: bulk X-Mailing-List: linux-fsdevel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Use the rosebush instead of a custom hashtable. This is a deliberately awful patch because I wouldn't want anyone applying it yet. We need to figure out how to implement d_unhashed() properly, and what to do about IS_ROOT() dentries. Signed-off-by: Matthew Wilcox (Oracle) --- fs/dcache.c | 152 +++++++++++++++++++--------------------------------- 1 file changed, 54 insertions(+), 98 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 407095188f83..5c2ba8d5f8ff 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -87,23 +88,7 @@ EXPORT_SYMBOL(slash_name); const struct qstr dotdot_name = QSTR_INIT("..", 2); EXPORT_SYMBOL(dotdot_name); -/* - * This is the single most critical data structure when it comes - * to the dcache: the hashtable for lookups. Somebody should try - * to make this good - I've just made it work. - * - * This hash-function tries to avoid losing too many bits of hash - * information, yet avoid using a prime hash-size or similar. - */ - -static unsigned int d_hash_shift __ro_after_init; - -static struct hlist_bl_head *dentry_hashtable __ro_after_init; - -static inline struct hlist_bl_head *d_hash(unsigned int hash) -{ - return dentry_hashtable + (hash >> d_hash_shift); -} +static DEFINE_ROSEBUSH(all_dentries); #define IN_LOOKUP_SHIFT 10 static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT]; @@ -486,20 +471,22 @@ static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, static void ___d_drop(struct dentry *dentry) { - struct hlist_bl_head *b; /* * Hashed dentries are normally on the dentry hashtable, * with the exception of those newly allocated by * d_obtain_root, which are always IS_ROOT: */ - if (unlikely(IS_ROOT(dentry))) - b = &dentry->d_sb->s_roots; - else - b = d_hash(dentry->d_name.hash); + if (unlikely(IS_ROOT(dentry))) { + struct hlist_bl_head *b = &dentry->d_sb->s_roots; + hlist_bl_lock(b); + __hlist_bl_del(&dentry->d_hash); + hlist_bl_unlock(b); + } else { + dentry->d_hash.pprev = NULL; + } - hlist_bl_lock(b); - __hlist_bl_del(&dentry->d_hash); - hlist_bl_unlock(b); +//printk("removing dentry %p at %x\n", dentry, dentry->d_name.hash); + rbh_remove(&all_dentries, dentry->d_name.hash, dentry); } void __d_drop(struct dentry *dentry) @@ -2104,15 +2091,14 @@ static noinline struct dentry *__d_lookup_rcu_op_compare( unsigned *seqp) { u64 hashlen = name->hash_len; - struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen)); - struct hlist_bl_node *node; + RBH_ITER(iter, &all_dentries, hashlen_hash(hashlen)); struct dentry *dentry; - hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { + while ((dentry = rbh_next(&iter)) != NULL) { int tlen; const char *tname; unsigned seq; - +//printk("%s: got dentry %p with hash %x\n", __func__, dentry, dentry->d_name.hash); seqretry: seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) @@ -2131,9 +2117,9 @@ static noinline struct dentry *__d_lookup_rcu_op_compare( if (parent->d_op->d_compare(dentry, tlen, tname, name) != 0) continue; *seqp = seq; - return dentry; + break; } - return NULL; + return dentry; } /** @@ -2171,8 +2157,7 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent, { u64 hashlen = name->hash_len; const unsigned char *str = name->name; - struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen)); - struct hlist_bl_node *node; + RBH_ITER(iter, &all_dentries, hashlen_hash(hashlen)); struct dentry *dentry; /* @@ -2182,6 +2167,8 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent, * Keep the two functions in sync. */ +//printk("%s: Looking up %s with hash %x\n", __func__, name->name, name->hash); + if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) return __d_lookup_rcu_op_compare(parent, name, seqp); @@ -2198,9 +2185,10 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent, * * See Documentation/filesystems/path-lookup.txt for more details. */ - hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { + while ((dentry = rbh_next(&iter)) != NULL) { unsigned seq; +//printk("%s: got dentry %p with hash %x\n", __func__, dentry, dentry->d_name.hash); /* * The dentry sequence count protects us from concurrent * renames, and thus protects parent and name fields. @@ -2228,9 +2216,10 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent, if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0) continue; *seqp = seq; - return dentry; + break; } - return NULL; +//printk("%s: found %p\n", __func__, dentry); + return dentry; } /** @@ -2276,10 +2265,7 @@ EXPORT_SYMBOL(d_lookup); */ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) { - unsigned int hash = name->hash; - struct hlist_bl_head *b = d_hash(hash); - struct hlist_bl_node *node; - struct dentry *found = NULL; + RBH_ITER(iter, &all_dentries, name->hash); struct dentry *dentry; /* @@ -2289,6 +2275,8 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) * Keep the two functions in sync. */ +//printk("%s: Looking up %s with hash %x\n", __func__, name->name, name->hash); + /* * The hash list is protected using RCU. * @@ -2303,12 +2291,8 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) * See Documentation/filesystems/path-lookup.txt for more details. */ rcu_read_lock(); - - hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { - - if (dentry->d_name.hash != hash) - continue; - + while ((dentry = rbh_next(&iter)) != NULL) { +//printk("%s: got dentry %p with hash %x\n", __func__, dentry, dentry->d_name.hash); spin_lock(&dentry->d_lock); if (dentry->d_parent != parent) goto next; @@ -2319,15 +2303,15 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) goto next; dentry->d_lockref.count++; - found = dentry; spin_unlock(&dentry->d_lock); break; next: spin_unlock(&dentry->d_lock); - } - rcu_read_unlock(); + } + rcu_read_unlock(); - return found; +//printk("%s: found %p\n", __func__, dentry); + return dentry; } /** @@ -2397,11 +2381,10 @@ EXPORT_SYMBOL(d_delete); static void __d_rehash(struct dentry *entry) { - struct hlist_bl_head *b = d_hash(entry->d_name.hash); - - hlist_bl_lock(b); - hlist_bl_add_head_rcu(&entry->d_hash, b); - hlist_bl_unlock(b); +//printk("%s: using dentry %p at %x\n", __func__, entry, entry->d_name.hash); + + entry->d_hash.pprev = (void *)&all_dentries; + rbh_use(&all_dentries, entry->d_name.hash, entry); } /** @@ -2413,6 +2396,8 @@ static void __d_rehash(struct dentry *entry) void d_rehash(struct dentry * entry) { +//printk("%s: reserving dentry %p at %x\n", __func__, entry, entry->d_name.hash); + rbh_reserve(&all_dentries, entry->d_name.hash); spin_lock(&entry->d_lock); __d_rehash(entry); spin_unlock(&entry->d_lock); @@ -2601,6 +2586,7 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode) wait_queue_head_t *d_wait; struct inode *dir = NULL; unsigned n; + spin_lock(&dentry->d_lock); if (unlikely(d_in_lookup(dentry))) { dir = dentry->d_parent->d_inode; @@ -2625,20 +2611,21 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode) /** * d_add - add dentry to hash queues - * @entry: dentry to add + * @dentry: dentry to add * @inode: The inode to attach to this dentry * * This adds the entry to the hash queues and initializes @inode. * The entry was actually filled in earlier during d_alloc(). */ - -void d_add(struct dentry *entry, struct inode *inode) +void d_add(struct dentry *dentry, struct inode *inode) { +//printk("%s: reserving dentry %p at %x\n", __func__, dentry, dentry->d_name.hash); + rbh_reserve(&all_dentries, dentry->d_name.hash); if (inode) { - security_d_instantiate(entry, inode); + security_d_instantiate(dentry, inode); spin_lock(&inode->i_lock); } - __d_add(entry, inode); + __d_add(dentry, inode); } EXPORT_SYMBOL(d_add); @@ -2658,6 +2645,8 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) struct dentry *alias; unsigned int hash = entry->d_name.hash; +//printk("%s: reserving dentry %p at %x\n", __func__, entry, hash); + rbh_reserve(&all_dentries, hash); spin_lock(&inode->i_lock); hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { /* @@ -2675,6 +2664,7 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) if (!d_unhashed(alias)) { spin_unlock(&alias->d_lock); alias = NULL; + rbh_release(&all_dentries, hash); } else { dget_dlock(alias); __d_rehash(alias); @@ -2684,6 +2674,7 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) return alias; } spin_unlock(&inode->i_lock); + rbh_release(&all_dentries, hash); return NULL; } EXPORT_SYMBOL(d_exact_alias); @@ -2967,6 +2958,8 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) BUG_ON(!d_unhashed(dentry)); +//printk("%s: reserving dentry %p at %x\n", __func__, dentry, dentry->d_name.hash); + rbh_reserve(&all_dentries, dentry->d_name.hash); if (!inode) goto out; @@ -3002,6 +2995,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) write_sequnlock(&rename_lock); } iput(inode); + rbh_release(&all_dentries, dentry->d_name.hash); return new; } } @@ -3110,27 +3104,6 @@ static int __init set_dhash_entries(char *str) } __setup("dhash_entries=", set_dhash_entries); -static void __init dcache_init_early(void) -{ - /* If hashes are distributed across NUMA nodes, defer - * hash allocation until vmalloc space is available. - */ - if (hashdist) - return; - - dentry_hashtable = - alloc_large_system_hash("Dentry cache", - sizeof(struct hlist_bl_head), - dhash_entries, - 13, - HASH_EARLY | HASH_ZERO, - &d_hash_shift, - NULL, - 0, - 0); - d_hash_shift = 32 - d_hash_shift; -} - static void __init dcache_init(void) { /* @@ -3141,22 +3114,6 @@ static void __init dcache_init(void) dentry_cache = KMEM_CACHE_USERCOPY(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT, d_iname); - - /* Hash may have been set up in dcache_init_early */ - if (!hashdist) - return; - - dentry_hashtable = - alloc_large_system_hash("Dentry cache", - sizeof(struct hlist_bl_head), - dhash_entries, - 13, - HASH_ZERO, - &d_hash_shift, - NULL, - 0, - 0); - d_hash_shift = 32 - d_hash_shift; } /* SLAB cache for __getname() consumers */ @@ -3170,7 +3127,6 @@ void __init vfs_caches_init_early(void) for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++) INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]); - dcache_init_early(); inode_init_early(); }