diff mbox series

[V2,1/3] dcache: add a new enum type for 'dentry_d_lock_class'

Message ID 20191130020225.20239-2-yukuai3@huawei.com (mailing list archive)
State New, archived
Headers show
Series [V2,1/3] dcache: add a new enum type for 'dentry_d_lock_class' | expand

Commit Message

Yu Kuai Nov. 30, 2019, 2:02 a.m. UTC
'dentry_d_lock_class' can be used for spin_lock_nested in case lockdep
confused about two different dentry take the 'd_lock'.

However, a single 'DENTRY_D_LOCK_NESTED' may not be enough if more than
two dentry are involed. So, add in 'DENTRY_D_LOCK_NESTED_TWICE'.

Signed-off-by: yu kuai <yukuai3@huawei.com>
---
 include/linux/dcache.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

Comments

Matthew Wilcox Nov. 30, 2019, 3:43 a.m. UTC | #1
On Sat, Nov 30, 2019 at 10:02:23AM +0800, yu kuai wrote:
> However, a single 'DENTRY_D_LOCK_NESTED' may not be enough if more than
> two dentry are involed. So, add in 'DENTRY_D_LOCK_NESTED_TWICE'.

No.  These need meaningful names.  Indeed, I think D_LOCK_NESTED is
a terrible name.

Looking at the calls:

$ git grep -n nested.*d_lock fs
fs/autofs/expire.c:82:          spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
fs/dcache.c:619:                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
fs/dcache.c:1086:       spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
fs/dcache.c:1303:               spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
fs/dcache.c:2822:               spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED);
fs/dcache.c:2827:                       spin_lock_nested(&target->d_parent->d_lock,
fs/dcache.c:2830:       spin_lock_nested(&dentry->d_lock, 2);
fs/dcache.c:2831:       spin_lock_nested(&target->d_lock, 3);
fs/dcache.c:3121:       spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
fs/libfs.c:112:                 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
fs/libfs.c:341:         spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
fs/notify/fsnotify.c:129:                       spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);

Most of these would be well-expressed by DENTRY_D_LOCK_CHILD.

The exception is __d_move() where I think we should actually name the
different lock classes instead of using a bare '2' and '3'.  Something
like this, perhaps:

diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index 2866fabf497f..f604175243eb 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -79,7 +79,7 @@ static struct dentry *positive_after(struct dentry *p, struct dentry *child)
 		child = list_first_entry(&p->d_subdirs, struct dentry, d_child);
 
 	list_for_each_entry_from(child, &p->d_subdirs, d_child) {
-		spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
+		spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_CHILD);
 		if (simple_positive(child)) {
 			dget_dlock(child);
 			spin_unlock(&child->d_lock);
diff --git a/fs/dcache.c b/fs/dcache.c
index e88cf0554e65..c73b7d7bc785 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -616,7 +616,7 @@ static struct dentry *__lock_parent(struct dentry *dentry)
 	}
 	rcu_read_unlock();
 	if (parent != dentry)
-		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_CHILD);
 	else
 		parent = NULL;
 	return parent;
@@ -1083,7 +1083,7 @@ static bool shrink_lock_dentry(struct dentry *dentry)
 		spin_lock(&dentry->d_lock);
 		goto out;
 	}
-	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_CHILD);
 	if (likely(!dentry->d_lockref.count))
 		return true;
 	spin_unlock(&parent->d_lock);
@@ -1300,7 +1300,7 @@ static void d_walk(struct dentry *parent, void *data,
 		if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR))
 			continue;
 
-		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_CHILD);
 
 		ret = enter(data, dentry);
 		switch (ret) {
@@ -2819,16 +2819,16 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
 	} else if (!p) {
 		/* target is not a descendent of dentry->d_parent */
 		spin_lock(&target->d_parent->d_lock);
-		spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED);
+		spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_PARENT_2);
 	} else {
 		BUG_ON(p == dentry);
 		spin_lock(&old_parent->d_lock);
 		if (p != target)
 			spin_lock_nested(&target->d_parent->d_lock,
-					DENTRY_D_LOCK_NESTED);
+					DENTRY_D_LOCK_PARENT_2);
 	}
-	spin_lock_nested(&dentry->d_lock, 2);
-	spin_lock_nested(&target->d_lock, 3);
+	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_CHILD);
+	spin_lock_nested(&target->d_lock, DENTRY_D_LOCK_CHILD_2);
 
 	if (unlikely(d_in_lookup(target))) {
 		dir = target->d_parent->d_inode;
@@ -2837,7 +2837,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
 	}
 
 	write_seqcount_begin(&dentry->d_seq);
-	write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);
+	write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_CHILD);
 
 	/* unhash both */
 	if (!d_unhashed(dentry))
@@ -3118,7 +3118,7 @@ void d_tmpfile(struct dentry *dentry, struct inode *inode)
 		!hlist_unhashed(&dentry->d_u.d_alias) ||
 		!d_unlinked(dentry));
 	spin_lock(&dentry->d_parent->d_lock);
-	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_CHILD);
 	dentry->d_name.len = sprintf(dentry->d_iname, "#%llu",
 				(unsigned long long)inode->i_ino);
 	spin_unlock(&dentry->d_lock);
diff --git a/fs/libfs.c b/fs/libfs.c
index 1463b038ffc4..c68dedbf4ad2 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -109,7 +109,7 @@ static struct dentry *scan_positives(struct dentry *cursor,
 		if (d->d_flags & DCACHE_DENTRY_CURSOR)
 			continue;
 		if (simple_positive(d) && !--count) {
-			spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
+			spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_CHILD);
 			if (simple_positive(d))
 				found = dget_dlock(d);
 			spin_unlock(&d->d_lock);
@@ -336,9 +336,9 @@ int simple_empty(struct dentry *dentry)
 	struct dentry *child;
 	int ret = 0;
 
-	spin_lock(&dentry->d_lock);
+	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_PARENT_2);
 	list_for_each_entry(child, &dentry->d_subdirs, d_child) {
-		spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
+		spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_CHILD_2);
 		if (simple_positive(child)) {
 			spin_unlock(&child->d_lock);
 			goto out;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 2ecef6155fc0..23492f2e4915 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -126,7 +126,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 			if (!child->d_inode)
 				continue;
 
-			spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
+			spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_CHILD);
 			if (watched)
 				child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
 			else
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 10090f11ab95..6a497c63bd38 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -121,15 +121,20 @@ struct dentry {
 } __randomize_layout;
 
 /*
- * dentry->d_lock spinlock nesting subclasses:
+ * dentry->d_lock spinlock nesting subclasses.  Always taken in increasing
+ * order although some subclasses may be skipped.
  *
  * 0: normal
- * 1: nested
+ * 1: either a descendent of "normal" or a cousin.
+ * 2: child of the "normal" dentry
+ * 3: child of the "parent2" dentry
  */
 enum dentry_d_lock_class
 {
-	DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs. */
-	DENTRY_D_LOCK_NESTED
+	DENTRY_D_LOCK_NORMAL,   /* implicitly used by plain spin_lock() APIs */
+	DENTRY_D_LOCK_PARENT_2, /* not an ancestor of parent */
+	DENTRY_D_LOCK_CHILD,    /* nests under parent's lock */
+	DENTRY_D_LOCK_CHILD_2,  /* PARENT_2's child */
 };
 
 struct dentry_operations {
Yu Kuai Nov. 30, 2019, 7:53 a.m. UTC | #2
On 2019/11/30 11:43, Matthew Wilcox wrote:
> On Sat, Nov 30, 2019 at 10:02:23AM +0800, yu kuai wrote:
>> However, a single 'DENTRY_D_LOCK_NESTED' may not be enough if more than
>> two dentry are involed. So, add in 'DENTRY_D_LOCK_NESTED_TWICE'.
> 
> No.  These need meaningful names.  Indeed, I think D_LOCK_NESTED is
> a terrible name.
> 
> Looking at the calls:
> 
> $ git grep -n nested.*d_lock fs
> fs/autofs/expire.c:82:          spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
> fs/dcache.c:619:                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
> fs/dcache.c:1086:       spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
> fs/dcache.c:1303:               spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
> fs/dcache.c:2822:               spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED);
> fs/dcache.c:2827:                       spin_lock_nested(&target->d_parent->d_lock,
> fs/dcache.c:2830:       spin_lock_nested(&dentry->d_lock, 2);
> fs/dcache.c:2831:       spin_lock_nested(&target->d_lock, 3);
> fs/dcache.c:3121:       spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
> fs/libfs.c:112:                 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
> fs/libfs.c:341:         spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
> fs/notify/fsnotify.c:129:                       spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
> 
> Most of these would be well-expressed by DENTRY_D_LOCK_CHILD.
> 
> The exception is __d_move() where I think we should actually name the
> different lock classes instead of using a bare '2' and '3'.  Something
> like this, perhaps:
> 
Thanks for looking into this, do you mind if I replace your patch with 
the first two patches in the patchset?

Yu Kuai
Matthew Wilcox Nov. 30, 2019, 7:36 p.m. UTC | #3
On Sat, Nov 30, 2019 at 03:53:10PM +0800, yukuai (C) wrote:
> On 2019/11/30 11:43, Matthew Wilcox wrote:
> > On Sat, Nov 30, 2019 at 10:02:23AM +0800, yu kuai wrote:
> > > However, a single 'DENTRY_D_LOCK_NESTED' may not be enough if more than
> > > two dentry are involed. So, add in 'DENTRY_D_LOCK_NESTED_TWICE'.
> > 
> > No.  These need meaningful names.  Indeed, I think D_LOCK_NESTED is
> > a terrible name.
> > 
> > The exception is __d_move() where I think we should actually name the
> > different lock classes instead of using a bare '2' and '3'.  Something
> > like this, perhaps:
> 
> Thanks for looking into this, do you mind if I replace your patch with the
> first two patches in the patchset?

That's fine by me, but I think we should wait for Al to give his approval
before submitting a new version.

I'm also not entirely content with the explanation I wrote last night.
Maybe this instead ...

 /*
- * dentry->d_lock spinlock nesting subclasses:
+ * dentry->d_lock spinlock nesting subclasses.  Always taken in increasing
+ * order although some subclasses may be skipped.  If one dentry is the
+ * ancestor of another, then the ancestor's d_lock is taken before the
+ * descendent.  If NORMAL and PARENT_2 do not have a hierarchical relationship
+ * then you must hold the s_vfs_rename_mutex to prevent another thread taking
+ * the locks in the opposite order, or NORMAL and PARENT_2 becoming
+ * hierarchical through a rename operation.
  *
  * 0: normal
- * 1: nested
+ * 1: either a descendent of "normal" or a cousin.
+ * 2: child of the "normal" dentry
+ * 3: child of the "parent2" dentry
  */
 enum dentry_d_lock_class
 {
-       DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs. */
-       DENTRY_D_LOCK_NESTED
+       DENTRY_D_LOCK_NORMAL,   /* implicitly used by plain spin_lock() APIs */
+       DENTRY_D_LOCK_PARENT_2, /* not an ancestor of normal */
+       DENTRY_D_LOCK_CHILD,    /* nests under parent's lock */
+       DENTRY_D_LOCK_CHILD_2,  /* PARENT_2's child */
 };
Al Viro Dec. 8, 2019, 7:11 p.m. UTC | #4
On Sat, Nov 30, 2019 at 11:36:15AM -0800, Matthew Wilcox wrote:
> On Sat, Nov 30, 2019 at 03:53:10PM +0800, yukuai (C) wrote:
> > On 2019/11/30 11:43, Matthew Wilcox wrote:
> > > On Sat, Nov 30, 2019 at 10:02:23AM +0800, yu kuai wrote:
> > > > However, a single 'DENTRY_D_LOCK_NESTED' may not be enough if more than
> > > > two dentry are involed. So, add in 'DENTRY_D_LOCK_NESTED_TWICE'.
> > > 
> > > No.  These need meaningful names.  Indeed, I think D_LOCK_NESTED is
> > > a terrible name.
> > > 
> > > The exception is __d_move() where I think we should actually name the
> > > different lock classes instead of using a bare '2' and '3'.  Something
> > > like this, perhaps:
> > 
> > Thanks for looking into this, do you mind if I replace your patch with the
> > first two patches in the patchset?
> 
> That's fine by me, but I think we should wait for Al to give his approval
> before submitting a new version.

IMO this is a wrong approach.  It's papering over a confused code in
debugfs recursive removal and it would be better to get rid of _that_,
rather than try and slap bandaids on it.

I suspect that the following would be a better way to deal with it; it adds
a new primitive and converts debugfs and tracefs to that.  There are
followups converting other such places, still not finished.

commit 7e9c8a08889bf42bbe64e80e456d2eca824e5db2
Author: Al Viro <viro@zeniv.linux.org.uk>
Date:   Mon Nov 18 09:43:10 2019 -0500

    simple_recursive_removal(): kernel-side rm -rf for ramfs-style filesystems
    
    two requirements: no file creations in IS_DEADDIR and no cross-directory
    renames whatsoever.
    
    Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 042b688ed124..da936c54d879 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -309,7 +309,10 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
 		parent = debugfs_mount->mnt_root;
 
 	inode_lock(d_inode(parent));
-	dentry = lookup_one_len(name, parent, strlen(name));
+	if (unlikely(IS_DEADDIR(d_inode(parent))))
+		dentry = ERR_PTR(-ENOENT);
+	else
+		dentry = lookup_one_len(name, parent, strlen(name));
 	if (!IS_ERR(dentry) && d_really_is_positive(dentry)) {
 		if (d_is_dir(dentry))
 			pr_err("Directory '%s' with parent '%s' already present!\n",
@@ -657,62 +660,15 @@ static void __debugfs_file_removed(struct dentry *dentry)
 		wait_for_completion(&fsd->active_users_drained);
 }
 
-static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
-{
-	int ret = 0;
-
-	if (simple_positive(dentry)) {
-		dget(dentry);
-		if (d_is_dir(dentry)) {
-			ret = simple_rmdir(d_inode(parent), dentry);
-			if (!ret)
-				fsnotify_rmdir(d_inode(parent), dentry);
-		} else {
-			simple_unlink(d_inode(parent), dentry);
-			fsnotify_unlink(d_inode(parent), dentry);
-		}
-		if (!ret)
-			d_delete(dentry);
-		if (d_is_reg(dentry))
-			__debugfs_file_removed(dentry);
-		dput(dentry);
-	}
-	return ret;
-}
-
-/**
- * debugfs_remove - removes a file or directory from the debugfs filesystem
- * @dentry: a pointer to a the dentry of the file or directory to be
- *          removed.  If this parameter is NULL or an error value, nothing
- *          will be done.
- *
- * This function removes a file or directory in debugfs that was previously
- * created with a call to another debugfs function (like
- * debugfs_create_file() or variants thereof.)
- *
- * This function is required to be called in order for the file to be
- * removed, no automatic cleanup of files will happen when a module is
- * removed, you are responsible here.
- */
-void debugfs_remove(struct dentry *dentry)
+static void remove_one(struct dentry *victim)
 {
-	struct dentry *parent;
-	int ret;
-
-	if (IS_ERR_OR_NULL(dentry))
-		return;
-
-	parent = dentry->d_parent;
-	inode_lock(d_inode(parent));
-	ret = __debugfs_remove(dentry, parent);
-	inode_unlock(d_inode(parent));
-	if (!ret)
-		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+        if (d_is_reg(victim))
+		__debugfs_file_removed(victim);
+	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 }
-EXPORT_SYMBOL_GPL(debugfs_remove);
 
 /**
- * debugfs_remove_recursive - recursively removes a directory
+ * debugfs_remove - recursively removes a directory
  * @dentry: a pointer to a the dentry of the directory to be removed.  If this
  *          parameter is NULL or an error value, nothing will be done.
  *
@@ -724,65 +680,16 @@ EXPORT_SYMBOL_GPL(debugfs_remove);
  * removed, no automatic cleanup of files will happen when a module is
  * removed, you are responsible here.
  */
-void debugfs_remove_recursive(struct dentry *dentry)
+void debugfs_remove(struct dentry *dentry)
 {
-	struct dentry *child, *parent;
-
 	if (IS_ERR_OR_NULL(dentry))
 		return;
 
-	parent = dentry;
- down:
-	inode_lock(d_inode(parent));
- loop:
-	/*
-	 * The parent->d_subdirs is protected by the d_lock. Outside that
-	 * lock, the child can be unlinked and set to be freed which can
-	 * use the d_u.d_child as the rcu head and corrupt this list.
-	 */
-	spin_lock(&parent->d_lock);
-	list_for_each_entry(child, &parent->d_subdirs, d_child) {
-		if (!simple_positive(child))
-			continue;
-
-		/* perhaps simple_empty(child) makes more sense */
-		if (!list_empty(&child->d_subdirs)) {
-			spin_unlock(&parent->d_lock);
-			inode_unlock(d_inode(parent));
-			parent = child;
-			goto down;
-		}
-
-		spin_unlock(&parent->d_lock);
-
-		if (!__debugfs_remove(child, parent))
-			simple_release_fs(&debugfs_mount, &debugfs_mount_count);
-
-		/*
-		 * The parent->d_lock protects agaist child from unlinking
-		 * from d_subdirs. When releasing the parent->d_lock we can
-		 * no longer trust that the next pointer is valid.
-		 * Restart the loop. We'll skip this one with the
-		 * simple_positive() check.
-		 */
-		goto loop;
-	}
-	spin_unlock(&parent->d_lock);
-
-	inode_unlock(d_inode(parent));
-	child = parent;
-	parent = parent->d_parent;
-	inode_lock(d_inode(parent));
-
-	if (child != dentry)
-		/* go up */
-		goto loop;
-
-	if (!__debugfs_remove(child, parent))
-		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
-	inode_unlock(d_inode(parent));
+	simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count);
+	simple_recursive_removal(dentry, remove_one);
+	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 }
-EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
+EXPORT_SYMBOL_GPL(debugfs_remove);
 
 /**
  * debugfs_rename - rename a file/directory in the debugfs filesystem
diff --git a/fs/libfs.c b/fs/libfs.c
index 540611b99b9a..b67003a948ed 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -19,6 +19,7 @@
 #include <linux/buffer_head.h> /* sync_mapping_buffers */
 #include <linux/fs_context.h>
 #include <linux/pseudo_fs.h>
+#include <linux/fsnotify.h>
 
 #include <linux/uaccess.h>
 
@@ -239,6 +240,75 @@ const struct inode_operations simple_dir_inode_operations = {
 };
 EXPORT_SYMBOL(simple_dir_inode_operations);
 
+static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
+{
+	struct dentry *child = NULL;
+	struct list_head *p = prev ? &prev->d_child : &parent->d_subdirs;
+
+	spin_lock(&parent->d_lock);
+	while ((p = p->next) != &parent->d_subdirs) {
+		struct dentry *d = container_of(p, struct dentry, d_child);
+		if (simple_positive(d)) {
+			spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
+			if (simple_positive(d))
+				child = dget_dlock(d);
+			spin_unlock(&d->d_lock);
+			if (likely(child))
+				break;
+		}
+	}
+	spin_unlock(&parent->d_lock);
+	dput(prev);
+	return child;
+}
+
+void simple_recursive_removal(struct dentry *dentry,
+                              void (*callback)(struct dentry *))
+{
+	struct dentry *this = dentry;
+	while (true) {
+		struct dentry *victim = NULL, *child;
+		struct inode *inode = this->d_inode;
+
+		inode_lock(inode);
+		if (d_is_dir(this))
+			inode->i_flags |= S_DEAD;
+		while ((child = find_next_child(this, victim)) == NULL) {
+			// kill and ascend
+			// update metadata while it's still locked
+			inode->i_ctime = current_time(inode);
+			clear_nlink(inode);
+			inode_unlock(inode);
+			victim = this;
+			this = this->d_parent;
+			inode = this->d_inode;
+			inode_lock(inode);
+			if (simple_positive(victim)) {
+				d_invalidate(victim);	// avoid lost mounts
+				if (d_is_dir(victim))
+					fsnotify_rmdir(inode, victim);
+				else
+					fsnotify_unlink(inode, victim);
+				if (callback)
+					callback(victim);
+				dput(victim);		// unpin it
+			}
+			if (victim == dentry) {
+				inode->i_ctime = inode->i_mtime =
+					current_time(inode);
+				if (d_is_dir(dentry))
+					drop_nlink(inode);
+				inode_unlock(inode);
+				dput(dentry);
+				return;
+			}
+		}
+		inode_unlock(inode);
+		this = child;
+	}
+}
+EXPORT_SYMBOL(simple_recursive_removal);
+
 static const struct super_operations simple_super_operations = {
 	.statfs		= simple_statfs,
 };
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index eeeae0475da9..2a16c0eb97e4 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -329,7 +329,10 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
 		parent = tracefs_mount->mnt_root;
 
 	inode_lock(parent->d_inode);
-	dentry = lookup_one_len(name, parent, strlen(name));
+	if (unlikely(IS_DEADDIR(parent->d_inode)))
+		dentry = ERR_PTR(-ENOENT);
+	else
+		dentry = lookup_one_len(name, parent, strlen(name));
 	if (!IS_ERR(dentry) && dentry->d_inode) {
 		dput(dentry);
 		dentry = ERR_PTR(-EEXIST);
@@ -495,122 +498,27 @@ __init struct dentry *tracefs_create_instance_dir(const char *name,
 	return dentry;
 }
 
-static int __tracefs_remove(struct dentry *dentry, struct dentry *parent)
+static void remove_one(struct dentry *victim)
 {
-	int ret = 0;
-
-	if (simple_positive(dentry)) {
-		if (dentry->d_inode) {
-			dget(dentry);
-			switch (dentry->d_inode->i_mode & S_IFMT) {
-			case S_IFDIR:
-				ret = simple_rmdir(parent->d_inode, dentry);
-				if (!ret)
-					fsnotify_rmdir(parent->d_inode, dentry);
-				break;
-			default:
-				simple_unlink(parent->d_inode, dentry);
-				fsnotify_unlink(parent->d_inode, dentry);
-				break;
-			}
-			if (!ret)
-				d_delete(dentry);
-			dput(dentry);
-		}
-	}
-	return ret;
-}
-
-/**
- * tracefs_remove - removes a file or directory from the tracefs filesystem
- * @dentry: a pointer to a the dentry of the file or directory to be
- *          removed.
- *
- * This function removes a file or directory in tracefs that was previously
- * created with a call to another tracefs function (like
- * tracefs_create_file() or variants thereof.)
- */
-void tracefs_remove(struct dentry *dentry)
-{
-	struct dentry *parent;
-	int ret;
-
-	if (IS_ERR_OR_NULL(dentry))
-		return;
-
-	parent = dentry->d_parent;
-	inode_lock(parent->d_inode);
-	ret = __tracefs_remove(dentry, parent);
-	inode_unlock(parent->d_inode);
-	if (!ret)
-		simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+	simple_release_fs(&tracefs_mount, &tracefs_mount_count);
 }
 
 /**
- * tracefs_remove_recursive - recursively removes a directory
+ * tracefs_remove - recursively removes a directory
  * @dentry: a pointer to a the dentry of the directory to be removed.
  *
  * This function recursively removes a directory tree in tracefs that
  * was previously created with a call to another tracefs function
  * (like tracefs_create_file() or variants thereof.)
  */
-void tracefs_remove_recursive(struct dentry *dentry)
+void tracefs_remove(struct dentry *dentry)
 {
-	struct dentry *child, *parent;
-
 	if (IS_ERR_OR_NULL(dentry))
 		return;
 
-	parent = dentry;
- down:
-	inode_lock(parent->d_inode);
- loop:
-	/*
-	 * The parent->d_subdirs is protected by the d_lock. Outside that
-	 * lock, the child can be unlinked and set to be freed which can
-	 * use the d_u.d_child as the rcu head and corrupt this list.
-	 */
-	spin_lock(&parent->d_lock);
-	list_for_each_entry(child, &parent->d_subdirs, d_child) {
-		if (!simple_positive(child))
-			continue;
-
-		/* perhaps simple_empty(child) makes more sense */
-		if (!list_empty(&child->d_subdirs)) {
-			spin_unlock(&parent->d_lock);
-			inode_unlock(parent->d_inode);
-			parent = child;
-			goto down;
-		}
-
-		spin_unlock(&parent->d_lock);
-
-		if (!__tracefs_remove(child, parent))
-			simple_release_fs(&tracefs_mount, &tracefs_mount_count);
-
-		/*
-		 * The parent->d_lock protects agaist child from unlinking
-		 * from d_subdirs. When releasing the parent->d_lock we can
-		 * no longer trust that the next pointer is valid.
-		 * Restart the loop. We'll skip this one with the
-		 * simple_positive() check.
-		 */
-		goto loop;
-	}
-	spin_unlock(&parent->d_lock);
-
-	inode_unlock(parent->d_inode);
-	child = parent;
-	parent = parent->d_parent;
-	inode_lock(parent->d_inode);
-
-	if (child != dentry)
-		/* go up */
-		goto loop;
-
-	if (!__tracefs_remove(child, parent))
-		simple_release_fs(&tracefs_mount, &tracefs_mount_count);
-	inode_unlock(parent->d_inode);
+	simple_pin_fs(&trace_fs_type, &tracefs_mount, &tracefs_mount_count);
+	simple_recursive_removal(dentry, remove_one);
+	simple_release_fs(&tracefs_mount, &tracefs_mount_count);
 }
 
 /**
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 58424eb3b329..0a817d763f0f 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -82,7 +82,7 @@ struct dentry *debugfs_create_automount(const char *name,
 					void *data);
 
 void debugfs_remove(struct dentry *dentry);
-void debugfs_remove_recursive(struct dentry *dentry);
+#define debugfs_remove_recursive debugfs_remove
 
 const struct file_operations *debugfs_real_fops(const struct file *filp);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 997a530ff4e9..73ffc8654987 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3242,6 +3242,8 @@ extern int simple_unlink(struct inode *, struct dentry *);
 extern int simple_rmdir(struct inode *, struct dentry *);
 extern int simple_rename(struct inode *, struct dentry *,
 			 struct inode *, struct dentry *, unsigned int);
+extern void simple_recursive_removal(struct dentry *,
+                              void (*callback)(struct dentry *));
 extern int noop_fsync(struct file *, loff_t, loff_t, int);
 extern int noop_set_page_dirty(struct page *page);
 extern void noop_invalidatepage(struct page *page, unsigned int offset,
diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h
index 88d279c1b863..99912445974c 100644
--- a/include/linux/tracefs.h
+++ b/include/linux/tracefs.h
@@ -28,7 +28,6 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
 struct dentry *tracefs_create_dir(const char *name, struct dentry *parent);
 
 void tracefs_remove(struct dentry *dentry);
-void tracefs_remove_recursive(struct dentry *dentry);
 
 struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *parent,
 					   int (*mkdir)(const char *name),
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 563e80f9006a..88d94dc3ed37 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8366,7 +8366,7 @@ struct trace_array *trace_array_create(const char *name)
 
 	ret = event_trace_add_tracer(tr->dir, tr);
 	if (ret) {
-		tracefs_remove_recursive(tr->dir);
+		tracefs_remove(tr->dir);
 		goto out_free_tr;
 	}
 
@@ -8422,7 +8422,7 @@ static int __remove_instance(struct trace_array *tr)
 	event_trace_del_tracer(tr);
 	ftrace_clear_pids(tr);
 	ftrace_destroy_function_files(tr);
-	tracefs_remove_recursive(tr->dir);
+	tracefs_remove(tr->dir);
 	free_trace_buffers(tr);
 
 	for (i = 0; i < tr->nr_topts; i++) {
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 648930823b57..25bb3e8fb170 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -696,7 +696,7 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
 		return;
 
 	if (!--dir->nr_events) {
-		tracefs_remove_recursive(dir->entry);
+		tracefs_remove(dir->entry);
 		list_del(&dir->list);
 		__put_system_dir(dir);
 	}
@@ -715,7 +715,7 @@ static void remove_event_file_dir(struct trace_event_file *file)
 		}
 		spin_unlock(&dir->d_lock);
 
-		tracefs_remove_recursive(dir);
+		tracefs_remove(dir);
 	}
 
 	list_del(&file->list);
@@ -3032,7 +3032,7 @@ int event_trace_del_tracer(struct trace_array *tr)
 
 	down_write(&trace_event_sem);
 	__trace_remove_event_dirs(tr);
-	tracefs_remove_recursive(tr->event_dir);
+	tracefs_remove(tr->event_dir);
 	up_write(&trace_event_sem);
 
 	tr->event_dir = NULL;
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index fa95139445b2..fa45a031848c 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -551,7 +551,7 @@ static int init_tracefs(void)
 	return 0;
 
  err:
-	tracefs_remove_recursive(top_dir);
+	tracefs_remove(top_dir);
 	return -ENOMEM;
 }
David Hildenbrand Dec. 11, 2019, 3:55 p.m. UTC | #5
On 08.12.19 20:11, Al Viro wrote:
> On Sat, Nov 30, 2019 at 11:36:15AM -0800, Matthew Wilcox wrote:
>> On Sat, Nov 30, 2019 at 03:53:10PM +0800, yukuai (C) wrote:
>>> On 2019/11/30 11:43, Matthew Wilcox wrote:
>>>> On Sat, Nov 30, 2019 at 10:02:23AM +0800, yu kuai wrote:
>>>>> However, a single 'DENTRY_D_LOCK_NESTED' may not be enough if more than
>>>>> two dentry are involed. So, add in 'DENTRY_D_LOCK_NESTED_TWICE'.
>>>>
>>>> No.  These need meaningful names.  Indeed, I think D_LOCK_NESTED is
>>>> a terrible name.
>>>>
>>>> The exception is __d_move() where I think we should actually name the
>>>> different lock classes instead of using a bare '2' and '3'.  Something
>>>> like this, perhaps:
>>>
>>> Thanks for looking into this, do you mind if I replace your patch with the
>>> first two patches in the patchset?
>>
>> That's fine by me, but I think we should wait for Al to give his approval
>> before submitting a new version.
> 
> IMO this is a wrong approach.  It's papering over a confused code in
> debugfs recursive removal and it would be better to get rid of _that_,
> rather than try and slap bandaids on it.
> 
> I suspect that the following would be a better way to deal with it; it adds
> a new primitive and converts debugfs and tracefs to that.  There are
> followups converting other such places, still not finished.
> 
> commit 7e9c8a08889bf42bbe64e80e456d2eca824e5db2
> Author: Al Viro <viro@zeniv.linux.org.uk>
> Date:   Mon Nov 18 09:43:10 2019 -0500
> 
>     simple_recursive_removal(): kernel-side rm -rf for ramfs-style filesystems
>     
>     two requirements: no file creations in IS_DEADDIR and no cross-directory
>     renames whatsoever.
>     
>     Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> 
> diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
> index 042b688ed124..da936c54d879 100644
> --- a/fs/debugfs/inode.c
> +++ b/fs/debugfs/inode.c
> @@ -309,7 +309,10 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
>  		parent = debugfs_mount->mnt_root;
>  
>  	inode_lock(d_inode(parent));
> -	dentry = lookup_one_len(name, parent, strlen(name));
> +	if (unlikely(IS_DEADDIR(d_inode(parent))))
> +		dentry = ERR_PTR(-ENOENT);
> +	else
> +		dentry = lookup_one_len(name, parent, strlen(name));
>  	if (!IS_ERR(dentry) && d_really_is_positive(dentry)) {
>  		if (d_is_dir(dentry))
>  			pr_err("Directory '%s' with parent '%s' already present!\n",
> @@ -657,62 +660,15 @@ static void __debugfs_file_removed(struct dentry *dentry)
>  		wait_for_completion(&fsd->active_users_drained);
>  }
>  
> -static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
> -{
> -	int ret = 0;
> -
> -	if (simple_positive(dentry)) {
> -		dget(dentry);
> -		if (d_is_dir(dentry)) {
> -			ret = simple_rmdir(d_inode(parent), dentry);
> -			if (!ret)
> -				fsnotify_rmdir(d_inode(parent), dentry);
> -		} else {
> -			simple_unlink(d_inode(parent), dentry);
> -			fsnotify_unlink(d_inode(parent), dentry);
> -		}
> -		if (!ret)
> -			d_delete(dentry);
> -		if (d_is_reg(dentry))
> -			__debugfs_file_removed(dentry);
> -		dput(dentry);
> -	}
> -	return ret;
> -}
> -
> -/**
> - * debugfs_remove - removes a file or directory from the debugfs filesystem
> - * @dentry: a pointer to a the dentry of the file or directory to be
> - *          removed.  If this parameter is NULL or an error value, nothing
> - *          will be done.
> - *
> - * This function removes a file or directory in debugfs that was previously
> - * created with a call to another debugfs function (like
> - * debugfs_create_file() or variants thereof.)
> - *
> - * This function is required to be called in order for the file to be
> - * removed, no automatic cleanup of files will happen when a module is
> - * removed, you are responsible here.
> - */
> -void debugfs_remove(struct dentry *dentry)
> +static void remove_one(struct dentry *victim)
>  {
> -	struct dentry *parent;
> -	int ret;
> -
> -	if (IS_ERR_OR_NULL(dentry))
> -		return;
> -
> -	parent = dentry->d_parent;
> -	inode_lock(d_inode(parent));
> -	ret = __debugfs_remove(dentry, parent);
> -	inode_unlock(d_inode(parent));
> -	if (!ret)
> -		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
> +        if (d_is_reg(victim))
> +		__debugfs_file_removed(victim);
> +	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
>  }
> -EXPORT_SYMBOL_GPL(debugfs_remove);
>  
>  /**
> - * debugfs_remove_recursive - recursively removes a directory
> + * debugfs_remove - recursively removes a directory
>   * @dentry: a pointer to a the dentry of the directory to be removed.  If this
>   *          parameter is NULL or an error value, nothing will be done.
>   *
> @@ -724,65 +680,16 @@ EXPORT_SYMBOL_GPL(debugfs_remove);
>   * removed, no automatic cleanup of files will happen when a module is
>   * removed, you are responsible here.
>   */
> -void debugfs_remove_recursive(struct dentry *dentry)
> +void debugfs_remove(struct dentry *dentry)
>  {
> -	struct dentry *child, *parent;
> -
>  	if (IS_ERR_OR_NULL(dentry))
>  		return;
>  
> -	parent = dentry;
> - down:
> -	inode_lock(d_inode(parent));
> - loop:
> -	/*
> -	 * The parent->d_subdirs is protected by the d_lock. Outside that
> -	 * lock, the child can be unlinked and set to be freed which can
> -	 * use the d_u.d_child as the rcu head and corrupt this list.
> -	 */
> -	spin_lock(&parent->d_lock);
> -	list_for_each_entry(child, &parent->d_subdirs, d_child) {
> -		if (!simple_positive(child))
> -			continue;
> -
> -		/* perhaps simple_empty(child) makes more sense */
> -		if (!list_empty(&child->d_subdirs)) {
> -			spin_unlock(&parent->d_lock);
> -			inode_unlock(d_inode(parent));
> -			parent = child;
> -			goto down;
> -		}
> -
> -		spin_unlock(&parent->d_lock);
> -
> -		if (!__debugfs_remove(child, parent))
> -			simple_release_fs(&debugfs_mount, &debugfs_mount_count);
> -
> -		/*
> -		 * The parent->d_lock protects agaist child from unlinking
> -		 * from d_subdirs. When releasing the parent->d_lock we can
> -		 * no longer trust that the next pointer is valid.
> -		 * Restart the loop. We'll skip this one with the
> -		 * simple_positive() check.
> -		 */
> -		goto loop;
> -	}
> -	spin_unlock(&parent->d_lock);
> -
> -	inode_unlock(d_inode(parent));
> -	child = parent;
> -	parent = parent->d_parent;
> -	inode_lock(d_inode(parent));
> -
> -	if (child != dentry)
> -		/* go up */
> -		goto loop;
> -
> -	if (!__debugfs_remove(child, parent))
> -		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
> -	inode_unlock(d_inode(parent));
> +	simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count);
> +	simple_recursive_removal(dentry, remove_one);
> +	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
>  }
> -EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
> +EXPORT_SYMBOL_GPL(debugfs_remove);
>  
>  /**
>   * debugfs_rename - rename a file/directory in the debugfs filesystem
> diff --git a/fs/libfs.c b/fs/libfs.c
> index 540611b99b9a..b67003a948ed 100644
> --- a/fs/libfs.c
> +++ b/fs/libfs.c
> @@ -19,6 +19,7 @@
>  #include <linux/buffer_head.h> /* sync_mapping_buffers */
>  #include <linux/fs_context.h>
>  #include <linux/pseudo_fs.h>
> +#include <linux/fsnotify.h>
>  
>  #include <linux/uaccess.h>
>  
> @@ -239,6 +240,75 @@ const struct inode_operations simple_dir_inode_operations = {
>  };
>  EXPORT_SYMBOL(simple_dir_inode_operations);
>  
> +static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
> +{
> +	struct dentry *child = NULL;
> +	struct list_head *p = prev ? &prev->d_child : &parent->d_subdirs;
> +
> +	spin_lock(&parent->d_lock);
> +	while ((p = p->next) != &parent->d_subdirs) {
> +		struct dentry *d = container_of(p, struct dentry, d_child);
> +		if (simple_positive(d)) {
> +			spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
> +			if (simple_positive(d))
> +				child = dget_dlock(d);
> +			spin_unlock(&d->d_lock);
> +			if (likely(child))
> +				break;
> +		}
> +	}
> +	spin_unlock(&parent->d_lock);
> +	dput(prev);
> +	return child;
> +}
> +
> +void simple_recursive_removal(struct dentry *dentry,
> +                              void (*callback)(struct dentry *))
> +{
> +	struct dentry *this = dentry;
> +	while (true) {
> +		struct dentry *victim = NULL, *child;
> +		struct inode *inode = this->d_inode;
> +
> +		inode_lock(inode);
> +		if (d_is_dir(this))
> +			inode->i_flags |= S_DEAD;
> +		while ((child = find_next_child(this, victim)) == NULL) {
> +			// kill and ascend
> +			// update metadata while it's still locked
> +			inode->i_ctime = current_time(inode);
> +			clear_nlink(inode);
> +			inode_unlock(inode);
> +			victim = this;
> +			this = this->d_parent;
> +			inode = this->d_inode;
> +			inode_lock(inode);
> +			if (simple_positive(victim)) {
> +				d_invalidate(victim);	// avoid lost mounts
> +				if (d_is_dir(victim))
> +					fsnotify_rmdir(inode, victim);
> +				else
> +					fsnotify_unlink(inode, victim);
> +				if (callback)
> +					callback(victim);
> +				dput(victim);		// unpin it
> +			}
> +			if (victim == dentry) {
> +				inode->i_ctime = inode->i_mtime =
> +					current_time(inode);
> +				if (d_is_dir(dentry))
> +					drop_nlink(inode);
> +				inode_unlock(inode);
> +				dput(dentry);
> +				return;
> +			}
> +		}
> +		inode_unlock(inode);
> +		this = child;
> +	}
> +}
> +EXPORT_SYMBOL(simple_recursive_removal);
> +
>  static const struct super_operations simple_super_operations = {
>  	.statfs		= simple_statfs,
>  };
> diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
> index eeeae0475da9..2a16c0eb97e4 100644
> --- a/fs/tracefs/inode.c
> +++ b/fs/tracefs/inode.c
> @@ -329,7 +329,10 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
>  		parent = tracefs_mount->mnt_root;
>  
>  	inode_lock(parent->d_inode);
> -	dentry = lookup_one_len(name, parent, strlen(name));
> +	if (unlikely(IS_DEADDIR(parent->d_inode)))
> +		dentry = ERR_PTR(-ENOENT);
> +	else
> +		dentry = lookup_one_len(name, parent, strlen(name));
>  	if (!IS_ERR(dentry) && dentry->d_inode) {
>  		dput(dentry);
>  		dentry = ERR_PTR(-EEXIST);
> @@ -495,122 +498,27 @@ __init struct dentry *tracefs_create_instance_dir(const char *name,
>  	return dentry;
>  }
>  
> -static int __tracefs_remove(struct dentry *dentry, struct dentry *parent)
> +static void remove_one(struct dentry *victim)
>  {
> -	int ret = 0;
> -
> -	if (simple_positive(dentry)) {
> -		if (dentry->d_inode) {
> -			dget(dentry);
> -			switch (dentry->d_inode->i_mode & S_IFMT) {
> -			case S_IFDIR:
> -				ret = simple_rmdir(parent->d_inode, dentry);
> -				if (!ret)
> -					fsnotify_rmdir(parent->d_inode, dentry);
> -				break;
> -			default:
> -				simple_unlink(parent->d_inode, dentry);
> -				fsnotify_unlink(parent->d_inode, dentry);
> -				break;
> -			}
> -			if (!ret)
> -				d_delete(dentry);
> -			dput(dentry);
> -		}
> -	}
> -	return ret;
> -}
> -
> -/**
> - * tracefs_remove - removes a file or directory from the tracefs filesystem
> - * @dentry: a pointer to a the dentry of the file or directory to be
> - *          removed.
> - *
> - * This function removes a file or directory in tracefs that was previously
> - * created with a call to another tracefs function (like
> - * tracefs_create_file() or variants thereof.)
> - */
> -void tracefs_remove(struct dentry *dentry)
> -{
> -	struct dentry *parent;
> -	int ret;
> -
> -	if (IS_ERR_OR_NULL(dentry))
> -		return;
> -
> -	parent = dentry->d_parent;
> -	inode_lock(parent->d_inode);
> -	ret = __tracefs_remove(dentry, parent);
> -	inode_unlock(parent->d_inode);
> -	if (!ret)
> -		simple_release_fs(&tracefs_mount, &tracefs_mount_count);
> +	simple_release_fs(&tracefs_mount, &tracefs_mount_count);
>  }
>  
>  /**
> - * tracefs_remove_recursive - recursively removes a directory
> + * tracefs_remove - recursively removes a directory
>   * @dentry: a pointer to a the dentry of the directory to be removed.
>   *
>   * This function recursively removes a directory tree in tracefs that
>   * was previously created with a call to another tracefs function
>   * (like tracefs_create_file() or variants thereof.)
>   */
> -void tracefs_remove_recursive(struct dentry *dentry)
> +void tracefs_remove(struct dentry *dentry)
>  {
> -	struct dentry *child, *parent;
> -
>  	if (IS_ERR_OR_NULL(dentry))
>  		return;
>  
> -	parent = dentry;
> - down:
> -	inode_lock(parent->d_inode);
> - loop:
> -	/*
> -	 * The parent->d_subdirs is protected by the d_lock. Outside that
> -	 * lock, the child can be unlinked and set to be freed which can
> -	 * use the d_u.d_child as the rcu head and corrupt this list.
> -	 */
> -	spin_lock(&parent->d_lock);
> -	list_for_each_entry(child, &parent->d_subdirs, d_child) {
> -		if (!simple_positive(child))
> -			continue;
> -
> -		/* perhaps simple_empty(child) makes more sense */
> -		if (!list_empty(&child->d_subdirs)) {
> -			spin_unlock(&parent->d_lock);
> -			inode_unlock(parent->d_inode);
> -			parent = child;
> -			goto down;
> -		}
> -
> -		spin_unlock(&parent->d_lock);
> -
> -		if (!__tracefs_remove(child, parent))
> -			simple_release_fs(&tracefs_mount, &tracefs_mount_count);
> -
> -		/*
> -		 * The parent->d_lock protects agaist child from unlinking
> -		 * from d_subdirs. When releasing the parent->d_lock we can
> -		 * no longer trust that the next pointer is valid.
> -		 * Restart the loop. We'll skip this one with the
> -		 * simple_positive() check.
> -		 */
> -		goto loop;
> -	}
> -	spin_unlock(&parent->d_lock);
> -
> -	inode_unlock(parent->d_inode);
> -	child = parent;
> -	parent = parent->d_parent;
> -	inode_lock(parent->d_inode);
> -
> -	if (child != dentry)
> -		/* go up */
> -		goto loop;
> -
> -	if (!__tracefs_remove(child, parent))
> -		simple_release_fs(&tracefs_mount, &tracefs_mount_count);
> -	inode_unlock(parent->d_inode);
> +	simple_pin_fs(&trace_fs_type, &tracefs_mount, &tracefs_mount_count);
> +	simple_recursive_removal(dentry, remove_one);
> +	simple_release_fs(&tracefs_mount, &tracefs_mount_count);
>  }
>  
>  /**
> diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
> index 58424eb3b329..0a817d763f0f 100644
> --- a/include/linux/debugfs.h
> +++ b/include/linux/debugfs.h
> @@ -82,7 +82,7 @@ struct dentry *debugfs_create_automount(const char *name,
>  					void *data);
>  
>  void debugfs_remove(struct dentry *dentry);
> -void debugfs_remove_recursive(struct dentry *dentry);
> +#define debugfs_remove_recursive debugfs_remove
>  
>  const struct file_operations *debugfs_real_fops(const struct file *filp);
>  
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 997a530ff4e9..73ffc8654987 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -3242,6 +3242,8 @@ extern int simple_unlink(struct inode *, struct dentry *);
>  extern int simple_rmdir(struct inode *, struct dentry *);
>  extern int simple_rename(struct inode *, struct dentry *,
>  			 struct inode *, struct dentry *, unsigned int);
> +extern void simple_recursive_removal(struct dentry *,
> +                              void (*callback)(struct dentry *));
>  extern int noop_fsync(struct file *, loff_t, loff_t, int);
>  extern int noop_set_page_dirty(struct page *page);
>  extern void noop_invalidatepage(struct page *page, unsigned int offset,
> diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h
> index 88d279c1b863..99912445974c 100644
> --- a/include/linux/tracefs.h
> +++ b/include/linux/tracefs.h
> @@ -28,7 +28,6 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
>  struct dentry *tracefs_create_dir(const char *name, struct dentry *parent);
>  
>  void tracefs_remove(struct dentry *dentry);
> -void tracefs_remove_recursive(struct dentry *dentry);
>  
>  struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *parent,
>  					   int (*mkdir)(const char *name),
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index 563e80f9006a..88d94dc3ed37 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -8366,7 +8366,7 @@ struct trace_array *trace_array_create(const char *name)
>  
>  	ret = event_trace_add_tracer(tr->dir, tr);
>  	if (ret) {
> -		tracefs_remove_recursive(tr->dir);
> +		tracefs_remove(tr->dir);
>  		goto out_free_tr;
>  	}
>  
> @@ -8422,7 +8422,7 @@ static int __remove_instance(struct trace_array *tr)
>  	event_trace_del_tracer(tr);
>  	ftrace_clear_pids(tr);
>  	ftrace_destroy_function_files(tr);
> -	tracefs_remove_recursive(tr->dir);
> +	tracefs_remove(tr->dir);
>  	free_trace_buffers(tr);
>  
>  	for (i = 0; i < tr->nr_topts; i++) {
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index 648930823b57..25bb3e8fb170 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -696,7 +696,7 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
>  		return;
>  
>  	if (!--dir->nr_events) {
> -		tracefs_remove_recursive(dir->entry);
> +		tracefs_remove(dir->entry);
>  		list_del(&dir->list);
>  		__put_system_dir(dir);
>  	}
> @@ -715,7 +715,7 @@ static void remove_event_file_dir(struct trace_event_file *file)
>  		}
>  		spin_unlock(&dir->d_lock);
>  
> -		tracefs_remove_recursive(dir);
> +		tracefs_remove(dir);
>  	}
>  
>  	list_del(&file->list);
> @@ -3032,7 +3032,7 @@ int event_trace_del_tracer(struct trace_array *tr)
>  
>  	down_write(&trace_event_sem);
>  	__trace_remove_event_dirs(tr);
> -	tracefs_remove_recursive(tr->event_dir);
> +	tracefs_remove(tr->event_dir);
>  	up_write(&trace_event_sem);
>  
>  	tr->event_dir = NULL;
> diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
> index fa95139445b2..fa45a031848c 100644
> --- a/kernel/trace/trace_hwlat.c
> +++ b/kernel/trace/trace_hwlat.c
> @@ -551,7 +551,7 @@ static int init_tracefs(void)
>  	return 0;
>  
>   err:
> -	tracefs_remove_recursive(top_dir);
> +	tracefs_remove(top_dir);
>  	return -ENOMEM;
>  }
>  
> 

The patch in linux-next

commit 653f0d05be0948e7610bb786e6570bb6c48a4e75 (HEAD, refs/bisect/bad)
Author: Al Viro <viro@zeniv.linux.org.uk>
Date:   Mon Nov 18 09:43:10 2019 -0500

    simple_recursive_removal(): kernel-side rm -rf for ramfs-style
filesystems

    two requirements: no file creations in IS_DEADDIR and no cross-directory
    renames whatsoever.

    Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>


Makes my simple QEMU setup crash when booting


[    4.571181] list_del corruption. prev->next should be
ffff8b75df3408d0, but was ffff8b75df340d50
[    4.572064] ------------[ cut here ]------------
[    4.572448] kernel BUG at lib/list_debug.c:51!
[    4.572838] invalid opcode: 0000 [#1] SMP NOPTI
[    4.573235] CPU: 0 PID: 479 Comm: systemd-udevd Not tainted
5.5.0-rc1+ #14
[    4.573827] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu4
[    4.574782] RIP: 0010:__list_del_entry_valid.cold+0x31/0x55
[    4.575252] Code: 0d 3d a8 e8 14 dd bd ff 0f 0b 48 c7 c7 00 0e 3d a8
e8 06 dd bd ff 0f 0b 48 89 f2 48 89 fe 48 c7b
[    4.576829] RSP: 0018:ffffaef9401ebd30 EFLAGS: 00010246
[    4.577283] RAX: 0000000000000054 RBX: ffff8b75df3416c0 RCX:
0000000000000000
[    4.577879] RDX: 0000000000000000 RSI: ffff8b757fa1a248 RDI:
ffff8b757fa1a248
[    4.578479] RBP: ffff8b75df3407e0 R08: 0000000000000000 R09:
0000000000000000
[    4.579055] R10: 0000000000000000 R11: 0000000000000000 R12:
ffff8b75df340860
[    4.579660] R13: 0000000000000000 R14: ffff8b75df3416c0 R15:
ffff8b75d6bda620
[    4.580257] FS:  00007f3016d08940(0000) GS:ffff8b757fa00000(0000)
knlGS:0000000000000000
[    4.580941] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    4.581425] CR2: 00005555eedf9cf3 CR3: 0000000197fa2000 CR4:
00000000000006f0
[    4.582034] Call Trace:
[    4.582256]  __dentry_kill+0x86/0x190
[    4.582577]  ? dput+0x20/0x460
[    4.582839]  dput+0x2a6/0x460
[    4.583100]  debugfs_remove+0x40/0x60
[    4.583403]  blk_mq_debugfs_unregister_sched+0x15/0x30
[    4.583825]  blk_mq_exit_sched+0x6b/0xa0
[    4.584154]  __elevator_exit+0x32/0x50
[    4.584460]  elevator_switch_mq+0x63/0x170
[    4.584801]  elevator_switch+0x33/0x70
[    4.585114]  elv_iosched_store+0x135/0x1b0
[    4.585450]  queue_attr_store+0x47/0x70
[    4.585779]  kernfs_fop_write+0xdc/0x1c0
[    4.586128]  vfs_write+0xdb/0x1d0
[    4.586423]  ksys_write+0x65/0xe0
[    4.586716]  do_syscall_64+0x5c/0xa0
[    4.587029]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[    4.587472] RIP: 0033:0x7f3017d4a467
[    4.587782] Code: 64 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00
00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 854
[    4.589353] RSP: 002b:00007ffc7fa4f518 EFLAGS: 00000246 ORIG_RAX:
0000000000000001
[    4.589994] RAX: ffffffffffffffda RBX: 0000000000000004 RCX:
00007f3017d4a467
[    4.590605] RDX: 0000000000000004 RSI: 00007ffc7fa4f600 RDI:
000000000000000f
[    4.591212] RBP: 00007ffc7fa4f600 R08: fefefefefefefeff R09:
ffffffff00000000
[    4.591820] R10: 0000000000000000 R11: 0000000000000246 R12:
0000000000000004
[    4.592423] R13: 000055cdeaffe190 R14: 0000000000000004 R15:
00007f3017e1b700
[    4.593032] Modules linked in:
[    4.593307] ---[ end trace 42f66ce1e6e1c1fe ]---
[    4.593694] RIP: 0010:__list_del_entry_valid.cold+0x31/0x55
[    4.594173] Code: 0d 3d a8 e8 14 dd bd ff 0f 0b 48 c7 c7 00 0e 3d a8
e8 06 dd bd ff 0f 0b 48 89 f2 48 89 fe 48 c7b
[    4.595756] RSP: 0018:ffffaef9401ebd30 EFLAGS: 00010246
[    4.596205] RAX: 0000000000000054 RBX: ffff8b75df3416c0 RCX:
0000000000000000
[    4.596818] RDX: 0000000000000000 RSI: ffff8b757fa1a248 RDI:
ffff8b757fa1a248
[    4.597423] RBP: ffff8b75df3407e0 R08: 0000000000000000 R09:
0000000000000000
[    4.598038] R10: 0000000000000000 R11: 0000000000000000 R12:
ffff8b75df340860
[    4.598640] R13: 0000000000000000 R14: ffff8b75df3416c0 R15:
ffff8b75d6bda620
[    4.599241] FS:  00007f3016d08940(0000) GS:ffff8b757fa00000(0000)
knlGS:0000000000000000
[    4.599936] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    4.600415] CR2: 00005555eedf9cf3 CR3: 0000000197fa2000 CR4:
00000000000006f0
[    4.601034] BUG: sleeping function called from invalid context at
include/linux/percpu-rwsem.h:38
[    4.601789] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid:
479, name: systemd-udevd
[    4.602505] INFO: lockdep is turned off.
[    4.602837] CPU: 0 PID: 479 Comm: systemd-udevd Tainted: G      D
       5.5.0-rc1+ #14
[    4.603549] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu4
[    4.604520] Call Trace:
[    4.604736]  dump_stack+0x8f/0xd0
[    4.605020]  ___might_sleep.cold+0xb3/0xc3
[    4.605374]  exit_signals+0x30/0x2d0
[    4.605689]  do_exit+0xb4/0xc40
[    4.605961]  ? ksys_write+0x65/0xe0
[    4.606256]  rewind_stack_do_exit+0x17/0x20
[    4.606624] note: systemd-udevd[479] exited with preempt_count 2
[    4.611186] list_del corruption. prev->next should be
ffff8b75df3489f0, but was ffff8b75df3480f0
[    4.611972] ------------[ cut here ]------------
[    4.612371] kernel BUG at lib/list_debug.c:51!
[    4.612783] invalid opcode: 0000 [#2] SMP NOPTI
[    4.613161] CPU: 0 PID: 511 Comm: systemd-udevd Tainted: G      D W
       5.5.0-rc1+ #14
[    4.613875] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu4
[    4.614842] RIP: 0010:__list_del_entry_valid.cold+0x31/0x55
[    4.615309] Code: 0d 3d a8 e8 14 dd bd ff 0f 0b 48 c7 c7 00 0e 3d a8
e8 06 dd bd ff 0f 0b 48 89 f2 48 89 fe 48 c7b
[    4.616880] RSP: 0018:ffffaef9402a3d30 EFLAGS: 00010246
[    4.617327] RAX: 0000000000000054 RBX: ffff8b75df347240 RCX:
0000000000000000
[    4.617930] RDX: 0000000000000000 RSI: ffff8b757fa1a248 RDI:
ffff8b757fa1a248
[    4.618541] RBP: ffff8b75df348900 R08: 0000000000000000 R09:
0000000000000001
[    4.619140] R10: 0000000000000000 R11: 0000000000000000 R12:
ffff8b75df348980
[    4.619743] R13: 0000000000000000 R14: ffff8b75df347240 R15:
ffff8b75d6a25020
[    4.620349] FS:  00007f3016d08940(0000) GS:ffff8b757fa00000(0000)
knlGS:0000000000000000
[    4.621030] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    4.621508] CR2: 000055cdeb042e48 CR3: 000000019e202000 CR4:
00000000000006f0
[    4.622099] Call Trace:
[    4.622322]  __dentry_kill+0x86/0x190
[    4.622645]  ? dput+0x20/0x460
[    4.622911]  dput+0x2a6/0x460
[    4.623170]  debugfs_remove+0x40/0x60
[    4.623495]  blk_mq_debugfs_unregister_sched+0x15/0x30
[    4.623929]  blk_mq_exit_sched+0x6b/0xa0
[    4.624264]  __elevator_exit+0x32/0x50
[    4.624593]  elevator_switch_mq+0x63/0x170
[    4.624945]  elevator_switch+0x33/0x70
[    4.625268]  elv_iosched_store+0x135/0x1b0
[    4.625619]  queue_attr_store+0x47/0x70
[    4.625951]  kernfs_fop_write+0xdc/0x1c0
[    4.626289]  vfs_write+0xdb/0x1d0
[    4.626583]  ksys_write+0x65/0xe0
[    4.626870]  do_syscall_64+0x5c/0xa0
[    4.627180]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[    4.627616] RIP: 0033:0x7f3017d4a467
[    4.627925] Code: 64 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00
00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 854
[    4.629499] RSP: 002b:00007ffc7fa4f578 EFLAGS: 00000246 ORIG_RAX:
0000000000000001
[    4.630136] RAX: ffffffffffffffda RBX: 0000000000000004 RCX:
00007f3017d4a467
[    4.630735] RDX: 0000000000000004 RSI: 00007ffc7fa4f660 RDI:
000000000000000f
[    4.631334] RBP: 00007ffc7fa4f660 R08: fefefefefefefeff R09:
ffffffff00000000
[    4.631940] R10: 0000000000000000 R11: 0000000000000246 R12:
0000000000000004
[    4.632540] R13: 000055cdeaffe190 R14: 0000000000000004 R15:
00007f3017e1b700
[    4.633141] Modules linked in:
[    4.633414] ---[ end trace 42f66ce1e6e1c1ff ]---
[    4.633814] RIP: 0010:__list_del_entry_valid.cold+0x31/0x55
[    4.634289] Code: 0d 3d a8 e8 14 dd bd ff 0f 0b 48 c7 c7 00 0e 3d a8
e8 06 dd bd ff 0f 0b 48 89 f2 48 89 fe 48 c7b
[    4.635881] RSP: 0018:ffffaef9401ebd30 EFLAGS: 00010246
[    4.636329] RAX: 0000000000000054 RBX: ffff8b75df3416c0 RCX:
0000000000000000
[    4.636940] RDX: 0000000000000000 RSI: ffff8b757fa1a248 RDI:
ffff8b757fa1a248
[    4.637544] RBP: ffff8b75df3407e0 R08: 0000000000000000 R09:
0000000000000000
[    4.638149] R10: 0000000000000000 R11: 0000000000000000 R12:
ffff8b75df340860
[    4.638750] R13: 0000000000000000 R14: ffff8b75df3416c0 R15:
ffff8b75d6bda620
[    4.639360] FS:  00007f3016d08940(0000) GS:ffff8b757fa00000(0000)
knlGS:0000000000000000
[    4.640047] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    4.640537] CR2: 000055cdeb042e48 CR3: 000000019e202000 CR4:
00000000000006f0
[    4.641128] note: systemd-udevd[511] exited with preempt_count 2


Reverting that commit makes it work again. How does that untested and
unreviewed patch end up in linux-next? Took me 30min to bisect.
Al Viro Dec. 11, 2019, 6:46 p.m. UTC | #6
On Wed, Dec 11, 2019 at 04:55:56PM +0100, David Hildenbrand wrote:

[snip]

> The patch in linux-next
> 
> commit 653f0d05be0948e7610bb786e6570bb6c48a4e75 (HEAD, refs/bisect/bad)

... is no longer there.  commit a3d1e7eb5abe3aa1095bc75d1a6760d3809bd672
is; could you check if it fixes your reproducer?
David Hildenbrand Dec. 11, 2019, 7:18 p.m. UTC | #7
On 11.12.19 19:46, Al Viro wrote:
> On Wed, Dec 11, 2019 at 04:55:56PM +0100, David Hildenbrand wrote:
> 
> [snip]
> 
>> The patch in linux-next
>>
>> commit 653f0d05be0948e7610bb786e6570bb6c48a4e75 (HEAD, refs/bisect/bad)
> 
> ... is no longer there.  commit a3d1e7eb5abe3aa1095bc75d1a6760d3809bd672
> is; could you check if it fixes your reproducer?
> 

desktop: ~/git/linux memory_holes $ git fetch next
desktop: ~/git/linux memory_holes $ git show
a3d1e7eb5abe3aa1095bc75d1a6760d3809bd672
fatal: bad object a3d1e7eb5abe3aa1095bc75d1a6760d3809bd672

I'll go hunt for that commit :) ... guess it will show up in -next soon.
David Hildenbrand Dec. 11, 2019, 7:27 p.m. UTC | #8
On 11.12.19 20:18, David Hildenbrand wrote:
> On 11.12.19 19:46, Al Viro wrote:
>> On Wed, Dec 11, 2019 at 04:55:56PM +0100, David Hildenbrand wrote:
>>
>> [snip]
>>
>>> The patch in linux-next
>>>
>>> commit 653f0d05be0948e7610bb786e6570bb6c48a4e75 (HEAD, refs/bisect/bad)
>>
>> ... is no longer there.  commit a3d1e7eb5abe3aa1095bc75d1a6760d3809bd672
>> is; could you check if it fixes your reproducer?
>>
> 
> desktop: ~/git/linux memory_holes $ git fetch next
> desktop: ~/git/linux memory_holes $ git show
> a3d1e7eb5abe3aa1095bc75d1a6760d3809bd672
> fatal: bad object a3d1e7eb5abe3aa1095bc75d1a6760d3809bd672
> 
> I'll go hunt for that commit :) ... guess it will show up in -next soon.
> 

Found it on vfs.git - seems to do its job in my setup (booting Fedora 31
with custom kernel in QEMU).
diff mbox series

Patch

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 10090f11ab95..830405b401ce 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -129,7 +129,16 @@  struct dentry {
 enum dentry_d_lock_class
 {
 	DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs. */
-	DENTRY_D_LOCK_NESTED
+	/* 
+	 * used by spin_lock_nested when two d_lock need to be taken
+	 * at the same time
+	 */
+	DENTRY_D_LOCK_NESTED,
+	/*
+	 * used by spin_lock_nested when three d_lock need to be taken
+	 * at the same time
+	 */
+	DENTRY_D_LOCK_NESTED_TWICE
 };
 
 struct dentry_operations {