diff mbox series

[RFC] simplifying fast_dput(), dentry_kill() et.al.

Message ID 20231030003759.GW800259@ZenIV (mailing list archive)
State New, archived
Headers show
Series [RFC] simplifying fast_dput(), dentry_kill() et.al. | expand

Commit Message

Al Viro Oct. 30, 2023, 12:37 a.m. UTC
Back in 2015 when fast_dput() got introduced, I'd been worried
about ->d_delete() being exposed to dentries with zero refcount.
To quote my reply to Linus back then,

"The only potential nastiness I can see here is that filesystem with
->d_delete() always returning 1 might be surprised by encountering
a hashed dentry with zero d_count.  I can't recall anything actually
sensitive to that, and there might very well be no such examples,
but in principle it might be a problem.  Might be a good idea to check
DCACHE_OP_DELETE before anything else..."

Looking at that again, that check was not a good idea.  Sure, ->d_delete()
instances could, in theory, check d_count (as BUG_ON(d_count(dentry) != 1)
or something equally useful) or, worse, drop and regain ->d_lock.
The latter would be rather hard to pull off safely, but it is not
impossible.  The thing is, none of the in-tree instances do anything of
that sort and I don't see any valid reasons why anyone would want to.

And getting rid of that would, AFAICS, allow for much simpler rules
around __dentry_kill() and friends - we could hold rcu_read_lock
over the places where dentry_kill() drops/regains ->d_lock and
that would allow
	* fast_dput() always decrementing refcount
	* retain_dentry() never modifying it
	* __dentry_kill() always called with refcount 0 (currently
it gets 1 from dentry_kill() and 0 in all other cases)

Does anybody see any problems with something along the lines of the
(untested) patch below?  It would need to be carved up (and accompanied
by "thou shalt not play silly buggers with ->d_lockref in your
->d_delete() instances" in D/f/porting), obviously, but I would really
like to get saner rules around refcount manipulations in there - as
it is, trying to document them gets very annoying.

Comments?

Comments

Al Viro Oct. 30, 2023, 9:53 p.m. UTC | #1
On Mon, Oct 30, 2023 at 12:37:59AM +0000, Al Viro wrote:
> 	Back in 2015 when fast_dput() got introduced, I'd been worried
> about ->d_delete() being exposed to dentries with zero refcount.
> To quote my reply to Linus back then,
> 
> "The only potential nastiness I can see here is that filesystem with
> ->d_delete() always returning 1 might be surprised by encountering
> a hashed dentry with zero d_count.  I can't recall anything actually
> sensitive to that, and there might very well be no such examples,
> but in principle it might be a problem.  Might be a good idea to check
> DCACHE_OP_DELETE before anything else..."
> 
> Looking at that again, that check was not a good idea.  Sure, ->d_delete()
> instances could, in theory, check d_count (as BUG_ON(d_count(dentry) != 1)
> or something equally useful) or, worse, drop and regain ->d_lock.
> The latter would be rather hard to pull off safely, but it is not
> impossible.  The thing is, none of the in-tree instances do anything of
> that sort and I don't see any valid reasons why anyone would want to.
> 
> And getting rid of that would, AFAICS, allow for much simpler rules
> around __dentry_kill() and friends - we could hold rcu_read_lock
> over the places where dentry_kill() drops/regains ->d_lock and
> that would allow
> 	* fast_dput() always decrementing refcount
> 	* retain_dentry() never modifying it
> 	* __dentry_kill() always called with refcount 0 (currently
> it gets 1 from dentry_kill() and 0 in all other cases)
> 
> Does anybody see any problems with something along the lines of the
> (untested) patch below?  It would need to be carved up (and accompanied
> by "thou shalt not play silly buggers with ->d_lockref in your
> ->d_delete() instances" in D/f/porting), obviously, but I would really
> like to get saner rules around refcount manipulations in there - as
> it is, trying to document them gets very annoying.
> 
> Comments?

After fixing a couple of brainos, it seems to work.  See below:

diff --git a/fs/dcache.c b/fs/dcache.c
index 9f471fdb768b..5e975a013508 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -680,7 +680,6 @@ static inline bool retain_dentry(struct dentry *dentry)
 		return false;
 
 	/* retain; LRU fodder */
-	dentry->d_lockref.count--;
 	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
 		d_lru_add(dentry);
 	else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
@@ -709,7 +708,7 @@ EXPORT_SYMBOL(d_mark_dontcache);
  * Returns dentry requiring refcount drop, or NULL if we're done.
  */
 static struct dentry *dentry_kill(struct dentry *dentry)
-	__releases(dentry->d_lock)
+	__releases(dentry->d_lock) __releases(rcu)
 {
 	struct inode *inode = dentry->d_inode;
 	struct dentry *parent = NULL;
@@ -730,6 +729,7 @@ static struct dentry *dentry_kill(struct dentry *dentry)
 			goto slow_positive;
 		}
 	}
+	rcu_read_unlock();
 	__dentry_kill(dentry);
 	return parent;
 
@@ -739,9 +739,8 @@ static struct dentry *dentry_kill(struct dentry *dentry)
 	spin_lock(&dentry->d_lock);
 	parent = lock_parent(dentry);
 got_locks:
-	if (unlikely(dentry->d_lockref.count != 1)) {
-		dentry->d_lockref.count--;
-	} else if (likely(!retain_dentry(dentry))) {
+	rcu_read_unlock();
+	if (likely(dentry->d_lockref.count == 0 && !retain_dentry(dentry))) {
 		__dentry_kill(dentry);
 		return parent;
 	}
@@ -768,15 +767,7 @@ static inline bool fast_dput(struct dentry *dentry)
 	unsigned int d_flags;
 
 	/*
-	 * If we have a d_op->d_delete() operation, we sould not
-	 * let the dentry count go to zero, so use "put_or_lock".
-	 */
-	if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
-		return lockref_put_or_lock(&dentry->d_lockref);
-
-	/*
-	 * .. otherwise, we can try to just decrement the
-	 * lockref optimistically.
+	 * try to decrement the lockref optimistically.
 	 */
 	ret = lockref_put_return(&dentry->d_lockref);
 
@@ -787,8 +778,12 @@ static inline bool fast_dput(struct dentry *dentry)
 	 */
 	if (unlikely(ret < 0)) {
 		spin_lock(&dentry->d_lock);
-		if (dentry->d_lockref.count > 1) {
-			dentry->d_lockref.count--;
+		if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) {
+			spin_unlock(&dentry->d_lock);
+			return true;
+		}
+		dentry->d_lockref.count--;
+		if (dentry->d_lockref.count) {
 			spin_unlock(&dentry->d_lock);
 			return true;
 		}
@@ -830,7 +825,7 @@ static inline bool fast_dput(struct dentry *dentry)
 	 */
 	smp_rmb();
 	d_flags = READ_ONCE(dentry->d_flags);
-	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST |
+	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_OP_DELETE |
 			DCACHE_DISCONNECTED | DCACHE_DONTCACHE;
 
 	/* Nothing to do? Dropping the reference was all we needed? */
@@ -854,13 +849,6 @@ static inline bool fast_dput(struct dentry *dentry)
 		spin_unlock(&dentry->d_lock);
 		return true;
 	}
-
-	/*
-	 * Re-get the reference we optimistically dropped. We hold the
-	 * lock, and we just tested that it was zero, so we can just
-	 * set it to 1.
-	 */
-	dentry->d_lockref.count = 1;
 	return false;
 }
 
@@ -903,10 +891,9 @@ void dput(struct dentry *dentry)
 		}
 
 		/* Slow case: now with the dentry lock held */
-		rcu_read_unlock();
-
 		if (likely(retain_dentry(dentry))) {
 			spin_unlock(&dentry->d_lock);
+			rcu_read_unlock();
 			return;
 		}
 
@@ -918,14 +905,10 @@ EXPORT_SYMBOL(dput);
 static void __dput_to_list(struct dentry *dentry, struct list_head *list)
 __must_hold(&dentry->d_lock)
 {
-	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
-		/* let the owner of the list it's on deal with it */
-		--dentry->d_lockref.count;
-	} else {
+	if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
 		if (dentry->d_flags & DCACHE_LRU_LIST)
 			d_lru_del(dentry);
-		if (!--dentry->d_lockref.count)
-			d_shrink_add(dentry, list);
+		d_shrink_add(dentry, list);
 	}
 }
 
@@ -1191,7 +1174,7 @@ void shrink_dentry_list(struct list_head *list)
 		rcu_read_unlock();
 		d_shrink_del(dentry);
 		parent = dentry->d_parent;
-		if (parent != dentry)
+		if (parent != dentry && !--parent->d_lockref.count)
 			__dput_to_list(parent, list);
 		__dentry_kill(dentry);
 	}
@@ -1638,7 +1621,8 @@ void shrink_dcache_parent(struct dentry *parent)
 			} else {
 				rcu_read_unlock();
 				parent = data.victim->d_parent;
-				if (parent != data.victim)
+				if (parent != data.victim &&
+				    !--parent->d_lockref.count)
 					__dput_to_list(parent, &data.dispose);
 				__dentry_kill(data.victim);
 			}
Linus Torvalds Oct. 30, 2023, 10:18 p.m. UTC | #2
On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> After fixing a couple of brainos, it seems to work.

This all makes me unnaturally nervous, probably because it;s overly
subtle, and I have lost the context for some of the rules.

I like the patch, because honestly, our current logic for dput_fast()
is nasty, andI agree with you that the existence of d_op->d_delete()
shouldn't change the locking logic.

At the same time, I just worry. That whole lockref_put_return() thing
has horrific semantics, and this is the only case that uses it, and I
wish we didn't need it.

[ Looks around. Oh. Except we have lockref_put_return() in fs/erofs/
too, and that looks completely bogus, since it doesn't check the
return value! ]

At the same time, that whole fast_dpu() is one of the more critical
places, and we definitely don't want to take the lock just because the
ref goes down to zero (and we still leave it around).

End result: I *think* that patch is an improvement, but this code just
makes me unreasonably nervous.

               Linus
Al Viro Oct. 31, 2023, 12:18 a.m. UTC | #3
On Mon, Oct 30, 2023 at 12:18:28PM -1000, Linus Torvalds wrote:
> On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > After fixing a couple of brainos, it seems to work.
> 
> This all makes me unnaturally nervous, probably because it;s overly
> subtle, and I have lost the context for some of the rules.

A bit of context: I started to look at the possibility of refcount overflows.
Writing the current rules for dentry refcounting and lifetime down was the
obvious first step, and that immediately turned into an awful mess.

It is overly subtle.  Even more so when you throw the shrink lists into
the mix - shrink_lock_dentry() got too smart for its own good, and that
leads to really awful correctness proofs.  The next thing in the series
is getting rid of the "it had been moved around, so somebody had clearly
been taking/dropping references and we can just evict it from the
shrink list and be done with that" crap - the things get much simpler
if the rules become
	* call it under rcu_read_lock, with dentry locked
	* if returned true
		dentry, parent, inode locked, refcount is zero.
	* if returned false
		dentry locked, refcount is non-zero.
It used to be that way, but removal of trylock loops had turned that
into something much more subtle.  Restoring the old semantics without
trylocks on the slow path is doable and it makes analysis much simpler.

BTW, where how aggressive do we want to be with d_lru_del()?

We obviously do not do that on non-final dput, even if we have
a dentry with positive refcount in LRU list.  But when we hit e.g.
shrink_dcache_parent(), all dentries in the subtree get d_lru_del(),
whether they are busy or not.  I'm not sure it's a good idea...

Sure, we want __dentry_kill() to remove the victim from LRU and
we want the same done to anything moved to a shrink list.
Having LRU scanners ({prune,shrink}_dcache_sb() do that to
dentries with positive refcount also makes sense.  Do we really need
the other cases?
Al Viro Oct. 31, 2023, 1:53 a.m. UTC | #4
On Tue, Oct 31, 2023 at 12:18:48AM +0000, Al Viro wrote:
> On Mon, Oct 30, 2023 at 12:18:28PM -1000, Linus Torvalds wrote:
> > On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote:
> > >
> > > After fixing a couple of brainos, it seems to work.
> > 
> > This all makes me unnaturally nervous, probably because it;s overly
> > subtle, and I have lost the context for some of the rules.
> 
> A bit of context: I started to look at the possibility of refcount overflows.
> Writing the current rules for dentry refcounting and lifetime down was the
> obvious first step, and that immediately turned into an awful mess.
> 
> It is overly subtle.  Even more so when you throw the shrink lists into
> the mix - shrink_lock_dentry() got too smart for its own good, and that
> leads to really awful correctness proofs.  The next thing in the series
> is getting rid of the "it had been moved around, so somebody had clearly
> been taking/dropping references and we can just evict it from the
> shrink list and be done with that" crap - the things get much simpler
> if the rules become
> 	* call it under rcu_read_lock, with dentry locked
> 	* if returned true
> 		dentry, parent, inode locked, refcount is zero.
> 	* if returned false
> 		dentry locked, refcount is non-zero.
> It used to be that way, but removal of trylock loops had turned that
> into something much more subtle.  Restoring the old semantics without
> trylocks on the slow path is doable and it makes analysis much simpler.

It's also a perfect match to what we want in dentry_kill(), actually.
And looking into that has caught another place too subtle for its own
good:
        if (!IS_ROOT(dentry)) {
                parent = dentry->d_parent;
                if (unlikely(!spin_trylock(&parent->d_lock))) {
                        parent = __lock_parent(dentry);
                        if (likely(inode || !dentry->d_inode))
                                goto got_locks;
                        /* negative that became positive */
                        if (parent)
                                spin_unlock(&parent->d_lock);
                        inode = dentry->d_inode;
                        goto slow_positive;
                }
        }
        __dentry_kill(dentry);
        return parent;

slow_positive:
        spin_unlock(&dentry->d_lock);
        spin_lock(&inode->i_lock);
        spin_lock(&dentry->d_lock);
        parent = lock_parent(dentry);
got_locks:

That code (in dentry_kill()) relies upon the assumption that
positive dentry couldn't have become negative under us while
__lock_parent() had it unlocked.  Which is only true because
we have a positive refcount here.

IOW, the patch is broken as posted upthread.  It's really
not hard to fix, fortunately, and what we end up in dentry_kill()
looks a lot better that way -

static struct dentry *dentry_kill(struct dentry *dentry)
        __releases(dentry->d_lock) __releases(rcu)
{
        struct dentry *parent = NULL;
	if (likely(shrink_lock_dentry(dentry))) {
		if (!IS_ROOT(dentry))
			parent = dentry->d_parent;
		rcu_read_unlock();
		__dentry_kill(dentry);
	} else {
		rcu_read_unlock();
		spin_unlock(&dentry->d_lock);
	}
	return parent;
}

Carving that series up will be interesting, though...
Gao Xiang Oct. 31, 2023, 2:25 a.m. UTC | #5
Hi Linus,

On 2023/10/31 06:18, Linus Torvalds wrote:
> On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote:
>>
>> After fixing a couple of brainos, it seems to work.
> 
> This all makes me unnaturally nervous, probably because it;s overly
> subtle, and I have lost the context for some of the rules.
> 
> I like the patch, because honestly, our current logic for dput_fast()
> is nasty, andI agree with you that the existence of d_op->d_delete()
> shouldn't change the locking logic.
> 
> At the same time, I just worry. That whole lockref_put_return() thing
> has horrific semantics, and this is the only case that uses it, and I
> wish we didn't need it.
> 
> [ Looks around. Oh. Except we have lockref_put_return() in fs/erofs/
> too, and that looks completely bogus, since it doesn't check the
> return value! ]

  74 struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
  75                                                struct erofs_workgroup *grp)
  76 {
  77         struct erofs_sb_info *const sbi = EROFS_SB(sb);
  78         struct erofs_workgroup *pre;
  79
  80         /*
  81          * Bump up before making this visible to others for the XArray in order
  82          * to avoid potential UAF without serialized by xa_lock.
  83          */
  84         lockref_get(&grp->lockref);
  85
  86 repeat:
  87         xa_lock(&sbi->managed_pslots);
  88         pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
  89                            NULL, grp, GFP_NOFS);
  90         if (pre) {
  91                 if (xa_is_err(pre)) {
  92                         pre = ERR_PTR(xa_err(pre));
  93                 } else if (!erofs_workgroup_get(pre)) {
  94                         /* try to legitimize the current in-tree one */
  95                         xa_unlock(&sbi->managed_pslots);
  96                         cond_resched();
  97                         goto repeat;
  98                 }
  99                 lockref_put_return(&grp->lockref);

This line it just decreases the reference count just bumpped up at the
line 84 (and it will always succeed).

Since it finds a previous one at line 88, so the old one will be used
(and be returned) instead of the new one and the new allocated one
will be freed in the caller.

Hopefully it explains the use case here.

100                 grp = pre;
101         }
102         xa_unlock(&sbi->managed_pslots);
103         return grp;
104 }

Thanks,
Gao Xiang
Gao Xiang Oct. 31, 2023, 2:29 a.m. UTC | #6
On 2023/10/31 10:25, Gao Xiang wrote:
> Hi Linus,
> 
> On 2023/10/31 06:18, Linus Torvalds wrote:
>> On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote:
>>>
>>> After fixing a couple of brainos, it seems to work.
>>
>> This all makes me unnaturally nervous, probably because it;s overly
>> subtle, and I have lost the context for some of the rules.
>>
>> I like the patch, because honestly, our current logic for dput_fast()
>> is nasty, andI agree with you that the existence of d_op->d_delete()
>> shouldn't change the locking logic.
>>
>> At the same time, I just worry. That whole lockref_put_return() thing
>> has horrific semantics, and this is the only case that uses it, and I
>> wish we didn't need it.
>>
>> [ Looks around. Oh. Except we have lockref_put_return() in fs/erofs/
>> too, and that looks completely bogus, since it doesn't check the
>> return value! ]
> 
>   74 struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
>   75                                                struct erofs_workgroup *grp)
>   76 {
>   77         struct erofs_sb_info *const sbi = EROFS_SB(sb);
>   78         struct erofs_workgroup *pre;
>   79
>   80         /*
>   81          * Bump up before making this visible to others for the XArray in order
>   82          * to avoid potential UAF without serialized by xa_lock.
>   83          */
>   84         lockref_get(&grp->lockref);
>   85
>   86 repeat:
>   87         xa_lock(&sbi->managed_pslots);
>   88         pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
>   89                            NULL, grp, GFP_NOFS);
>   90         if (pre) {
>   91                 if (xa_is_err(pre)) {
>   92                         pre = ERR_PTR(xa_err(pre));
>   93                 } else if (!erofs_workgroup_get(pre)) {
>   94                         /* try to legitimize the current in-tree one */
>   95                         xa_unlock(&sbi->managed_pslots);
>   96                         cond_resched();
>   97                         goto repeat;
>   98                 }
>   99                 lockref_put_return(&grp->lockref);
> 
> This line it just decreases the reference count just bumpped up at the
> line 84 (and it will always succeed).

Add some words: also since it's a new allocated one without populated
so it won't be locked by others.

> 
> Since it finds a previous one at line 88, so the old one will be used
> (and be returned) instead of the new one and the new allocated one
> will be freed in the caller.
> 
> Hopefully it explains the use case here.
> 
> 100                 grp = pre;
> 101         }
> 102         xa_unlock(&sbi->managed_pslots);
> 103         return grp;
> 104 }
> 
> Thanks,
> Gao Xiang
>
Linus Torvalds Oct. 31, 2023, 3:02 a.m. UTC | #7
On Mon, 30 Oct 2023 at 16:25, Gao Xiang <hsiangkao@linux.alibaba.com> wrote:
>
> >
> > [ Looks around. Oh. Except we have lockref_put_return() in fs/erofs/
> > too, and that looks completely bogus, since it doesn't check the
> > return value! ]
>
>   74 struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
>   75                                                struct erofs_workgroup *grp)
>   76 {
>   77         struct erofs_sb_info *const sbi = EROFS_SB(sb);
>   78         struct erofs_workgroup *pre;
>   79
>   80         /*
>   81          * Bump up before making this visible to others for the XArray in order
>   82          * to avoid potential UAF without serialized by xa_lock.
>   83          */
>   84         lockref_get(&grp->lockref);
>   85
>   86 repeat:
>   87         xa_lock(&sbi->managed_pslots);
>   88         pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
>   89                            NULL, grp, GFP_NOFS);
>   90         if (pre) {
>   91                 if (xa_is_err(pre)) {
>   92                         pre = ERR_PTR(xa_err(pre));
>   93                 } else if (!erofs_workgroup_get(pre)) {
>   94                         /* try to legitimize the current in-tree one */
>   95                         xa_unlock(&sbi->managed_pslots);
>   96                         cond_resched();
>   97                         goto repeat;
>   98                 }
>   99                 lockref_put_return(&grp->lockref);
>
> This line it just decreases the reference count just bumpped up at the
> line 84 (and it will always succeed).

You have two possible scenarios:

 - it doesn't always succeed, because somebody else has the lock on
the grp->lockref right now, or because lockref doesn't do any
optimized cases at all

 - nobody else can access grp->lockref at the same time, so the lock
is pointless, so you shouldn't be using lockref in the first place,
and certainly not lockref_put_return

IOW, I don't see how lockref_put_return() could possibly *ever* be the
right thing to do.

The thing is, lockref_put_return() is fundamentally designed to be
something that can fail.

In  fact, in some situations it will *always* fail. Check this out:

#define USE_CMPXCHG_LOCKREF \
        (IS_ENABLED(CONFIG_ARCH_USE_CMPXCHG_LOCKREF) && \
         IS_ENABLED(CONFIG_SMP) && SPINLOCK_SIZE <= 4)
...
#if USE_CMPXCHG_LOCKREF
...
#else

#define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0)

#endif
...
int lockref_put_return(struct lockref *lockref)
{
        CMPXCHG_LOOP(
                new.count--;
                if (old.count <= 0)
                        return -1;
        ,
                return new.count;
        );
        return -1;
}

look, if USE_CMPXCHG_LOCKREF is false (on UP, or if spinlock are big
because of spinlock debugging, or whatever), lockref_put_return() will
*always* fail, expecting the caller to deal with that failure.

So doing a lockref_put_return() without dealing with the failure case
is FUNDAMENTALLY BROKEN.

Yes, it's an odd function. It's a function that is literally designed
for that dcache use-case, where we have a fast-path and a slow path,
and the "lockref_put_return() fails" is the slow-path that needs to
take the spinlock and do it carefully.

You *cannot* use that function without failure handling. Really.

                     Linus
Gao Xiang Oct. 31, 2023, 3:13 a.m. UTC | #8
On 2023/10/31 11:02, Linus Torvalds wrote:
> On Mon, 30 Oct 2023 at 16:25, Gao Xiang <hsiangkao@linux.alibaba.com> wrote:
>>
>>>
>>> [ Looks around. Oh. Except we have lockref_put_return() in fs/erofs/
>>> too, and that looks completely bogus, since it doesn't check the
>>> return value! ]
>>
>>    74 struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
>>    75                                                struct erofs_workgroup *grp)
>>    76 {
>>    77         struct erofs_sb_info *const sbi = EROFS_SB(sb);
>>    78         struct erofs_workgroup *pre;
>>    79
>>    80         /*
>>    81          * Bump up before making this visible to others for the XArray in order
>>    82          * to avoid potential UAF without serialized by xa_lock.
>>    83          */
>>    84         lockref_get(&grp->lockref);
>>    85
>>    86 repeat:
>>    87         xa_lock(&sbi->managed_pslots);
>>    88         pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
>>    89                            NULL, grp, GFP_NOFS);
>>    90         if (pre) {
>>    91                 if (xa_is_err(pre)) {
>>    92                         pre = ERR_PTR(xa_err(pre));
>>    93                 } else if (!erofs_workgroup_get(pre)) {
>>    94                         /* try to legitimize the current in-tree one */
>>    95                         xa_unlock(&sbi->managed_pslots);
>>    96                         cond_resched();
>>    97                         goto repeat;
>>    98                 }
>>    99                 lockref_put_return(&grp->lockref);
>>
>> This line it just decreases the reference count just bumpped up at the
>> line 84 (and it will always succeed).
> 
> You have two possible scenarios:
> 
>   - it doesn't always succeed, because somebody else has the lock on
> the grp->lockref right now, or because lockref doesn't do any
> optimized cases at all
> 
>   - nobody else can access grp->lockref at the same time, so the lock
> is pointless, so you shouldn't be using lockref in the first place,
> and certainly not lockref_put_return

Yeah, the second case is the real use case here.

> 
> IOW, I don't see how lockref_put_return() could possibly *ever* be the
> right thing to do.
> 
> The thing is, lockref_put_return() is fundamentally designed to be
> something that can fail.
> 
> In  fact, in some situations it will *always* fail. Check this out:
> 
> #define USE_CMPXCHG_LOCKREF \
>          (IS_ENABLED(CONFIG_ARCH_USE_CMPXCHG_LOCKREF) && \
>           IS_ENABLED(CONFIG_SMP) && SPINLOCK_SIZE <= 4)
> ...
> #if USE_CMPXCHG_LOCKREF
> ...
> #else
> 
> #define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0)
> 
> #endif
> ...
> int lockref_put_return(struct lockref *lockref)
> {
>          CMPXCHG_LOOP(
>                  new.count--;
>                  if (old.count <= 0)
>                          return -1;
>          ,
>                  return new.count;
>          );
>          return -1;
> }
> 
> look, if USE_CMPXCHG_LOCKREF is false (on UP, or if spinlock are big
> because of spinlock debugging, or whatever), lockref_put_return() will
> *always* fail, expecting the caller to deal with that failure.
> 
> So doing a lockref_put_return() without dealing with the failure case
> is FUNDAMENTALLY BROKEN.

Yeah, thanks for point out, I get it.  I think this really needs to be
cleaned up since we don't actually care about locking here (since as I
said it doesn't actually populate to XArray).

> 
> Yes, it's an odd function. It's a function that is literally designed
> for that dcache use-case, where we have a fast-path and a slow path,
> and the "lockref_put_return() fails" is the slow-path that needs to
> take the spinlock and do it carefully.
> 
> You *cannot* use that function without failure handling. Really.

I will fix+cleanup this path later and send upstream.  Thanks for the
heads up.

Thanks,
Gao Xiang

> 
>                       Linus
Al Viro Oct. 31, 2023, 3:26 a.m. UTC | #9
On Mon, Oct 30, 2023 at 05:02:32PM -1000, Linus Torvalds wrote:

> look, if USE_CMPXCHG_LOCKREF is false (on UP, or if spinlock are big
> because of spinlock debugging, or whatever), lockref_put_return() will
> *always* fail, expecting the caller to deal with that failure.
> 
> So doing a lockref_put_return() without dealing with the failure case
> is FUNDAMENTALLY BROKEN.
> 
> Yes, it's an odd function. It's a function that is literally designed
> for that dcache use-case, where we have a fast-path and a slow path,
> and the "lockref_put_return() fails" is the slow-path that needs to
> take the spinlock and do it carefully.
> 
> You *cannot* use that function without failure handling. Really.

Put another way, it's a fastpath-only thing.  Not sure how much use
would it be to slap __must_check on it; a warning along the lines
of "DON'T USE UNLESS YOU HAVE READ <archive link>" in lockref.h
might be useful.

BTW, is there any reason for -128 for marking them dead?  Looks like
-1 wouldn't be any worse...
Linus Torvalds Oct. 31, 2023, 3:41 a.m. UTC | #10
On Mon, 30 Oct 2023 at 17:26, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> BTW, is there any reason for -128 for marking them dead?  Looks like
> -1 wouldn't be any worse...

It's *much* too long ago, but I have this dim memory of simply wanting
to make sure that "dead" was clearly separate from "underflow".

                  Linus
Al Viro Oct. 31, 2023, 6:12 a.m. UTC | #11
On Tue, Oct 31, 2023 at 01:53:51AM +0000, Al Viro wrote:

> Carving that series up will be interesting, though...

I think I have a sane carve-up; will post if it survives testing.

Cumulative diff follows:

diff --git a/fs/dcache.c b/fs/dcache.c
index 9f471fdb768b..213026d5b033 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -625,8 +625,6 @@ static void __dentry_kill(struct dentry *dentry)
 static struct dentry *__lock_parent(struct dentry *dentry)
 {
 	struct dentry *parent;
-	rcu_read_lock();
-	spin_unlock(&dentry->d_lock);
 again:
 	parent = READ_ONCE(dentry->d_parent);
 	spin_lock(&parent->d_lock);
@@ -642,7 +640,6 @@ static struct dentry *__lock_parent(struct dentry *dentry)
 		spin_unlock(&parent->d_lock);
 		goto again;
 	}
-	rcu_read_unlock();
 	if (parent != dentry)
 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 	else
@@ -657,7 +654,64 @@ static inline struct dentry *lock_parent(struct dentry *dentry)
 		return NULL;
 	if (likely(spin_trylock(&parent->d_lock)))
 		return parent;
-	return __lock_parent(dentry);
+	rcu_read_lock();
+	spin_unlock(&dentry->d_lock);
+	parent = __lock_parent(dentry);
+	rcu_read_unlock();
+	return parent;
+}
+
+/*
+ * Lock a dentry for feeding it to __dentry_kill().
+ * Called under rcu_read_lock() and dentry->d_lock; the former
+ * guarantees that nothing we access will be freed under us.
+ * Note that dentry is *not* protected from concurrent dentry_kill(),
+ * d_delete(), etc.
+ *
+ * Return false if dentry is busy.  Otherwise, return true and have
+ * that dentry's inode and parent both locked.
+ */
+
+static bool lock_for_kill(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct dentry *parent = dentry->d_parent;
+
+	if (unlikely(dentry->d_lockref.count))
+		return false;
+
+	if (inode && unlikely(!spin_trylock(&inode->i_lock)))
+		goto slow;
+	if (dentry == parent)
+		return true;
+	if (likely(spin_trylock(&parent->d_lock)))
+		return true;
+
+	if (inode)
+		spin_unlock(&inode->i_lock);
+slow:
+	spin_unlock(&dentry->d_lock);
+
+	for (;;) {
+		if (inode)
+			spin_lock(&inode->i_lock);
+		parent = __lock_parent(dentry);
+		if (likely(inode == dentry->d_inode))
+			break;
+		if (inode)
+			spin_unlock(&inode->i_lock);
+		inode = dentry->d_inode;
+		spin_unlock(&dentry->d_lock);
+		if (parent)
+			spin_unlock(&parent->d_lock);
+	}
+	if (likely(!dentry->d_lockref.count))
+		return true;
+	if (inode)
+		spin_unlock(&inode->i_lock);
+	if (parent)
+		spin_unlock(&parent->d_lock);
+	return false;
 }
 
 static inline bool retain_dentry(struct dentry *dentry)
@@ -680,7 +734,6 @@ static inline bool retain_dentry(struct dentry *dentry)
 		return false;
 
 	/* retain; LRU fodder */
-	dentry->d_lockref.count--;
 	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
 		d_lru_add(dentry);
 	else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
@@ -703,61 +756,12 @@ void d_mark_dontcache(struct inode *inode)
 }
 EXPORT_SYMBOL(d_mark_dontcache);
 
-/*
- * Finish off a dentry we've decided to kill.
- * dentry->d_lock must be held, returns with it unlocked.
- * Returns dentry requiring refcount drop, or NULL if we're done.
- */
-static struct dentry *dentry_kill(struct dentry *dentry)
-	__releases(dentry->d_lock)
-{
-	struct inode *inode = dentry->d_inode;
-	struct dentry *parent = NULL;
-
-	if (inode && unlikely(!spin_trylock(&inode->i_lock)))
-		goto slow_positive;
-
-	if (!IS_ROOT(dentry)) {
-		parent = dentry->d_parent;
-		if (unlikely(!spin_trylock(&parent->d_lock))) {
-			parent = __lock_parent(dentry);
-			if (likely(inode || !dentry->d_inode))
-				goto got_locks;
-			/* negative that became positive */
-			if (parent)
-				spin_unlock(&parent->d_lock);
-			inode = dentry->d_inode;
-			goto slow_positive;
-		}
-	}
-	__dentry_kill(dentry);
-	return parent;
-
-slow_positive:
-	spin_unlock(&dentry->d_lock);
-	spin_lock(&inode->i_lock);
-	spin_lock(&dentry->d_lock);
-	parent = lock_parent(dentry);
-got_locks:
-	if (unlikely(dentry->d_lockref.count != 1)) {
-		dentry->d_lockref.count--;
-	} else if (likely(!retain_dentry(dentry))) {
-		__dentry_kill(dentry);
-		return parent;
-	}
-	/* we are keeping it, after all */
-	if (inode)
-		spin_unlock(&inode->i_lock);
-	if (parent)
-		spin_unlock(&parent->d_lock);
-	spin_unlock(&dentry->d_lock);
-	return NULL;
-}
-
 /*
  * Try to do a lockless dput(), and return whether that was successful.
  *
  * If unsuccessful, we return false, having already taken the dentry lock.
+ * In that case refcount is guaranteed to be zero and we have already
+ * decided that it's not worth keeping around.
  *
  * The caller needs to hold the RCU read lock, so that the dentry is
  * guaranteed to stay around even if the refcount goes down to zero!
@@ -768,15 +772,7 @@ static inline bool fast_dput(struct dentry *dentry)
 	unsigned int d_flags;
 
 	/*
-	 * If we have a d_op->d_delete() operation, we sould not
-	 * let the dentry count go to zero, so use "put_or_lock".
-	 */
-	if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
-		return lockref_put_or_lock(&dentry->d_lockref);
-
-	/*
-	 * .. otherwise, we can try to just decrement the
-	 * lockref optimistically.
+	 * try to decrement the lockref optimistically.
 	 */
 	ret = lockref_put_return(&dentry->d_lockref);
 
@@ -787,12 +783,12 @@ static inline bool fast_dput(struct dentry *dentry)
 	 */
 	if (unlikely(ret < 0)) {
 		spin_lock(&dentry->d_lock);
-		if (dentry->d_lockref.count > 1) {
-			dentry->d_lockref.count--;
+		if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) {
 			spin_unlock(&dentry->d_lock);
 			return true;
 		}
-		return false;
+		dentry->d_lockref.count--;
+		goto locked;
 	}
 
 	/*
@@ -830,7 +826,7 @@ static inline bool fast_dput(struct dentry *dentry)
 	 */
 	smp_rmb();
 	d_flags = READ_ONCE(dentry->d_flags);
-	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST |
+	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_OP_DELETE |
 			DCACHE_DISCONNECTED | DCACHE_DONTCACHE;
 
 	/* Nothing to do? Dropping the reference was all we needed? */
@@ -850,17 +846,11 @@ static inline bool fast_dput(struct dentry *dentry)
 	 * else could have killed it and marked it dead. Either way, we
 	 * don't need to do anything else.
 	 */
-	if (dentry->d_lockref.count) {
+locked:
+	if (dentry->d_lockref.count || retain_dentry(dentry)) {
 		spin_unlock(&dentry->d_lock);
 		return true;
 	}
-
-	/*
-	 * Re-get the reference we optimistically dropped. We hold the
-	 * lock, and we just tested that it was zero, so we can just
-	 * set it to 1.
-	 */
-	dentry->d_lockref.count = 1;
 	return false;
 }
 
@@ -903,29 +893,29 @@ void dput(struct dentry *dentry)
 		}
 
 		/* Slow case: now with the dentry lock held */
-		rcu_read_unlock();
-
-		if (likely(retain_dentry(dentry))) {
+		if (likely(lock_for_kill(dentry))) {
+			struct dentry *parent = dentry->d_parent;
+			rcu_read_unlock();
+			__dentry_kill(dentry);
+			if (dentry == parent)
+				return;
+			dentry = parent;
+		} else {
+			rcu_read_unlock();
 			spin_unlock(&dentry->d_lock);
 			return;
 		}
-
-		dentry = dentry_kill(dentry);
 	}
 }
 EXPORT_SYMBOL(dput);
 
-static void __dput_to_list(struct dentry *dentry, struct list_head *list)
+static void to_shrink_list(struct dentry *dentry, struct list_head *list)
 __must_hold(&dentry->d_lock)
 {
-	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
-		/* let the owner of the list it's on deal with it */
-		--dentry->d_lockref.count;
-	} else {
+	if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
 		if (dentry->d_flags & DCACHE_LRU_LIST)
 			d_lru_del(dentry);
-		if (!--dentry->d_lockref.count)
-			d_shrink_add(dentry, list);
+		d_shrink_add(dentry, list);
 	}
 }
 
@@ -937,8 +927,7 @@ void dput_to_list(struct dentry *dentry, struct list_head *list)
 		return;
 	}
 	rcu_read_unlock();
-	if (!retain_dentry(dentry))
-		__dput_to_list(dentry, list);
+	to_shrink_list(dentry, list);
 	spin_unlock(&dentry->d_lock);
 }
 
@@ -1117,58 +1106,6 @@ void d_prune_aliases(struct inode *inode)
 }
 EXPORT_SYMBOL(d_prune_aliases);
 
-/*
- * Lock a dentry from shrink list.
- * Called under rcu_read_lock() and dentry->d_lock; the former
- * guarantees that nothing we access will be freed under us.
- * Note that dentry is *not* protected from concurrent dentry_kill(),
- * d_delete(), etc.
- *
- * Return false if dentry has been disrupted or grabbed, leaving
- * the caller to kick it off-list.  Otherwise, return true and have
- * that dentry's inode and parent both locked.
- */
-static bool shrink_lock_dentry(struct dentry *dentry)
-{
-	struct inode *inode;
-	struct dentry *parent;
-
-	if (dentry->d_lockref.count)
-		return false;
-
-	inode = dentry->d_inode;
-	if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
-		spin_unlock(&dentry->d_lock);
-		spin_lock(&inode->i_lock);
-		spin_lock(&dentry->d_lock);
-		if (unlikely(dentry->d_lockref.count))
-			goto out;
-		/* changed inode means that somebody had grabbed it */
-		if (unlikely(inode != dentry->d_inode))
-			goto out;
-	}
-
-	parent = dentry->d_parent;
-	if (IS_ROOT(dentry) || likely(spin_trylock(&parent->d_lock)))
-		return true;
-
-	spin_unlock(&dentry->d_lock);
-	spin_lock(&parent->d_lock);
-	if (unlikely(parent != dentry->d_parent)) {
-		spin_unlock(&parent->d_lock);
-		spin_lock(&dentry->d_lock);
-		goto out;
-	}
-	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-	if (likely(!dentry->d_lockref.count))
-		return true;
-	spin_unlock(&parent->d_lock);
-out:
-	if (inode)
-		spin_unlock(&inode->i_lock);
-	return false;
-}
-
 void shrink_dentry_list(struct list_head *list)
 {
 	while (!list_empty(list)) {
@@ -1177,12 +1114,11 @@ void shrink_dentry_list(struct list_head *list)
 		dentry = list_entry(list->prev, struct dentry, d_lru);
 		spin_lock(&dentry->d_lock);
 		rcu_read_lock();
-		if (!shrink_lock_dentry(dentry)) {
+		if (!lock_for_kill(dentry)) {
 			bool can_free = false;
 			rcu_read_unlock();
 			d_shrink_del(dentry);
-			if (dentry->d_lockref.count < 0)
-				can_free = dentry->d_flags & DCACHE_MAY_FREE;
+			can_free = dentry->d_flags & DCACHE_MAY_FREE;
 			spin_unlock(&dentry->d_lock);
 			if (can_free)
 				dentry_free(dentry);
@@ -1191,8 +1127,8 @@ void shrink_dentry_list(struct list_head *list)
 		rcu_read_unlock();
 		d_shrink_del(dentry);
 		parent = dentry->d_parent;
-		if (parent != dentry)
-			__dput_to_list(parent, list);
+		if (parent != dentry && !--parent->d_lockref.count)
+			to_shrink_list(parent, list);
 		__dentry_kill(dentry);
 	}
 }
@@ -1632,14 +1568,15 @@ void shrink_dcache_parent(struct dentry *parent)
 		if (data.victim) {
 			struct dentry *parent;
 			spin_lock(&data.victim->d_lock);
-			if (!shrink_lock_dentry(data.victim)) {
+			if (!lock_for_kill(data.victim)) {
 				spin_unlock(&data.victim->d_lock);
 				rcu_read_unlock();
 			} else {
 				rcu_read_unlock();
 				parent = data.victim->d_parent;
-				if (parent != data.victim)
-					__dput_to_list(parent, &data.dispose);
+				if (parent != data.victim &&
+				    !--parent->d_lockref.count)
+					to_shrink_list(parent, &data.dispose);
 				__dentry_kill(data.victim);
 			}
 		}
Al Viro Nov. 1, 2023, 2:22 a.m. UTC | #12
[NFS folks Cc'd]

On Tue, Oct 31, 2023 at 12:18:48AM +0000, Al Viro wrote:
> On Mon, Oct 30, 2023 at 12:18:28PM -1000, Linus Torvalds wrote:
> > On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote:
> > >
> > > After fixing a couple of brainos, it seems to work.
> > 
> > This all makes me unnaturally nervous, probably because it;s overly
> > subtle, and I have lost the context for some of the rules.
> 
> A bit of context: I started to look at the possibility of refcount overflows.
> Writing the current rules for dentry refcounting and lifetime down was the
> obvious first step, and that immediately turned into an awful mess.
> 
> It is overly subtle.  Even more so when you throw the shrink lists into
> the mix - shrink_lock_dentry() got too smart for its own good, and that
> leads to really awful correctness proofs.

... and for another example of subtle shit, consider DCACHE_NORCU.  Recall
c0eb027e5aef "vfs: don't do RCU lookup of empty pathnames" and note that
it relies upon never getting results of alloc_file_pseudo() with directory
inode anywhere near descriptor tables.

Back then I basically went "fine, nobody would ever use alloc_file_pseudo()
for that anyway", but... there's a call in __nfs42_ssc_open() that doesn't
have any obvious protection against ending up with directory inode.
That does not end up anywhere near descriptor tables, as far as I can tell,
fortunately.

Unfortunately, it is quite capable of fucking the things up in different
ways, even if it's promptly closed.  d_instantiate() on directory inode
is a really bad thing; a plenty of places expect to have only one alias
for those, and would be very unhappy with that kind of crap without any
RCU considerations.

I'm pretty sure that this NFS code really does not want to use that for
directories; the simplest solution would be to refuse alloc_file_pseudo()
for directory inodes.  NFS folks - do you have a problem with the
following patch?

======
Make sure we never feed a directory to alloc_file_pseudo()

That would be broken in a lot of ways, from UAF in pathwalk if
that thing ever gets into descriptor tables, to royally screwing
every place that relies upon the lack of aliases for directory
inodes (i.e. quite a bit of VFS).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
diff --git a/fs/file_table.c b/fs/file_table.c
index ee21b3da9d08..5331a696896e 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -326,6 +326,9 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
 	struct path path;
 	struct file *file;
 
+	if (WARN_ON_ONCE(S_ISDIR(inode->i_mode)))
+		return ERR_PTR(-EISDIR);
+
 	path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
 	if (!path.dentry)
 		return ERR_PTR(-ENOMEM);
Al Viro Nov. 1, 2023, 6:18 a.m. UTC | #13
On Tue, Oct 31, 2023 at 06:12:26AM +0000, Al Viro wrote:
> On Tue, Oct 31, 2023 at 01:53:51AM +0000, Al Viro wrote:
> 
> > Carving that series up will be interesting, though...
> 
> I think I have a sane carve-up; will post if it survives testing.

OK, current variant survives local testing.  Lives in
git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git #work.dcache

Shortlog:
      fast_dput(): having ->d_delete() is not reason to delay refcount decrement
      fast_dput(): handle underflows gracefully
      fast_dput(): new rules for refcount
      __dput_to_list(): do decrement of refcount in the caller
      retain_dentry(): lift decrement of ->d_count into callers
      __dentry_kill(): get consistent rules for ->d_count
      dentry_kill(): don't bother with retain_dentry() on slow path
      Call retain_dentry() with refcount 0
      fold the call of retain_dentry() into fast_dput()
      don't try to cut corners in shrink_lock_dentry()
      fold dentry_kill() into dput()
      get rid of __dget()
      shrink_dentry_list(): no need to check that dentry refcount is marked dead
      to_shrink_list(): call only if refcount is 0
      switch select_collect{,2}() to use of to_shrink_list()

Diffstat:
 fs/dcache.c | 268 ++++++++++++++++++++++--------------------------------------
 1 file changed, 96 insertions(+), 172 deletions(-)

Individual patches in followups.  Review and testing would be welcome,
and it's obviously next cycle fodder.  Massage of refcounting is in the
first 11 commits, last 4 are followups.
Benjamin Coddington Nov. 1, 2023, 2:29 p.m. UTC | #14
On 31 Oct 2023, at 22:22, Al Viro wrote:

> [NFS folks Cc'd]
>
> On Tue, Oct 31, 2023 at 12:18:48AM +0000, Al Viro wrote:
>> On Mon, Oct 30, 2023 at 12:18:28PM -1000, Linus Torvalds wrote:
>>> On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote:
>>>>
>>>> After fixing a couple of brainos, it seems to work.
>>>
>>> This all makes me unnaturally nervous, probably because it;s overly
>>> subtle, and I have lost the context for some of the rules.
>>
>> A bit of context: I started to look at the possibility of refcount overflows.
>> Writing the current rules for dentry refcounting and lifetime down was the
>> obvious first step, and that immediately turned into an awful mess.
>>
>> It is overly subtle.  Even more so when you throw the shrink lists into
>> the mix - shrink_lock_dentry() got too smart for its own good, and that
>> leads to really awful correctness proofs.
>
> ... and for another example of subtle shit, consider DCACHE_NORCU.  Recall
> c0eb027e5aef "vfs: don't do RCU lookup of empty pathnames" and note that
> it relies upon never getting results of alloc_file_pseudo() with directory
> inode anywhere near descriptor tables.
>
> Back then I basically went "fine, nobody would ever use alloc_file_pseudo()
> for that anyway", but... there's a call in __nfs42_ssc_open() that doesn't
> have any obvious protection against ending up with directory inode.
> That does not end up anywhere near descriptor tables, as far as I can tell,
> fortunately.
>
> Unfortunately, it is quite capable of fucking the things up in different
> ways, even if it's promptly closed.  d_instantiate() on directory inode
> is a really bad thing; a plenty of places expect to have only one alias
> for those, and would be very unhappy with that kind of crap without any
> RCU considerations.
>
> I'm pretty sure that this NFS code really does not want to use that for
> directories; the simplest solution would be to refuse alloc_file_pseudo()
> for directory inodes.  NFS folks - do you have a problem with the
> following patch?

It would be a protocol violation to use COPY on a directory:

https://www.rfc-editor.org/rfc/rfc7862.html#section-15.2.3

   Both SAVED_FH and CURRENT_FH must be regular files.  If either
   SAVED_FH or CURRENT_FH is not a regular file, the operation MUST fail
   and return NFS4ERR_WRONG_TYPE.

so nfsd4_verify_copy() does S_ISREG() checks before alloc_file_pseudo().

Ben
Al Viro Nov. 5, 2023, 7:54 p.m. UTC | #15
On Tue, Oct 31, 2023 at 12:18:48AM +0000, Al Viro wrote:
> On Mon, Oct 30, 2023 at 12:18:28PM -1000, Linus Torvalds wrote:
> > On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote:
> > >
> > > After fixing a couple of brainos, it seems to work.
> > 
> > This all makes me unnaturally nervous, probably because it;s overly
> > subtle, and I have lost the context for some of the rules.
> 
> A bit of context: I started to look at the possibility of refcount overflows.
> Writing the current rules for dentry refcounting and lifetime down was the
> obvious first step, and that immediately turned into an awful mess.
> 
> It is overly subtle.

	Another piece of too subtle shite: ordering of ->d_iput() of child
and __dentry_kill() of parent.  As it is, in some cases it is possible for
the latter to happen before the former.  It is *not* possible in the cases
when in-tree ->d_iput() instances actually look at the parent (all of those
are due to sillyrename stuff), but the proof is convoluted and very brittle.

	The origin of that mess is in the interaction of shrink_dcache_for_umount()
with shrink_dentry_list().  What we want to avoid is a directory looking like
it's busy since shrink_dcache_for_umount() doesn't see any children to account
for positive refcount of parent.  The kinda-sorta solution we use is to decrement
the parent's refcount *before* __dentry_kill() of child and put said parent
into a shrink list.  That makes shrink_dcache_for_umount() do the right thing,
but it's possible to end up with parent freed before the child is done with;
scenario is non-obvious, and rather hard to hit, but it's not impossible.

	dput() does no such thing - it does not decrement the parent's
refcount until the child had been taken care of.  That's fine, as far
as shrink_dcache_for_umount() is concerned - this is not a false positive;
with slightly different timing shrink_dcache_for_umount() would've reported
the child as being busy.  IOW, there should be no overlap between dput()
in one thread and shrink_dcache_for_umount() in another.  Unfortunately,
memory eviction *can* come in the middle of shrink_dcache_for_umount().

	Life would be much simpler if shrink_dentry_list() would not have
to pull that kind of tricks and used the same ordering as dput() does.
IMO there's a reasonably cheap way to achieve that:

	* have shrink_dcache_for_umount() mark the superblock (either in
->s_flags or inside the ->s_dentry_lru itself) and have the logics
in retain_dentry() that does insertion into LRU list check ->d_sb for that
mark, treating its presence as "do not retain".
	* after marking the superblock shrink_dentry_for_umount() is guaranteed
that nothing new will be added to shrink list in question.  Have it call
shrink_dcache_sb() to drain LRU.
	* Now shrink_dentry_list() in one thread hitting a dentry on
a superblock going throug shrink_dcache_for_umount() in another thread is
always a bug and reporting busy dentries is the right thing to do.
So we can switch shrink_dentry_list() to the same "drop reference to parent
only after the child had been killed" ordering as we have in dput().

	IMO that removes a fairly nasty trap for ->d_iput() and ->d_delete()
instances.  As for the overhead, the relevant fragment of retain_dentry() is
	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
		d_lru_add(dentry);
	else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
		dentry->d_flags |= DCACHE_REFERENCED;
	return true;
That would become
	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) {
		if (unlikely(dentry->d_sb is marked))
			return false;
		d_lru_add(dentry);
	} else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
		dentry->d_flags |= DCACHE_REFERENCED;
	return true;
Note that d_lru_add() will hit ->d_sb->s_dentry_lru, so we are not
adding memory traffic here; the else if part doesn't need to be
touched - we only need to prevent insertions into LRU.

	Comments?
Al Viro Nov. 5, 2023, 9:59 p.m. UTC | #16
On Sun, Nov 05, 2023 at 07:54:16PM +0000, Al Viro wrote:

> 	* have shrink_dcache_for_umount() mark the superblock (either in
> ->s_flags or inside the ->s_dentry_lru itself) and have the logics
> in retain_dentry() that does insertion into LRU list check ->d_sb for that
> mark, treating its presence as "do not retain".

	BTW, no barriers, etc. are needed for those - once we are into
shrink_dentry_for_umount() there must be no dput() calls anywhere on
that filesystem.
Al Viro Nov. 6, 2023, 5:53 a.m. UTC | #17
On Sun, Nov 05, 2023 at 07:54:16PM +0000, Al Viro wrote:
> On Tue, Oct 31, 2023 at 12:18:48AM +0000, Al Viro wrote:
> > On Mon, Oct 30, 2023 at 12:18:28PM -1000, Linus Torvalds wrote:
> > > On Mon, 30 Oct 2023 at 11:53, Al Viro <viro@zeniv.linux.org.uk> wrote:
> > > >
> > > > After fixing a couple of brainos, it seems to work.
> > > 
> > > This all makes me unnaturally nervous, probably because it;s overly
> > > subtle, and I have lost the context for some of the rules.
> > 
> > A bit of context: I started to look at the possibility of refcount overflows.
> > Writing the current rules for dentry refcounting and lifetime down was the
> > obvious first step, and that immediately turned into an awful mess.
> > 
> > It is overly subtle.
> 
> 	Another piece of too subtle shite: ordering of ->d_iput() of child
> and __dentry_kill() of parent.  As it is, in some cases it is possible for
> the latter to happen before the former.  It is *not* possible in the cases
> when in-tree ->d_iput() instances actually look at the parent (all of those
> are due to sillyrename stuff), but the proof is convoluted and very brittle.
> 
> 	The origin of that mess is in the interaction of shrink_dcache_for_umount()
> with shrink_dentry_list().  What we want to avoid is a directory looking like
> it's busy since shrink_dcache_for_umount() doesn't see any children to account
> for positive refcount of parent.  The kinda-sorta solution we use is to decrement
> the parent's refcount *before* __dentry_kill() of child and put said parent
> into a shrink list.  That makes shrink_dcache_for_umount() do the right thing,
> but it's possible to end up with parent freed before the child is done with;
> scenario is non-obvious, and rather hard to hit, but it's not impossible.

D'oh...  We actually don't need to worry about eviction on memory pressure at that
point; unregister_shrinker() is done early enough to prevent that.

So shrink_dcache_for_umount() does not need to worry about shrink lists use
in prune_dcache_sb().

Use in namespace_unlock() is guaranteed that all dentries involved either
have a matching mount in the list of mounts to be dropped (and thus protected
from simultaneous fs shutdown) or have a matching mount pinned by the caller.

Use in mntput_no_expire() is guaranteed the same - all dentries involved are
on superblock of mount we are going to drop after the call of shrink_dentry_list().

All other users also either have an active reference to superblock or are done
by ->kill_sb() synchronously (and thus can't race with shrink_dcache_for_umount())
or are done async, but flushed and/or waited for by foofs_kill_sb() before
they get to shrink_dcache_for_umount().

IOW, I'd been too paranoid in "Teach shrink_dcache_parent() to cope with
mixed-filesystem shrink lists" - the real requirements are milder; in-tree
users didn't need these games with parent.  Dcache side of Tobin's Slab
Movable Objects patches needed those, though...

AFAICS, there are 3 options:
	1) leave the current weirdness with ->d_iput() on child vs __dentry_kill()
on parent.  Document the requirement to ->d_iput() (and ->d_release()) to cope
with that, promise that in case of sillyrename the ordering will be there and
write down the proof of that.  No code changes, rather revolting docs to
write, trouble waiting to happen in ->d_iput().
	2) require that shrink_dentry_list() should never overlap with
shrink_dcache_for_umount() on any of the filesystems represented in the
shrink list, guarantee that parent won't get to __dentry_kill() before
the child gets through __dentry_kill() completely and accept that resurrecting
SMO stuff will require more work.  Smallish patch, tolerable docs, probably
the best option at the moment.
	3) bite the bullet and get shrink_dentry_list() to coexist with
shrink_dcache_for_umount(), with sane ordering of ->d_iput() vs. parent's
__dentry_kill().  Doable, but AFAICS it will take a counter of children
currently being killed in the parent dentry.  shrink_dentry_list() would
bump that on parent, __dentry_kill() the victim, then relock the parent
and decrement that counter along with the main refcount.  That would allow
the shrink_dcache_for_umount() to cope with that crap.  No requirements
for shrink_dentry_kill() callers that way, sane environment for ->d_iput(),
no obstacles for SMO stuff.  OTOH, we need to get space for additional
counter in struct dentry; again, doable (->d_subdirs/->d_child can be
converted to hlist, saving us a pointer in each dentry), but... I'd
leave that option alone until something that needs it would show up
(e.g. if/when Tobin resurrects his patchset).

	My preference would be (2) for the coming cycle + prototype of
a patch doing (3) on top of that for the future.

Completely untested diff for (2) (on top of #work.dcache, sans the
documentation update) below:

diff --git a/fs/dcache.c b/fs/dcache.c
index ccf41c5ee804..c978207f3fc4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1086,10 +1086,27 @@ void d_prune_aliases(struct inode *inode)
 }
 EXPORT_SYMBOL(d_prune_aliases);
 
+static inline void shrink_kill(struct dentry *victim, struct list_head *list)
+{
+	struct dentry *parent = victim->d_parent;
+
+	__dentry_kill(victim);
+
+	if (parent == victim || lockref_put_or_lock(&parent->d_lockref))
+		return;
+
+	if (!WARN_ON_ONCE(parent->d_lockref.count != 1)) {
+		parent->d_lockref.count = 0;
+		to_shrink_list(parent, list);
+	}
+	spin_unlock(&parent->d_lock);
+}
+
+
 void shrink_dentry_list(struct list_head *list)
 {
 	while (!list_empty(list)) {
-		struct dentry *dentry, *parent;
+		struct dentry *dentry;
 
 		dentry = list_entry(list->prev, struct dentry, d_lru);
 		spin_lock(&dentry->d_lock);
@@ -1106,10 +1123,7 @@ void shrink_dentry_list(struct list_head *list)
 		}
 		rcu_read_unlock();
 		d_shrink_del(dentry);
-		parent = dentry->d_parent;
-		if (parent != dentry && !--parent->d_lockref.count)
-			to_shrink_list(parent, list);
-		__dentry_kill(dentry);
+		shrink_kill(dentry, list);
 	}
 }
 
@@ -1537,19 +1551,14 @@ void shrink_dcache_parent(struct dentry *parent)
 			break;
 		data.victim = NULL;
 		d_walk(parent, &data, select_collect2);
-		if (data.victim) {
-			struct dentry *parent;
+		if (data.victim) { // rcu_read_lock held - see select_collect2()
 			spin_lock(&data.victim->d_lock);
 			if (!lock_for_kill(data.victim)) {
 				spin_unlock(&data.victim->d_lock);
 				rcu_read_unlock();
 			} else {
 				rcu_read_unlock();
-				parent = data.victim->d_parent;
-				if (parent != data.victim &&
-				    !--parent->d_lockref.count)
-					to_shrink_list(parent, &data.dispose);
-				__dentry_kill(data.victim);
+				shrink_kill(data.victim, &data.dispose);
 			}
 		}
 		if (!list_empty(&data.dispose))
Al Viro Nov. 7, 2023, 2:08 a.m. UTC | #18
On Mon, Nov 06, 2023 at 05:53:53AM +0000, Al Viro wrote:

> AFAICS, there are 3 options:
> 	1) leave the current weirdness with ->d_iput() on child vs __dentry_kill()
> on parent.  Document the requirement to ->d_iput() (and ->d_release()) to cope
> with that, promise that in case of sillyrename the ordering will be there and
> write down the proof of that.  No code changes, rather revolting docs to
> write, trouble waiting to happen in ->d_iput().
> 	2) require that shrink_dentry_list() should never overlap with
> shrink_dcache_for_umount() on any of the filesystems represented in the
> shrink list, guarantee that parent won't get to __dentry_kill() before
> the child gets through __dentry_kill() completely and accept that resurrecting
> SMO stuff will require more work.  Smallish patch, tolerable docs, probably
> the best option at the moment.
> 	3) bite the bullet and get shrink_dentry_list() to coexist with
> shrink_dcache_for_umount(), with sane ordering of ->d_iput() vs. parent's
> __dentry_kill().  Doable, but AFAICS it will take a counter of children
> currently being killed in the parent dentry.  shrink_dentry_list() would
> bump that on parent, __dentry_kill() the victim, then relock the parent
> and decrement that counter along with the main refcount.  That would allow
> the shrink_dcache_for_umount() to cope with that crap.  No requirements
> for shrink_dentry_kill() callers that way, sane environment for ->d_iput(),
> no obstacles for SMO stuff.  OTOH, we need to get space for additional
> counter in struct dentry; again, doable (->d_subdirs/->d_child can be
> converted to hlist, saving us a pointer in each dentry), but... I'd
> leave that option alone until something that needs it would show up
> (e.g. if/when Tobin resurrects his patchset).

	4) instead of having __dentry_kill() called with dentry, parent
and inode locked and doing
	->d_prune
	unhash
	remove from list of children
	unlock parent
	detach from inode
	unlock dentry and inode
	drop inode
	->d_release
	relock dentry
	if on shrink list, mark as ready to free 
	unlock dentry
	if was not on shrink list, free it
go for calling it with just dentry and inode locked and do
	->d_prune
	unhash
	detach from inode
	unlock dentry and inode
	drop inode
	->d_release
	lock parent (if any, as usual)
	lock dentry
	remove from list of children
	if on shrink list, mark as ready to free
	unlock dentry
	if was on shrink list, free it
	decrement parent's refcount (again, if there was a parent)
	if refcount is still positive - unlock parent and return NULL
	otherwise return parent

What changes:
	* caller needs milder locking environment; lock_for_kill() gets simpler.
	  Note that only positive dentries can be moved, so inside __dentry_kill()
	  we need no retry loops, etc. - ->d_parent is stable by the point we decide
	  to remove from the list of children.
	* code that iterates through the list of children (not much of it)
	  needs to cope with seeing negative unhashed dentries with
	  refcount marked dead.  Most of it will need no changes at all.
	* ->d_prune() instances are called without parent's ->d_lock; just
	  the victim's one.  Might require changes to out-of-tree filesystems.
	* dput() turns into
	if (!dentry)
		return;
	rcu_read_lock()
	if (fast_dput(dentry)) {
		rcu_read_unlock();
		return;
	}
	while (lock_for_kill(dentry)) { // not bothering with the parent
		rcu_read_unlock();
		dentry = __dentry_kill(dentry);
		if (!dentry)
			return;
		if (retain_dentry(dentry)) {
			spin_unlock(&dentry->d_lock);
			return;
		}
		rcu_read_lock();
	}
	spin_unlock(&dentry->d_lock);
	rcu_read_unlock();
since there's no point trying to avoid locking the parents - we need
to grab those locks at some point anyway, just to remove a child from
the list of children, and that way we return from __dentry_kill() with
that lock held.
	* shrink_dentry_list() eviction of parents happens thus:
	do {
		rcu_read_unlock();
		victim = __dentry_kill(victim);
		rcu_read_lock();
	while (victim && lock_for_kill(victim));
	rcu_read_unlock();
	if (victim)
		spin_unlock(&victim->d_lock);
	* sane order of ->d_iput() on child vs. __dentry_kill() on parent.
	* shrink_dcache_for_umount() does the right thing even if it
overlaps shrink_dentry_list().

	If that works, it's probably the best variant...
Al Viro Nov. 9, 2023, 6:19 a.m. UTC | #19
The series below is the fallout of trying to document the dentry
refcounting and life cycle - basically, getting rid of the bits that
had been too subtle and ugly to write them up.

	Results so far:
* -136LoC (-170LoC not counting the additions in D/f/porting ;-)
* considerably simpler locking for __dentry_kill()
* fast_dput() is pretty much "dput() sans killing dentry; returns true if
we are done, false - if dentry needs killing (in which case dentry will
be left locked and refcount is known to be 0).
* retain_dentry() not messing with refcounting - called with refcount 0
and ->d_lock held, returns whether we want the dentry retained in cache.
* rules for shrink lists are much simpler now - to_shrink_list() puts
a locked dentry with zero refcount into a shrink list, no need to guarantee
that filesystem containing that dentry won't get shut down before we get
to eventual shrink_dentry_list() - it would do the right thing.
* ->d_iput() and ->d_release() no longer have weird corner cases when they
could get called with parent already killed.  That happened to be avoided
in the cases where in-kernel instances would bother to work with the parent,
but that used to be very brittle and convoluted.  Now it's "parent is kept
pinned until __dentry_kill() of child is done".
* a bunch of other subtle shit is gone (e.g. the logics in shrink_lock_dentry()
had rather subtle corner cases, with convoluted reasons why they won't break
things - except that in some cases they would, etc.)

	This stuff lives in
git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git #work.dcache2
individual patches are in followups.  Help with reviewing and testing would
be very welcome - it seems to survive the local beating, but it definitely
needs more.

Shortlog:
Al Viro (22):
      struct dentry: get rid of randomize_layout idiocy
      switch nfsd_client_rmdir() to use of simple_recursive_removal()
      coda_flag_children(): cope with dentries turning negative
      dentry: switch the lists of children to hlist
      centralize killing dentry from shrink list
      get rid of __dget()
      shrink_dentry_list(): no need to check that dentry refcount is marked dead
      fast_dput(): having ->d_delete() is not reason to delay refcount decrement
      fast_dput(): handle underflows gracefully
      fast_dput(): new rules for refcount
      __dput_to_list(): do decrement of refcount in the callers
      Make retain_dentry() neutral with respect to refcounting
      __dentry_kill(): get consistent rules for victim's refcount
      dentry_kill(): don't bother with retain_dentry() on slow path
      Call retain_dentry() with refcount 0
      fold the call of retain_dentry() into fast_dput()
      don't try to cut corners in shrink_lock_dentry()
      fold dentry_kill() into dput()
      to_shrink_list(): call only if refcount is 0
      switch select_collect{,2}() to use of to_shrink_list()
      d_prune_aliases(): use a shrink list
      __dentry_kill(): new locking scheme

Diffstat:
 Documentation/filesystems/porting.rst     |  34 +++
 arch/powerpc/platforms/cell/spufs/inode.c |   5 +-
 fs/afs/dynroot.c                          |   5 +-
 fs/autofs/expire.c                        |   7 +-
 fs/ceph/dir.c                             |   2 +-
 fs/ceph/mds_client.c                      |   2 +-
 fs/coda/cache.c                           |   9 +-
 fs/dcache.c                               | 409 ++++++++++--------------------
 fs/libfs.c                                |  45 ++--
 fs/nfsd/nfsctl.c                          |  70 +----
 fs/notify/fsnotify.c                      |   2 +-
 fs/tracefs/inode.c                        |  34 +--
 include/linux/dcache.h                    |  22 +-
 13 files changed, 255 insertions(+), 391 deletions(-)

Patch description follows:

	Part 1 - preparations

01/22) struct dentry: get rid of randomize_layout idiocy.
	This is beyond ridiculous.  There is a reason why that thing
is cacheline-aligned...

02/22) nfsd_client_rmdir() and its gut open-code simple_recursive_removal();
converting to calling that cleans the things up in there *and* reduces
the amount of places where we touch the list of children, which simplifies
the work later in the series.

03/22) more fun caught while looking at the places that go through the
lists of children: coda_flag_children() assumes that ->d_lock on parent
is enough to prevent children going negative.  Ain't so...

04/22) switch the lists of children to hlist.  We never bother with
accessing the list tail and using hlist saves us a pointer per each
dentry.  Besides, it ends up more readable that way.  Fields used to hold
the lists got renamed - d_children/d_sib instead of d_subdirs/d_child.
Yes, any out-of-tree code that works with the lists of children gets
loudly broken; not hard to fix.

05/22) centralize killing dentry from shrink list
There are identical pieces of code in shrink_dentry_list() and
shrink_dcache_for_umount(); they would require identical massage through
the series below, unifying them into an inlined helper reduces the amount
of noise.

06/22) get rid of __dget()
fold into the sole remaining caller

07/22) shrink_dentry_list(): no need to check that dentry refcount is
marked dead.  We won't see DCACHE_MAY_FREE on anything that is *not*
dead and checking d_flags is just as cheap as checking refcount.

	Part 2 - massage of dput() and friends

08/22) fast_dput(): having ->d_delete() is not reason to delay refcount
decrement.
	->d_delete() is a way for filesystem to tell that dentry is not
worth keeping cached.  It is not guaranteed to be called every time a dentry
has refcount drop down to zero; it is not guaranteed to be called before
dentry gets evicted.  In other words, it is not suitable for any kind
of keeping track of dentry state.
	None of the in-tree filesystems attempt to use it that way,
fortunately.
	So the contortions done by fast_dput() (as well as dentry_kill())
are not warranted.  fast_dput() certainly should treat having ->d_delete()
instance as "can't assume we'll be keeping it", but that's not different
from the way we treat e.g. DCACHE_DONTCACHE (which is rather similar
to making ->d_delete() returns true when called).

09/22) fast_dput(): handle underflows gracefully.
	If refcount is less than 1, we should just warn, unlock
dentry and return true, so that the caller doesn't try to do anything
else.
	Taking care of that leaves the rest of "lockref_put_return() has
failed" case equivalent to "decrement refcount and rejoin the normal
slow path after the point where we grab ->d_lock".

10/22) fast_dput(): new rules for refcount.
	Currently the "need caller to do more work" path in fast_dput()
has refcount decremented, then, with ->d_lock held and refcount verified
to have reached 0 fast_dput() forcibly resets the refcount to 1.
	Move that resetting refcount to 1 into the callers; later in
the series it will be massaged out of existence.

11/22) __dput_to_list(): do decrement of refcount in the callers
... and rename it to to_shrink_list(), seeing that it no longer
does dropping any references.

12/22) make retain_dentry() neutral with respect to refcounting.
It used to decrement refcount if and only if it returned true.
Lift those decrements into the callers.

13/22) __dentry_kill(): get consistent rules for victim's refcount
	Currently we call it with refcount equal to 1 when called from
dentry_kill(); all other callers have it equal to 0.
	Make it always be called with zero refcount; on this step we just
decrement it before the calls in dentry_kill().  That is safe, since
all places that care about the value of refcount either do that under
->d_lock or hold a reference to dentry in question.  Either is sufficient
to prevent observing a dentry immediately prior to __dentry_kill()
getting called from dentry_kill().

14/22) dentry_kill(): don't bother with retain_dentry() on the slow path
	We have already checked it and dentry used to look not worthy
of keeping.  The only hard obstacle to evicting dentry is non-zero
refcount; everything else is advisory - e.g. memory pressure could evict
any dentry found with refcount zero.  On the slow path in dentry_kill()
we had dropped and regained ->d_lock; we must recheck the refcount,
but everything else is not worth bothering with.
	Note that filesystem can not count upon ->d_delete() being
called for dentry - not even once.  Again, memory pressure (as well as
d_prune_aliases(), or attempted rmdir() of ancestor, or...) will not
call ->d_delete() at all.
	So from the correctness point of view we are fine doing the
check only once.  And it makes things simpler down the road.
	The doctor said "To the morgue", so to the morgue it is!

15/22) Call retain_dentry() with refcount 0.
	Instead of bumping it from 0 to 1, calling retain_dentry(),
then decrementing it back to 0 (with ->d_lock held all the way through),
just leave refcount at 0 through all of that.
	It will have a visible effect for ->d_delete() - now it can
be called with refcount 0 instead of 1 and it can no longer play silly
buggers with dropping/regaining ->d_lock.  Not that any in-tree instances
tried to (it's pretty hard to get right).
	Any out-of-tree ones will have to adjust (assuming they need
any changes).

16/22) fold the call of retain_dentry() into fast_dput()
	Calls of retain_dentry() happen immediately after getting false
from fast_dput() and getting true from retain_dentry() is treated the
same way as non-zero refcount would be treated by fast_dput() - unlock
dentry and bugger off.

17/22) don't try to cut corners in shrink_lock_dentry().
	That is to say, do *not* treat the ->d_inode or ->d_parent
changes as "it's hard, return false; somebody must have grabbed it,
so even if has zero refcount, we don't need to bother killing it -
final dput() from whoever grabbed it would've done everything".
	First of all, that is not guaranteed.  It might have been dropped
by shrink_kill() handling of victim's parent, which would've found it
already on a shrink list (ours) and decided that they don't need to put
it on their shrink list.
	What's more, dentry_kill() is doing pretty much the same thing,
cutting its own set of corners (it assumes that dentry can't go from
positive to negative, so its inode can change but only once and only in
one direction).
	Doing that right allows to get rid of that not-quite-duplication
and removes the only reason for re-incrementing refcount before the call
of dentry_kill().
	Replacement is called lock_for_kill(); called under rcu_read_lock
and with ->d_lock held.  If it returns false, dentry has non-zero refcount
and the same locks are held.  If it returns true, dentry has zero refcount
and all locks required by __dentry_kill() are taken.
	Part of __lock_parent() had been lifted into lock_parent() to
allow its reuse.  Now it's called with rcu_read_lock already held and
dentry already unlocked.
	Note that this is not the final change - locking requirements
for __dentry_kill() are going to change later in the series and the set
of locks taken by lock_for_kill() will be adjusted.

18/22) fold dentry_kill() into dput().
	Not worth keeping separate.

19/22) to_shrink_list(): call only if refcount is 0
	The only thing it does if refcount is not zero is d_lru_del();
no point, IMO, seeing that plain dput() does nothing of that sort...
Note that 2 of 3 current callers are guaranteed that refcount is 0.

20/22) switch select_collect{,2}() to use of to_shrink_list()
Same note about d_lru_del() as in (18/22).

21/22) d_prune_aliases(): use a shrink list
	Instead of dropping aliases one by one, restarting, etc., just
collect them into a shrink list and kill them off in one pass.
	We don't really need the restarts - one alias can't pin another
(directory has only one alias, and couldn't be its own ancestor anyway),
so collecting everything that is not busy and taking it out would
take care of everything evictable that had been there as we entered
the function.  And new aliases added while we'd been dropping old ones
could just as easily have appeared right as we return to caller...

22/22) __dentry_kill(): new locking scheme
	Currently we enter __dentry_kill() with parent (along with the
victim dentry and victim's inode) held locked.	Then we
	mark dentry refcount as dead
	call ->d_prune()
	remove dentry from hash
	remove it from the parent's list of children
	unlock the parent, don't need it from that point on
	detach dentry from inode,
	unlock dentry and drop the inode (via ->d_iput())
	call ->d_release()
	regain the lock on dentry
	check if it's on a shrink list (in which case freeing its empty
	  husk has to be left to shrink_dentry_list()) or not (in which
	  case we can free it ourselves).  In the former case, mark it
	  as an empty husk, so that shrink_dentry_list() would know it
	  can free the sucker.
	drop the lock on dentry
... and usually the caller proceeds to drop a reference on the parent,
possibly retaking the lock on it.
	That is painful for a bunch of reasons, starting with the need
to take locks out of order, but not limited to that - the parent of
positive dentry can change if we drop its ->d_lock, so getting these
locks has to be done with care.
	Moreover, as soon as dentry is out of the parent's list of
children, shrink_dcache_for_umount() won't see it anymore, making it
appear as if the parent is inexplicably busy.  We do work around that
by having shrink_dentry_list() decrement the parent's refcount first and
put it on shrink list to be evicted once we are done with __dentry_kill()
of child, but that may in some cases lead to ->d_iput() on child called
after the parent got killed.  That doesn't happen in cases where in-tree
->d_iput() instances might want to look at the parent, but that's brittle
as hell.
	Solution: do removal from the parent's list of children in the
very end of __dentry_kill().  As the result, the callers do not need to
lock the parent and by the time we really need the parent locked, dentry
is negative and is guaranteed not to be moved around.
	It does mean that ->d_prune() will be called with parent not
locked.  It also means that we might see dentries in process of being torn
down while going through the parent's list of children; those dentries
will be unhashed, negative and with refcount marked dead.  In practice,
that's enough for in-tree code that looks through the list of children
to do the right thing as-is.  Out-of-tree code might need to be adjusted.
	Calling conventions: __dentry_kill(dentry) is called with
dentry->d_lock held, along with ->i_lock of its inode (if any).
It either returns the parent (locked, with refcount decremented to 0)
or NULL (if there'd been no parent or if refcount decrement for parent
hadn't reached 0).
	lock_for_kill() is adjusted for new requirements - it doesn't
touch the parent's ->d_lock at all.
	Callers adjusted.  Note that for dput() we don't need to
bother with fast_dput() for the parent - we just need to check
retain_dentry() for it, since its ->d_lock is still held since the
moment when __dentry_kill() had taken it to remove the victim from the
list of children.
	The kludge with early decrement of parent's refcount in
shrink_dentry_list() is no longer needed - shrink_dcache_for_umount()
sees the half-killed dentries in the list of children for as long as
they are pinning the parent.  They are easily recognized and accounted
for by select_collect(), so we know we are not done yet.
	As the result, we always have the expected ordering
for ->d_iput()/->d_release() vs. __dentry_kill() of the parent, no
exceptions.  Moreover, the current rules for shrink lists (one must make
sure that shrink_dcache_for_umount() won't happen while any dentries
from the superblock in question are on any shrink lists) are gone -
shrink_dcache_for_umount() will do the right thing in all cases, taking
such dentries out.  Their empty husks (memory occupied by struct dentry
itself + its external name, if any) will remain on the shrink lists,
but they are no obstacles to filesystem shutdown.  And such husks will
get freed as soon as shrink_dentry_list() of the list they are on gets
to them.
diff mbox series

Patch

diff --git a/fs/dcache.c b/fs/dcache.c
index 9f471fdb768b..af0e067f6982 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -680,7 +680,6 @@  static inline bool retain_dentry(struct dentry *dentry)
 		return false;
 
 	/* retain; LRU fodder */
-	dentry->d_lockref.count--;
 	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
 		d_lru_add(dentry);
 	else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
@@ -709,7 +708,7 @@  EXPORT_SYMBOL(d_mark_dontcache);
  * Returns dentry requiring refcount drop, or NULL if we're done.
  */
 static struct dentry *dentry_kill(struct dentry *dentry)
-	__releases(dentry->d_lock)
+	__releases(dentry->d_lock) __releases(rcu)
 {
 	struct inode *inode = dentry->d_inode;
 	struct dentry *parent = NULL;
@@ -730,6 +729,7 @@  static struct dentry *dentry_kill(struct dentry *dentry)
 			goto slow_positive;
 		}
 	}
+	rcu_read_unlock();
 	__dentry_kill(dentry);
 	return parent;
 
@@ -739,9 +739,8 @@  static struct dentry *dentry_kill(struct dentry *dentry)
 	spin_lock(&dentry->d_lock);
 	parent = lock_parent(dentry);
 got_locks:
-	if (unlikely(dentry->d_lockref.count != 1)) {
-		dentry->d_lockref.count--;
-	} else if (likely(!retain_dentry(dentry))) {
+	if (likely(dentry->d_lockref.count == 0 && !retain_dentry(dentry))) {
+		rcu_read_unlock();
 		__dentry_kill(dentry);
 		return parent;
 	}
@@ -751,6 +750,7 @@  static struct dentry *dentry_kill(struct dentry *dentry)
 	if (parent)
 		spin_unlock(&parent->d_lock);
 	spin_unlock(&dentry->d_lock);
+	rcu_read_unlock();
 	return NULL;
 }
 
@@ -768,15 +768,7 @@  static inline bool fast_dput(struct dentry *dentry)
 	unsigned int d_flags;
 
 	/*
-	 * If we have a d_op->d_delete() operation, we sould not
-	 * let the dentry count go to zero, so use "put_or_lock".
-	 */
-	if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
-		return lockref_put_or_lock(&dentry->d_lockref);
-
-	/*
-	 * .. otherwise, we can try to just decrement the
-	 * lockref optimistically.
+	 * decrement the lockref optimistically.
 	 */
 	ret = lockref_put_return(&dentry->d_lockref);
 
@@ -830,7 +822,7 @@  static inline bool fast_dput(struct dentry *dentry)
 	 */
 	smp_rmb();
 	d_flags = READ_ONCE(dentry->d_flags);
-	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST |
+	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_OP_DELETE |
 			DCACHE_DISCONNECTED | DCACHE_DONTCACHE;
 
 	/* Nothing to do? Dropping the reference was all we needed? */
@@ -855,12 +847,6 @@  static inline bool fast_dput(struct dentry *dentry)
 		return true;
 	}
 
-	/*
-	 * Re-get the reference we optimistically dropped. We hold the
-	 * lock, and we just tested that it was zero, so we can just
-	 * set it to 1.
-	 */
-	dentry->d_lockref.count = 1;
 	return false;
 }
 
@@ -903,10 +889,9 @@  void dput(struct dentry *dentry)
 		}
 
 		/* Slow case: now with the dentry lock held */
-		rcu_read_unlock();
-
 		if (likely(retain_dentry(dentry))) {
 			spin_unlock(&dentry->d_lock);
+			rcu_read_unlock();
 			return;
 		}
 
@@ -918,14 +903,10 @@  EXPORT_SYMBOL(dput);
 static void __dput_to_list(struct dentry *dentry, struct list_head *list)
 __must_hold(&dentry->d_lock)
 {
-	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
-		/* let the owner of the list it's on deal with it */
-		--dentry->d_lockref.count;
-	} else {
+	if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
 		if (dentry->d_flags & DCACHE_LRU_LIST)
 			d_lru_del(dentry);
-		if (!--dentry->d_lockref.count)
-			d_shrink_add(dentry, list);
+		d_shrink_add(dentry, list);
 	}
 }
 
@@ -1191,7 +1172,7 @@  void shrink_dentry_list(struct list_head *list)
 		rcu_read_unlock();
 		d_shrink_del(dentry);
 		parent = dentry->d_parent;
-		if (parent != dentry)
+		if (parent != dentry && !--parent->d_lockref.count)
 			__dput_to_list(parent, list);
 		__dentry_kill(dentry);
 	}
@@ -1638,7 +1619,8 @@  void shrink_dcache_parent(struct dentry *parent)
 			} else {
 				rcu_read_unlock();
 				parent = data.victim->d_parent;
-				if (parent != data.victim)
+				if (parent != data.victim &&
+				    !--parent->d_lockref.count)
 					__dput_to_list(parent, &data.dispose);
 				__dentry_kill(data.victim);
 			}