[1/2] Re introduces a DCACHE_DENTRY_UNHASHED flag
diff mbox

Message ID 148056588886.28852.15338791339456402291.stgit@alexeys-macbook-pro.local
State New
Headers show

Commit Message

Alexey Lyashkov Dec. 1, 2016, 4:18 a.m. UTC
rehash process protected with d_seq and d_lock locks, but VFS have access to the
d_hashed field without any locks sometimes. It produce errors with get cwd
operations or fsnotify may report an unlink event sometimes. d_seq lock isn’t
used to protect due possibility to sleep with holding locks, and d_lock is too
hot to check a simple state, lets return a DCACHE_DENTRY_UNHASHED flag to
ability to check a unhashed state without locks held.

#include <pthread.h>
#include <sys/stat.h>
#include <stdlib.h>
#include <assert.h>
#include <unistd.h>
#include <stdio.h>
#include <errno.h>

void *thread_main(void *unused)
{
        int rc;
        for (;;) {
                rc = rename("/tmp/t-a", "/tmp/t-b");
                assert(rc == 0);
                rc = rename("/tmp/t-b", "/tmp/t-a");
                assert(rc == 0);
        }

        return NULL;
}

int main(void) {
        int rc, i;
        pthread_t ptt;

        rmdir("/tmp/t-a");
        rmdir("/tmp/t-b");

        rc = mkdir("/tmp/t-a", 0666);
        assert(rc == 0);

        rc = chdir("/tmp/t-a");
        assert(rc == 0);

        rc = pthread_create(&ptt, NULL, thread_main, NULL);
        assert(rc == 0);

        for (i=0; ;i++) {
                char buf[100], *b;
                b = getcwd(buf, sizeof(buf));
                if (b == NULL) {
                        printf("getcwd failed on iter %d with %d\n", i, errno);
                        break;
                }
        }

        return 0;
}

Reported-by: Andrew Perepechko <anserper@yandex.ru>
Signed-off-by: Alexey Lyashkov <alexey.lyashkov@gmail.com>
---
 arch/powerpc/platforms/cell/spufs/inode.c          |    2 +-
 drivers/infiniband/hw/qib/qib_fs.c                 |    2 +-
 .../staging/lustre/lustre/llite/llite_internal.h   |    2 +-
 fs/configfs/inode.c                                |    2 +-
 fs/dcache.c                                        |   24 +++++++++++++-------
 fs/nfs/dir.c                                       |    2 +-
 include/linux/dcache.h                             |    6 +++--
 7 files changed, 25 insertions(+), 15 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Amir Goldstein Dec. 1, 2016, 8:15 a.m. UTC | #1
On Thu, Dec 1, 2016 at 6:18 AM, Alexey Lyashkov
<alexey.lyashkov@gmail.com> wrote:
> rehash process protected with d_seq and d_lock locks, but VFS have access to the
> d_hashed field without any locks sometimes. It produce errors with get cwd
> operations or fsnotify may report an unlink event sometimes. d_seq lock isn’t
> used to protect due possibility to sleep with holding locks, and d_lock is too
> hot to check a simple state, lets return a DCACHE_DENTRY_UNHASHED flag to
> ability to check a unhashed state without locks held.
>
>

If I am counting correctly, you modified all call sites of __d_drop() except
for those in d_move() and d_delete(). Right?
I suggest that you keep all those call sites in tact and only modify the
2 call sites with changed behavior.
Also, best give the new helper a name that is a bit more meaningful to
understand the changed behavior (maybe __d_unhash(), do_d_drop()).
Then you may add a comment to explain why you need to use the
modified behavior when you use it. Why do you need it in d_delete()?

Incidentally, I just wrote a helper I needed to find out if a dentry I got from
exportfs_decode_fh() is "lookable" as opposed to a "connected zombie".
So your patch is certainly usable for my use case.
Can anyone say if my helper is correct and whether is usable for anyone
else?

/* Check if p1 is connected with a chain of hashed dentries to p2 */
static bool ovl_is_lookable(struct dentry *p1, struct dentry *p2)
{
        struct dentry *p;

        for (p = p2; !IS_ROOT(p); p = p->d_parent) {
                /*
                 * FIXME: there is a small window inside d_move()
                 * where moved entry is unhashed and rehashed and
                 * this can fail.
                 */
                if (d_unhashed(p))
                        return false;
                if (p->d_parent == p1)
                        return true;
        }
        return false;
}


[...]

> @@ -326,11 +327,12 @@ extern struct dentry *dget_parent(struct dentry *dentry);
>   *     @dentry: entry to check
>   *
>   *     Returns true if the dentry passed is not currently hashed.
> + *     hlist_bl_unhashed can't be used as race with d_move().
>   */
>

remove this newline while at it.

>  static inline int d_unhashed(const struct dentry *dentry)
>  {
> -       return hlist_bl_unhashed(&dentry->d_hash);
> +       return dentry->d_flags & DCACHE_DENTRY_UNHASHED;
>  }
>


Thanks,
Amir.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexey Lyashkov Dec. 1, 2016, 10:03 a.m. UTC | #2
Amir, 

comments inline.

> 1 dec 2016 г., в 11:15, Amir Goldstein <amir73il@gmail.com> wrote:
> 
> On Thu, Dec 1, 2016 at 6:18 AM, Alexey Lyashkov
> <alexey.lyashkov@gmail.com> wrote:
>> rehash process protected with d_seq and d_lock locks, but VFS have access to the
>> d_hashed field without any locks sometimes. It produce errors with get cwd
>> operations or fsnotify may report an unlink event sometimes. d_seq lock isn’t
>> used to protect due possibility to sleep with holding locks, and d_lock is too
>> hot to check a simple state, lets return a DCACHE_DENTRY_UNHASHED flag to
>> ability to check a unhashed state without locks held.
>> 
>> 
> 
> If I am counting correctly, you modified all call sites of __d_drop() except
> for those in d_move() and d_delete(). Right?
correct. 


> I suggest that you keep all those call sites in tact and only modify the
> 2 call sites with changed behavior.
> Also, best give the new helper a name that is a bit more meaningful to
> understand the changed behavior (maybe __d_unhash(), do_d_drop()).
i may agree with rename a __d_drop to something else, but export symbol rename will help to find any places need to be changed.
v3 patch was created from kbuild robot messages, which hides without it, a specially on platform other than x86 used for development.

> Then you may add a comment to explain why you need to use the
> modified behavior when you use it. Why do you need it in d_delete()?
d_delete remove a dentry from a tree completely, but d_move just change a hash point. So if we want to know a file unlinked

> 
> Incidentally, I just wrote a helper I needed to find out if a dentry I got from
> exportfs_decode_fh() is "lookable" as opposed to a "connected zombie".
> So your patch is certainly usable for my use case.
NFS a area with big problems in dcache. it use d_drop to avoid an assertion in d_rehash, but it open a window when dentry out of tree, so several operations may fail.
I think we need something better in that case like an take a d_lock for whole or allow to rehash dentry without unlinking from tree (i think it better). 


> Can anyone say if my helper is correct and whether is usable for anyone
> else?
may you explain how it should be called? 

as i see comments
        /*
         * Inform d_walk() and shrink_dentry_list() that we are no longer
         * attached to the dentry tree
         */
        dentry->d_flags |= DCACHE_DENTRY_KILLED;

it’s better than d_unhashed...

> 
> /* Check if p1 is connected with a chain of hashed dentries to p2 */
> static bool ovl_is_lookable(struct dentry *p1, struct dentry *p2)
> {
>        struct dentry *p;
> 
>        for (p = p2; !IS_ROOT(p); p = p->d_parent) {
>                /*
>                 * FIXME: there is a small window inside d_move()
>                 * where moved entry is unhashed and rehashed and
>                 * this can fail.
>                 */
>                if (d_unhashed(p))
>                        return false;
>                if (p->d_parent == p1)
>                        return true;
>        }
>        return false;
> }
> 
> 
> [...]
> 
>> @@ -326,11 +327,12 @@ extern struct dentry *dget_parent(struct dentry *dentry);
>>  *     @dentry: entry to check
>>  *
>>  *     Returns true if the dentry passed is not currently hashed.
>> + *     hlist_bl_unhashed can't be used as race with d_move().
>>  */
>> 
> 
> remove this newline while at it.
> 
>> static inline int d_unhashed(const struct dentry *dentry)
>> {
>> -       return hlist_bl_unhashed(&dentry->d_hash);
>> +       return dentry->d_flags & DCACHE_DENTRY_UNHASHED;
>> }
>> 
> 
> 
> Thanks,
> Amir.

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Dec. 1, 2016, 11:23 a.m. UTC | #3
On Thu, Dec 1, 2016 at 12:03 PM, Alexey Lyashkov
<alexey.lyashkov@gmail.com> wrote:
...
>
>
>> I suggest that you keep all those call sites in tact and only modify the
>> 2 call sites with changed behavior.
>> Also, best give the new helper a name that is a bit more meaningful to
>> understand the changed behavior (maybe __d_unhash(), do_d_drop()).
> i may agree with rename a __d_drop to something else, but export symbol
> rename will help to find any places need to be changed.
> v3 patch was created from kbuild robot messages, which hides without it,
> a specially on platform other than x86 used for development.
>

You have not convinced me. The way I see it you created a variant of __d_drop()
that is specific for using in rename for temporary unhash before rehash.
But it's up to me to decide if the practice on replacing all call site
is valid or not.

>> Then you may add a comment to explain why you need to use the
>> modified behavior when you use it. Why do you need it in d_delete()?
> d_delete remove a dentry from a tree completely, but d_move just change a
> hash point. So if we want to know a file unlinked

I don't follow. Why wouldn't we want to set the UNHASHED flag on d_delete()?


>>
>> Incidentally, I just wrote a helper I needed to find out if a dentry I got from
>> exportfs_decode_fh() is "lookable" as opposed to a "connected zombie".
>> So your patch is certainly usable for my use case.
> NFS a area with big problems in dcache. it use d_drop to avoid an assertion in
> d_rehash,
> but it open a window when dentry out of tree, so several operations may fail.
> I think we need something better in that case like an take a d_lock for whole
> or allow to rehash dentry without unlinking from tree (i think it better).
>
>
>> Can anyone say if my helper is correct and whether is usable for anyone
>> else?
> may you explain how it should be called?

For my use case I am trying to figure out if a file handle is stale
in a stronger sense then what exportfs_decode_fh() already provides.
exportfs_decode_fh() returns ESTALE is the inode in question is no longer
in the file system or no longer listed in the parent directory, but if
the handle
represents an open and unlinked file/dir, it is not considered stale.

I am using the helper to filter out the open and unlinked file/dir case.

>
> as i see comments
>         /*
>          * Inform d_walk() and shrink_dentry_list() that we are no longer
>          * attached to the dentry tree
>          */
>         dentry->d_flags |= DCACHE_DENTRY_KILLED;
>
> it’s better than d_unhashed...
>

Sorry. I did not properly define the terms "lookable" and "connected zombie".
"lookable" := there exists a path for which lookup(path) will return this dentry
"connected zombie" := dentry is "connected" with parent chain to root, but
one or more of the dentries in the chain have been unlinked, but they still
have non zero refcount and therefore not yet killed.



>>
>> /* Check if p1 is connected with a chain of hashed dentries to p2 */
>> static bool ovl_is_lookable(struct dentry *p1, struct dentry *p2)
>> {
>>        struct dentry *p;
>>
>>        for (p = p2; !IS_ROOT(p); p = p->d_parent) {
>>                /*
>>                 * FIXME: there is a small window inside d_move()
>>                 * where moved entry is unhashed and rehashed and
>>                 * this can fail.
>>                 */
>>                if (d_unhashed(p))
>>                        return false;
>>                if (p->d_parent == p1)
>>                        return true;
>>        }
>>        return false;
>> }
>>
>>
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Oleg Drokin Dec. 8, 2016, 2:32 a.m. UTC | #4
On Nov 30, 2016, at 11:18 PM, Alexey Lyashkov wrote:

> rehash process protected with d_seq and d_lock locks, but VFS have access to the
> d_hashed field without any locks sometimes. It produce errors with get cwd
> operations or fsnotify may report an unlink event sometimes. d_seq lock isn’t
> used to protect due possibility to sleep with holding locks, and d_lock is too
> hot to check a simple state, lets return a DCACHE_DENTRY_UNHASHED flag to
> ability to check a unhashed state without locks held.

While Lustre-wise the patch is ok, I guess,
I wonder if the underlying idea is sound?
Lockless access is inherently racy, so if you replace the list check with just
a bit check, it's still racy, just the window is a bit smaller, no?
Or are you only concerned about d_move's very nonatomic unhash/rehash?

> #include <pthread.h>
> #include <sys/stat.h>
> #include <stdlib.h>
> #include <assert.h>
> #include <unistd.h>
> #include <stdio.h>
> #include <errno.h>
> 
> void *thread_main(void *unused)
> {
>        int rc;
>        for (;;) {
>                rc = rename("/tmp/t-a", "/tmp/t-b");
>                assert(rc == 0);
>                rc = rename("/tmp/t-b", "/tmp/t-a");
>                assert(rc == 0);
>        }
> 
>        return NULL;
> }
> 
> int main(void) {
>        int rc, i;
>        pthread_t ptt;
> 
>        rmdir("/tmp/t-a");
>        rmdir("/tmp/t-b");
> 
>        rc = mkdir("/tmp/t-a", 0666);
>        assert(rc == 0);
> 
>        rc = chdir("/tmp/t-a");
>        assert(rc == 0);
> 
>        rc = pthread_create(&ptt, NULL, thread_main, NULL);
>        assert(rc == 0);
> 
>        for (i=0; ;i++) {
>                char buf[100], *b;
>                b = getcwd(buf, sizeof(buf));
>                if (b == NULL) {
>                        printf("getcwd failed on iter %d with %d\n", i, errno);
>                        break;
>                }
>        }
> 
>        return 0;
> }
> 
> Reported-by: Andrew Perepechko <anserper@yandex.ru>
> Signed-off-by: Alexey Lyashkov <alexey.lyashkov@gmail.com>
> ---
> arch/powerpc/platforms/cell/spufs/inode.c          |    2 +-
> drivers/infiniband/hw/qib/qib_fs.c                 |    2 +-
> .../staging/lustre/lustre/llite/llite_internal.h   |    2 +-
> fs/configfs/inode.c                                |    2 +-
> fs/dcache.c                                        |   24 +++++++++++++-------
> fs/nfs/dir.c                                       |    2 +-
> include/linux/dcache.h                             |    6 +++--
> 7 files changed, 25 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
> index 5364d4a..cc8623d 100644
> --- a/arch/powerpc/platforms/cell/spufs/inode.c
> +++ b/arch/powerpc/platforms/cell/spufs/inode.c
> @@ -168,7 +168,7 @@ static void spufs_prune_dir(struct dentry *dir)
> 		spin_lock(&dentry->d_lock);
> 		if (simple_positive(dentry)) {
> 			dget_dlock(dentry);
> -			__d_drop(dentry);
> +			_d_drop(dentry);
> 			spin_unlock(&dentry->d_lock);
> 			simple_unlink(d_inode(dir), dentry);
> 			/* XXX: what was dcache_lock protecting here? Other
> diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
> index f1e66ef..9b4c0e7 100644
> --- a/drivers/infiniband/hw/qib/qib_fs.c
> +++ b/drivers/infiniband/hw/qib/qib_fs.c
> @@ -440,7 +440,7 @@ static int remove_file(struct dentry *parent, char *name)
> 
> 	spin_lock(&tmp->d_lock);
> 	if (simple_positive(tmp)) {
> -		__d_drop(tmp);
> +		_d_drop(tmp);
> 		spin_unlock(&tmp->d_lock);
> 		simple_unlink(d_inode(parent), tmp);
> 	} else {
> diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h
> index 4bc5512..f230505 100644
> --- a/drivers/staging/lustre/lustre/llite/llite_internal.h
> +++ b/drivers/staging/lustre/lustre/llite/llite_internal.h
> @@ -1353,7 +1353,7 @@ static inline void d_lustre_invalidate(struct dentry *dentry, int nested)
> 	 * it and busy inodes would be reported.
> 	 */
> 	if (d_count(dentry) == 0 && !(dentry->d_flags & DCACHE_DISCONNECTED))
> -		__d_drop(dentry);
> +		_d_drop(dentry);
> 	spin_unlock(&dentry->d_lock);
> }
> 
> diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
> index ad718e5..9f531f7 100644
> --- a/fs/configfs/inode.c
> +++ b/fs/configfs/inode.c
> @@ -248,7 +248,7 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
> 		spin_lock(&dentry->d_lock);
> 		if (simple_positive(dentry)) {
> 			dget_dlock(dentry);
> -			__d_drop(dentry);
> +			_d_drop(dentry);
> 			spin_unlock(&dentry->d_lock);
> 			simple_unlink(d_inode(parent), dentry);
> 		} else
> diff --git a/fs/dcache.c b/fs/dcache.c
> index 5c7cc95..0863841 100644
> --- a/fs/dcache.c
> +++ b/fs/dcache.c
> @@ -438,7 +438,7 @@ static void dentry_lru_add(struct dentry *dentry)
>  */
> void __d_drop(struct dentry *dentry)
> {
> -	if (!d_unhashed(dentry)) {
> +	if (!hlist_bl_unhashed(&dentry->d_hash)) {
> 		struct hlist_bl_head *b;
> 		/*
> 		 * Hashed dentries are normally on the dentry hashtable,
> @@ -458,12 +458,18 @@ void __d_drop(struct dentry *dentry)
> 		write_seqcount_invalidate(&dentry->d_seq);
> 	}
> }
> -EXPORT_SYMBOL(__d_drop);
> +
> +void _d_drop(struct dentry *dentry)
> +{
> +	dentry->d_flags |= DCACHE_DENTRY_UNHASHED;
> +	__d_drop(dentry);
> +}
> +EXPORT_SYMBOL(_d_drop);
> 
> void d_drop(struct dentry *dentry)
> {
> 	spin_lock(&dentry->d_lock);
> -	__d_drop(dentry);
> +	_d_drop(dentry);
> 	spin_unlock(&dentry->d_lock);
> }
> EXPORT_SYMBOL(d_drop);
> @@ -530,7 +536,7 @@ static void __dentry_kill(struct dentry *dentry)
> 			d_lru_del(dentry);
> 	}
> 	/* if it was on the hash then remove it */
> -	__d_drop(dentry);
> +	_d_drop(dentry);
> 	dentry_unlist(dentry, parent);
> 	if (parent)
> 		spin_unlock(&parent->d_lock);
> @@ -1486,7 +1492,7 @@ static void check_and_drop(void *_data)
> 	struct detach_data *data = _data;
> 
> 	if (!data->mountpoint && !data->select.found)
> -		__d_drop(data->select.start);
> +		_d_drop(data->select.start);
> }
> 
> /**
> @@ -1601,7 +1607,7 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
> 	dentry->d_name.name = dname;
> 
> 	dentry->d_lockref.count = 1;
> -	dentry->d_flags = 0;
> +	dentry->d_flags = DCACHE_DENTRY_UNHASHED;
> 	spin_lock_init(&dentry->d_lock);
> 	seqcount_init(&dentry->d_seq);
> 	dentry->d_inode = NULL;
> @@ -1926,6 +1932,7 @@ static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected)
> 	spin_lock(&tmp->d_lock);
> 	__d_set_inode_and_type(tmp, inode, add_flags);
> 	hlist_add_head(&tmp->d_u.d_alias, &inode->i_dentry);
> +	tmp->d_flags &= ~DCACHE_DENTRY_UNHASHED;
> 	hlist_bl_lock(&tmp->d_sb->s_anon);
> 	hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon);
> 	hlist_bl_unlock(&tmp->d_sb->s_anon);
> @@ -2331,7 +2338,7 @@ void d_delete(struct dentry * dentry)
> 	}
> 
> 	if (!d_unhashed(dentry))
> -		__d_drop(dentry);
> +		_d_drop(dentry);
> 
> 	spin_unlock(&dentry->d_lock);
> 
> @@ -2342,7 +2349,8 @@ EXPORT_SYMBOL(d_delete);
> static void __d_rehash(struct dentry *entry)
> {
> 	struct hlist_bl_head *b = d_hash(entry->d_name.hash);
> -	BUG_ON(!d_unhashed(entry));
> +	BUG_ON(!hlist_bl_unhashed(&entry->d_hash));
> +	entry->d_flags &= ~DCACHE_DENTRY_UNHASHED;
> 	hlist_bl_lock(b);
> 	hlist_bl_add_head_rcu(&entry->d_hash, b);
> 	hlist_bl_unlock(b);
> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> index 5f1af4c..a1911bb 100644
> --- a/fs/nfs/dir.c
> +++ b/fs/nfs/dir.c
> @@ -1891,7 +1891,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
> 		goto out;
> 	}
> 	if (!d_unhashed(dentry)) {
> -		__d_drop(dentry);
> +		_d_drop(dentry);
> 		need_rehash = 1;
> 	}
> 	spin_unlock(&dentry->d_lock);
> diff --git a/include/linux/dcache.h b/include/linux/dcache.h
> index 5beed7b..6b0b3d7 100644
> --- a/include/linux/dcache.h
> +++ b/include/linux/dcache.h
> @@ -213,6 +213,7 @@ struct dentry_operations {
> #define DCACHE_PAR_LOOKUP		0x10000000 /* being looked up (with parent locked shared) */
> #define DCACHE_DENTRY_CURSOR		0x20000000
> 
> +#define DCACHE_DENTRY_UNHASHED		0x40000000 /* dentry unhashed from tree with unlink */
> extern seqlock_t rename_lock;
> 
> /*
> @@ -221,7 +222,7 @@ extern seqlock_t rename_lock;
> extern void d_instantiate(struct dentry *, struct inode *);
> extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *);
> extern int d_instantiate_no_diralias(struct dentry *, struct inode *);
> -extern void __d_drop(struct dentry *dentry);
> +extern void _d_drop(struct dentry *dentry);
> extern void d_drop(struct dentry *dentry);
> extern void d_delete(struct dentry *);
> extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op);
> @@ -326,11 +327,12 @@ extern struct dentry *dget_parent(struct dentry *dentry);
>  *	@dentry: entry to check
>  *
>  *	Returns true if the dentry passed is not currently hashed.
> + *	hlist_bl_unhashed can't be used as race with d_move().

s/as/due to a/

>  */
> 
> static inline int d_unhashed(const struct dentry *dentry)
> {
> -	return hlist_bl_unhashed(&dentry->d_hash);
> +	return dentry->d_flags & DCACHE_DENTRY_UNHASHED;
> }
> 
> static inline int d_unlinked(const struct dentry *dentry)
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexey Lyashkov Dec. 8, 2016, 5:16 a.m. UTC | #5
comments inline,

> 8 дек. 2016 г., в 5:32, Oleg Drokin <green@linuxhacker.ru> написал(а):
> 
> 
> On Nov 30, 2016, at 11:18 PM, Alexey Lyashkov wrote:
> 
>> rehash process protected with d_seq and d_lock locks, but VFS have access to the
>> d_hashed field without any locks sometimes. It produce errors with get cwd
>> operations or fsnotify may report an unlink event sometimes. d_seq lock isn’t
>> used to protect due possibility to sleep with holding locks, and d_lock is too
>> hot to check a simple state, lets return a DCACHE_DENTRY_UNHASHED flag to
>> ability to check a unhashed state without locks held.
> 
> While Lustre-wise the patch is ok, I guess,
> I wonder if the underlying idea is sound?
> Lockless access is inherently racy, so if you replace the list check with just
> a bit check, it's still racy, just the window is a bit smaller, no?
> Or are you only concerned about d_move's very nonatomic unhash/rehash?
> 
You are right.
I concerned about d_move and non atomic rehash, but it not a decrease a race window.
hashed flag live cycle it - dentry unhashed while don’t connected to the tree, but hashed until it dentry is valid (file isn’t unlinked) or dentry killed via low memory conditional and new lookup is needs. 
I guess, you talk about - how it will affect a lustre? it bug was discovered in lustre workstation previously.
Lustre uses a d_move as part of lookup process to reuse an unprotected dentries to connect into dentry tree.
get_cwd was confused in that time as threat a unhashed dentry as unlinked, who is correct from d_drop use case.

Alexey--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Al Viro Dec. 13, 2016, 1:44 a.m. UTC | #6
On Thu, Dec 01, 2016 at 07:18:26AM +0300, Alexey Lyashkov wrote:
> rehash process protected with d_seq and d_lock locks, but VFS have access to the
> d_hashed field without any locks sometimes. It produce errors with get cwd
> operations or fsnotify may report an unlink event sometimes. d_seq lock isn’t
> used to protect due possibility to sleep with holding locks, and d_lock is too
> hot to check a simple state, lets return a DCACHE_DENTRY_UNHASHED flag to
> ability to check a unhashed state without locks held.

>   *	Returns true if the dentry passed is not currently hashed.
> + *	hlist_bl_unhashed can't be used as race with d_move().
>   */
>   
>  static inline int d_unhashed(const struct dentry *dentry)
>  {
> -	return hlist_bl_unhashed(&dentry->d_hash);
> +	return dentry->d_flags & DCACHE_DENTRY_UNHASHED;
>  }

The real problem here is the unsafe use of d_unlinked().  You are papering
over that, and doing so in the wrong place.  Note that most of the callers
of d_unhashed() are either under ->d_lock on it or on parent or with
parent locked by ->i_rwsem, or in places where false positive is fine
since it only pushes us to slow path.  Excluding those, we are left with

fs/autofs4/expire.c:226:        if (!simple_positive(top))
fs/autofs4/expire.c:537:                if (d_unhashed(dentry))
fs/ceph/dir.c:649:              BUG_ON(!d_unhashed(dentry));
fs/ceph/inode.c:1074:   if (!d_unhashed(dn))
fs/ceph/inode.c:1322:                           if (have_lease && d_unhashed(dn))
fs/cifs/inode.c:945:            if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
fs/coredump.c:727:              if (d_unhashed(cprm.file->f_path.dentry))
fs/dcache.c:3202:       if (d_unlinked(path->dentry)) {
fs/dcache.c:3366:       if (d_unlinked(dentry)) {
fs/dcache.c:3423:       if (!d_unlinked(pwd.dentry)) {
fs/debugfs/file.c:77:   if (d_unlinked(dentry))
fs/namespace.c:3154:    if (d_unlinked(new.dentry))
fs/namespace.c:738:                     if (d_unlinked(dentry))
fs/notify/inotify/inotify_fsnotify.c:85:                if (d_unlinked(path->dentry))
security/apparmor/file.c:263:   if (d_unlinked(dentry) && d_backing_inode(dentry)->i_nlink == 0)
security/apparmor/path.c:149:   if (d_unlinked(path->dentry) && d_is_positive(path->dentry) &&

VFS part of that consists of
	* d_path() and friends, including getcwd().  Might bloody well turn
those into
	if (unlikely(d_unhashed(dentry))) {
		/* recheck under d_lock */
		spin_lock(&dentry->d_lock);
		if (d_unhashed(dentry))
			...
	* mount(2)/pivot_root(2) vs. rename(2) - userland race; after all,
had you called it just a bit later, you *would* have gotten that -ENOENT.
Nothing the kernel can actually do here.
	* do_coredump() bailing out on races with somebody moving the
coredump to be around.  Cry me a river...
	* idiotify playing with d_unlinked(); same story as with
d_path() et.al.

autofs ones are, AFAICS, harmless - you might spin a bit more (if that), but
no worse than that.  If they can be triggered at all, that is.
No idea about the ceph ones (i.e. whether they can race with d_move()
like that, whether it really cares, etc.).  debugfs looks like it can't race
with d_move() at all.  cifs... no idea, really.  Looks like we _might_
have an odd behaviour from server not spotted in some cases we would've
spotted it otherwise.  Not sure if we care.  Apparmor... you'll need to
ask apparmor folks.

NAK in that form.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch
diff mbox

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 5364d4a..cc8623d 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -168,7 +168,7 @@  static void spufs_prune_dir(struct dentry *dir)
 		spin_lock(&dentry->d_lock);
 		if (simple_positive(dentry)) {
 			dget_dlock(dentry);
-			__d_drop(dentry);
+			_d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
 			simple_unlink(d_inode(dir), dentry);
 			/* XXX: what was dcache_lock protecting here? Other
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index f1e66ef..9b4c0e7 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -440,7 +440,7 @@  static int remove_file(struct dentry *parent, char *name)
 
 	spin_lock(&tmp->d_lock);
 	if (simple_positive(tmp)) {
-		__d_drop(tmp);
+		_d_drop(tmp);
 		spin_unlock(&tmp->d_lock);
 		simple_unlink(d_inode(parent), tmp);
 	} else {
diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h
index 4bc5512..f230505 100644
--- a/drivers/staging/lustre/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustre/lustre/llite/llite_internal.h
@@ -1353,7 +1353,7 @@  static inline void d_lustre_invalidate(struct dentry *dentry, int nested)
 	 * it and busy inodes would be reported.
 	 */
 	if (d_count(dentry) == 0 && !(dentry->d_flags & DCACHE_DISCONNECTED))
-		__d_drop(dentry);
+		_d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
 }
 
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index ad718e5..9f531f7 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -248,7 +248,7 @@  void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
 		spin_lock(&dentry->d_lock);
 		if (simple_positive(dentry)) {
 			dget_dlock(dentry);
-			__d_drop(dentry);
+			_d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
 			simple_unlink(d_inode(parent), dentry);
 		} else
diff --git a/fs/dcache.c b/fs/dcache.c
index 5c7cc95..0863841 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -438,7 +438,7 @@  static void dentry_lru_add(struct dentry *dentry)
  */
 void __d_drop(struct dentry *dentry)
 {
-	if (!d_unhashed(dentry)) {
+	if (!hlist_bl_unhashed(&dentry->d_hash)) {
 		struct hlist_bl_head *b;
 		/*
 		 * Hashed dentries are normally on the dentry hashtable,
@@ -458,12 +458,18 @@  void __d_drop(struct dentry *dentry)
 		write_seqcount_invalidate(&dentry->d_seq);
 	}
 }
-EXPORT_SYMBOL(__d_drop);
+
+void _d_drop(struct dentry *dentry)
+{
+	dentry->d_flags |= DCACHE_DENTRY_UNHASHED;
+	__d_drop(dentry);
+}
+EXPORT_SYMBOL(_d_drop);
 
 void d_drop(struct dentry *dentry)
 {
 	spin_lock(&dentry->d_lock);
-	__d_drop(dentry);
+	_d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
 }
 EXPORT_SYMBOL(d_drop);
@@ -530,7 +536,7 @@  static void __dentry_kill(struct dentry *dentry)
 			d_lru_del(dentry);
 	}
 	/* if it was on the hash then remove it */
-	__d_drop(dentry);
+	_d_drop(dentry);
 	dentry_unlist(dentry, parent);
 	if (parent)
 		spin_unlock(&parent->d_lock);
@@ -1486,7 +1492,7 @@  static void check_and_drop(void *_data)
 	struct detach_data *data = _data;
 
 	if (!data->mountpoint && !data->select.found)
-		__d_drop(data->select.start);
+		_d_drop(data->select.start);
 }
 
 /**
@@ -1601,7 +1607,7 @@  struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 	dentry->d_name.name = dname;
 
 	dentry->d_lockref.count = 1;
-	dentry->d_flags = 0;
+	dentry->d_flags = DCACHE_DENTRY_UNHASHED;
 	spin_lock_init(&dentry->d_lock);
 	seqcount_init(&dentry->d_seq);
 	dentry->d_inode = NULL;
@@ -1926,6 +1932,7 @@  static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected)
 	spin_lock(&tmp->d_lock);
 	__d_set_inode_and_type(tmp, inode, add_flags);
 	hlist_add_head(&tmp->d_u.d_alias, &inode->i_dentry);
+	tmp->d_flags &= ~DCACHE_DENTRY_UNHASHED;
 	hlist_bl_lock(&tmp->d_sb->s_anon);
 	hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon);
 	hlist_bl_unlock(&tmp->d_sb->s_anon);
@@ -2331,7 +2338,7 @@  void d_delete(struct dentry * dentry)
 	}
 
 	if (!d_unhashed(dentry))
-		__d_drop(dentry);
+		_d_drop(dentry);
 
 	spin_unlock(&dentry->d_lock);
 
@@ -2342,7 +2349,8 @@  EXPORT_SYMBOL(d_delete);
 static void __d_rehash(struct dentry *entry)
 {
 	struct hlist_bl_head *b = d_hash(entry->d_name.hash);
-	BUG_ON(!d_unhashed(entry));
+	BUG_ON(!hlist_bl_unhashed(&entry->d_hash));
+	entry->d_flags &= ~DCACHE_DENTRY_UNHASHED;
 	hlist_bl_lock(b);
 	hlist_bl_add_head_rcu(&entry->d_hash, b);
 	hlist_bl_unlock(b);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5f1af4c..a1911bb 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1891,7 +1891,7 @@  int nfs_unlink(struct inode *dir, struct dentry *dentry)
 		goto out;
 	}
 	if (!d_unhashed(dentry)) {
-		__d_drop(dentry);
+		_d_drop(dentry);
 		need_rehash = 1;
 	}
 	spin_unlock(&dentry->d_lock);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 5beed7b..6b0b3d7 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -213,6 +213,7 @@  struct dentry_operations {
 #define DCACHE_PAR_LOOKUP		0x10000000 /* being looked up (with parent locked shared) */
 #define DCACHE_DENTRY_CURSOR		0x20000000
 
+#define DCACHE_DENTRY_UNHASHED		0x40000000 /* dentry unhashed from tree with unlink */
 extern seqlock_t rename_lock;
 
 /*
@@ -221,7 +222,7 @@  extern seqlock_t rename_lock;
 extern void d_instantiate(struct dentry *, struct inode *);
 extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *);
 extern int d_instantiate_no_diralias(struct dentry *, struct inode *);
-extern void __d_drop(struct dentry *dentry);
+extern void _d_drop(struct dentry *dentry);
 extern void d_drop(struct dentry *dentry);
 extern void d_delete(struct dentry *);
 extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op);
@@ -326,11 +327,12 @@  extern struct dentry *dget_parent(struct dentry *dentry);
  *	@dentry: entry to check
  *
  *	Returns true if the dentry passed is not currently hashed.
+ *	hlist_bl_unhashed can't be used as race with d_move().
  */
  
 static inline int d_unhashed(const struct dentry *dentry)
 {
-	return hlist_bl_unhashed(&dentry->d_hash);
+	return dentry->d_flags & DCACHE_DENTRY_UNHASHED;
 }
 
 static inline int d_unlinked(const struct dentry *dentry)