Message ID | 20250206054504.2950516-9-neilb@suse.de (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | RFC: Allow concurrent and async changes in a directory | expand |
On Thu, Feb 06, 2025 at 04:42:45PM +1100, NeilBrown wrote: > lookup_and_lock() combines locking the directory and performing a lookup > prior to a change to the directory. > Abstracting this prepares for changing the locking requirements. > > done_lookup_and_lock() provides the inverse of putting the dentry and > unlocking. > > For "silly_rename" we will need to lookup_and_lock() in a directory that > is already locked. For this purpose we add LOOKUP_PARENT_LOCKED. > > Like lookup_len_qstr(), lookup_and_lock() returns -ENOENT if > LOOKUP_CREATE was NOT given and the name cannot be found,, and returns > -EEXIST if LOOKUP_EXCL WAS given and the name CAN be found. > > These functions replace all uses of lookup_one_qstr() in namei.c > except for those used for rename. > > The name might seem backwards as the lock happens before the lookup. > A future patch will change this so that only a shared lock is taken > before the lookup, and an exclusive lock on the dentry is taken after a > successful lookup. So the order "lookup" then "lock" will make sense. > > This functionality is exported as lookup_and_lock_one() which takes a > name and len rather than a qstr. > > Signed-off-by: NeilBrown <neilb@suse.de> > --- > fs/namei.c | 102 ++++++++++++++++++++++++++++-------------- > include/linux/namei.h | 15 ++++++- > 2 files changed, 83 insertions(+), 34 deletions(-) > > diff --git a/fs/namei.c b/fs/namei.c > index 69610047f6c6..3c0feca081a2 100644 > --- a/fs/namei.c > +++ b/fs/namei.c > @@ -1715,6 +1715,41 @@ struct dentry *lookup_one_qstr(const struct qstr *name, > } > EXPORT_SYMBOL(lookup_one_qstr); > > +static struct dentry *lookup_and_lock_nested(const struct qstr *last, > + struct dentry *base, > + unsigned int lookup_flags, > + unsigned int subclass) > +{ > + struct dentry *dentry; > + > + if (!(lookup_flags & LOOKUP_PARENT_LOCKED)) > + inode_lock_nested(base->d_inode, subclass); > + > + dentry = lookup_one_qstr(last, base, lookup_flags); > + if (IS_ERR(dentry) && !(lookup_flags & LOOKUP_PARENT_LOCKED)) { > + inode_unlock(base->d_inode); Nit: The indentation here is wrong and the {} aren't common practice. > + } > + return dentry; > +} > + > +static struct dentry *lookup_and_lock(const struct qstr *last, > + struct dentry *base, > + unsigned int lookup_flags) > +{ > + return lookup_and_lock_nested(last, base, lookup_flags, > + I_MUTEX_PARENT); > +} > + > +void done_lookup_and_lock(struct dentry *base, struct dentry *dentry, > + unsigned int lookup_flags) Did you mean done_lookup_and_unlock()? > +{ > + d_lookup_done(dentry); > + dput(dentry); > + if (!(lookup_flags & LOOKUP_PARENT_LOCKED)) > + inode_unlock(base->d_inode); > +} > +EXPORT_SYMBOL(done_lookup_and_lock); > + > /** > * lookup_fast - do fast lockless (but racy) lookup of a dentry > * @nd: current nameidata > @@ -2754,12 +2789,9 @@ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct > path_put(path); > return ERR_PTR(-EINVAL); > } > - inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); > - d = lookup_one_qstr(&last, path->dentry, 0); > - if (IS_ERR(d)) { > - inode_unlock(path->dentry->d_inode); > + d = lookup_and_lock(&last, path->dentry, 0); > + if (IS_ERR(d)) > path_put(path); > - } > return d; > } > > @@ -3053,6 +3085,22 @@ struct dentry *lookup_positive_unlocked(const char *name, > } > EXPORT_SYMBOL(lookup_positive_unlocked); > > +struct dentry *lookup_and_lock_one(struct mnt_idmap *idmap, > + const char *name, int len, struct dentry *base, > + unsigned int lookup_flags) > +{ > + struct qstr this; > + int err; > + > + if (!idmap) > + idmap = &nop_mnt_idmap; The callers should pass nop_mnt_idmap. That's how every function that takes this argument works. This is a lot more explicit than magically fixing this up in the function. > + err = lookup_one_common(idmap, name, base, len, &this); > + if (err) > + return ERR_PTR(err); > + return lookup_and_lock(&this, base, lookup_flags); > +} > +EXPORT_SYMBOL(lookup_and_lock_one); > + > #ifdef CONFIG_UNIX98_PTYS > int path_pts(struct path *path) > { > @@ -4071,7 +4119,6 @@ static struct dentry *filename_create(int dfd, struct filename *name, > unsigned int reval_flag = lookup_flags & LOOKUP_REVAL; > unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL; > int type; > - int err2; > int error; > > error = filename_parentat(dfd, name, reval_flag, path, &last, &type); > @@ -4083,36 +4130,30 @@ static struct dentry *filename_create(int dfd, struct filename *name, > * (foo/., foo/.., /////) > */ > if (unlikely(type != LAST_NORM)) > - goto out; > + goto put; > > /* don't fail immediately if it's r/o, at least try to report other errors */ > - err2 = mnt_want_write(path->mnt); > + error = mnt_want_write(path->mnt); > /* > * Do the final lookup. Suppress 'create' if there is a trailing > * '/', and a directory wasn't requested. > */ > if (last.name[last.len] && !want_dir) > create_flags &= ~LOOKUP_CREATE; > - inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); > - dentry = lookup_one_qstr(&last, path->dentry, > - reval_flag | create_flags); > + dentry = lookup_and_lock(&last, path->dentry, reval_flag | create_flags); > if (IS_ERR(dentry)) > - goto unlock; > + goto drop; > > - if (unlikely(err2)) { > - error = err2; > + if (unlikely(error)) > goto fail; > - } > return dentry; > fail: > - d_lookup_done(dentry); > - dput(dentry); > + done_lookup_and_lock(path->dentry, dentry, reval_flag | create_flags); > dentry = ERR_PTR(error); > -unlock: > - inode_unlock(path->dentry->d_inode); > - if (!err2) > +drop: > + if (!error) > mnt_drop_write(path->mnt); > -out: > +put: > path_put(path); > return dentry; > } > @@ -4130,14 +4171,13 @@ EXPORT_SYMBOL(kern_path_create); > > void done_path_create(struct path *path, struct dentry *dentry) > { > - dput(dentry); > - inode_unlock(path->dentry->d_inode); > + done_lookup_and_lock(path->dentry, dentry, LOOKUP_CREATE); > mnt_drop_write(path->mnt); > path_put(path); > } > EXPORT_SYMBOL(done_path_create); > > -inline struct dentry *user_path_create(int dfd, const char __user *pathname, > +struct dentry *user_path_create(int dfd, const char __user *pathname, > struct path *path, unsigned int lookup_flags) > { > struct filename *filename = getname(pathname); > @@ -4510,19 +4550,18 @@ int do_rmdir(int dfd, struct filename *name) > if (error) > goto exit2; > > - inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); > - dentry = lookup_one_qstr(&last, path.dentry, lookup_flags); > + dentry = lookup_and_lock(&last, path.dentry, lookup_flags); > error = PTR_ERR(dentry); > if (IS_ERR(dentry)) > goto exit3; > + > error = security_path_rmdir(&path, dentry); > if (error) > goto exit4; > error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry); > exit4: > - dput(dentry); > + done_lookup_and_lock(path.dentry, dentry, lookup_flags); > exit3: > - inode_unlock(path.dentry->d_inode); > mnt_drop_write(path.mnt); > exit2: > path_put(&path); > @@ -4639,11 +4678,9 @@ int do_unlinkat(int dfd, struct filename *name) > if (error) > goto exit2; > retry_deleg: > - inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); > - dentry = lookup_one_qstr(&last, path.dentry, lookup_flags); > + dentry = lookup_and_lock(&last, path.dentry, lookup_flags); > error = PTR_ERR(dentry); > if (!IS_ERR(dentry)) { > - > /* Why not before? Because we want correct error value */ > if (last.name[last.len]) > goto slashes; > @@ -4655,9 +4692,8 @@ int do_unlinkat(int dfd, struct filename *name) > error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode, > dentry, &delegated_inode); > exit3: > - dput(dentry); > + done_lookup_and_lock(path.dentry, dentry, lookup_flags); > } > - inode_unlock(path.dentry->d_inode); > if (inode) > iput(inode); /* truncate the inode here */ > inode = NULL; > diff --git a/include/linux/namei.h b/include/linux/namei.h > index 0d81e571a159..76c587a5ec3a 100644 > --- a/include/linux/namei.h > +++ b/include/linux/namei.h > @@ -29,7 +29,11 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT}; > #define LOOKUP_RCU BIT(8) /* RCU pathwalk mode; semi-internal */ > #define LOOKUP_CACHED BIT(9) /* Only do cached lookup */ > #define LOOKUP_PARENT BIT(10) /* Looking up final parent in path */ > -/* 5 spare bits for pathwalk */ > +#define LOOKUP_PARENT_LOCKED BIT(11) /* filesystem sets this for nested > + * "lookup_and_lock_one" when it knows > + * parent is sufficiently locked. > + */ > +/* 4 spare bits for pathwalk */ > > /* These tell filesystem methods that we are dealing with the final component... */ > #define LOOKUP_OPEN BIT(16) /* ... in open */ > @@ -82,6 +86,15 @@ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, > struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, > const char *name, > struct dentry *base, int len); > +struct dentry *lookup_and_lock_one(struct mnt_idmap *idmap, > + const char *name, int len, struct dentry *base, > + unsigned int lookup_flags); > +struct dentry *__lookup_and_lock_one(struct mnt_idmap *idmap, > + const char *name, int len, struct dentry *base, > + unsigned int lookup_flags); > +void done_lookup_and_lock(struct dentry *base, struct dentry *dentry, > + unsigned int lookup_flags); > +void __done_lookup_and_lock(struct dentry *dentry); > > extern int follow_down_one(struct path *); > extern int follow_down(struct path *path, unsigned int flags); > -- > 2.47.1 >
On Fri, 07 Feb 2025, Christian Brauner wrote: > On Thu, Feb 06, 2025 at 04:42:45PM +1100, NeilBrown wrote: > > lookup_and_lock() combines locking the directory and performing a lookup > > prior to a change to the directory. > > Abstracting this prepares for changing the locking requirements. > > > > done_lookup_and_lock() provides the inverse of putting the dentry and > > unlocking. > > > > For "silly_rename" we will need to lookup_and_lock() in a directory that > > is already locked. For this purpose we add LOOKUP_PARENT_LOCKED. > > > > Like lookup_len_qstr(), lookup_and_lock() returns -ENOENT if > > LOOKUP_CREATE was NOT given and the name cannot be found,, and returns > > -EEXIST if LOOKUP_EXCL WAS given and the name CAN be found. > > > > These functions replace all uses of lookup_one_qstr() in namei.c > > except for those used for rename. > > > > The name might seem backwards as the lock happens before the lookup. > > A future patch will change this so that only a shared lock is taken > > before the lookup, and an exclusive lock on the dentry is taken after a > > successful lookup. So the order "lookup" then "lock" will make sense. > > > > This functionality is exported as lookup_and_lock_one() which takes a > > name and len rather than a qstr. > > > > Signed-off-by: NeilBrown <neilb@suse.de> > > --- > > fs/namei.c | 102 ++++++++++++++++++++++++++++-------------- > > include/linux/namei.h | 15 ++++++- > > 2 files changed, 83 insertions(+), 34 deletions(-) > > > > diff --git a/fs/namei.c b/fs/namei.c > > index 69610047f6c6..3c0feca081a2 100644 > > --- a/fs/namei.c > > +++ b/fs/namei.c > > @@ -1715,6 +1715,41 @@ struct dentry *lookup_one_qstr(const struct qstr *name, > > } > > EXPORT_SYMBOL(lookup_one_qstr); > > > > +static struct dentry *lookup_and_lock_nested(const struct qstr *last, > > + struct dentry *base, > > + unsigned int lookup_flags, > > + unsigned int subclass) > > +{ > > + struct dentry *dentry; > > + > > + if (!(lookup_flags & LOOKUP_PARENT_LOCKED)) > > + inode_lock_nested(base->d_inode, subclass); > > + > > + dentry = lookup_one_qstr(last, base, lookup_flags); > > + if (IS_ERR(dentry) && !(lookup_flags & LOOKUP_PARENT_LOCKED)) { > > + inode_unlock(base->d_inode); > > Nit: The indentation here is wrong and the {} aren't common practice. Thanks. > > > + } > > + return dentry; > > +} > > + > > +static struct dentry *lookup_and_lock(const struct qstr *last, > > + struct dentry *base, > > + unsigned int lookup_flags) > > +{ > > + return lookup_and_lock_nested(last, base, lookup_flags, > > + I_MUTEX_PARENT); > > +} > > + > > +void done_lookup_and_lock(struct dentry *base, struct dentry *dentry, > > + unsigned int lookup_flags) > > Did you mean done_lookup_and_unlock()? No. The thing that we are done with is "lookup_and_lock()". This matches "done_path_create()" which doesn't create anything. On the other hand we have d_lookup_done() which puts _done at the end. Or end_name_hash(). ->write_end(), finish_automount() I guess I could accept done_lookup_and_unlock() if you prefer that. > > > +{ > > + d_lookup_done(dentry); > > + dput(dentry); > > + if (!(lookup_flags & LOOKUP_PARENT_LOCKED)) > > + inode_unlock(base->d_inode); > > +} > > +EXPORT_SYMBOL(done_lookup_and_lock); > > + > > /** > > * lookup_fast - do fast lockless (but racy) lookup of a dentry > > * @nd: current nameidata > > @@ -2754,12 +2789,9 @@ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct > > path_put(path); > > return ERR_PTR(-EINVAL); > > } > > - inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); > > - d = lookup_one_qstr(&last, path->dentry, 0); > > - if (IS_ERR(d)) { > > - inode_unlock(path->dentry->d_inode); > > + d = lookup_and_lock(&last, path->dentry, 0); > > + if (IS_ERR(d)) > > path_put(path); > > - } > > return d; > > } > > > > @@ -3053,6 +3085,22 @@ struct dentry *lookup_positive_unlocked(const char *name, > > } > > EXPORT_SYMBOL(lookup_positive_unlocked); > > > > +struct dentry *lookup_and_lock_one(struct mnt_idmap *idmap, > > + const char *name, int len, struct dentry *base, > > + unsigned int lookup_flags) > > +{ > > + struct qstr this; > > + int err; > > + > > + if (!idmap) > > + idmap = &nop_mnt_idmap; > > The callers should pass nop_mnt_idmap. That's how every function that > takes this argument works. This is a lot more explicit than magically > fixing this up in the function. OK. Thanks, NeilBrown
On Thu, Feb 06, 2025 at 04:42:45PM +1100, NeilBrown wrote: > lookup_and_lock() combines locking the directory and performing a lookup > prior to a change to the directory. > Abstracting this prepares for changing the locking requirements. > > done_lookup_and_lock() provides the inverse of putting the dentry and > unlocking. > > For "silly_rename" we will need to lookup_and_lock() in a directory that > is already locked. For this purpose we add LOOKUP_PARENT_LOCKED. Ewww... I do realize that such things might appear in intermediate stages of locking massage, but they'd better be _GONE_ by the end of it. Conditional locking of that sort is really asking for trouble. If nothing else, better split the function in two variants and document the differences; that kind of stuff really does not belong in arguments. If you need it to exist through the series, that is - if not, you should just leave lookup_one_qstr() for the "locked" case from the very beginning. > This functionality is exported as lookup_and_lock_one() which takes a > name and len rather than a qstr. ... for the sake of ...?
On Fri, Feb 07, 2025 at 08:22:35PM +0000, Al Viro wrote: > On Thu, Feb 06, 2025 at 04:42:45PM +1100, NeilBrown wrote: > > lookup_and_lock() combines locking the directory and performing a lookup > > prior to a change to the directory. > > Abstracting this prepares for changing the locking requirements. > > > > done_lookup_and_lock() provides the inverse of putting the dentry and > > unlocking. > > > > For "silly_rename" we will need to lookup_and_lock() in a directory that > > is already locked. For this purpose we add LOOKUP_PARENT_LOCKED. > > Ewww... I do realize that such things might appear in intermediate > stages of locking massage, but they'd better be _GONE_ by the end of it. > Conditional locking of that sort is really asking for trouble. > > If nothing else, better split the function in two variants and document > the differences; that kind of stuff really does not belong in arguments. > If you need it to exist through the series, that is - if not, you should > just leave lookup_one_qstr() for the "locked" case from the very beginning. The same, BTW, applies to more than LOOKUP_PARENT_LOCKED part. One general observation: if the locking behaviour of a function depends upon the flags passed to it, it's going to cause massive headache afterwards. If you need to bother with data flow analysis to tell what given call will do, expect trouble. If anything, I would rather have separate lookup_for_removal(), etc., each with its locking effects explicitly spelled out. Incidentally, looking through that I would say that your "VFS: filename_create(): fix incorrect intent" is not the right solution. If we hit that condition (no LOOKUP_DIRECTORY and last component ending with slash), we are going to fail anyway, the only question is which error to return. Rules: * if the last component lookup fails, return the error from lookup * if it yields positive, return -EEXIST * if it yields negative, return -ENOENT Correct? So how about this: diff --git a/fs/namei.c b/fs/namei.c index 3ab9440c5b93..6189e54f767a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4054,13 +4054,13 @@ static struct dentry *filename_create(int dfd, struct filename *name, struct dentry *dentry = ERR_PTR(-EEXIST); struct qstr last; bool want_dir = lookup_flags & LOOKUP_DIRECTORY; - unsigned int reval_flag = lookup_flags & LOOKUP_REVAL; - unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL; int type; int err2; int error; - error = filename_parentat(dfd, name, reval_flag, path, &last, &type); + lookup_flags &= LOOKUP_REVAL; + + error = filename_parentat(dfd, name, lookup_flags, path, &last, &type); if (error) return ERR_PTR(error); @@ -4070,18 +4070,28 @@ static struct dentry *filename_create(int dfd, struct filename *name, */ if (unlikely(type != LAST_NORM)) goto out; + /* + * mkdir foo/bar/ is OK, but for anything else a slash in the end + * is always an error; the only question is which one. + */ + if (unlikely(last.name[last.len] && !want_dir)) { + dentry = lookup_dcache(&last, path->dentry, lookup_flags); + if (!dentry) + dentry = lookup_slow(&last, path->dentry, lookup_flags); + if (!IS_ERR(dentry)) { + error = d_is_positive(dentry) ? -EEXIST : -ENOENT; + dput(dentry); + dentry = ERR_PTR(error); + } + goto out; + } /* don't fail immediately if it's r/o, at least try to report other errors */ err2 = mnt_want_write(path->mnt); - /* - * Do the final lookup. Suppress 'create' if there is a trailing - * '/', and a directory wasn't requested. - */ - if (last.name[last.len] && !want_dir) - create_flags = 0; + /* do the final lookup */ inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(&last, path->dentry, - reval_flag | create_flags); + lookup_flags | LOOKUP_CREATE | LOOKUP_EXCL); if (IS_ERR(dentry)) goto unlock; @@ -4089,16 +4099,6 @@ static struct dentry *filename_create(int dfd, struct filename *name, if (d_is_positive(dentry)) goto fail; - /* - * Special case - lookup gave negative, but... we had foo/bar/ - * From the vfs_mknod() POV we just have a negative dentry - - * all is fine. Let's be bastards - you had / on the end, you've - * been asking for (non-existent) directory. -ENOENT for you. - */ - if (unlikely(!create_flags)) { - error = -ENOENT; - goto fail; - } if (unlikely(err2)) { error = err2; goto fail;
On Sat, 08 Feb 2025, Al Viro wrote: > On Thu, Feb 06, 2025 at 04:42:45PM +1100, NeilBrown wrote: > > lookup_and_lock() combines locking the directory and performing a lookup > > prior to a change to the directory. > > Abstracting this prepares for changing the locking requirements. > > > > done_lookup_and_lock() provides the inverse of putting the dentry and > > unlocking. > > > > For "silly_rename" we will need to lookup_and_lock() in a directory that > > is already locked. For this purpose we add LOOKUP_PARENT_LOCKED. > > Ewww... I do realize that such things might appear in intermediate > stages of locking massage, but they'd better be _GONE_ by the end of it. > Conditional locking of that sort is really asking for trouble. > > If nothing else, better split the function in two variants and document > the differences; that kind of stuff really does not belong in arguments. > If you need it to exist through the series, that is - if not, you should > just leave lookup_one_qstr() for the "locked" case from the very beginning. That's what I did at first, but then when I realised I had to pass the lookup flags around everywhere.... Will revert. > > > This functionality is exported as lookup_and_lock_one() which takes a > > name and len rather than a qstr. > > ... for the sake of ...? nfsd. Thanks, NeilBrown
On Sun, 09 Feb 2025, Al Viro wrote: > On Fri, Feb 07, 2025 at 08:22:35PM +0000, Al Viro wrote: > > On Thu, Feb 06, 2025 at 04:42:45PM +1100, NeilBrown wrote: > > > lookup_and_lock() combines locking the directory and performing a lookup > > > prior to a change to the directory. > > > Abstracting this prepares for changing the locking requirements. > > > > > > done_lookup_and_lock() provides the inverse of putting the dentry and > > > unlocking. > > > > > > For "silly_rename" we will need to lookup_and_lock() in a directory that > > > is already locked. For this purpose we add LOOKUP_PARENT_LOCKED. > > > > Ewww... I do realize that such things might appear in intermediate > > stages of locking massage, but they'd better be _GONE_ by the end of it. > > Conditional locking of that sort is really asking for trouble. > > > > If nothing else, better split the function in two variants and document > > the differences; that kind of stuff really does not belong in arguments. > > If you need it to exist through the series, that is - if not, you should > > just leave lookup_one_qstr() for the "locked" case from the very beginning. > > The same, BTW, applies to more than LOOKUP_PARENT_LOCKED part. > > One general observation: if the locking behaviour of a function depends > upon the flags passed to it, it's going to cause massive headache afterwards. > > If you need to bother with data flow analysis to tell what given call will > do, expect trouble. > > If anything, I would rather have separate lookup_for_removal(), etc., each > with its locking effects explicitly spelled out. Incidentally, looking lookup_for_removal() etc would only be temporarily needed. Eventually (I hope) we would get to a place where all filesystems support all operations with only a shared lock. When we get there, lookup_for_remove() and lookup_for_create() would be identical again. And the difference wouldn't be that one takes a shared lock and the other takes an exclusive lock. It would be that one takes a shared or exclusive lock based on flag X stored somewhere (inode, inode_operations, ...) while the other takes a shared or exclusive lock based on flag Y. It would be nice to be able to accelerate that and push the locking down into the filesystems call at once as Linus suggested last time: https://lore.kernel.org/all/CAHk-=whz69y=98udgGB5ujH6bapYuapwfHS2esWaFrKEoi9-Ow@mail.gmail.com/ That would require either adding a new rwsem to each inode, possibly in the filesystem-private part of the inode, or changing VFS to not lock the inode at all. The first would be unwelcome by fs developers I expect, the second would be a serious challenge. I started thinking about and quickly decided I had enough challenges already. So I think we need some way for the VFS to determine and select the lock type requires by the filesystem. Christian suggested a flag in inode_operations and think that is a good idea. I originally suggested a flag in the superblock, but Linus suggested different operations might want different locking (same email linked above). But I don't think we can get away without having conditional locking (like we already do in open_last_lookup() depending on O_CREAT). Thanks, NeilBrown
On Wed, Feb 12, 2025 at 04:22:16PM +1100, NeilBrown wrote: > lookup_for_removal() etc would only be temporarily needed. Eventually > (I hope) we would get to a place where all filesystems support all > operations with only a shared lock. When we get there, > lookup_for_remove() and lookup_for_create() would be identical again. > > And the difference wouldn't be that one takes a shared lock and the > other takes an exclusive lock. It would be that one takes a shared or > exclusive lock based on flag X stored somewhere (inode, inode_operations, > ...) while the other takes a shared or exclusive lock based on flag Y. > > It would be nice to be able to accelerate that and push the locking down > into the filesystems call at once as Linus suggested last time: > > https://lore.kernel.org/all/CAHk-=whz69y=98udgGB5ujH6bapYuapwfHS2esWaFrKEoi9-Ow@mail.gmail.com/ > > That would require either adding a new rwsem to each inode, possibly in > the filesystem-private part of the inode, or changing VFS to not lock > the inode at all. The first would be unwelcome by fs developers I > expect, the second would be a serious challenge. I started thinking > about and quickly decided I had enough challenges already. I think it's the wrong way to go. Your "in-update" state does make sense, but it doesn't go far enough and it's not really about parallel anything - it's simply "this dentry is nailed down <here> with <this> name for now". And _that_ is really useful, provided that it's reliable. What we need to avoid is d_drop()/d_rehash() windows, when that "operated upon" dentry ceases to be visible. Currently we can do that, provided that parent is held exclusive. Any lookup will hit dcache miss and proceed to lookup_slow() path, which will block on attempt to get the parent shared. As soon as you switch to holding parent shared, that pattern becomes a source of problems. And if we deal with that, there's not much reason to nest this dentry lock inside ->i_rwsem. Then ->i_rwsem would become easy to push inside the methods. Right now the fundamental problem with your locking is that you get dentry locks sandwiched between ->i_rwsem on parents and that on children. We can try to be clever with how we acquire them (have ->d_parent rechecked before going to sleep, etc.), but that's rather brittle. _IF_ we push them outside of ->i_rwsem, the role of ->i_rwsem would shrink to protecting (1) the directory internal representation, (2) emptiness checks and (3) link counts. What goes away is "we are holding it exclusive, so anything that comes here with dcache miss won't get around to doing anything until we unlock".
On Wed, Feb 12, 2025 at 03:51:32PM +0000, Al Viro wrote: > And _that_ is really useful, provided that it's reliable. What we > need to avoid is d_drop()/d_rehash() windows, when that "operated > upon" dentry ceases to be visible. ... which is easier to do these days - NFS doesn't do it anymore (AFS still does, though). There's also a bit of magical mystery shite in exfat_lookup()... IIRC, we used to have something similar in VFAT as well, and it had been bloody bogus... Actually, this one is worse - this /* * Unhashed alias is able to exist because of revalidate() * called by lookup_fast. You can easily make this status * by calling create and lookup concurrently * In such case, we reuse an alias instead of new dentry */ in there is utter nonsense - exfat_d_revalidate() never tells you to drop positive dentries, to start with. Check for disconnected stuff is also bogus (reasoning in "vfat: simplify checks in vfat_lookup()" applies), d_drop(dentry) is pointless (->lookup() argument is not hashed), for directories we don't give a rat's arse whether it's hashed or not (d_splice_alias() will DTRT) and for non-directories the next case in there (d_move() and return alias) will work, hashed or unhashed. Now, the case of alias dentry being locked is interesting (both for exfat and vfat)...
diff --git a/fs/namei.c b/fs/namei.c index 69610047f6c6..3c0feca081a2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1715,6 +1715,41 @@ struct dentry *lookup_one_qstr(const struct qstr *name, } EXPORT_SYMBOL(lookup_one_qstr); +static struct dentry *lookup_and_lock_nested(const struct qstr *last, + struct dentry *base, + unsigned int lookup_flags, + unsigned int subclass) +{ + struct dentry *dentry; + + if (!(lookup_flags & LOOKUP_PARENT_LOCKED)) + inode_lock_nested(base->d_inode, subclass); + + dentry = lookup_one_qstr(last, base, lookup_flags); + if (IS_ERR(dentry) && !(lookup_flags & LOOKUP_PARENT_LOCKED)) { + inode_unlock(base->d_inode); + } + return dentry; +} + +static struct dentry *lookup_and_lock(const struct qstr *last, + struct dentry *base, + unsigned int lookup_flags) +{ + return lookup_and_lock_nested(last, base, lookup_flags, + I_MUTEX_PARENT); +} + +void done_lookup_and_lock(struct dentry *base, struct dentry *dentry, + unsigned int lookup_flags) +{ + d_lookup_done(dentry); + dput(dentry); + if (!(lookup_flags & LOOKUP_PARENT_LOCKED)) + inode_unlock(base->d_inode); +} +EXPORT_SYMBOL(done_lookup_and_lock); + /** * lookup_fast - do fast lockless (but racy) lookup of a dentry * @nd: current nameidata @@ -2754,12 +2789,9 @@ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path_put(path); return ERR_PTR(-EINVAL); } - inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); - d = lookup_one_qstr(&last, path->dentry, 0); - if (IS_ERR(d)) { - inode_unlock(path->dentry->d_inode); + d = lookup_and_lock(&last, path->dentry, 0); + if (IS_ERR(d)) path_put(path); - } return d; } @@ -3053,6 +3085,22 @@ struct dentry *lookup_positive_unlocked(const char *name, } EXPORT_SYMBOL(lookup_positive_unlocked); +struct dentry *lookup_and_lock_one(struct mnt_idmap *idmap, + const char *name, int len, struct dentry *base, + unsigned int lookup_flags) +{ + struct qstr this; + int err; + + if (!idmap) + idmap = &nop_mnt_idmap; + err = lookup_one_common(idmap, name, base, len, &this); + if (err) + return ERR_PTR(err); + return lookup_and_lock(&this, base, lookup_flags); +} +EXPORT_SYMBOL(lookup_and_lock_one); + #ifdef CONFIG_UNIX98_PTYS int path_pts(struct path *path) { @@ -4071,7 +4119,6 @@ static struct dentry *filename_create(int dfd, struct filename *name, unsigned int reval_flag = lookup_flags & LOOKUP_REVAL; unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL; int type; - int err2; int error; error = filename_parentat(dfd, name, reval_flag, path, &last, &type); @@ -4083,36 +4130,30 @@ static struct dentry *filename_create(int dfd, struct filename *name, * (foo/., foo/.., /////) */ if (unlikely(type != LAST_NORM)) - goto out; + goto put; /* don't fail immediately if it's r/o, at least try to report other errors */ - err2 = mnt_want_write(path->mnt); + error = mnt_want_write(path->mnt); /* * Do the final lookup. Suppress 'create' if there is a trailing * '/', and a directory wasn't requested. */ if (last.name[last.len] && !want_dir) create_flags &= ~LOOKUP_CREATE; - inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); - dentry = lookup_one_qstr(&last, path->dentry, - reval_flag | create_flags); + dentry = lookup_and_lock(&last, path->dentry, reval_flag | create_flags); if (IS_ERR(dentry)) - goto unlock; + goto drop; - if (unlikely(err2)) { - error = err2; + if (unlikely(error)) goto fail; - } return dentry; fail: - d_lookup_done(dentry); - dput(dentry); + done_lookup_and_lock(path->dentry, dentry, reval_flag | create_flags); dentry = ERR_PTR(error); -unlock: - inode_unlock(path->dentry->d_inode); - if (!err2) +drop: + if (!error) mnt_drop_write(path->mnt); -out: +put: path_put(path); return dentry; } @@ -4130,14 +4171,13 @@ EXPORT_SYMBOL(kern_path_create); void done_path_create(struct path *path, struct dentry *dentry) { - dput(dentry); - inode_unlock(path->dentry->d_inode); + done_lookup_and_lock(path->dentry, dentry, LOOKUP_CREATE); mnt_drop_write(path->mnt); path_put(path); } EXPORT_SYMBOL(done_path_create); -inline struct dentry *user_path_create(int dfd, const char __user *pathname, +struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, unsigned int lookup_flags) { struct filename *filename = getname(pathname); @@ -4510,19 +4550,18 @@ int do_rmdir(int dfd, struct filename *name) if (error) goto exit2; - inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); - dentry = lookup_one_qstr(&last, path.dentry, lookup_flags); + dentry = lookup_and_lock(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit3; + error = security_path_rmdir(&path, dentry); if (error) goto exit4; error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry); exit4: - dput(dentry); + done_lookup_and_lock(path.dentry, dentry, lookup_flags); exit3: - inode_unlock(path.dentry->d_inode); mnt_drop_write(path.mnt); exit2: path_put(&path); @@ -4639,11 +4678,9 @@ int do_unlinkat(int dfd, struct filename *name) if (error) goto exit2; retry_deleg: - inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); - dentry = lookup_one_qstr(&last, path.dentry, lookup_flags); + dentry = lookup_and_lock(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { - /* Why not before? Because we want correct error value */ if (last.name[last.len]) goto slashes; @@ -4655,9 +4692,8 @@ int do_unlinkat(int dfd, struct filename *name) error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, &delegated_inode); exit3: - dput(dentry); + done_lookup_and_lock(path.dentry, dentry, lookup_flags); } - inode_unlock(path.dentry->d_inode); if (inode) iput(inode); /* truncate the inode here */ inode = NULL; diff --git a/include/linux/namei.h b/include/linux/namei.h index 0d81e571a159..76c587a5ec3a 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -29,7 +29,11 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT}; #define LOOKUP_RCU BIT(8) /* RCU pathwalk mode; semi-internal */ #define LOOKUP_CACHED BIT(9) /* Only do cached lookup */ #define LOOKUP_PARENT BIT(10) /* Looking up final parent in path */ -/* 5 spare bits for pathwalk */ +#define LOOKUP_PARENT_LOCKED BIT(11) /* filesystem sets this for nested + * "lookup_and_lock_one" when it knows + * parent is sufficiently locked. + */ +/* 4 spare bits for pathwalk */ /* These tell filesystem methods that we are dealing with the final component... */ #define LOOKUP_OPEN BIT(16) /* ... in open */ @@ -82,6 +86,15 @@ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len); +struct dentry *lookup_and_lock_one(struct mnt_idmap *idmap, + const char *name, int len, struct dentry *base, + unsigned int lookup_flags); +struct dentry *__lookup_and_lock_one(struct mnt_idmap *idmap, + const char *name, int len, struct dentry *base, + unsigned int lookup_flags); +void done_lookup_and_lock(struct dentry *base, struct dentry *dentry, + unsigned int lookup_flags); +void __done_lookup_and_lock(struct dentry *dentry); extern int follow_down_one(struct path *); extern int follow_down(struct path *path, unsigned int flags);
lookup_and_lock() combines locking the directory and performing a lookup prior to a change to the directory. Abstracting this prepares for changing the locking requirements. done_lookup_and_lock() provides the inverse of putting the dentry and unlocking. For "silly_rename" we will need to lookup_and_lock() in a directory that is already locked. For this purpose we add LOOKUP_PARENT_LOCKED. Like lookup_len_qstr(), lookup_and_lock() returns -ENOENT if LOOKUP_CREATE was NOT given and the name cannot be found,, and returns -EEXIST if LOOKUP_EXCL WAS given and the name CAN be found. These functions replace all uses of lookup_one_qstr() in namei.c except for those used for rename. The name might seem backwards as the lock happens before the lookup. A future patch will change this so that only a shared lock is taken before the lookup, and an exclusive lock on the dentry is taken after a successful lookup. So the order "lookup" then "lock" will make sense. This functionality is exported as lookup_and_lock_one() which takes a name and len rather than a qstr. Signed-off-by: NeilBrown <neilb@suse.de> --- fs/namei.c | 102 ++++++++++++++++++++++++++++-------------- include/linux/namei.h | 15 ++++++- 2 files changed, 83 insertions(+), 34 deletions(-)