diff mbox

[POC/RFC] overlayfs: constant inode numbers

Message ID 20161125212934.GB2622@veci.piliscsaba.szeredi.hu (mailing list archive)
State New, archived
Headers show

Commit Message

Miklos Szeredi Nov. 25, 2016, 9:29 p.m. UTC
Here's a really preliminary patch to allow inode numbers to be constant across
copy ups and be consistent between st_ino and d_ino.

It only works if underlying lower and upper dirs are all on the same filesystem
(so there's only a single inode namespace to deal with).

Performance of readdir is probably dismal, definitely needs improving.

Readdir of overlay root doesn't get ino of "." right.  This probably needs
special casing.

And it doesn't yet deal with the hard link copy-up issue, which is a somewhat
separate problem.

Thanks,
Miklos

---
 fs/overlayfs/copy_up.c   |   16 +++++++--
 fs/overlayfs/dir.c       |    9 +----
 fs/overlayfs/inode.c     |   45 +++++++++++++++++++++++++--
 fs/overlayfs/namei.c     |   17 +++++++++-
 fs/overlayfs/overlayfs.h |    9 ++++-
 fs/overlayfs/ovl_entry.h |    1 
 fs/overlayfs/readdir.c   |   78 +++++++++++++++++++++++++++++++++++++----------
 fs/overlayfs/util.c      |   15 +++++++++
 8 files changed, 160 insertions(+), 30 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Amir Goldstein Nov. 28, 2016, 9:10 a.m. UTC | #1
[Re-adding CC list]

On Sat, Nov 26, 2016 at 10:14 AM, Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Friday, November 25, 2016, Miklos Szeredi <miklos@szeredi.hu> wrote:
>>
>> Here's a really preliminary patch to allow inode numbers to be constant
>> across
>> copy ups and be consistent between st_ino and d_ino.
>>
>> It only works if underlying lower and upper dirs are all on the same
>> filesystem
>> (so there's only a single inode namespace to deal with).
>>
>> Performance of readdir is probably dismal, definitely needs improving.
>>
>> Readdir of overlay root doesn't get ino of "." right.  This probably needs
>> special casing.
>>
>> And it doesn't yet deal with the hard link copy-up issue, which is a
>> somewhat
>> separate problem.
>>
>> Thanks,
>> Miklos
>>
>> ---
>>  fs/overlayfs/copy_up.c   |   16 +++++++--
>>  fs/overlayfs/dir.c       |    9 +----
>>  fs/overlayfs/inode.c     |   45 +++++++++++++++++++++++++--
>>  fs/overlayfs/namei.c     |   17 +++++++++-
>>  fs/overlayfs/overlayfs.h |    9 ++++-
>>  fs/overlayfs/ovl_entry.h |    1
>>  fs/overlayfs/readdir.c   |   78
>> +++++++++++++++++++++++++++++++++++++----------
>>  fs/overlayfs/util.c      |   15 +++++++++
>>  8 files changed, 160 insertions(+), 30 deletions(-)
>>
>> --- a/fs/overlayfs/copy_up.c
>> +++ b/fs/overlayfs/copy_up.c
>> @@ -177,6 +177,7 @@ int ovl_set_attr(struct dentry *upperden
>>  {
>>         int err = 0;
>>
>> +       inode_lock(upperdentry->d_inode);
>>         if (!S_ISLNK(stat->mode)) {
>>                 struct iattr attr = {
>>                         .ia_valid = ATTR_MODE,
>> @@ -194,6 +195,15 @@ int ovl_set_attr(struct dentry *upperden
>>         }
>>         if (!err)
>>                 ovl_set_timestamps(upperdentry, stat);
>> +       inode_unlock(upperdentry->d_inode);
>> +
>> +       if (!err) {
>> +               char buf[17];
>> +
>> +               snprintf(buf, sizeof(buf), "%llx", stat->ino);
>> +               err = ovl_do_setxattr(upperdentry, OVL_XATTR_INO,
>> +                                     buf, strlen(buf), 0);
>> +       }
>>
>>         return err;
>>  }
>> @@ -258,12 +268,12 @@ static int ovl_copy_up_locked(struct den
>>         if (err)
>>                 goto out_cleanup;
>>
>> -       inode_lock(newdentry->d_inode);
>>         err = ovl_set_attr(newdentry, stat);
>> -       inode_unlock(newdentry->d_inode);
>>         if (err)
>>                 goto out_cleanup;
>>
>> +       ovl_dentry_set_ino(dentry, stat->ino);
>> +
>
>

Shouldn't we propagate stored ino all the way
from lowest layer to preserve ino across layer recycling?

Specifically, shouldn't ino of merged dir expose the lower most ino?

>
>>
>>         err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
>>         if (err)
>>                 goto out_cleanup;
>> @@ -379,7 +389,7 @@ int ovl_copy_up(struct dentry *dentry)
>>                 }
>>
>>                 ovl_path_lower(next, &lowerpath);
>> -               err = vfs_getattr(&lowerpath, &stat);
>> +               err = ovl_getattr_int(next, &lowerpath, &stat);
>>                 if (!err)
>>                         err = ovl_copy_up_one(parent, next, &lowerpath,
>> &stat);
>>
>> --- a/fs/overlayfs/overlayfs.h
>> +++ b/fs/overlayfs/overlayfs.h
>> @@ -20,6 +20,7 @@ enum ovl_path_type {
>>  #define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay."
>>  #define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque"
>>  #define OVL_XATTR_REDIRECT OVL_XATTR_PREFIX "redirect"
>> +#define OVL_XATTR_INO OVL_XATTR_PREFIX "ino"
>>
>>  #define OVL_ISUPPER_MASK 1UL
>>
>> @@ -161,6 +162,8 @@ bool ovl_redirect_dir(struct super_block
>>  void ovl_clear_redirect_dir(struct super_block *sb);
>>  const char *ovl_dentry_get_redirect(struct dentry *dentry);
>>  void ovl_dentry_set_redirect(struct dentry *dentry, const char
>> *redirect);
>> +u64 ovl_dentry_get_ino(struct dentry *dentry);
>> +void ovl_dentry_set_ino(struct dentry *dentry, u64 ino);
>>  void ovl_dentry_update(struct dentry *dentry, struct dentry
>> *upperdentry);
>>  void ovl_inode_init(struct inode *inode, struct inode *realinode,
>>                     bool is_upper);
>> @@ -171,9 +174,10 @@ bool ovl_is_whiteout(struct dentry *dent
>>  struct file *ovl_path_open(struct path *path, int flags);
>>
>>  /* namei.c */
>> -int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
>> +int ovl_path_next(int idx, struct dentry *dentry, struct path *path, int
>> *idxp);
>>  struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
>> unsigned int flags);
>>  bool ovl_lower_positive(struct dentry *dentry);
>> +struct dentry *ovl_dentry_at_idx(struct dentry *dentry, int idx);
>>
>>  /* readdir.c */
>>  extern const struct file_operations ovl_dir_operations;
>> @@ -196,6 +200,9 @@ struct posix_acl *ovl_get_acl(struct ino
>>  int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int
>> file_flags);
>>  int ovl_update_time(struct inode *inode, struct timespec *ts, int flags);
>>  bool ovl_is_private_xattr(const char *name);
>> +int ovl_getattr_int(struct dentry *dentry, struct path *realpath,
>> +                   struct kstat *stat);
>> +void ovl_get_ino(struct dentry *realdentry, u64 *ino);
>>
>>  struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t
>> rdev);
>>  struct inode *ovl_get_inode(struct super_block *sb, struct inode
>> *realinode);
>> --- a/fs/overlayfs/inode.c
>> +++ b/fs/overlayfs/inode.c
>> @@ -15,6 +15,7 @@
>>  #include <linux/module.h>
>>  #include <linux/file.h>
>>  #include <linux/hashtable.h>
>> +#include <linux/ratelimit.h>
>>  #include "overlayfs.h"
>>
>>  static int ovl_copy_up_truncate(struct dentry *dentry)
>> @@ -33,7 +34,7 @@ static int ovl_copy_up_truncate(struct d
>>         ovl_path_lower(dentry, &lowerpath);
>>
>>         old_cred = ovl_override_creds(dentry->d_sb);
>> -       err = vfs_getattr(&lowerpath, &stat);
>> +       err = ovl_getattr_int(dentry, &lowerpath, &stat);
>>         if (!err) {
>>                 stat.size = 0;
>>                 err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat);
>> @@ -88,6 +89,46 @@ int ovl_setattr(struct dentry *dentry, s
>>         return err;
>>  }
>>
>> +void ovl_get_ino(struct dentry *realdentry, u64 *ino)
>> +{
>> +       int err;
>> +       char buf[18];
>> +
>> +       err = vfs_getxattr(realdentry, OVL_XATTR_INO,
>> +                          buf, sizeof(buf) - 1);
>> +       if (err < 0) {
>> +               if (err != -ENODATA && err != -EOPNOTSUPP)
>> +                       pr_warn_ratelimited("overlay: failed to get ino
>> (%i)\n", err);
>> +       } else {
>> +               buf[err] = '\0';
>> +               err = kstrtoull(buf, 16, ino);
>> +               if (err)
>> +                       pr_warn("overlay: invalid ino (%s)\n", buf);
>> +       }
>> +}
>> +
>> +int ovl_getattr_int(struct dentry *dentry, struct path *realpath,
>> +                   struct kstat *stat)
>> +{
>> +       int err;
>> +       u64 ino;
>> +
>> +       err = vfs_getattr(realpath, stat);
>> +       if (err)
>> +               return err;
>> +
>> +       ino = ovl_dentry_get_ino(dentry);
>> +       if (!ino) {
>> +               ino = stat->ino;
>> +               ovl_get_ino(realpath->dentry, &ino);
>> +               ovl_dentry_set_ino(dentry, ino);
>> +       }
>> +       stat->dev = dentry->d_sb->s_dev;
>> +       stat->ino = ino;
>> +
>> +       return 0;
>> +}
>> +
>>  static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
>>                          struct kstat *stat)
>>  {
>> @@ -97,7 +138,7 @@ static int ovl_getattr(struct vfsmount *
>>
>>         ovl_path_real(dentry, &realpath);
>>         old_cred = ovl_override_creds(dentry->d_sb);
>> -       err = vfs_getattr(&realpath, stat);
>> +       err = ovl_getattr_int(dentry, &realpath, stat);
>>         revert_creds(old_cred);
>>         return err;
>>  }
>> --- a/fs/overlayfs/ovl_entry.h
>> +++ b/fs/overlayfs/ovl_entry.h
>> @@ -36,6 +36,7 @@ struct ovl_entry {
>>         union {
>>                 struct {
>>                         u64 version;
>> +                       u64 ino;
>>                         const char *redirect;
>>                         bool opaque;
>>                 };
>> --- a/fs/overlayfs/util.c
>> +++ b/fs/overlayfs/util.c
>> @@ -205,6 +205,21 @@ void ovl_dentry_set_redirect(struct dent
>>         oe->redirect = redirect;
>>  }
>>
>> +u64 ovl_dentry_get_ino(struct dentry *dentry)
>> +{
>> +       struct ovl_entry *oe = dentry->d_fsdata;
>> +
>> +       return oe->ino;
>> +}
>> +
>> +void ovl_dentry_set_ino(struct dentry *dentry, u64 ino)
>> +{
>> +       struct ovl_entry *oe = dentry->d_fsdata;
>> +
>> +       WARN_ON(oe->ino && oe->ino != ino);
>> +       oe->ino = ino;
>> +}
>> +
>>  void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
>>  {
>>         struct ovl_entry *oe = dentry->d_fsdata;
>> --- a/fs/overlayfs/dir.c
>> +++ b/fs/overlayfs/dir.c
>> @@ -143,14 +143,11 @@ static int ovl_dir_getattr(struct vfsmou
>>
>>         type = ovl_path_real(dentry, &realpath);
>>         old_cred = ovl_override_creds(dentry->d_sb);
>> -       err = vfs_getattr(&realpath, stat);
>> +       err = ovl_getattr_int(dentry, &realpath, stat);
>>         revert_creds(old_cred);
>>         if (err)
>>                 return err;
>>
>> -       stat->dev = dentry->d_sb->s_dev;
>> -       stat->ino = dentry->d_inode->i_ino;
>> -
>>         /*
>>          * It's probably not worth it to count subdirs to get the
>>          * correct link count.  nlink=1 seems to pacify 'find' and
>> @@ -250,7 +247,7 @@ static struct dentry *ovl_clear_empty(st
>>                 goto out;
>>
>>         ovl_path_upper(dentry, &upperpath);
>> -       err = vfs_getattr(&upperpath, &stat);
>> +       err = ovl_getattr_int(dentry, &upperpath, &stat);
>>         if (err)
>>                 goto out_unlock;
>>
>> @@ -278,9 +275,7 @@ static struct dentry *ovl_clear_empty(st
>>         if (err)
>>                 goto out_cleanup;
>>
>> -       inode_lock(opaquedir->d_inode);
>>         err = ovl_set_attr(opaquedir, &stat);
>> -       inode_unlock(opaquedir->d_inode);
>>         if (err)
>>                 goto out_cleanup;
>>
>> --- a/fs/overlayfs/namei.c
>> +++ b/fs/overlayfs/namei.c
>> @@ -186,23 +186,36 @@ static int ovl_lookup_layer(struct dentr
>>   * Returns next layer in stack starting from top.
>>   * Returns -1 if this is the last layer.
>>   */
>> -int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
>> +int ovl_path_next(int idx, struct dentry *dentry, struct path *path, int
>> *idxp)
>>  {
>>         struct ovl_entry *oe = dentry->d_fsdata;
>>
>>         BUG_ON(idx < 0);
>>         if (idx == 0) {
>>                 ovl_path_upper(dentry, path);
>> -               if (path->dentry)
>> +               if (path->dentry) {
>> +                       *idxp = 0;
>>                         return oe->numlower ? 1 : -1;
>> +               }
>>                 idx++;
>>         }
>>         BUG_ON(idx > oe->numlower);
>> +       *idxp = idx;
>>         *path = oe->lowerstack[idx - 1];
>>
>>         return (idx < oe->numlower) ? idx + 1 : -1;
>>  }
>>
>> +struct dentry *ovl_dentry_at_idx(struct dentry *dentry, int idx)
>> +{
>> +       struct ovl_entry *oe = dentry->d_fsdata;
>> +
>> +       if (idx == 0)
>> +               return ovl_upperdentry_dereference(oe);
>> +       else
>> +               return oe->lowerstack[idx - 1].dentry;
>> +}
>> +
>>  struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
>>                           unsigned int flags)
>>  {
>> --- a/fs/overlayfs/readdir.c
>> +++ b/fs/overlayfs/readdir.c
>> @@ -20,10 +20,12 @@
>>  struct ovl_cache_entry {
>>         unsigned int len;
>>         unsigned int type;
>> +       u64 orig_ino;
>>         u64 ino;
>>         struct list_head l_node;
>>         struct rb_node node;
>>         struct ovl_cache_entry *next_maybe_whiteout;
>> +       int idx;
>>         bool is_whiteout;
>>         char name[];
>>  };
>> @@ -36,13 +38,14 @@ struct ovl_dir_cache {
>>
>>  struct ovl_readdir_data {
>>         struct dir_context ctx;
>> -       struct dentry *dentry;
>> +       bool check_whiteouts;
>>         bool is_lowest;
>>         struct rb_root root;
>>         struct list_head *list;
>>         struct list_head middle;
>>         struct ovl_cache_entry *first_maybe_whiteout;
>>         int count;
>> +       int idx;
>>         int err;
>>         bool d_type_supported;
>>  };
>> @@ -97,8 +100,10 @@ static struct ovl_cache_entry *ovl_cache
>>         p->name[len] = '\0';
>>         p->len = len;
>>         p->type = d_type;
>> -       p->ino = ino;
>> +       p->orig_ino = ino;
>> +       p->ino = 0;
>>         p->is_whiteout = false;
>> +       p->idx = rdd->idx;
>>
>>         if (d_type == DT_CHR) {
>>                 p->next_maybe_whiteout = rdd->first_maybe_whiteout;
>> @@ -206,9 +211,6 @@ static int ovl_check_whiteouts(struct de
>>         int err;
>>         struct ovl_cache_entry *p;
>>         struct dentry *dentry;
>> -       const struct cred *old_cred;
>> -
>> -       old_cred = ovl_override_creds(rdd->dentry->d_sb);
>>
>>         err = down_write_killable(&dir->d_inode->i_rwsem);
>>         if (!err) {
>> @@ -223,7 +225,6 @@ static int ovl_check_whiteouts(struct de
>>                 }
>>                 inode_unlock(dir->d_inode);
>>         }
>> -       revert_creds(old_cred);
>>
>>         return err;
>>  }
>> @@ -248,7 +249,7 @@ static inline int ovl_dir_read(struct pa
>>                         err = rdd->err;
>>         } while (!err && rdd->count);
>>
>> -       if (!err && rdd->first_maybe_whiteout && rdd->dentry)
>> +       if (!err && rdd->first_maybe_whiteout && rdd->check_whiteouts)
>>                 err = ovl_check_whiteouts(realpath->dentry, rdd);
>>
>>         fput(realfile);
>> @@ -268,7 +269,7 @@ static void ovl_dir_reset(struct file *f
>>                 od->cache = NULL;
>>                 od->cursor = NULL;
>>         }
>> -       WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
>> +//     WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
>>         if (od->is_real && OVL_TYPE_MERGE(type))
>>                 od->is_real = false;
>>  }
>> @@ -279,7 +280,7 @@ static int ovl_dir_read_merged(struct de
>>         struct path realpath;
>>         struct ovl_readdir_data rdd = {
>>                 .ctx.actor = ovl_fill_merge,
>> -               .dentry = dentry,
>> +               .check_whiteouts = true,
>>                 .list = list,
>>                 .root = RB_ROOT,
>>                 .is_lowest = false,
>> @@ -287,7 +288,7 @@ static int ovl_dir_read_merged(struct de
>>         int idx, next;
>>
>>         for (idx = 0; idx != -1; idx = next) {
>> -               next = ovl_path_next(idx, dentry, &realpath);
>> +               next = ovl_path_next(idx, dentry, &realpath, &rdd.idx);
>>
>>                 if (next != -1) {
>>                         err = ovl_dir_read(&realpath, &rdd);
>> @@ -353,11 +354,45 @@ static struct ovl_dir_cache *ovl_cache_g
>>         return cache;
>>  }
>>
>> +static int ovl_cache_entry_update_ino(struct dentry *dir,
>> +                                     struct ovl_cache_entry *p)
>> +
>> +{
>> +       struct dentry *this;
>> +       struct dentry *base = ovl_dentry_at_idx(dir, p->idx);
>> +
>> +       if (p->name[0] == '.') {
>> +               if (p->len == 1) {
>> +                       this = dget(base);
>> +                       goto get;
>> +               }
>> +               if (p->len == 2 && p->name[1] == '.') {
>> +                       /* ♫ we shall not be moved */
>> +                       this = dget(ovl_dentry_real(dir->d_parent));
>> +                       goto get;
>> +               }
>> +       }
>> +       this = lookup_one_len_unlocked(p->name, base, p->len);
>> +       if (IS_ERR(this)) {
>> +               pr_err("overlay: failed to look up (%s) for ino (%i)\n",
>> +                      p->name, (int) PTR_ERR(this));
>> +               return -EIO;
>> +       }
>>
>> +       get:
>> +       p->ino = p->orig_ino;
>> +       ovl_get_ino(this, &p->ino);

I may be way off here, but why do you need to lookup entry and get ino
from xattr at all? Wouldn't it be easier to not add the entry to the list if
it was copied up and rely on the fact that it will be added to list in iter
of lower layer with original ino with no extra effort?

For that matter, I like the fact that every copied up entry will be explicitly
marked with OVL_XATTR_INO. In a way, it is the opposite of
OVL_XATTR_OPAQUE and if the former becomes a standard, the latter
will become redundant. Arguably, it is preferred to mark the copy ups
as special case rather then the pure upper files, which can then stay 'pure'.

>>
>> +       dput(this);
>> +
>> +       return 0;
>> +}
>> +
>>  static int ovl_iterate(struct file *file, struct dir_context *ctx)
>>  {
>>         struct ovl_dir_file *od = file->private_data;
>>         struct dentry *dentry = file->f_path.dentry;
>>         struct ovl_cache_entry *p;
>> +       const struct cred *old_cred;
>> +       int err;
>>
>>         if (!ctx->pos)
>>                 ovl_dir_reset(file);
>> @@ -365,12 +400,14 @@ static int ovl_iterate(struct file *file
>>         if (od->is_real)
>>                 return iterate_dir(od->realfile, ctx);
>>
>> +       old_cred = ovl_override_creds(dentry->d_sb);
>>         if (!od->cache) {
>>                 struct ovl_dir_cache *cache;
>>
>>                 cache = ovl_cache_get(dentry);
>> +               err = PTR_ERR(cache);
>>                 if (IS_ERR(cache))
>> -                       return PTR_ERR(cache);
>> +                       goto out;
>>
>>                 od->cache = cache;
>>                 ovl_seek_cursor(od, ctx->pos);
>> @@ -378,13 +415,23 @@ static int ovl_iterate(struct file *file
>>
>>         while (od->cursor != &od->cache->entries) {
>>                 p = list_entry(od->cursor, struct ovl_cache_entry,
>> l_node);
>> -               if (!p->is_whiteout)
>> +               if (!p->is_whiteout) {
>> +                       if (!p->ino) {
>> +                               err = ovl_cache_entry_update_ino(dentry,
>> p);
>> +                               if (err)
>> +                                       goto out;
>> +                       }
>>                         if (!dir_emit(ctx, p->name, p->len, p->ino,
>> p->type))
>>                                 break;
>> +               }
>>                 od->cursor = p->l_node.next;
>>                 ctx->pos++;
>>         }
>> -       return 0;
>> +       err = 0;
>> +out:
>> +       revert_creds(old_cred);
>> +
>> +       return err;
>>  }
>>
>>  static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int
>> origin)
>> @@ -502,7 +549,8 @@ static int ovl_dir_open(struct inode *in
>>                 return PTR_ERR(realfile);
>>         }
>>         od->realfile = realfile;
>> -       od->is_real = !OVL_TYPE_MERGE(type);
>> +//     od->is_real = !OVL_TYPE_MERGE(type);
>> +       od->is_real = false;


A major change of framing would be to treat regular file entries as merged
if they have been ever copied up and opaque only if they are pure upper.
Same as dirs.

This would also allow keeping ino stable across rename/redirect of regular
files. Not sure if any programs rely on that??
I did not examine the blast radius of this sort of change and have no idea
whether stable ino across rename benefits are worth while, but perhaps
it can help reduce code complexity by combining some special cases
for files vs. dirs to the same code.

When you feel this POC is testing grade, please make it available also
via a topic branch, preferably on top of overlayfs-rorw.

Thanks,
Amir.


>>
>>         od->is_upper = OVL_TYPE_UPPER(type);
>>         file->private_data = od;
>>
>> @@ -615,7 +663,7 @@ static void ovl_workdir_cleanup_recurse(
>>         struct ovl_cache_entry *p;
>>         struct ovl_readdir_data rdd = {
>>                 .ctx.actor = ovl_fill_merge,
>> -               .dentry = NULL,
>> +               .check_whiteouts = false,
>>                 .list = &list,
>>                 .root = RB_ROOT,
>>                 .is_lowest = false,
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-unionfs"
>> in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Miklos Szeredi Nov. 28, 2016, 10:35 a.m. UTC | #2
On Mon, Nov 28, 2016 at 10:10 AM, Amir Goldstein <amir73il@gmail.com> wrote:
>>> @@ -258,12 +268,12 @@ static int ovl_copy_up_locked(struct den
>>>         if (err)
>>>                 goto out_cleanup;
>>>
>>> -       inode_lock(newdentry->d_inode);
>>>         err = ovl_set_attr(newdentry, stat);
>>> -       inode_unlock(newdentry->d_inode);
>>>         if (err)
>>>                 goto out_cleanup;
>>>
>>> +       ovl_dentry_set_ino(dentry, stat->ino);
>>> +
>>
>>
>
> Shouldn't we propagate stored ino all the way
> from lowest layer to preserve ino across layer recycling?

Since OVL_XATTR_INO is set every time we copy-up, layer recycling
should work fine.

Exception is overlay root, where there's no copy-up, so no
preservation of ino.  Not sure what the right solution there is.

>
> Specifically, shouldn't ino of merged dir expose the lower most ino?

It should.


>>> @@ -353,11 +354,45 @@ static struct ovl_dir_cache *ovl_cache_g
>>>         return cache;
>>>  }
>>>
>>> +static int ovl_cache_entry_update_ino(struct dentry *dir,
>>> +                                     struct ovl_cache_entry *p)
>>> +
>>> +{
>>> +       struct dentry *this;
>>> +       struct dentry *base = ovl_dentry_at_idx(dir, p->idx);
>>> +
>>> +       if (p->name[0] == '.') {
>>> +               if (p->len == 1) {
>>> +                       this = dget(base);
>>> +                       goto get;
>>> +               }
>>> +               if (p->len == 2 && p->name[1] == '.') {
>>> +                       /* ♫ we shall not be moved */
>>> +                       this = dget(ovl_dentry_real(dir->d_parent));
>>> +                       goto get;
>>> +               }
>>> +       }
>>> +       this = lookup_one_len_unlocked(p->name, base, p->len);
>>> +       if (IS_ERR(this)) {
>>> +               pr_err("overlay: failed to look up (%s) for ino (%i)\n",
>>> +                      p->name, (int) PTR_ERR(this));
>>> +               return -EIO;
>>> +       }
>>>
>>> +       get:
>>> +       p->ino = p->orig_ino;
>>> +       ovl_get_ino(this, &p->ino);
>
> I may be way off here, but why do you need to lookup entry and get ino
> from xattr at all? Wouldn't it be easier to not add the entry to the list if
> it was copied up and rely on the fact that it will be added to list in iter
> of lower layer with original ino with no extra effort?

What about renamed entries?  What about opaque ones?

I do hope we can optimize directory reading, because doing lookup and
getxattr for all entries is going to hurt...

> For that matter, I like the fact that every copied up entry will be explicitly
> marked with OVL_XATTR_INO. In a way, it is the opposite of
> OVL_XATTR_OPAQUE and if the former becomes a standard, the latter
> will become redundant. Arguably, it is preferred to mark the copy ups
> as special case rather then the pure upper files, which can then stay 'pure'.

Maybe.


>>> @@ -502,7 +549,8 @@ static int ovl_dir_open(struct inode *in
>>>                 return PTR_ERR(realfile);
>>>         }
>>>         od->realfile = realfile;
>>> -       od->is_real = !OVL_TYPE_MERGE(type);
>>> +//     od->is_real = !OVL_TYPE_MERGE(type);
>>> +       od->is_real = false;
>
>
> A major change of framing would be to treat regular file entries as merged
> if they have been ever copied up and opaque only if they are pure upper.
> Same as dirs.
>
> This would also allow keeping ino stable across rename/redirect of regular
> files. Not sure if any programs rely on that??

We do keep ino stable across rename.  We don't keep ino stable across
copy-up.  That's what this patch is trying to address.

You are saying that we should have redirects for non-dir and drop
OVL_XATTR_INO?  That's another option, but it doesn't look like it
would simplify things...

Thanks for the revirew.

Pushed patch to #overlayfs-constino (needs work but it's worth testing).

Miklos
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Nov. 28, 2016, 11:56 a.m. UTC | #3
On Mon, Nov 28, 2016 at 12:35 PM, Miklos Szeredi <miklos@szeredi.hu> wrote:
> On Mon, Nov 28, 2016 at 10:10 AM, Amir Goldstein <amir73il@gmail.com> wrote:
>>>> @@ -258,12 +268,12 @@ static int ovl_copy_up_locked(struct den
>>>>         if (err)
>>>>                 goto out_cleanup;
>>>>
>>>> -       inode_lock(newdentry->d_inode);
>>>>         err = ovl_set_attr(newdentry, stat);
>>>> -       inode_unlock(newdentry->d_inode);
>>>>         if (err)
>>>>                 goto out_cleanup;
>>>>
>>>> +       ovl_dentry_set_ino(dentry, stat->ino);
>>>> +
>>>
>>>
>>
>> Shouldn't we propagate stored ino all the way
>> from lowest layer to preserve ino across layer recycling?
>
> Since OVL_XATTR_INO is set every time we copy-up, layer recycling
> should work fine.
>

Right. I knew it should be there somewhere, but miss-read the copy up code.

> Exception is overlay root, where there's no copy-up, so no
> preservation of ino.  Not sure what the right solution there is.
>
>>
>> Specifically, shouldn't ino of merged dir expose the lower most ino?
>
> It should.
>
>
>>>> @@ -353,11 +354,45 @@ static struct ovl_dir_cache *ovl_cache_g
>>>>         return cache;
>>>>  }
>>>>
>>>> +static int ovl_cache_entry_update_ino(struct dentry *dir,
>>>> +                                     struct ovl_cache_entry *p)
>>>> +
>>>> +{
>>>> +       struct dentry *this;
>>>> +       struct dentry *base = ovl_dentry_at_idx(dir, p->idx);
>>>> +
>>>> +       if (p->name[0] == '.') {
>>>> +               if (p->len == 1) {
>>>> +                       this = dget(base);
>>>> +                       goto get;
>>>> +               }
>>>> +               if (p->len == 2 && p->name[1] == '.') {
>>>> +                       /* ♫ we shall not be moved */
>>>> +                       this = dget(ovl_dentry_real(dir->d_parent));
>>>> +                       goto get;
>>>> +               }
>>>> +       }
>>>> +       this = lookup_one_len_unlocked(p->name, base, p->len);
>>>> +       if (IS_ERR(this)) {
>>>> +               pr_err("overlay: failed to look up (%s) for ino (%i)\n",
>>>> +                      p->name, (int) PTR_ERR(this));
>>>> +               return -EIO;
>>>> +       }
>>>>
>>>> +       get:
>>>> +       p->ino = p->orig_ino;
>>>> +       ovl_get_ino(this, &p->ino);
>>
>> I may be way off here, but why do you need to lookup entry and get ino
>> from xattr at all? Wouldn't it be easier to not add the entry to the list if
>> it was copied up and rely on the fact that it will be added to list in iter
>> of lower layer with original ino with no extra effort?
>
> What about renamed entries?

Right. I completely missed out on the rename case..

> What about opaque ones?

That's exactly the point of OVL_XATTR_INO IFF !OVL_XATTR_OPAQUE
If you have OVL_XATTR_INO means entry cannot be opaque, so it should
be safe to skip it

>
> I do hope we can optimize directory reading, because doing lookup and
> getxattr for all entries is going to hurt...
>

Possibly silly question:
Do you know if programs really rely of d_ino from getdents to be sane/non-zero?
And what are the implications of overlayfs readdir not exporting the real d_ino?

For example, findutils seems to be ok with zero d_ino and just uses non-zero
d_ino for optimization:

commit 2bf001636e66789560ef1d2509c117f78b6cd06f
Author: James Youngman <jay@gnu.org>
Date:   Sat Mar 7 20:16:49 2009 +0000

    Optimise away calls to stat if all we need is the inode number (bug #24342).


>> For that matter, I like the fact that every copied up entry will be explicitly
>> marked with OVL_XATTR_INO. In a way, it is the opposite of
>> OVL_XATTR_OPAQUE and if the former becomes a standard, the latter
>> will become redundant. Arguably, it is preferred to mark the copy ups
>> as special case rather then the pure upper files, which can then stay 'pure'.
>
> Maybe.
>
>
>>>> @@ -502,7 +549,8 @@ static int ovl_dir_open(struct inode *in
>>>>                 return PTR_ERR(realfile);
>>>>         }
>>>>         od->realfile = realfile;
>>>> -       od->is_real = !OVL_TYPE_MERGE(type);
>>>> +//     od->is_real = !OVL_TYPE_MERGE(type);
>>>> +       od->is_real = false;
>>
>>
>> A major change of framing would be to treat regular file entries as merged
>> if they have been ever copied up and opaque only if they are pure upper.
>> Same as dirs.
>>
>> This would also allow keeping ino stable across rename/redirect of regular
>> files. Not sure if any programs rely on that??
>
> We do keep ino stable across rename.  We don't keep ino stable across
> copy-up.  That's what this patch is trying to address.
>
> You are saying that we should have redirects for non-dir and drop
> OVL_XATTR_INO?  That's another option, but it doesn't look like it
> would simplify things...
>

Well, not sure if you noticed my redirect_fh (rediect by file handle) work.
If differs from redirect by path in 2 major ways:
1. Like OVL_XATTR_INO, redirect is set on copy up (but only for dirs)
2. Lookup is much simpler (and most likely faster) then full path lookup

It would be trivial to set oe->ino of merged dir from lower most entry in
ovl_lookup().

So while I cannot justify non-dir redirect in favor of OVL_XATTR_INO,
I do see a big value for redirect by file handle for directories, which can
provide the non-readdir part of stable directory inode as by-product.

> Thanks for the revirew.
>
> Pushed patch to #overlayfs-constino (needs work but it's worth testing).
>

Thanks. I'll get to testing this later this week and will do my best to draft a
quick xfstest for this as well.

Amir.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Nov. 28, 2016, 6:02 p.m. UTC | #4
On Mon, Nov 28, 2016 at 1:56 PM, Amir Goldstein <amir73il@gmail.com> wrote:
> On Mon, Nov 28, 2016 at 12:35 PM, Miklos Szeredi <miklos@szeredi.hu> wrote:
>> On Mon, Nov 28, 2016 at 10:10 AM, Amir Goldstein <amir73il@gmail.com> wrote:
>>>>> @@ -258,12 +268,12 @@ static int ovl_copy_up_locked(struct den
>>>>>         if (err)
>>>>>                 goto out_cleanup;
>>>>>
>>>>> -       inode_lock(newdentry->d_inode);
>>>>>         err = ovl_set_attr(newdentry, stat);
>>>>> -       inode_unlock(newdentry->d_inode);
>>>>>         if (err)
>>>>>                 goto out_cleanup;
>>>>>
>>>>> +       ovl_dentry_set_ino(dentry, stat->ino);
>>>>> +
>>>>
>>>>
>>>
>>> Shouldn't we propagate stored ino all the way
>>> from lowest layer to preserve ino across layer recycling?
>>
>> Since OVL_XATTR_INO is set every time we copy-up, layer recycling
>> should work fine.
>>
>
> Right. I knew it should be there somewhere, but miss-read the copy up code.
>
>> Exception is overlay root, where there's no copy-up, so no
>> preservation of ino.  Not sure what the right solution there is.
>>
>>>
>>> Specifically, shouldn't ino of merged dir expose the lower most ino?
>>
>> It should.
>>
>>
>>>>> @@ -353,11 +354,45 @@ static struct ovl_dir_cache *ovl_cache_g
>>>>>         return cache;
>>>>>  }
>>>>>
>>>>> +static int ovl_cache_entry_update_ino(struct dentry *dir,
>>>>> +                                     struct ovl_cache_entry *p)
>>>>> +
>>>>> +{
>>>>> +       struct dentry *this;
>>>>> +       struct dentry *base = ovl_dentry_at_idx(dir, p->idx);
>>>>> +
>>>>> +       if (p->name[0] == '.') {
>>>>> +               if (p->len == 1) {
>>>>> +                       this = dget(base);
>>>>> +                       goto get;
>>>>> +               }
>>>>> +               if (p->len == 2 && p->name[1] == '.') {
>>>>> +                       /* ♫ we shall not be moved */

I don't think music notes are allowed in comments outside the audio
subsystem ;-)

>>>>> +                       this = dget(ovl_dentry_real(dir->d_parent));
>>>>> +                       goto get;
>>>>> +               }
>>>>> +       }
>>>>> +       this = lookup_one_len_unlocked(p->name, base, p->len);
>>>>> +       if (IS_ERR(this)) {
>>>>> +               pr_err("overlay: failed to look up (%s) for ino (%i)\n",
>>>>> +                      p->name, (int) PTR_ERR(this));
>>>>> +               return -EIO;
>>>>> +       }
>>>>>
>>>>> +       get:

indentation of goto label.

>>>>> +       p->ino = p->orig_ino;
>>>>> +       ovl_get_ino(this, &p->ino);
>>>
>>> I may be way off here, but why do you need to lookup entry and get ino
>>> from xattr at all? Wouldn't it be easier to not add the entry to the list if
>>> it was copied up and rely on the fact that it will be added to list in iter
>>> of lower layer with original ino with no extra effort?
>>
>> What about renamed entries?
>
> Right. I completely missed out on the rename case..
>
>> What about opaque ones?
>
> That's exactly the point of OVL_XATTR_INO IFF !OVL_XATTR_OPAQUE
> If you have OVL_XATTR_INO means entry cannot be opaque, so it should
> be safe to skip it
>
>>
>> I do hope we can optimize directory reading, because doing lookup and
>> getxattr for all entries is going to hurt...
>>
>
> Possibly silly question:
> Do you know if programs really rely of d_ino from getdents to be sane/non-zero?
> And what are the implications of overlayfs readdir not exporting the real d_ino?
>
> For example, findutils seems to be ok with zero d_ino and just uses non-zero
> d_ino for optimization:
>
> commit 2bf001636e66789560ef1d2509c117f78b6cd06f
> Author: James Youngman <jay@gnu.org>
> Date:   Sat Mar 7 20:16:49 2009 +0000
>
>     Optimise away calls to stat if all we need is the inode number (bug #24342).
>
>
>>> For that matter, I like the fact that every copied up entry will be explicitly
>>> marked with OVL_XATTR_INO. In a way, it is the opposite of
>>> OVL_XATTR_OPAQUE and if the former becomes a standard, the latter
>>> will become redundant. Arguably, it is preferred to mark the copy ups
>>> as special case rather then the pure upper files, which can then stay 'pure'.
>>
>> Maybe.
>>
>>
>>>>> @@ -502,7 +549,8 @@ static int ovl_dir_open(struct inode *in
>>>>>                 return PTR_ERR(realfile);
>>>>>         }
>>>>>         od->realfile = realfile;
>>>>> -       od->is_real = !OVL_TYPE_MERGE(type);
>>>>> +//     od->is_real = !OVL_TYPE_MERGE(type);
>>>>> +       od->is_real = false;
>>>
>>>
>>> A major change of framing would be to treat regular file entries as merged
>>> if they have been ever copied up and opaque only if they are pure upper.
>>> Same as dirs.
>>>
>>> This would also allow keeping ino stable across rename/redirect of regular
>>> files. Not sure if any programs rely on that??
>>
>> We do keep ino stable across rename.  We don't keep ino stable across
>> copy-up.  That's what this patch is trying to address.
>>
>> You are saying that we should have redirects for non-dir and drop
>> OVL_XATTR_INO?  That's another option, but it doesn't look like it
>> would simplify things...
>>
>
> Well, not sure if you noticed my redirect_fh (rediect by file handle) work.
> If differs from redirect by path in 2 major ways:
> 1. Like OVL_XATTR_INO, redirect is set on copy up (but only for dirs)
> 2. Lookup is much simpler (and most likely faster) then full path lookup
>
> It would be trivial to set oe->ino of merged dir from lower most entry in
> ovl_lookup().
>
> So while I cannot justify non-dir redirect in favor of OVL_XATTR_INO,
> I do see a big value for redirect by file handle for directories, which can
> provide the non-readdir part of stable directory inode as by-product.
>
>> Thanks for the revirew.
>>
>> Pushed patch to #overlayfs-constino (needs work but it's worth testing).
>>
>
> Thanks. I'll get to testing this later this week and will do my best to draft a
> quick xfstest for this as well.
>

So far so good, passed xfstest/pjdfstests/unionmount (--ov=0/10).
The stable st_dev change actually broke some assumptions made by
unionmount-testsuite so test needed some fixes to check_layer()
fixes available at:
https://github.com/amir73il/unionmount-testsuite.git #overlayfs-devel

commit f4d2bee (Test lower/upper on the same underlying fs) has
instructions for running unionmount tests over same underlaying fs.

Tested-by: Amir Goldstein <amir73il@gmail.com>
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Miklos Szeredi Nov. 29, 2016, 10:16 a.m. UTC | #5
On Mon, Nov 28, 2016 at 12:56 PM, Amir Goldstein <amir73il@gmail.com> wrote:
> On Mon, Nov 28, 2016 at 12:35 PM, Miklos Szeredi <miklos@szeredi.hu> wrote:
>> On Mon, Nov 28, 2016 at 10:10 AM, Amir Goldstein <amir73il@gmail.com> wrote:

[...]

>>> I may be way off here, but why do you need to lookup entry and get ino
>>> from xattr at all? Wouldn't it be easier to not add the entry to the list if
>>> it was copied up and rely on the fact that it will be added to list in iter
>>> of lower layer with original ino with no extra effort?
>>
>> What about renamed entries?
>
> Right. I completely missed out on the rename case..
>
>> What about opaque ones?
>
> That's exactly the point of OVL_XATTR_INO IFF !OVL_XATTR_OPAQUE
> If you have OVL_XATTR_INO means entry cannot be opaque, so it should
> be safe to skip it

So that means if OVL_XATTR_OPAQUE is set then we don't need to check
OVL_XATTR_INO and if OVL_XATTR_INO is set we don't need to check
OVL_XATTR_OPAQUE.   But that doesn't help optimize readdir, because we
really want to check neither.


>> I do hope we can optimize directory reading, because doing lookup and
>> getxattr for all entries is going to hurt...
>>
>
> Possibly silly question:
> Do you know if programs really rely of d_ino from getdents to be sane/non-zero?
> And what are the implications of overlayfs readdir not exporting the real d_ino?

I don't know of any precedent, so it's a big unknown.

>> We do keep ino stable across rename.  We don't keep ino stable across
>> copy-up.  That's what this patch is trying to address.
>>
>> You are saying that we should have redirects for non-dir and drop
>> OVL_XATTR_INO?  That's another option, but it doesn't look like it
>> would simplify things...

There *is* actually a case where both 'opaque' and 'ino' make sense:
when an empty merged dir is exchanged for an empty opaque one in
ovl_clear_empty().

> Well, not sure if you noticed my redirect_fh (rediect by file handle) work.
> If differs from redirect by path in 2 major ways:
> 1. Like OVL_XATTR_INO, redirect is set on copy up (but only for dirs)
> 2. Lookup is much simpler (and most likely faster) then full path lookup
>
> It would be trivial to set oe->ino of merged dir from lower most entry in
> ovl_lookup().

Okay.  In fact always using the handle looks like a better option
overall.  File handle should be unique for the lifetime of the
filesystem, while inode numbers may be reused.

Biggest drawback of the file handle based redirects is exactly that:
makes backing up the overlay basically impossible, since file handles
won't work after a backup + restore.  But as an optimization, in
addition to path based redirects it would work fine and provide a good
way to get the stable ino.

Thanks,
Miklos
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Nov. 29, 2016, 11:34 a.m. UTC | #6
On Tue, Nov 29, 2016 at 12:16 PM, Miklos Szeredi <miklos@szeredi.hu> wrote:
> On Mon, Nov 28, 2016 at 12:56 PM, Amir Goldstein <amir73il@gmail.com> wrote:
>
> [...]
>
>
>> Well, not sure if you noticed my redirect_fh (rediect by file handle) work.
>> If differs from redirect by path in 2 major ways:
>> 1. Like OVL_XATTR_INO, redirect is set on copy up (but only for dirs)
>> 2. Lookup is much simpler (and most likely faster) then full path lookup
>>
>> It would be trivial to set oe->ino of merged dir from lower most entry in
>> ovl_lookup().
>
> Okay.  In fact always using the handle looks like a better option
> overall.  File handle should be unique for the lifetime of the
> filesystem, while inode numbers may be reused.
>
> Biggest drawback of the file handle based redirects is exactly that:
> makes backing up the overlay basically impossible, since file handles
> won't work after a backup + restore.  But as an optimization, in
> addition to path based redirects it would work fine and provide a good
> way to get the stable ino.
>

Not sure that I understand what you are suggesting, but I would be happy
to make the needed adjustments to redirect_fh per your request if you clarify
what you mean. From what I understand:

1. If redirect_dir=fh (and supported by layers), store lower handle
    on dir copy up in new xattr OVL_XATTR_FH
2. In ovl_rename(), set OVL_XATTR_REDIRECT regardless of OVL_XATTR_FH
3. In ovl_lookup_single(), carry both d.redirct and d.redirect_fh to next layer
4. In ovl_lookup_layer(), lookup by handle first then by path

Mind you that unlike OVL_XATTR_INO, OVL_XATTR_FH points to *next* inode
rather then *lowest* inode, so it does not really improve anything wrt
getting the
stable inode. Stable inode of merged dir is available after lookup in
oe->lowerstack[oe->numlower - 1].dentry->d_inode->i_ino
regardless of whether lookup was by handle or by path or no redirect at all, so
not sure what you meant by "... and provide a good way to get the stable ino."
Either I managed to confuse you, or I am missing something?

Amir.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Nov. 29, 2016, 12:03 p.m. UTC | #7
On Tue, Nov 29, 2016 at 1:34 PM, Amir Goldstein <amir73il@gmail.com> wrote:
> On Tue, Nov 29, 2016 at 12:16 PM, Miklos Szeredi <miklos@szeredi.hu> wrote:
>> On Mon, Nov 28, 2016 at 12:56 PM, Amir Goldstein <amir73il@gmail.com> wrote:
>>
>> [...]
>>
>>
>>> Well, not sure if you noticed my redirect_fh (rediect by file handle) work.
>>> If differs from redirect by path in 2 major ways:
>>> 1. Like OVL_XATTR_INO, redirect is set on copy up (but only for dirs)
>>> 2. Lookup is much simpler (and most likely faster) then full path lookup
>>>
>>> It would be trivial to set oe->ino of merged dir from lower most entry in
>>> ovl_lookup().
>>
>> Okay.  In fact always using the handle looks like a better option
>> overall.  File handle should be unique for the lifetime of the
>> filesystem, while inode numbers may be reused.
>>
>> Biggest drawback of the file handle based redirects is exactly that:
>> makes backing up the overlay basically impossible, since file handles
>> won't work after a backup + restore.  But as an optimization, in
>> addition to path based redirects it would work fine and provide a good
>> way to get the stable ino.
>>
>
> Not sure that I understand what you are suggesting, but I would be happy
> to make the needed adjustments to redirect_fh per your request if you clarify
> what you mean. From what I understand:
>
> 1. If redirect_dir=fh (and supported by layers), store lower handle
>     on dir copy up in new xattr OVL_XATTR_FH
> 2. In ovl_rename(), set OVL_XATTR_REDIRECT regardless of OVL_XATTR_FH
> 3. In ovl_lookup_single(), carry both d.redirct and d.redirect_fh to next layer
> 4. In ovl_lookup_layer(), lookup by handle first then by path
>
> Mind you that unlike OVL_XATTR_INO, OVL_XATTR_FH points to *next* inode
> rather then *lowest* inode, so it does not really improve anything wrt
> getting the
> stable inode. Stable inode of merged dir is available after lookup in
> oe->lowerstack[oe->numlower - 1].dentry->d_inode->i_ino
> regardless of whether lookup was by handle or by path or no redirect at all, so
> not sure what you meant by "... and provide a good way to get the stable ino."
> Either I managed to confuse you, or I am missing something?
>

Perhaps you meant for non-dir:

5. If redirect_dir=fh, *propagate* lowest-handle on non-dir copy up
6. In ovl_lookup() of non-dir, decode lowest-handle to set oe->ino

This is a good way to avoid stale OVL_XATTR_INO after backup+restore.

Amir.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Miklos Szeredi Nov. 29, 2016, 9:49 p.m. UTC | #8
On Tue, Nov 29, 2016 at 1:03 PM, Amir Goldstein <amir73il@gmail.com> wrote:
> On Tue, Nov 29, 2016 at 1:34 PM, Amir Goldstein <amir73il@gmail.com> wrote:
>> On Tue, Nov 29, 2016 at 12:16 PM, Miklos Szeredi <miklos@szeredi.hu> wrote:
>>> On Mon, Nov 28, 2016 at 12:56 PM, Amir Goldstein <amir73il@gmail.com> wrote:

>> Not sure that I understand what you are suggesting, but I would be happy
>> to make the needed adjustments to redirect_fh per your request if you clarify
>> what you mean. From what I understand:
>>
>> 1. If redirect_dir=fh (and supported by layers), store lower handle
>>     on dir copy up in new xattr OVL_XATTR_FH
>> 2. In ovl_rename(), set OVL_XATTR_REDIRECT regardless of OVL_XATTR_FH
>> 3. In ovl_lookup_single(), carry both d.redirct and d.redirect_fh to next layer
>> 4. In ovl_lookup_layer(), lookup by handle first then by path
>>
>> Mind you that unlike OVL_XATTR_INO, OVL_XATTR_FH points to *next* inode
>> rather then *lowest* inode, so it does not really improve anything wrt
>> getting the
>> stable inode. Stable inode of merged dir is available after lookup in
>> oe->lowerstack[oe->numlower - 1].dentry->d_inode->i_ino
>> regardless of whether lookup was by handle or by path or no redirect at all, so
>> not sure what you meant by "... and provide a good way to get the stable ino."
>> Either I managed to confuse you, or I am missing something?

I meant that we can unify OVL_XATTR_INO  with "redirect/fh"
functionality and get something good out of it.

> Perhaps you meant for non-dir:
>
> 5. If redirect_dir=fh, *propagate* lowest-handle on non-dir copy up
> 6. In ovl_lookup() of non-dir, decode lowest-handle to set oe->ino

Yes.

OVL_XATTR_FH would be safe to ignore, so this is back and forward
compatible..  And the cost is probably not prohitive, since copy ups
should be relatively rare.

After a backup + restore it is not expected that we get back the old
inode numbers so it's fine to ignore the stale file handles.

The following issues are left:

 - performance of readdir;
 - what to do if not all layers are on the same fs;
 - hard link copy ups.

Thanks,
Miklos
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Nov. 30, 2016, 3:05 p.m. UTC | #9
On Tue, Nov 29, 2016 at 11:49 PM, Miklos Szeredi <miklos@szeredi.hu> wrote:
> On Tue, Nov 29, 2016 at 1:03 PM, Amir Goldstein <amir73il@gmail.com> wrote:
...
> I meant that we can unify OVL_XATTR_INO  with "redirect/fh"
> functionality and get something good out of it.
>
>> Perhaps you meant for non-dir:
>>
>> 5. If redirect_dir=fh, *propagate* lowest-handle on non-dir copy up
>> 6. In ovl_lookup() of non-dir, decode lowest-handle to set oe->ino
>
> Yes.
>
> OVL_XATTR_FH would be safe to ignore, so this is back and forward
> compatible..  And the cost is probably not prohitive, since copy ups
> should be relatively rare.
>
> After a backup + restore it is not expected that we get back the old
> inode numbers so it's fine to ignore the stale file handles.
>

FYI, there are 2 interesting corner case of "semi stale" handles:
- Copy of layers to same fs (without deleting old layers)
- Old layers are deleted but an old deleted file is still open

I have handled both these cases in the last version of redirect_fh
that I pushed yesterday, but not 100% sure that I handled them
correctly.

Anyway, I will get to work on adjusting redirect_fh for use by
stable inodes.

> The following issues are left:
>
>  - performance of readdir;

Here is one very simple optimization for WIP:
@@ -157,6 +157,8 @@ static int ovl_fill_lowest(struct ovl_readdir_data *rdd,
                list_move_tail(&p->l_node, &rdd->middle);
        } else {
                p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
+               if (p)
+                       p->ino = ino;

For non-lowets entry, we can provide mount option 'readdir_ino'.
With readdir_ino, readdir pays a penalty of getxattr for any non-lowest
entry (either OVL_XATTR_FH or OVL_XATTR_INO).
Without readdir_ino, readdir will get d_ino = 0, in which case, at least
`find <path> --inum=<n>` does the right thing (fallback to fstat for
this dirent).

>  - what to do if not all layers are on the same fs;

Same as what I did for redirect_fh - turn the feature off.
We can also export this state in /proc/mounts options and maybe allow to
explicitly turn off stable inodes, but I don't think that we should, because
there shouldn't be a program which relies on inode numbers NOT being stable.

>  - hard link copy ups.
>

I'll start by setting up a TODO Wiki page and writing xfstests for all those
issues. Maybe even track them on github..

Amir.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Nov. 30, 2016, 4:36 p.m. UTC | #10
On Wed, Nov 30, 2016 at 5:05 PM, Amir Goldstein <amir73il@gmail.com> wrote:
...
> For non-lowets entry, we can provide mount option 'readdir_ino'.
> With readdir_ino, readdir pays a penalty of getxattr for any non-lowest
> entry (either OVL_XATTR_FH or OVL_XATTR_INO).
> Without readdir_ino, readdir will get d_ino = 0, in which case, at least
> `find <path> --inum=<n>` does the right thing (fallback to fstat for
> this dirent).
>

Well, I was wrong about findutils. The -inum filter seems to fail to find
the file if d_ino is set to 0.
That's at least the case for findutils 4.4.2 that I have on debian and
4.7.0-git that I built myself.
Now I have a simple command line to use for the xfstest...

Amir.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Dec. 5, 2016, 2:05 p.m. UTC | #11
On Tue, Nov 29, 2016 at 11:49 PM, Miklos Szeredi <miklos@szeredi.hu> wrote:
> On Tue, Nov 29, 2016 at 1:03 PM, Amir Goldstein <amir73il@gmail.com> wrote:
>> On Tue, Nov 29, 2016 at 1:34 PM, Amir Goldstein <amir73il@gmail.com> wrote:

...
>>> Not sure that I understand what you are suggesting, but I would be happy
>>> to make the needed adjustments to redirect_fh per your request if you clarify
>>> what you mean. From what I understand:
>>>
>>> 1. If redirect_dir=fh (and supported by layers), store lower handle
>>>     on dir copy up in new xattr OVL_XATTR_FH
>>> 2. In ovl_rename(), set OVL_XATTR_REDIRECT regardless of OVL_XATTR_FH
>>> 3. In ovl_lookup_single(), carry both d.redirct and d.redirect_fh to next layer
>>> 4. In ovl_lookup_layer(), lookup by handle first then by path
>>>

Miklos,

FYI, this part of the work is done and pushed to #redirect_fh branch
on my github.
It is tested on same and not same fs and it is used as the base of
current #ovl_snapshot
branch and tested along with it.

>>> not sure what you meant by "... and provide a good way to get the stable ino."
>>> Either I managed to confuse you, or I am missing something?
>
> I meant that we can unify OVL_XATTR_INO  with "redirect/fh"
> functionality and get something good out of it.
>
>> Perhaps you meant for non-dir:
>>
>> 5. If redirect_dir=fh, *propagate* lowest-handle on non-dir copy up
>> 6. In ovl_lookup() of non-dir, decode lowest-handle to set oe->ino
>
> Yes.
>
> OVL_XATTR_FH would be safe to ignore, so this is back and forward
> compatible..  And the cost is probably not prohitive, since copy ups
> should be relatively rare.
>

I am still trying to figure out the best way of "getting something good"
out of this merger, because changing ovl_getattr() to use redirect_fh
instead of stored ino is not such a big win.

More thought are welcome.

Amir.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amir Goldstein Dec. 5, 2016, 8:01 p.m. UTC | #12
[linux-kernel removed]

On Wed, Nov 30, 2016 at 5:05 PM, Amir Goldstein <amir73il@gmail.com> wrote:
> On Tue, Nov 29, 2016 at 11:49 PM, Miklos Szeredi <miklos@szeredi.hu> wrote:
...
>
>> The following issues are left:
>>
>>  - performance of readdir;
>
> Here is one very simple optimization for WIP:
> @@ -157,6 +157,8 @@ static int ovl_fill_lowest(struct ovl_readdir_data *rdd,
>                 list_move_tail(&p->l_node, &rdd->middle);
>         } else {
>                 p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
> +               if (p)
> +                       p->ino = ino;
>

Beyond that simple optimization, when file/dirs are created over whiteout,
we can mark the parent directory "remap_dino" and same when file/dir are
moved into a merged directory.

In every layer readdir would check "remap_dino" and if NOT set, it can skip
reading ino from xattr. All read entries would be marked as 'candidate ino'
and if another candidate of the same name is found in lower dir, it will update
p->ino.

>
>>  - hard link copy ups.
>>
>
> I'll start by setting up a TODO Wiki page and writing xfstests for all those

Wiki is up:
https://github.com/amir73il/overlayfs/wiki/Non-standard-behavior

Note that I have made the following observation about hard link copy up.
Not sure how much sense it makes:
"It is possible to imagine how solving the stable file handle goal could
be used to solve the broken hardlink issue - if there is a method to locate
a lower file handle that has been copied up by inode number/generation,
the same method can be used to check, when opening a lower file, if the
same inode has been already copied up for another alias".

Amir.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -177,6 +177,7 @@  int ovl_set_attr(struct dentry *upperden
 {
 	int err = 0;
 
+	inode_lock(upperdentry->d_inode);
 	if (!S_ISLNK(stat->mode)) {
 		struct iattr attr = {
 			.ia_valid = ATTR_MODE,
@@ -194,6 +195,15 @@  int ovl_set_attr(struct dentry *upperden
 	}
 	if (!err)
 		ovl_set_timestamps(upperdentry, stat);
+	inode_unlock(upperdentry->d_inode);
+
+	if (!err) {
+		char buf[17];
+
+		snprintf(buf, sizeof(buf), "%llx", stat->ino);
+		err = ovl_do_setxattr(upperdentry, OVL_XATTR_INO,
+				      buf, strlen(buf), 0);
+	}
 
 	return err;
 }
@@ -258,12 +268,12 @@  static int ovl_copy_up_locked(struct den
 	if (err)
 		goto out_cleanup;
 
-	inode_lock(newdentry->d_inode);
 	err = ovl_set_attr(newdentry, stat);
-	inode_unlock(newdentry->d_inode);
 	if (err)
 		goto out_cleanup;
 
+	ovl_dentry_set_ino(dentry, stat->ino);
+
 	err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
 	if (err)
 		goto out_cleanup;
@@ -379,7 +389,7 @@  int ovl_copy_up(struct dentry *dentry)
 		}
 
 		ovl_path_lower(next, &lowerpath);
-		err = vfs_getattr(&lowerpath, &stat);
+		err = ovl_getattr_int(next, &lowerpath, &stat);
 		if (!err)
 			err = ovl_copy_up_one(parent, next, &lowerpath, &stat);
 
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -20,6 +20,7 @@  enum ovl_path_type {
 #define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay."
 #define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque"
 #define OVL_XATTR_REDIRECT OVL_XATTR_PREFIX "redirect"
+#define OVL_XATTR_INO OVL_XATTR_PREFIX "ino"
 
 #define OVL_ISUPPER_MASK 1UL
 
@@ -161,6 +162,8 @@  bool ovl_redirect_dir(struct super_block
 void ovl_clear_redirect_dir(struct super_block *sb);
 const char *ovl_dentry_get_redirect(struct dentry *dentry);
 void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect);
+u64 ovl_dentry_get_ino(struct dentry *dentry);
+void ovl_dentry_set_ino(struct dentry *dentry, u64 ino);
 void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
 void ovl_inode_init(struct inode *inode, struct inode *realinode,
 		    bool is_upper);
@@ -171,9 +174,10 @@  bool ovl_is_whiteout(struct dentry *dent
 struct file *ovl_path_open(struct path *path, int flags);
 
 /* namei.c */
-int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path, int *idxp);
 struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags);
 bool ovl_lower_positive(struct dentry *dentry);
+struct dentry *ovl_dentry_at_idx(struct dentry *dentry, int idx);
 
 /* readdir.c */
 extern const struct file_operations ovl_dir_operations;
@@ -196,6 +200,9 @@  struct posix_acl *ovl_get_acl(struct ino
 int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags);
 int ovl_update_time(struct inode *inode, struct timespec *ts, int flags);
 bool ovl_is_private_xattr(const char *name);
+int ovl_getattr_int(struct dentry *dentry, struct path *realpath,
+		    struct kstat *stat);
+void ovl_get_ino(struct dentry *realdentry, u64 *ino);
 
 struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev);
 struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode);
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -15,6 +15,7 @@ 
 #include <linux/module.h>
 #include <linux/file.h>
 #include <linux/hashtable.h>
+#include <linux/ratelimit.h>
 #include "overlayfs.h"
 
 static int ovl_copy_up_truncate(struct dentry *dentry)
@@ -33,7 +34,7 @@  static int ovl_copy_up_truncate(struct d
 	ovl_path_lower(dentry, &lowerpath);
 
 	old_cred = ovl_override_creds(dentry->d_sb);
-	err = vfs_getattr(&lowerpath, &stat);
+	err = ovl_getattr_int(dentry, &lowerpath, &stat);
 	if (!err) {
 		stat.size = 0;
 		err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat);
@@ -88,6 +89,46 @@  int ovl_setattr(struct dentry *dentry, s
 	return err;
 }
 
+void ovl_get_ino(struct dentry *realdentry, u64 *ino)
+{
+	int err;
+	char buf[18];
+
+	err = vfs_getxattr(realdentry, OVL_XATTR_INO,
+			   buf, sizeof(buf) - 1);
+	if (err < 0) {
+		if (err != -ENODATA && err != -EOPNOTSUPP)
+			pr_warn_ratelimited("overlay: failed to get ino (%i)\n", err);
+	} else {
+		buf[err] = '\0';
+		err = kstrtoull(buf, 16, ino);
+		if (err)
+			pr_warn("overlay: invalid ino (%s)\n", buf);
+	}
+}
+
+int ovl_getattr_int(struct dentry *dentry, struct path *realpath,
+		    struct kstat *stat)
+{
+	int err;
+	u64 ino;
+
+	err = vfs_getattr(realpath, stat);
+	if (err)
+		return err;
+
+	ino = ovl_dentry_get_ino(dentry);
+	if (!ino) {
+		ino = stat->ino;
+		ovl_get_ino(realpath->dentry, &ino);
+		ovl_dentry_set_ino(dentry, ino);
+	}
+	stat->dev = dentry->d_sb->s_dev;
+	stat->ino = ino;
+
+	return 0;
+}
+
 static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
 			 struct kstat *stat)
 {
@@ -97,7 +138,7 @@  static int ovl_getattr(struct vfsmount *
 
 	ovl_path_real(dentry, &realpath);
 	old_cred = ovl_override_creds(dentry->d_sb);
-	err = vfs_getattr(&realpath, stat);
+	err = ovl_getattr_int(dentry, &realpath, stat);
 	revert_creds(old_cred);
 	return err;
 }
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -36,6 +36,7 @@  struct ovl_entry {
 	union {
 		struct {
 			u64 version;
+			u64 ino;
 			const char *redirect;
 			bool opaque;
 		};
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -205,6 +205,21 @@  void ovl_dentry_set_redirect(struct dent
 	oe->redirect = redirect;
 }
 
+u64 ovl_dentry_get_ino(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	return oe->ino;
+}
+
+void ovl_dentry_set_ino(struct dentry *dentry, u64 ino)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	WARN_ON(oe->ino && oe->ino != ino);
+	oe->ino = ino;
+}
+
 void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
 {
 	struct ovl_entry *oe = dentry->d_fsdata;
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -143,14 +143,11 @@  static int ovl_dir_getattr(struct vfsmou
 
 	type = ovl_path_real(dentry, &realpath);
 	old_cred = ovl_override_creds(dentry->d_sb);
-	err = vfs_getattr(&realpath, stat);
+	err = ovl_getattr_int(dentry, &realpath, stat);
 	revert_creds(old_cred);
 	if (err)
 		return err;
 
-	stat->dev = dentry->d_sb->s_dev;
-	stat->ino = dentry->d_inode->i_ino;
-
 	/*
 	 * It's probably not worth it to count subdirs to get the
 	 * correct link count.  nlink=1 seems to pacify 'find' and
@@ -250,7 +247,7 @@  static struct dentry *ovl_clear_empty(st
 		goto out;
 
 	ovl_path_upper(dentry, &upperpath);
-	err = vfs_getattr(&upperpath, &stat);
+	err = ovl_getattr_int(dentry, &upperpath, &stat);
 	if (err)
 		goto out_unlock;
 
@@ -278,9 +275,7 @@  static struct dentry *ovl_clear_empty(st
 	if (err)
 		goto out_cleanup;
 
-	inode_lock(opaquedir->d_inode);
 	err = ovl_set_attr(opaquedir, &stat);
-	inode_unlock(opaquedir->d_inode);
 	if (err)
 		goto out_cleanup;
 
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -186,23 +186,36 @@  static int ovl_lookup_layer(struct dentr
  * Returns next layer in stack starting from top.
  * Returns -1 if this is the last layer.
  */
-int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path, int *idxp)
 {
 	struct ovl_entry *oe = dentry->d_fsdata;
 
 	BUG_ON(idx < 0);
 	if (idx == 0) {
 		ovl_path_upper(dentry, path);
-		if (path->dentry)
+		if (path->dentry) {
+			*idxp = 0;
 			return oe->numlower ? 1 : -1;
+		}
 		idx++;
 	}
 	BUG_ON(idx > oe->numlower);
+	*idxp = idx;
 	*path = oe->lowerstack[idx - 1];
 
 	return (idx < oe->numlower) ? idx + 1 : -1;
 }
 
+struct dentry *ovl_dentry_at_idx(struct dentry *dentry, int idx)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	if (idx == 0)
+		return ovl_upperdentry_dereference(oe);
+	else
+		return oe->lowerstack[idx - 1].dentry;
+}
+
 struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 			  unsigned int flags)
 {
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -20,10 +20,12 @@ 
 struct ovl_cache_entry {
 	unsigned int len;
 	unsigned int type;
+	u64 orig_ino;
 	u64 ino;
 	struct list_head l_node;
 	struct rb_node node;
 	struct ovl_cache_entry *next_maybe_whiteout;
+	int idx;
 	bool is_whiteout;
 	char name[];
 };
@@ -36,13 +38,14 @@  struct ovl_dir_cache {
 
 struct ovl_readdir_data {
 	struct dir_context ctx;
-	struct dentry *dentry;
+	bool check_whiteouts;
 	bool is_lowest;
 	struct rb_root root;
 	struct list_head *list;
 	struct list_head middle;
 	struct ovl_cache_entry *first_maybe_whiteout;
 	int count;
+	int idx;
 	int err;
 	bool d_type_supported;
 };
@@ -97,8 +100,10 @@  static struct ovl_cache_entry *ovl_cache
 	p->name[len] = '\0';
 	p->len = len;
 	p->type = d_type;
-	p->ino = ino;
+	p->orig_ino = ino;
+	p->ino = 0;
 	p->is_whiteout = false;
+	p->idx = rdd->idx;
 
 	if (d_type == DT_CHR) {
 		p->next_maybe_whiteout = rdd->first_maybe_whiteout;
@@ -206,9 +211,6 @@  static int ovl_check_whiteouts(struct de
 	int err;
 	struct ovl_cache_entry *p;
 	struct dentry *dentry;
-	const struct cred *old_cred;
-
-	old_cred = ovl_override_creds(rdd->dentry->d_sb);
 
 	err = down_write_killable(&dir->d_inode->i_rwsem);
 	if (!err) {
@@ -223,7 +225,6 @@  static int ovl_check_whiteouts(struct de
 		}
 		inode_unlock(dir->d_inode);
 	}
-	revert_creds(old_cred);
 
 	return err;
 }
@@ -248,7 +249,7 @@  static inline int ovl_dir_read(struct pa
 			err = rdd->err;
 	} while (!err && rdd->count);
 
-	if (!err && rdd->first_maybe_whiteout && rdd->dentry)
+	if (!err && rdd->first_maybe_whiteout && rdd->check_whiteouts)
 		err = ovl_check_whiteouts(realpath->dentry, rdd);
 
 	fput(realfile);
@@ -268,7 +269,7 @@  static void ovl_dir_reset(struct file *f
 		od->cache = NULL;
 		od->cursor = NULL;
 	}
-	WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
+//	WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
 	if (od->is_real && OVL_TYPE_MERGE(type))
 		od->is_real = false;
 }
@@ -279,7 +280,7 @@  static int ovl_dir_read_merged(struct de
 	struct path realpath;
 	struct ovl_readdir_data rdd = {
 		.ctx.actor = ovl_fill_merge,
-		.dentry = dentry,
+		.check_whiteouts = true,
 		.list = list,
 		.root = RB_ROOT,
 		.is_lowest = false,
@@ -287,7 +288,7 @@  static int ovl_dir_read_merged(struct de
 	int idx, next;
 
 	for (idx = 0; idx != -1; idx = next) {
-		next = ovl_path_next(idx, dentry, &realpath);
+		next = ovl_path_next(idx, dentry, &realpath, &rdd.idx);
 
 		if (next != -1) {
 			err = ovl_dir_read(&realpath, &rdd);
@@ -353,11 +354,45 @@  static struct ovl_dir_cache *ovl_cache_g
 	return cache;
 }
 
+static int ovl_cache_entry_update_ino(struct dentry *dir,
+				      struct ovl_cache_entry *p)
+
+{
+	struct dentry *this;
+	struct dentry *base = ovl_dentry_at_idx(dir, p->idx);
+
+	if (p->name[0] == '.') {
+		if (p->len == 1) {
+			this = dget(base);
+			goto get;
+		}
+		if (p->len == 2 && p->name[1] == '.') {
+			/* ♫ we shall not be moved */
+			this = dget(ovl_dentry_real(dir->d_parent));
+			goto get;
+		}
+	}
+	this = lookup_one_len_unlocked(p->name, base, p->len);
+	if (IS_ERR(this)) {
+		pr_err("overlay: failed to look up (%s) for ino (%i)\n",
+		       p->name, (int) PTR_ERR(this));
+		return -EIO;
+	}
+	get:
+	p->ino = p->orig_ino;
+	ovl_get_ino(this, &p->ino);
+	dput(this);
+
+	return 0;
+}
+
 static int ovl_iterate(struct file *file, struct dir_context *ctx)
 {
 	struct ovl_dir_file *od = file->private_data;
 	struct dentry *dentry = file->f_path.dentry;
 	struct ovl_cache_entry *p;
+	const struct cred *old_cred;
+	int err;
 
 	if (!ctx->pos)
 		ovl_dir_reset(file);
@@ -365,12 +400,14 @@  static int ovl_iterate(struct file *file
 	if (od->is_real)
 		return iterate_dir(od->realfile, ctx);
 
+	old_cred = ovl_override_creds(dentry->d_sb);
 	if (!od->cache) {
 		struct ovl_dir_cache *cache;
 
 		cache = ovl_cache_get(dentry);
+		err = PTR_ERR(cache);
 		if (IS_ERR(cache))
-			return PTR_ERR(cache);
+			goto out;
 
 		od->cache = cache;
 		ovl_seek_cursor(od, ctx->pos);
@@ -378,13 +415,23 @@  static int ovl_iterate(struct file *file
 
 	while (od->cursor != &od->cache->entries) {
 		p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
-		if (!p->is_whiteout)
+		if (!p->is_whiteout) {
+			if (!p->ino) {
+				err = ovl_cache_entry_update_ino(dentry, p);
+				if (err)
+					goto out;
+			}
 			if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
 				break;
+		}
 		od->cursor = p->l_node.next;
 		ctx->pos++;
 	}
-	return 0;
+	err = 0;
+out:
+	revert_creds(old_cred);
+
+	return err;
 }
 
 static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
@@ -502,7 +549,8 @@  static int ovl_dir_open(struct inode *in
 		return PTR_ERR(realfile);
 	}
 	od->realfile = realfile;
-	od->is_real = !OVL_TYPE_MERGE(type);
+//	od->is_real = !OVL_TYPE_MERGE(type);
+	od->is_real = false;
 	od->is_upper = OVL_TYPE_UPPER(type);
 	file->private_data = od;
 
@@ -615,7 +663,7 @@  static void ovl_workdir_cleanup_recurse(
 	struct ovl_cache_entry *p;
 	struct ovl_readdir_data rdd = {
 		.ctx.actor = ovl_fill_merge,
-		.dentry = NULL,
+		.check_whiteouts = false,
 		.list = &list,
 		.root = RB_ROOT,
 		.is_lowest = false,