diff mbox series

[RFC] getting misc stats/attributes via xattr API

Message ID YnEeuw6fd1A8usjj@miu.piliscsaba.redhat.com (mailing list archive)
State New
Headers show
Series [RFC] getting misc stats/attributes via xattr API | expand

Commit Message

Miklos Szeredi May 3, 2022, 12:23 p.m. UTC
This is a simplification of the getvalues(2) prototype and moving it to the
getxattr(2) interface, as suggested by Dave.

The patch itself just adds the possibility to retrieve a single line of
/proc/$$/mountinfo (which was the basic requirement from which the fsinfo
patchset grew out of).

But this should be able to serve Amir's per-sb iostats, as well as a host of
other cases where some statistic needs to be retrieved from some object.  Note:
a filesystem object often represents other kinds of objects (such as processes
in /proc) so this is not limited to fs attributes.

This also opens up the interface to setting attributes via setxattr(2).

After some pondering I made the namespace so:

: - root
bar - an attribute
foo: - a folder (can contain attributes and/or folders)

The contents of a folder is represented by a null separated list of names.

Examples:

$ getfattr -etext -n ":" .
# file: .
:="mnt:\000mntns:"

$ getfattr -etext -n ":mnt:" .
# file: .
:mnt:="info"

$ getfattr -etext -n ":mnt:info" .
# file: .
:mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012"

$ getfattr -etext -n ":mntns:" .
# file: .
:mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:"

$ getfattr -etext -n ":mntns:28:" .
# file: .
:mntns:28:="info"

Comments?

Thanks,
Miklos

---
 fs/Makefile            |    2 
 fs/mount.h             |    8 +
 fs/namespace.c         |   15 ++-
 fs/pnode.h             |    2 
 fs/proc_namespace.c    |   15 ++-
 fs/values.c            |  242 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xattr.c             |   16 ++-
 include/linux/values.h |   11 ++
 8 files changed, 295 insertions(+), 16 deletions(-)

Comments

Amir Goldstein May 3, 2022, 2:39 p.m. UTC | #1
On Tue, May 3, 2022 at 3:23 PM Miklos Szeredi <miklos@szeredi.hu> wrote:
>
> This is a simplification of the getvalues(2) prototype and moving it to the
> getxattr(2) interface, as suggested by Dave.
>
> The patch itself just adds the possibility to retrieve a single line of
> /proc/$$/mountinfo (which was the basic requirement from which the fsinfo
> patchset grew out of).
>
> But this should be able to serve Amir's per-sb iostats, as well as a host of
> other cases where some statistic needs to be retrieved from some object.  Note:
> a filesystem object often represents other kinds of objects (such as processes
> in /proc) so this is not limited to fs attributes.
>
> This also opens up the interface to setting attributes via setxattr(2).
>
> After some pondering I made the namespace so:
>
> : - root
> bar - an attribute
> foo: - a folder (can contain attributes and/or folders)
>
> The contents of a folder is represented by a null separated list of names.
>
> Examples:
>
> $ getfattr -etext -n ":" .
> # file: .
> :="mnt:\000mntns:"
>
> $ getfattr -etext -n ":mnt:" .
> # file: .
> :mnt:="info"
>
> $ getfattr -etext -n ":mnt:info" .
> # file: .
> :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012"
>
> $ getfattr -etext -n ":mntns:" .
> # file: .
> :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:"
>
> $ getfattr -etext -n ":mntns:28:" .
> # file: .
> :mntns:28:="info"
>
> Comments?
>

I like that :)

It should be noted that while this API mandates text keys,
it does not mandate text values, so for example, sb iostats could be
exported as text or as binary struct, or as individual text/binary records or
all of the above.

We can relive this sort of discussion for every property that we add. Fun!

Folks interested in this discussion are welcome to join the Zoom
discussion on LSFMM tomorrow, Wed at 11AM PDT:
https://zoom.us/j/99394450657?pwd=ZHE2TzdXV2MzWE9yVnpLYzJNZDBuUT09

Folks who did not get an invite and would like to participate, please
email me in private.

Thanks,
Amir.


>
> ---
>  fs/Makefile            |    2
>  fs/mount.h             |    8 +
>  fs/namespace.c         |   15 ++-
>  fs/pnode.h             |    2
>  fs/proc_namespace.c    |   15 ++-
>  fs/values.c            |  242 +++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/xattr.c             |   16 ++-
>  include/linux/values.h |   11 ++
>  8 files changed, 295 insertions(+), 16 deletions(-)
>
> --- a/fs/Makefile
> +++ b/fs/Makefile
> @@ -16,7 +16,7 @@ obj-y :=      open.o read_write.o file_table.
>                 pnode.o splice.o sync.o utimes.o d_path.o \
>                 stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
>                 fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
> -               kernel_read_file.o remap_range.o
> +               kernel_read_file.o remap_range.o values.o
>
>  ifeq ($(CONFIG_BLOCK),y)
>  obj-y +=       buffer.o direct-io.o mpage.o
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -148,3 +148,11 @@ static inline bool is_anon_ns(struct mnt
>  }
>
>  extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
> +
> +struct mount *mnt_list_next(struct mnt_namespace *ns, struct list_head *p);
> +extern void namespace_lock_read(void);
> +extern void namespace_unlock_read(void);
> +extern int show_mountinfo_root(struct seq_file *m, struct vfsmount *mnt,
> +                              struct path *root);
> +extern bool is_path_reachable(struct mount *, struct dentry *,
> +                             const struct path *root);
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1332,9 +1332,7 @@ struct vfsmount *mnt_clone_internal(cons
>         return &p->mnt;
>  }
>
> -#ifdef CONFIG_PROC_FS
> -static struct mount *mnt_list_next(struct mnt_namespace *ns,
> -                                  struct list_head *p)
> +struct mount *mnt_list_next(struct mnt_namespace *ns, struct list_head *p)
>  {
>         struct mount *mnt, *ret = NULL;
>
> @@ -1351,6 +1349,7 @@ static struct mount *mnt_list_next(struc
>         return ret;
>  }
>
> +#ifdef CONFIG_PROC_FS
>  /* iterator; we want it to have access to namespace_sem, thus here... */
>  static void *m_start(struct seq_file *m, loff_t *pos)
>  {
> @@ -1507,6 +1506,16 @@ static inline void namespace_lock(void)
>         down_write(&namespace_sem);
>  }
>
> +void namespace_lock_read(void)
> +{
> +       down_read(&namespace_sem);
> +}
> +
> +void namespace_unlock_read(void)
> +{
> +       up_read(&namespace_sem);
> +}
> +
>  enum umount_tree_flags {
>         UMOUNT_SYNC = 1,
>         UMOUNT_PROPAGATE = 2,
> --- a/fs/pnode.h
> +++ b/fs/pnode.h
> @@ -50,7 +50,5 @@ void mnt_set_mountpoint(struct mount *,
>  void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
>                            struct mount *mnt);
>  struct mount *copy_tree(struct mount *, struct dentry *, int);
> -bool is_path_reachable(struct mount *, struct dentry *,
> -                        const struct path *root);
>  int count_mounts(struct mnt_namespace *ns, struct mount *mnt);
>  #endif /* _LINUX_PNODE_H */
> --- a/fs/proc_namespace.c
> +++ b/fs/proc_namespace.c
> @@ -132,9 +132,9 @@ static int show_vfsmnt(struct seq_file *
>         return err;
>  }
>
> -static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
> +int show_mountinfo_root(struct seq_file *m, struct vfsmount *mnt,
> +                       struct path *root)
>  {
> -       struct proc_mounts *p = m->private;
>         struct mount *r = real_mount(mnt);
>         struct super_block *sb = mnt->mnt_sb;
>         struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
> @@ -152,7 +152,7 @@ static int show_mountinfo(struct seq_fil
>         seq_putc(m, ' ');
>
>         /* mountpoints outside of chroot jail will give SEQ_SKIP on this */
> -       err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\");
> +       err = seq_path_root(m, &mnt_path, root, " \t\n\\");
>         if (err)
>                 goto out;
>
> @@ -164,7 +164,7 @@ static int show_mountinfo(struct seq_fil
>                 seq_printf(m, " shared:%i", r->mnt_group_id);
>         if (IS_MNT_SLAVE(r)) {
>                 int master = r->mnt_master->mnt_group_id;
> -               int dom = get_dominating_id(r, &p->root);
> +               int dom = get_dominating_id(r, root);
>                 seq_printf(m, " master:%i", master);
>                 if (dom && dom != master)
>                         seq_printf(m, " propagate_from:%i", dom);
> @@ -194,6 +194,13 @@ static int show_mountinfo(struct seq_fil
>         return err;
>  }
>
> +static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
> +{
> +       struct proc_mounts *p = m->private;
> +
> +       return show_mountinfo_root(m, mnt, &p->root);
> +}
> +
>  static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
>  {
>         struct proc_mounts *p = m->private;
> --- /dev/null
> +++ b/fs/values.c
> @@ -0,0 +1,242 @@
> +#include <linux/values.h>
> +#include <linux/fs_struct.h>
> +#include <linux/seq_file.h>
> +#include <linux/nsproxy.h>
> +#include "../lib/kstrtox.h"
> +#include "mount.h"
> +
> +struct val_string {
> +       const char *str;
> +       size_t len;
> +};
> +
> +struct val_iter {
> +       struct val_string name;
> +       struct seq_file seq;
> +       int error;
> +};
> +
> +struct val_desc {
> +       struct val_string name;
> +       union {
> +               u64 idx;
> +               int (*get)(struct val_iter *vi, const struct path *path);
> +       };
> +};
> +
> +#define VAL_STRING(x) { .str = x, .len = sizeof(x) - 1 }
> +#define VD_NAME(x) .name = VAL_STRING(x)
> +
> +static int val_err(struct val_iter *vi, int err)
> +{
> +       vi->error = err;
> +       return 0;
> +}
> +
> +static int val_end_seq(struct val_iter *vi)
> +{
> +       if (vi->seq.count == vi->seq.size)
> +               return -EOVERFLOW;
> +
> +       return 0;
> +}
> +
> +static inline void val_string_skip(struct val_string *s, size_t count)
> +{
> +       WARN_ON(s->len < count);
> +       s->str += count;
> +       s->len -= count;
> +}
> +
> +static bool val_string_prefix(const struct val_string *p,
> +                             const struct val_string *s)
> +{
> +       return s->len >= p->len && !memcmp(s->str, p->str, p->len);
> +}
> +
> +static struct val_desc *val_lookup(struct val_iter *vi, struct val_desc *vd)
> +{
> +       for (; vd->name.len; vd++) {
> +               if (val_string_prefix(&vd->name, &vi->name)) {
> +                       val_string_skip(&vi->name, vd->name.len);
> +                       break;
> +               }
> +       }
> +       return vd;
> +}
> +
> +static int val_get_group(struct val_iter *vi, struct val_desc *vd)
> +{
> +       for (; vd->name.len; vd++)
> +               seq_write(&vi->seq, vd->name.str, vd->name.len + 1);
> +
> +       return val_end_seq(vi);
> +}
> +
> +enum {
> +       VAL_MNT_INFO,
> +};
> +
> +static struct val_desc val_mnt_group[] = {
> +       { VD_NAME("info"),              .idx = VAL_MNT_INFO             },
> +       { }
> +};
> +
> +static int val_mnt_show(struct val_iter *vi, struct vfsmount *mnt)
> +{
> +       struct val_desc *vd = val_lookup(vi, val_mnt_group);
> +       struct path root;
> +
> +       if (!vd->name.str)
> +               return val_err(vi, -ENOENT);
> +
> +       switch(vd->idx) {
> +       case VAL_MNT_INFO:
> +               get_fs_root(current->fs, &root);
> +               show_mountinfo_root(&vi->seq, mnt, &root);
> +               path_put(&root);
> +               break;
> +       }
> +
> +       return 0;
> +}
> +
> +static int val_mnt_get(struct val_iter *vi, const struct path *path)
> +{
> +       int err;
> +
> +       if (!vi->name.len)
> +               return val_get_group(vi, val_mnt_group);
> +
> +       namespace_lock_read();
> +       err = val_mnt_show(vi, path->mnt);
> +       namespace_unlock_read();
> +
> +       return err;
> +}
> +
> +/* called with namespace_sem held for read */
> +static struct vfsmount *mnt_lookup_by_id(struct mnt_namespace *ns,
> +                                        struct path *root, int id)
> +{
> +       struct mount *m;
> +
> +       for (m = mnt_list_next(ns, &ns->list); m; m = mnt_list_next(ns, &m->mnt_list)) {
> +               if (m->mnt_id == id) {
> +                       if (is_path_reachable(m, m->mnt.mnt_root, root))
> +                               return mntget(&m->mnt);
> +                       else
> +                               return NULL;
> +               }
> +       }
> +       return NULL;
> +}
> +
> +static void seq_mnt_list(struct seq_file *seq, struct mnt_namespace *ns,
> +                        struct path *root)
> +{
> +       struct mount *m;
> +
> +       namespace_lock_read();
> +       for (m = mnt_list_next(ns, &ns->list); m; m = mnt_list_next(ns, &m->mnt_list)) {
> +               if (is_path_reachable(m, m->mnt.mnt_root, root)) {
> +                       seq_printf(seq, "%i:", m->mnt_id);
> +                       seq_putc(seq, '\0');
> +               }
> +       }
> +       namespace_unlock_read();
> +}
> +
> +static int val_mntns_get(struct val_iter *vi, const struct path *path)
> +{
> +       struct mnt_namespace *mnt_ns = current->nsproxy->mnt_ns;
> +       struct vfsmount *mnt;
> +       struct path root;
> +       unsigned long long mnt_id;
> +       unsigned int end;
> +       int err;
> +
> +       if (!vi->name.len) {
> +               get_fs_root(current->fs, &root);
> +               seq_mnt_list(&vi->seq, mnt_ns, &root);
> +               path_put(&root);
> +               return val_end_seq(vi);
> +       }
> +
> +       end = _parse_integer(vi->name.str, 10, &mnt_id);
> +       if (end & KSTRTOX_OVERFLOW)
> +               return val_err(vi, -ENOENT);
> +       if (vi->name.str[end] != VAL_SEP)
> +               return val_err(vi, -ENOENT);
> +       val_string_skip(&vi->name, end + 1);
> +
> +       namespace_lock_read();
> +       get_fs_root(current->fs, &root);
> +       mnt = mnt_lookup_by_id(mnt_ns, &root, mnt_id);
> +       path_put(&root);
> +       if (!mnt) {
> +               namespace_unlock_read();
> +               return val_err(vi, -ENOENT);
> +       }
> +       if (vi->name.len)
> +               err = val_mnt_show(vi, mnt);
> +       else
> +               err = val_get_group(vi, val_mnt_group);
> +
> +       namespace_unlock_read();
> +       mntput(mnt);
> +
> +       return err;
> +}
> +
> +
> +
> +static struct val_desc val_toplevel_group[] = {
> +       { VD_NAME("mnt:"),      .get = val_mnt_get,     },
> +       { VD_NAME("mntns:"),    .get = val_mntns_get,   },
> +       { },
> +};
> +
> +static int getvalues(struct val_iter *vi, const struct path *path)
> +{
> +       struct val_desc *vd;
> +       int err;
> +
> +       if (!vi->name.len)
> +               return val_get_group(vi, val_toplevel_group);
> +
> +       vd = val_lookup(vi, val_toplevel_group);
> +       if (!vd->name.len)
> +               err = val_err(vi, -ENOENT);
> +       else
> +               err = vd->get(vi, path);
> +
> +       return err ?: vi->error;
> +}
> +
> +ssize_t val_getxattr(struct path *path, const char *name, size_t namelen,
> +                    void __user *value, size_t size)
> +{
> +       int err;
> +       char val[1024];
> +       struct val_iter vi = {
> +               .name = { .str = name, .len = namelen },
> +               .seq = { .buf = val, .size = min(sizeof(val), size) },
> +       };
> +
> +       if (!size)
> +               return sizeof(val);
> +
> +       val_string_skip(&vi.name, 1);
> +
> +       err = getvalues(&vi, path);
> +       if (err < 0)
> +               return err;
> +
> +       WARN_ON(vi.seq.count > size);
> +       if (copy_to_user(value, vi.seq.buf, vi.seq.count))
> +               return -EFAULT;
> +
> +       return vi.seq.count;
> +}
> +
> --- a/fs/xattr.c
> +++ b/fs/xattr.c
> @@ -22,6 +22,7 @@
>  #include <linux/audit.h>
>  #include <linux/vmalloc.h>
>  #include <linux/posix_acl_xattr.h>
> +#include <linux/values.h>
>
>  #include <linux/uaccess.h>
>
> @@ -643,12 +644,13 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, cons
>   * Extended attribute GET operations
>   */
>  static ssize_t
> -getxattr(struct user_namespace *mnt_userns, struct dentry *d,
> -        const char __user *name, void __user *value, size_t size)
> +getxattr(struct path *path, const char __user *name,
> +        void __user *value, size_t size)
>  {
>         ssize_t error;
>         void *kvalue = NULL;
>         char kname[XATTR_NAME_MAX + 1];
> +       struct user_namespace *mnt_userns = mnt_user_ns(path->mnt);
>
>         error = strncpy_from_user(kname, name, sizeof(kname));
>         if (error == 0 || error == sizeof(kname))
> @@ -656,6 +658,9 @@ getxattr(struct user_namespace *mnt_user
>         if (error < 0)
>                 return error;
>
> +       if (kname[0] == VAL_SEP)
> +               return val_getxattr(path, kname, error, value, size);
> +
>         if (size) {
>                 if (size > XATTR_SIZE_MAX)
>                         size = XATTR_SIZE_MAX;
> @@ -664,7 +669,7 @@ getxattr(struct user_namespace *mnt_user
>                         return -ENOMEM;
>         }
>
> -       error = vfs_getxattr(mnt_userns, d, kname, kvalue, size);
> +       error = vfs_getxattr(mnt_userns, path->dentry, kname, kvalue, size);
>         if (error > 0) {
>                 if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
>                     (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
> @@ -693,7 +698,7 @@ static ssize_t path_getxattr(const char
>         error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
>         if (error)
>                 return error;
> -       error = getxattr(mnt_user_ns(path.mnt), path.dentry, name, value, size);
> +       error = getxattr(&path, name, value, size);
>         path_put(&path);
>         if (retry_estale(error, lookup_flags)) {
>                 lookup_flags |= LOOKUP_REVAL;
> @@ -723,8 +728,7 @@ SYSCALL_DEFINE4(fgetxattr, int, fd, cons
>         if (!f.file)
>                 return error;
>         audit_file(f.file);
> -       error = getxattr(file_mnt_user_ns(f.file), f.file->f_path.dentry,
> -                        name, value, size);
> +       error = getxattr(&f.file->f_path, name, value, size);
>         fdput(f);
>         return error;
>  }
> --- /dev/null
> +++ b/include/linux/values.h
> @@ -0,0 +1,11 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#include <linux/types.h>
> +
> +#define VAL_SEP ':'
> +
> +struct path;
> +
> +ssize_t val_getxattr(struct path *path, const char *name, size_t namelen,
> +                    void __user *value, size_t size);
> +
Greg KH May 3, 2022, 2:53 p.m. UTC | #2
On Tue, May 03, 2022 at 05:39:46PM +0300, Amir Goldstein wrote:
> On Tue, May 3, 2022 at 3:23 PM Miklos Szeredi <miklos@szeredi.hu> wrote:
> >
> > This is a simplification of the getvalues(2) prototype and moving it to the
> > getxattr(2) interface, as suggested by Dave.
> >
> > The patch itself just adds the possibility to retrieve a single line of
> > /proc/$$/mountinfo (which was the basic requirement from which the fsinfo
> > patchset grew out of).
> >
> > But this should be able to serve Amir's per-sb iostats, as well as a host of
> > other cases where some statistic needs to be retrieved from some object.  Note:
> > a filesystem object often represents other kinds of objects (such as processes
> > in /proc) so this is not limited to fs attributes.
> >
> > This also opens up the interface to setting attributes via setxattr(2).
> >
> > After some pondering I made the namespace so:
> >
> > : - root
> > bar - an attribute
> > foo: - a folder (can contain attributes and/or folders)
> >
> > The contents of a folder is represented by a null separated list of names.
> >
> > Examples:
> >
> > $ getfattr -etext -n ":" .
> > # file: .
> > :="mnt:\000mntns:"
> >
> > $ getfattr -etext -n ":mnt:" .
> > # file: .
> > :mnt:="info"
> >
> > $ getfattr -etext -n ":mnt:info" .
> > # file: .
> > :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012"
> >
> > $ getfattr -etext -n ":mntns:" .
> > # file: .
> > :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:"
> >
> > $ getfattr -etext -n ":mntns:28:" .
> > # file: .
> > :mntns:28:="info"
> >
> > Comments?
> >
> 
> I like that :)
> 
> It should be noted that while this API mandates text keys,
> it does not mandate text values, so for example, sb iostats could be
> exported as text or as binary struct, or as individual text/binary records or
> all of the above.

Ugh, no, that would be a total mess.  Don't go exporting random binary
structs depending on the file, that's going to be completely
unmaintainable.  As it is, this is going to be hard enough with random
text fields.

As for this format, it needs to be required to be documented in
Documentation/ABI/ for each entry and key type so that we have a chance
of knowing what is going on and tracking how things are working and
validating stuff.

thanks,

greg k-h
Miklos Szeredi May 3, 2022, 3:04 p.m. UTC | #3
On Tue, 3 May 2022 at 16:53, Greg KH <gregkh@linuxfoundation.org> wrote:
>
> On Tue, May 03, 2022 at 05:39:46PM +0300, Amir Goldstein wrote:

> > It should be noted that while this API mandates text keys,
> > it does not mandate text values, so for example, sb iostats could be
> > exported as text or as binary struct, or as individual text/binary records or
> > all of the above.
>
> Ugh, no, that would be a total mess.  Don't go exporting random binary
> structs depending on the file, that's going to be completely
> unmaintainable.  As it is, this is going to be hard enough with random
> text fields.
>
> As for this format, it needs to be required to be documented in
> Documentation/ABI/ for each entry and key type so that we have a chance
> of knowing what is going on and tracking how things are working and
> validating stuff.

My preference would be a single text value for each key.

Contents of ":mnt:info" contradicts that, but mountinfo has a long
established, well documented format, and nothing prevents exporting
individual attributes with separate names as well (the getvalues(2)
patch did exactly that).

Thanks,
Miklos
Amir Goldstein May 3, 2022, 3:14 p.m. UTC | #4
On Tue, May 3, 2022 at 6:04 PM Miklos Szeredi <miklos@szeredi.hu> wrote:
>
> On Tue, 3 May 2022 at 16:53, Greg KH <gregkh@linuxfoundation.org> wrote:
> >
> > On Tue, May 03, 2022 at 05:39:46PM +0300, Amir Goldstein wrote:
>
> > > It should be noted that while this API mandates text keys,
> > > it does not mandate text values, so for example, sb iostats could be
> > > exported as text or as binary struct, or as individual text/binary records or
> > > all of the above.
> >
> > Ugh, no, that would be a total mess.  Don't go exporting random binary
> > structs depending on the file, that's going to be completely
> > unmaintainable.  As it is, this is going to be hard enough with random
> > text fields.
> >
> > As for this format, it needs to be required to be documented in
> > Documentation/ABI/ for each entry and key type so that we have a chance
> > of knowing what is going on and tracking how things are working and
> > validating stuff.
>
> My preference would be a single text value for each key.
>
> Contents of ":mnt:info" contradicts that, but mountinfo has a long
> established, well documented format, and nothing prevents exporting
> individual attributes with separate names as well (the getvalues(2)
> patch did exactly that).
>

Right, the fun is that ":mnt:info" and ":mnt:info:" can co-exist.

Thanks,
Amir.
Greg KH May 3, 2022, 4:54 p.m. UTC | #5
On Tue, May 03, 2022 at 05:04:06PM +0200, Miklos Szeredi wrote:
> On Tue, 3 May 2022 at 16:53, Greg KH <gregkh@linuxfoundation.org> wrote:
> >
> > On Tue, May 03, 2022 at 05:39:46PM +0300, Amir Goldstein wrote:
> 
> > > It should be noted that while this API mandates text keys,
> > > it does not mandate text values, so for example, sb iostats could be
> > > exported as text or as binary struct, or as individual text/binary records or
> > > all of the above.
> >
> > Ugh, no, that would be a total mess.  Don't go exporting random binary
> > structs depending on the file, that's going to be completely
> > unmaintainable.  As it is, this is going to be hard enough with random
> > text fields.
> >
> > As for this format, it needs to be required to be documented in
> > Documentation/ABI/ for each entry and key type so that we have a chance
> > of knowing what is going on and tracking how things are working and
> > validating stuff.
> 
> My preference would be a single text value for each key.

Yes!  That's the only sane way to maintain apis, and is why we do that
for sysfs.

If the key isn't present, there's no value, so userspace "knows" this
automatically and parsing this is trivial.

> Contents of ":mnt:info" contradicts that, but mountinfo has a long
> established, well documented format, and nothing prevents exporting
> individual attributes with separate names as well (the getvalues(2)
> patch did exactly that).

I understand, for "legacy" things like this, that's fine, but don't add
new fields or change them over time please, that way just gets us back
to the nightmare of preserving /proc/ file apis.

thanks,

greg k-h
Dave Chinner May 3, 2022, 10:43 p.m. UTC | #6
On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote:
> This is a simplification of the getvalues(2) prototype and moving it to the
> getxattr(2) interface, as suggested by Dave.
> 
> The patch itself just adds the possibility to retrieve a single line of
> /proc/$$/mountinfo (which was the basic requirement from which the fsinfo
> patchset grew out of).
> 
> But this should be able to serve Amir's per-sb iostats, as well as a host of
> other cases where some statistic needs to be retrieved from some object.  Note:
> a filesystem object often represents other kinds of objects (such as processes
> in /proc) so this is not limited to fs attributes.
> 
> This also opens up the interface to setting attributes via setxattr(2).
> 
> After some pondering I made the namespace so:
> 
> : - root
> bar - an attribute
> foo: - a folder (can contain attributes and/or folders)
> 
> The contents of a folder is represented by a null separated list of names.
> 
> Examples:
> 
> $ getfattr -etext -n ":" .
> # file: .
> :="mnt:\000mntns:"
> 
> $ getfattr -etext -n ":mnt:" .
> # file: .
> :mnt:="info"
> 
> $ getfattr -etext -n ":mnt:info" .
> # file: .
> :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012"
> 
> $ getfattr -etext -n ":mntns:" .
> # file: .
> :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:"
> 
> $ getfattr -etext -n ":mntns:28:" .
> # file: .
> :mntns:28:="info"
> 
> Comments?

I like. :)

> Thanks,
> Miklos
> 
> ---
>  fs/Makefile            |    2 
>  fs/mount.h             |    8 +
>  fs/namespace.c         |   15 ++-
>  fs/pnode.h             |    2 
>  fs/proc_namespace.c    |   15 ++-
>  fs/values.c            |  242 +++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/xattr.c             |   16 ++-
>  include/linux/values.h |   11 ++

"values" is a very generic name - probably should end up being
something more descriptive of the functionality is provides,
especially if the header file is going to be dumped in
include/linux/. I don't really have a good suggestion at the moment,
though. 

....

> +
> +enum {
> +	VAL_MNT_INFO,
> +};
> +
> +static struct val_desc val_mnt_group[] = {
> +	{ VD_NAME("info"),		.idx = VAL_MNT_INFO		},
> +	{ }
> +};
....
> +
> +
> +static struct val_desc val_toplevel_group[] = {
> +	{ VD_NAME("mnt:"),	.get = val_mnt_get,	},
> +	{ VD_NAME("mntns:"),	.get = val_mntns_get,	},
> +	{ },
> +};

I know this is an early POC, my main question is how do you
envisiage this table driven structure being extended down from just
the mount into the filesystem so we can expose filesystem specific
information that isn't covered by generic interfaces like statx?

Cheers,

Dave.
Miklos Szeredi May 4, 2022, 7:18 a.m. UTC | #7
On Wed, 4 May 2022 at 00:43, Dave Chinner <david@fromorbit.com> wrote:

> "values" is a very generic name - probably should end up being
> something more descriptive of the functionality is provides,
> especially if the header file is going to be dumped in
> include/linux/. I don't really have a good suggestion at the moment,
> though.

The obvious ones are stat and attr, which are taken already.   Info is
probably too generic as well.

Ideas are welcome.

>
> ....
>
> > +
> > +enum {
> > +     VAL_MNT_INFO,
> > +};
> > +
> > +static struct val_desc val_mnt_group[] = {
> > +     { VD_NAME("info"),              .idx = VAL_MNT_INFO             },
> > +     { }
> > +};
> ....
> > +
> > +
> > +static struct val_desc val_toplevel_group[] = {
> > +     { VD_NAME("mnt:"),      .get = val_mnt_get,     },
> > +     { VD_NAME("mntns:"),    .get = val_mntns_get,   },
> > +     { },
> > +};
>
> I know this is an early POC, my main question is how do you
> envisiage this table driven structure being extended down from just
> the mount into the filesystem so we can expose filesystem specific
> information that isn't covered by generic interfaces like statx?

I was thinking of adding a i_op callback.   The details are a bit
fuzzy, since the vfs and the fs would have to work together when
listing the attributes and possibly also when retrieving the attribute
itself (think mount options).

Thanks,
Miklos
Amir Goldstein May 4, 2022, 2:22 p.m. UTC | #8
On Wed, May 4, 2022 at 10:18 AM Miklos Szeredi <miklos@szeredi.hu> wrote:
>
> On Wed, 4 May 2022 at 00:43, Dave Chinner <david@fromorbit.com> wrote:
>
> > "values" is a very generic name - probably should end up being
> > something more descriptive of the functionality is provides,
> > especially if the header file is going to be dumped in
> > include/linux/. I don't really have a good suggestion at the moment,
> > though.
>
> The obvious ones are stat and attr, which are taken already.   Info is
> probably too generic as well.
>
> Ideas are welcome.

I was thinking of "properties".

>
> >
> > ....
> >
> > > +
> > > +enum {
> > > +     VAL_MNT_INFO,
> > > +};
> > > +
> > > +static struct val_desc val_mnt_group[] = {
> > > +     { VD_NAME("info"),              .idx = VAL_MNT_INFO             },
> > > +     { }
> > > +};
> > ....
> > > +
> > > +
> > > +static struct val_desc val_toplevel_group[] = {
> > > +     { VD_NAME("mnt:"),      .get = val_mnt_get,     },
> > > +     { VD_NAME("mntns:"),    .get = val_mntns_get,   },
> > > +     { },
> > > +};
> >
> > I know this is an early POC, my main question is how do you
> > envisiage this table driven structure being extended down from just
> > the mount into the filesystem so we can expose filesystem specific
> > information that isn't covered by generic interfaces like statx?
>
> I was thinking of adding a i_op callback.   The details are a bit
> fuzzy, since the vfs and the fs would have to work together when
> listing the attributes and possibly also when retrieving the attribute
> itself (think mount options).
>

No, please do not think mount options :)
Please think of an interface that does not mix vfs and fs properties.

Sure, with mount(2) you can mix fs and vfs options, but with the new interface
they should be clearly separated for set and get if possible.

":mnt:info" key to get the effective mount options (as in /proc/$$/mountinfo)
does not contradict that, because it can be read only.

Thanks,
Amir.
Karel Zak May 5, 2022, 12:30 p.m. UTC | #9
On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote:
> Examples:
> 
> $ getfattr -etext -n ":" .
> # file: .
> :="mnt:\000mntns:"
> 
> $ getfattr -etext -n ":mnt:" .
> # file: .
> :mnt:="info"
> 
> $ getfattr -etext -n ":mnt:info" .
> # file: .
> :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012"

Is there a way how to get mountinfo-like entry by mount ID for some
sub-tree? Something like:

 getfattr -etext -n ":mnt:info:21" /

The interface has to be consistent with some notification system (I
see your question about fsnotify/fanotify at linux-fsdevel) and mount
ID seems better than paths due to over-mounts, etc.

> $ getfattr -etext -n ":mntns:" .
> # file: .
> :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:"
> 
> $ getfattr -etext -n ":mntns:28:" .
> # file: .
> :mntns:28:="info"

  Karel
Miklos Szeredi May 5, 2022, 1:59 p.m. UTC | #10
On Thu, 5 May 2022 at 14:30, Karel Zak <kzak@redhat.com> wrote:

> Is there a way how to get mountinfo-like entry by mount ID for some
> sub-tree? Something like:
>
>  getfattr -etext -n ":mnt:info:21" /

Yes:

getfattr -etext -n ":mntns:21:info" /

>
> The interface has to be consistent with some notification system (I
> see your question about fsnotify/fanotify at linux-fsdevel) and mount
> ID seems better than paths due to over-mounts, etc.

Right.

If the mount notification doesn't fit into fsnotify well, the original
patch from David could be used.  I think that part was
uncontroversial.

Thanks,
Miklos
Theodore Ts'o May 5, 2022, 11:38 p.m. UTC | #11
On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote:
> 
> : - root
> bar - an attribute
> foo: - a folder (can contain attributes and/or folders)
> 
> The contents of a folder is represented by a null separated list of names.
> 
> Examples:
> 
> $ getfattr -etext -n ":" .
> # file: .
> :="mnt:\000mntns:"

In your example, does it matter what "." is?  It looks like in some
cases, it makes no difference at all, and in other cases, like this,
'.' *does* matter:

> $ getfattr -etext -n ":mnt:info" .
> # file: .
> :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012"

Is that right?

> $ getfattr -etext -n ":mntns:" .
> # file: .
> :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:"

What is this returning?  All possible mount name spaces?  Or all of
the mount spaces where '.' happens to exist?

Also, using the null character means that we can't really use shell
scripts calling getfattr.  I understand that the problem is that in
some cases, you might want to return a pathname, and NULL is the only
character which is guaranteed not to show up in a pathname.  However,
it makes parsing the returned value in a shell script exciting.

   	 	     	      	       	 - Ted
Amir Goldstein May 6, 2022, 12:06 a.m. UTC | #12
On Fri, May 6, 2022 at 2:38 AM tytso <tytso@mit.edu> wrote:
>
> On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote:
> >
> > : - root
> > bar - an attribute
> > foo: - a folder (can contain attributes and/or folders)
> >
> > The contents of a folder is represented by a null separated list of names.
> >
> > Examples:
> >
> > $ getfattr -etext -n ":" .
> > # file: .
> > :="mnt:\000mntns:"
>
> In your example, does it matter what "." is?  It looks like in some
> cases, it makes no difference at all, and in other cases, like this,
> '.' *does* matter:

It does. If "." was a directory in /proc/ or in ext4 it might have had
more entries.

>
> > $ getfattr -etext -n ":mnt:info" .
> > # file: .
> > :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012"
>
> Is that right?
>
> > $ getfattr -etext -n ":mntns:" .
> > # file: .
> > :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:"
>
> What is this returning?  All possible mount name spaces?  Or all of
> the mount spaces where '.' happens to exist?

This confused me too.
It is not returning the mount namespaces, it is returning all the mount ids
in the mount namespace of ".".
":mntns:mounts:" might have been a better choice of key.

Thanks,
Amir.

>
> Also, using the null character means that we can't really use shell
> scripts calling getfattr.  I understand that the problem is that in
> some cases, you might want to return a pathname, and NULL is the only
> character which is guaranteed not to show up in a pathname.  However,
> it makes parsing the returned value in a shell script exciting.
>
>                                          - Ted
Dave Chinner May 7, 2022, 12:32 a.m. UTC | #13
On Thu, May 05, 2022 at 04:38:12PM -0700, tytso wrote:
> On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote:
> > 
> > : - root
> > bar - an attribute
> > foo: - a folder (can contain attributes and/or folders)
> > 
> > The contents of a folder is represented by a null separated list of names.
> > 
> > Examples:
> > 
> > $ getfattr -etext -n ":" .
> > # file: .
> > :="mnt:\000mntns:"
> 
> In your example, does it matter what "." is?  It looks like in some
> cases, it makes no difference at all, and in other cases, like this,
> '.' *does* matter:
> 
> > $ getfattr -etext -n ":mnt:info" .
> > # file: .
> > :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012"
> 
> Is that right?
> 
> > $ getfattr -etext -n ":mntns:" .
> > # file: .
> > :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:"
> 
> What is this returning?  All possible mount name spaces?  Or all of
> the mount spaces where '.' happens to exist?
> 
> Also, using the null character means that we can't really use shell
> scripts calling getfattr.

Yeah, it should be returning an attr per namespace, not an attr
whose value contains all the valid namespaces.

i.e. if the next level of the heirachy is 21, 22, 24, .... we should
be seeing a listing of multiple attributes with naming like:

:mntns:21:
:mntns:22:
:mntns:24:
....

rather than an attribute whose value contains the names of the
attrbiutes in the next layer of the heirarchy. Then we can just
pull the namespace we want and feed it directly to:

$ getfattr -n ":mntns:21:"

and we get a list of all the attributes available for that
namespace...

> I understand that the problem is that in
> some cases, you might want to return a pathname, and NULL is the only
> character which is guaranteed not to show up in a pathname.  However,
> it makes parsing the returned value in a shell script exciting.

We shouldn't be returning the names of children in an attribute
value. We have a syscall API for doing this that - listxattr() will
iterate attribute names just like a directory does with readdir()
via listxattr(). IOWs, we should not need to encode the next layer
of child attribute names into the value of the parent attribute - if
we do a listxattr on a parent that has children, return the list of
child names as individual attribute names....

(Yes, I know listxattr null separates the attribute names - it's a
godawful kernel API - but that's not the programmatic interface
we should expose at the shell script level.)

Cheers,

Dave.
Christian Brauner May 9, 2022, 12:48 p.m. UTC | #14
On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote:
> This is a simplification of the getvalues(2) prototype and moving it to the
> getxattr(2) interface, as suggested by Dave.
> 
> The patch itself just adds the possibility to retrieve a single line of
> /proc/$$/mountinfo (which was the basic requirement from which the fsinfo
> patchset grew out of).
> 
> But this should be able to serve Amir's per-sb iostats, as well as a host of
> other cases where some statistic needs to be retrieved from some object.  Note:
> a filesystem object often represents other kinds of objects (such as processes
> in /proc) so this is not limited to fs attributes.
> 
> This also opens up the interface to setting attributes via setxattr(2).
> 
> After some pondering I made the namespace so:
> 
> : - root
> bar - an attribute
> foo: - a folder (can contain attributes and/or folders)
> 
> The contents of a folder is represented by a null separated list of names.
> 
> Examples:
> 
> $ getfattr -etext -n ":" .
> # file: .
> :="mnt:\000mntns:"
> 
> $ getfattr -etext -n ":mnt:" .
> # file: .
> :mnt:="info"
> 
> $ getfattr -etext -n ":mnt:info" .
> # file: .
> :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012"

Hey Miklos,

One comment about this. We really need to have this interface support
giving us mount options like "relatime" back in numeric form (I assume
this will be possible.). It is royally annoying having to maintain a
mapping table in userspace just to do:

relatime -> MS_RELATIME/MOUNT_ATTR_RELATIME
ro	 -> MS_RDONLY/MOUNT_ATTR_RDONLY

A library shouldn't be required to use this interface. Conservative
low-level software that keeps its shared library dependencies minimal
will need to be able to use that interface without having to go to an
external library that transforms text-based output to binary form (Which
I'm very sure will need to happen if we go with a text-based
interface.).

> 
> $ getfattr -etext -n ":mntns:" .
> # file: .
> :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:"
> 
> $ getfattr -etext -n ":mntns:28:" .
> # file: .
> :mntns:28:="info"
> 
> Comments?

I'm not a fan of text-based APIs and I'm particularly not a fan of the
xattr APIs. But at this point I'm ready to compromise on a lot as long
as it gets us values out of the kernel in some way. :)

I had to use xattrs extensively in various low-level userspace projects
and they continue to be a source of races and memory bugs.

A few initial questions:

* The xattr APIs often require the caller to do sm like (copying some go
  code quickly as I have that lying around):

	for _, x := range split {
		xattr := string(x)
		// Call Getxattr() twice: First, to determine the size of the
		// buffer we need to allocate to store the extended attributes,
		// second, to actually store the extended attributes in the
		// buffer. Also, check if the size of the extended attribute
		// hasn't increased between the two calls.
		pre, err = unix.Getxattr(path, xattr, nil)
		if err != nil || pre < 0 {
			return nil, err
		}

		dest = make([]byte, pre)
		post := 0
		if pre > 0 {
			post, err = unix.Getxattr(path, xattr, dest)
			if err != nil || post < 0 {
				return nil, err
			}
		}

		if post > pre {
			return nil, fmt.Errorf("Extended attribute '%s' size increased from %d to %d during retrieval", xattr, pre, post)
		}

		xattrs[xattr] = string(dest)
	}

  This pattern of requesting the size first by passing empty arguments,
  then allocating the buffer and then passing down that buffer to
  retrieve that value is really annoying to use and error prone (I do
  of course understand why it exists.).

  For real xattrs it's not that bad because we can assume that these
  values don't change often and so the race window between
  getxattr(GET_SIZE) and getxattr(GET_VALUES) often doesn't matter. But
  fwiw, the post > pre check doesn't exist for no reason; we do indeed
  hit that race.
  
  In addition, it is costly having to call getxattr() twice. Again, for
  retrieving xattrs it often doesn't matter because it's not a super
  common operation but for mount and other info it might matter.

  Will we have to use the same pattern for mnt and other info as well?
  If so, I worry that the race is way more likely than it is for real
  xattrs.

* Would it be possible to support binary output with this interface?
  I really think users would love to have an interfact where they can
  get a struct with binary info back. I'm not advocating to make the
  whole interface binary but I wouldn't mind having the option to
  support it.
  Especially for some information at least. I'd really love to have a
  way go get a struct mount_info or whatever back that gives me all the
  details about a mount encompassed in a single struct.

  Callers like systemd will have to parse text and will end up
  converting everything from text into binary anyway; especially for
  mount information. So giving them an option for this out of the box
  would be quite good.

  Interfaces like statx aim to be as fast as possible because we exptect
  them to be called quite often. Retrieving mount info is quite costly
  and is done quite often as well. Maybe not for all software but for a
  lot of low-level software. Especially when starting services in
  systemd a lot of mount parsing happens similar when starting
  containers in runtimes.

* If we decide to go forward with this interface - and I think I
  mentioned this in the lsfmm session - could we please at least add a
  new system call? It really feels wrong to retrieve mount and other
  information through the xattr interfaces. They aren't really xattrs.

  Imho, xattrs are a bit like a wonky version of streams already (One of
  the reasons I find them quite unpleasant.). Making mount and other
  information retrievable directly through the getxattr() interface will
  turn them into a full-on streams implementation imho. I'd prefer not
  to do that (Which is another reason I'd prefer at least a separate
  system call.).
Amir Goldstein May 9, 2022, 2:20 p.m. UTC | #15
On Mon, May 9, 2022 at 3:48 PM Christian Brauner <brauner@kernel.org> wrote:
>
> On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote:
> > This is a simplification of the getvalues(2) prototype and moving it to the
> > getxattr(2) interface, as suggested by Dave.
> >
> > The patch itself just adds the possibility to retrieve a single line of
> > /proc/$$/mountinfo (which was the basic requirement from which the fsinfo
> > patchset grew out of).
> >
> > But this should be able to serve Amir's per-sb iostats, as well as a host of
> > other cases where some statistic needs to be retrieved from some object.  Note:
> > a filesystem object often represents other kinds of objects (such as processes
> > in /proc) so this is not limited to fs attributes.
> >
> > This also opens up the interface to setting attributes via setxattr(2).
> >
> > After some pondering I made the namespace so:
> >
> > : - root
> > bar - an attribute
> > foo: - a folder (can contain attributes and/or folders)
> >
> > The contents of a folder is represented by a null separated list of names.
> >
> > Examples:
> >
> > $ getfattr -etext -n ":" .
> > # file: .
> > :="mnt:\000mntns:"
> >
> > $ getfattr -etext -n ":mnt:" .
> > # file: .
> > :mnt:="info"
> >
> > $ getfattr -etext -n ":mnt:info" .
> > # file: .
> > :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012"
>
> Hey Miklos,
>
> One comment about this. We really need to have this interface support
> giving us mount options like "relatime" back in numeric form (I assume
> this will be possible.). It is royally annoying having to maintain a
> mapping table in userspace just to do:
>
> relatime -> MS_RELATIME/MOUNT_ATTR_RELATIME
> ro       -> MS_RDONLY/MOUNT_ATTR_RDONLY
>
> A library shouldn't be required to use this interface. Conservative
> low-level software that keeps its shared library dependencies minimal
> will need to be able to use that interface without having to go to an
> external library that transforms text-based output to binary form (Which
> I'm very sure will need to happen if we go with a text-based
> interface.).
>

No need for a library.
We can export:

:mnt:attr:flags (in hex format)

> >
> > $ getfattr -etext -n ":mntns:" .
> > # file: .
> > :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:"
> >
> > $ getfattr -etext -n ":mntns:28:" .
> > # file: .
> > :mntns:28:="info"
> >
> > Comments?
>
> I'm not a fan of text-based APIs and I'm particularly not a fan of the
> xattr APIs. But at this point I'm ready to compromise on a lot as long
> as it gets us values out of the kernel in some way. :)
>
> I had to use xattrs extensively in various low-level userspace projects
> and they continue to be a source of races and memory bugs.
>
> A few initial questions:
>
> * The xattr APIs often require the caller to do sm like (copying some go
>   code quickly as I have that lying around):
>
>         for _, x := range split {
>                 xattr := string(x)
>                 // Call Getxattr() twice: First, to determine the size of the
>                 // buffer we need to allocate to store the extended attributes,
>                 // second, to actually store the extended attributes in the
>                 // buffer. Also, check if the size of the extended attribute
>                 // hasn't increased between the two calls.
>                 pre, err = unix.Getxattr(path, xattr, nil)
>                 if err != nil || pre < 0 {
>                         return nil, err
>                 }
>
>                 dest = make([]byte, pre)
>                 post := 0
>                 if pre > 0 {
>                         post, err = unix.Getxattr(path, xattr, dest)
>                         if err != nil || post < 0 {
>                                 return nil, err
>                         }
>                 }
>
>                 if post > pre {
>                         return nil, fmt.Errorf("Extended attribute '%s' size increased from %d to %d during retrieval", xattr, pre, post)
>                 }
>
>                 xattrs[xattr] = string(dest)
>         }
>
>   This pattern of requesting the size first by passing empty arguments,
>   then allocating the buffer and then passing down that buffer to
>   retrieve that value is really annoying to use and error prone (I do
>   of course understand why it exists.).
>
>   For real xattrs it's not that bad because we can assume that these
>   values don't change often and so the race window between
>   getxattr(GET_SIZE) and getxattr(GET_VALUES) often doesn't matter. But
>   fwiw, the post > pre check doesn't exist for no reason; we do indeed
>   hit that race.

It is not really a race, you can do {} while (errno != ERANGE) and there
will be no race.

>
>   In addition, it is costly having to call getxattr() twice. Again, for
>   retrieving xattrs it often doesn't matter because it's not a super
>   common operation but for mount and other info it might matter.
>

samba and many other projects that care about efficiency solved this
a long time ago with an opportunistic buffer - never start with NULL buffer
most values will fit in a 1K buffer.

>   Will we have to use the same pattern for mnt and other info as well?
>   If so, I worry that the race is way more likely than it is for real
>   xattrs.
>
> * Would it be possible to support binary output with this interface?
>   I really think users would love to have an interfact where they can
>   get a struct with binary info back. I'm not advocating to make the
>   whole interface binary but I wouldn't mind having the option to
>   support it.
>   Especially for some information at least. I'd really love to have a
>   way go get a struct mount_info or whatever back that gives me all the
>   details about a mount encompassed in a single struct.
>

I suggested that up thread and Greg has explicitly and loudly
NACKed it - so you will have to take it up with him

>   Callers like systemd will have to parse text and will end up
>   converting everything from text into binary anyway; especially for
>   mount information. So giving them an option for this out of the box
>   would be quite good.
>
>   Interfaces like statx aim to be as fast as possible because we exptect
>   them to be called quite often. Retrieving mount info is quite costly
>   and is done quite often as well. Maybe not for all software but for a
>   lot of low-level software. Especially when starting services in
>   systemd a lot of mount parsing happens similar when starting
>   containers in runtimes.
>

This API is not for *everything*. Obviously it does not replace statx and some
info (like the cifs OFFLINE flag) should be added to statx.
Whether or not mount info needs to get a special treatment like statx
is not proven.
Miklos claims this is a notification issue-
With David Howells' mount notification API, systemd can be pointed at the new
mount that was added/removed/changed and then systemd will rarely need to
parse thousands of mounts info.

> * If we decide to go forward with this interface - and I think I
>   mentioned this in the lsfmm session - could we please at least add a
>   new system call? It really feels wrong to retrieve mount and other
>   information through the xattr interfaces. They aren't really xattrs.
>
>   Imho, xattrs are a bit like a wonky version of streams already (One of
>   the reasons I find them quite unpleasant.). Making mount and other
>   information retrievable directly through the getxattr() interface will
>   turn them into a full-on streams implementation imho. I'd prefer not
>   to do that (Which is another reason I'd prefer at least a separate
>   system call.).

If you are thinking about a read() like interface for xattr or any alternative
data stream then Linus has NACKed it times before.

However, we could add getxattr_multi() as Dave Chinner suggested for
enumerating multiple keys+ (optional) values.
In contrast to listxattr(), getxattr_multi() could allow a "short read"
at least w.r.t the size of the vector.

Thanks,
Amir.
Christian Brauner May 9, 2022, 3:08 p.m. UTC | #16
On Mon, May 09, 2022 at 05:20:50PM +0300, Amir Goldstein wrote:
> On Mon, May 9, 2022 at 3:48 PM Christian Brauner <brauner@kernel.org> wrote:
> >
> > On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote:
> > > This is a simplification of the getvalues(2) prototype and moving it to the
> > > getxattr(2) interface, as suggested by Dave.
> > >
> > > The patch itself just adds the possibility to retrieve a single line of
> > > /proc/$$/mountinfo (which was the basic requirement from which the fsinfo
> > > patchset grew out of).
> > >
> > > But this should be able to serve Amir's per-sb iostats, as well as a host of
> > > other cases where some statistic needs to be retrieved from some object.  Note:
> > > a filesystem object often represents other kinds of objects (such as processes
> > > in /proc) so this is not limited to fs attributes.
> > >
> > > This also opens up the interface to setting attributes via setxattr(2).
> > >
> > > After some pondering I made the namespace so:
> > >
> > > : - root
> > > bar - an attribute
> > > foo: - a folder (can contain attributes and/or folders)
> > >
> > > The contents of a folder is represented by a null separated list of names.
> > >
> > > Examples:
> > >
> > > $ getfattr -etext -n ":" .
> > > # file: .
> > > :="mnt:\000mntns:"
> > >
> > > $ getfattr -etext -n ":mnt:" .
> > > # file: .
> > > :mnt:="info"
> > >
> > > $ getfattr -etext -n ":mnt:info" .
> > > # file: .
> > > :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012"
> >
> > Hey Miklos,
> >
> > One comment about this. We really need to have this interface support
> > giving us mount options like "relatime" back in numeric form (I assume
> > this will be possible.). It is royally annoying having to maintain a
> > mapping table in userspace just to do:
> >
> > relatime -> MS_RELATIME/MOUNT_ATTR_RELATIME
> > ro       -> MS_RDONLY/MOUNT_ATTR_RDONLY
> >
> > A library shouldn't be required to use this interface. Conservative
> > low-level software that keeps its shared library dependencies minimal
> > will need to be able to use that interface without having to go to an
> > external library that transforms text-based output to binary form (Which
> > I'm very sure will need to happen if we go with a text-based
> > interface.).
> >
> 
> No need for a library.
> We can export:
> 
> :mnt:attr:flags (in hex format)

So a binary attribute or a hex value as a string which we have to parse
in userspace into proper hex?

> 
> > >
> > > $ getfattr -etext -n ":mntns:" .
> > > # file: .
> > > :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:"
> > >
> > > $ getfattr -etext -n ":mntns:28:" .
> > > # file: .
> > > :mntns:28:="info"
> > >
> > > Comments?
> >
> > I'm not a fan of text-based APIs and I'm particularly not a fan of the
> > xattr APIs. But at this point I'm ready to compromise on a lot as long
> > as it gets us values out of the kernel in some way. :)
> >
> > I had to use xattrs extensively in various low-level userspace projects
> > and they continue to be a source of races and memory bugs.
> >
> > A few initial questions:
> >
> > * The xattr APIs often require the caller to do sm like (copying some go
> >   code quickly as I have that lying around):
> >
> >         for _, x := range split {
> >                 xattr := string(x)
> >                 // Call Getxattr() twice: First, to determine the size of the
> >                 // buffer we need to allocate to store the extended attributes,
> >                 // second, to actually store the extended attributes in the
> >                 // buffer. Also, check if the size of the extended attribute
> >                 // hasn't increased between the two calls.
> >                 pre, err = unix.Getxattr(path, xattr, nil)
> >                 if err != nil || pre < 0 {
> >                         return nil, err
> >                 }
> >
> >                 dest = make([]byte, pre)
> >                 post := 0
> >                 if pre > 0 {
> >                         post, err = unix.Getxattr(path, xattr, dest)
> >                         if err != nil || post < 0 {
> >                                 return nil, err
> >                         }
> >                 }
> >
> >                 if post > pre {
> >                         return nil, fmt.Errorf("Extended attribute '%s' size increased from %d to %d during retrieval", xattr, pre, post)
> >                 }
> >
> >                 xattrs[xattr] = string(dest)
> >         }
> >
> >   This pattern of requesting the size first by passing empty arguments,
> >   then allocating the buffer and then passing down that buffer to
> >   retrieve that value is really annoying to use and error prone (I do
> >   of course understand why it exists.).
> >
> >   For real xattrs it's not that bad because we can assume that these
> >   values don't change often and so the race window between
> >   getxattr(GET_SIZE) and getxattr(GET_VALUES) often doesn't matter. But
> >   fwiw, the post > pre check doesn't exist for no reason; we do indeed
> >   hit that race.
> 
> It is not really a race, you can do {} while (errno != ERANGE) and there
> will be no race.

I don't know what your definition of your race is but if a value can
change between two calls and I have to call it in a loop until I get a
consistent value then that's a race. At least I know of no better way of
calling it.

> 
> >
> >   In addition, it is costly having to call getxattr() twice. Again, for
> >   retrieving xattrs it often doesn't matter because it's not a super
> >   common operation but for mount and other info it might matter.
> >
> 
> samba and many other projects that care about efficiency solved this
> a long time ago with an opportunistic buffer - never start with NULL buffer
> most values will fit in a 1K buffer.

I'm glad that it's been solved a long time ago. It's still not a good
property for an interface.

> 
> >   Will we have to use the same pattern for mnt and other info as well?
> >   If so, I worry that the race is way more likely than it is for real
> >   xattrs.
> >
> > * Would it be possible to support binary output with this interface?
> >   I really think users would love to have an interfact where they can
> >   get a struct with binary info back. I'm not advocating to make the
> >   whole interface binary but I wouldn't mind having the option to
> >   support it.
> >   Especially for some information at least. I'd really love to have a
> >   way go get a struct mount_info or whatever back that gives me all the
> >   details about a mount encompassed in a single struct.
> >
> 
> I suggested that up thread and Greg has explicitly and loudly
> NACKed it - so you will have to take it up with him

This is a vfs API and ultimately I would think that if we agree as a
subsystem that it would be desirable to have a way of providing binary
output in the form of well-defined structs in some form then we are free
to do so.

> 
> >   Callers like systemd will have to parse text and will end up
> >   converting everything from text into binary anyway; especially for
> >   mount information. So giving them an option for this out of the box
> >   would be quite good.
> >
> >   Interfaces like statx aim to be as fast as possible because we exptect
> >   them to be called quite often. Retrieving mount info is quite costly
> >   and is done quite often as well. Maybe not for all software but for a
> >   lot of low-level software. Especially when starting services in
> >   systemd a lot of mount parsing happens similar when starting
> >   containers in runtimes.
> >
> 
> This API is not for *everything*. Obviously it does not replace statx and some
> info (like the cifs OFFLINE flag) should be added to statx.

I'm not sure why you bring up this API as a replacement for statx().
That was never part of the discussion. And I didn't think I gave the
impression I was saying this.

This is about mount information and you can't get a lot of meaningful
mount information from statx(). Though tbh, I really think a mountx() or
similar system call wouldn't hurt...

> Whether or not mount info needs to get a special treatment like statx
> is not proven.

Hm, if we take that argument I could also claim that whether or not
mount info needs to be given in textual form is not proven. Iow, I'm not
sure what this is an argument for or against.

In fact, most well-used vfs information providing APIs apart from a few
such as the xattr APIs are based on well-defined structs. And I
personally at least consider that to be a good thing.

But I'm willing to compromise and support the textual thing. But I'd
still like to have the possibility to have some information provided in
binary form. I don't think that needs to be off the table completely.

> Miklos claims this is a notification issue-
> With David Howells' mount notification API, systemd can be pointed at the new
> mount that was added/removed/changed and then systemd will rarely need to
> parse thousands of mounts info.

You seem to be replying to things I didn't say. :)

The notification issue is orthogonal to that and yes, we need that.
I'm just saying that I want the ability in principal for some properties
to be given in binary form in addition to textual form. Performance may
be an aspect of this but that's orthogonal to performance issues when
being notified about mount changes and reacting to it in e.g. a service.

> 
> > * If we decide to go forward with this interface - and I think I
> >   mentioned this in the lsfmm session - could we please at least add a
> >   new system call? It really feels wrong to retrieve mount and other
> >   information through the xattr interfaces. They aren't really xattrs.
> >
> >   Imho, xattrs are a bit like a wonky version of streams already (One of
> >   the reasons I find them quite unpleasant.). Making mount and other
> >   information retrievable directly through the getxattr() interface will
> >   turn them into a full-on streams implementation imho. I'd prefer not
> >   to do that (Which is another reason I'd prefer at least a separate
> >   system call.).
> 
> If you are thinking about a read() like interface for xattr or any alternative
> data stream then Linus has NACKed it times before.

Please don't just make up an argument for me and then counter it. :D

> 
> However, we could add getxattr_multi() as Dave Chinner suggested for
> enumerating multiple keys+ (optional) values.
> In contrast to listxattr(), getxattr_multi() could allow a "short read"
> at least w.r.t the size of the vector.

Having "xattr" in the system call name is just confusing. These are
fundamentally not "real" xattrs and we shouldn't mix semantics. There
should be a clear distinction between traditional xattrs and this vfs
and potentially fs information providing interface.

Just thinking about what the manpage would look like. We would need to
add a paragraph to xattr(7) explaining that in addition to the system.*,
security.*, user.* and other namespaces we now also have a set of
namespaces that function as ways to get information about mounts or
other things instead of information attached to specific inodes.

That's super random imho. If I were to be presented with this manpage
I'd wonder if someone was too lazy to add a proper new system call with
it's own semantics for this and just stuffed it into an existing API
because it provided matching system call arguments. We can add a new
system call. It's not that we're running out of them.

Fwiw, and I'm genuinly _not_ trolling you could call it:
fsinfo(int dfd, const char *path, const char *key, char *value);
Amir Goldstein May 9, 2022, 5:07 p.m. UTC | #17
On Mon, May 9, 2022 at 6:09 PM Christian Brauner <brauner@kernel.org> wrote:
>
> On Mon, May 09, 2022 at 05:20:50PM +0300, Amir Goldstein wrote:
> > On Mon, May 9, 2022 at 3:48 PM Christian Brauner <brauner@kernel.org> wrote:
> > >
> > > On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote:
> > > > This is a simplification of the getvalues(2) prototype and moving it to the
> > > > getxattr(2) interface, as suggested by Dave.
> > > >
> > > > The patch itself just adds the possibility to retrieve a single line of
> > > > /proc/$$/mountinfo (which was the basic requirement from which the fsinfo
> > > > patchset grew out of).
> > > >
> > > > But this should be able to serve Amir's per-sb iostats, as well as a host of
> > > > other cases where some statistic needs to be retrieved from some object.  Note:
> > > > a filesystem object often represents other kinds of objects (such as processes
> > > > in /proc) so this is not limited to fs attributes.
> > > >
> > > > This also opens up the interface to setting attributes via setxattr(2).
> > > >
> > > > After some pondering I made the namespace so:
> > > >
> > > > : - root
> > > > bar - an attribute
> > > > foo: - a folder (can contain attributes and/or folders)
> > > >
> > > > The contents of a folder is represented by a null separated list of names.
> > > >
> > > > Examples:
> > > >
> > > > $ getfattr -etext -n ":" .
> > > > # file: .
> > > > :="mnt:\000mntns:"
> > > >
> > > > $ getfattr -etext -n ":mnt:" .
> > > > # file: .
> > > > :mnt:="info"
> > > >
> > > > $ getfattr -etext -n ":mnt:info" .
> > > > # file: .
> > > > :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012"
> > >
> > > Hey Miklos,
> > >
> > > One comment about this. We really need to have this interface support
> > > giving us mount options like "relatime" back in numeric form (I assume
> > > this will be possible.). It is royally annoying having to maintain a
> > > mapping table in userspace just to do:
> > >
> > > relatime -> MS_RELATIME/MOUNT_ATTR_RELATIME
> > > ro       -> MS_RDONLY/MOUNT_ATTR_RDONLY
> > >
> > > A library shouldn't be required to use this interface. Conservative
> > > low-level software that keeps its shared library dependencies minimal
> > > will need to be able to use that interface without having to go to an
> > > external library that transforms text-based output to binary form (Which
> > > I'm very sure will need to happen if we go with a text-based
> > > interface.).
> > >
> >
> > No need for a library.
> > We can export:
> >
> > :mnt:attr:flags (in hex format)
>
> So a binary attribute or a hex value as a string which we have to parse
> in userspace into proper hex?
>

I do see the ugliness in that.
I personally have no objection to binary values in the leaves of the tree.
I don't the main concern that Greg raised was against binary structs,
which end up being an ABI compatibility nightmare.

> >
> > > >
> > > > $ getfattr -etext -n ":mntns:" .
> > > > # file: .
> > > > :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:"
> > > >
> > > > $ getfattr -etext -n ":mntns:28:" .
> > > > # file: .
> > > > :mntns:28:="info"
> > > >
> > > > Comments?
> > >
> > > I'm not a fan of text-based APIs and I'm particularly not a fan of the
> > > xattr APIs. But at this point I'm ready to compromise on a lot as long
> > > as it gets us values out of the kernel in some way. :)
> > >
> > > I had to use xattrs extensively in various low-level userspace projects
> > > and they continue to be a source of races and memory bugs.
> > >
> > > A few initial questions:
> > >
> > > * The xattr APIs often require the caller to do sm like (copying some go
> > >   code quickly as I have that lying around):
> > >
> > >         for _, x := range split {
> > >                 xattr := string(x)
> > >                 // Call Getxattr() twice: First, to determine the size of the
> > >                 // buffer we need to allocate to store the extended attributes,
> > >                 // second, to actually store the extended attributes in the
> > >                 // buffer. Also, check if the size of the extended attribute
> > >                 // hasn't increased between the two calls.
> > >                 pre, err = unix.Getxattr(path, xattr, nil)
> > >                 if err != nil || pre < 0 {
> > >                         return nil, err
> > >                 }
> > >
> > >                 dest = make([]byte, pre)
> > >                 post := 0
> > >                 if pre > 0 {
> > >                         post, err = unix.Getxattr(path, xattr, dest)
> > >                         if err != nil || post < 0 {
> > >                                 return nil, err
> > >                         }
> > >                 }
> > >
> > >                 if post > pre {
> > >                         return nil, fmt.Errorf("Extended attribute '%s' size increased from %d to %d during retrieval", xattr, pre, post)
> > >                 }
> > >
> > >                 xattrs[xattr] = string(dest)
> > >         }
> > >
> > >   This pattern of requesting the size first by passing empty arguments,
> > >   then allocating the buffer and then passing down that buffer to
> > >   retrieve that value is really annoying to use and error prone (I do
> > >   of course understand why it exists.).
> > >
> > >   For real xattrs it's not that bad because we can assume that these
> > >   values don't change often and so the race window between
> > >   getxattr(GET_SIZE) and getxattr(GET_VALUES) often doesn't matter. But
> > >   fwiw, the post > pre check doesn't exist for no reason; we do indeed
> > >   hit that race.
> >
> > It is not really a race, you can do {} while (errno != ERANGE) and there
> > will be no race.
>
> I don't know what your definition of your race is but if a value can
> change between two calls and I have to call it in a loop until I get a
> consistent value then that's a race. At least I know of no better way of
> calling it.
>

For enumerating multiple keys we need a "short read", probably
offset too.
For leaf values that is less likely to be needed - I hope.

> >
> > >
> > >   In addition, it is costly having to call getxattr() twice. Again, for
> > >   retrieving xattrs it often doesn't matter because it's not a super
> > >   common operation but for mount and other info it might matter.
> > >
> >
> > samba and many other projects that care about efficiency solved this
> > a long time ago with an opportunistic buffer - never start with NULL buffer
> > most values will fit in a 1K buffer.
>
> I'm glad that it's been solved a long time ago. It's still not a good
> property for an interface.
>

All right. All I am saying is that the argument that calling getxattr() twice
is costly is not relevant most of the time.

> >
> > >   Will we have to use the same pattern for mnt and other info as well?
> > >   If so, I worry that the race is way more likely than it is for real
> > >   xattrs.
> > >
> > > * Would it be possible to support binary output with this interface?
> > >   I really think users would love to have an interfact where they can
> > >   get a struct with binary info back. I'm not advocating to make the
> > >   whole interface binary but I wouldn't mind having the option to
> > >   support it.
> > >   Especially for some information at least. I'd really love to have a
> > >   way go get a struct mount_info or whatever back that gives me all the
> > >   details about a mount encompassed in a single struct.
> > >
> >
> > I suggested that up thread and Greg has explicitly and loudly
> > NACKed it - so you will have to take it up with him
>
> This is a vfs API and ultimately I would think that if we agree as a
> subsystem that it would be desirable to have a way of providing binary
> output in the form of well-defined structs in some form then we are free
> to do so.
>

Perhaps the way to reconcile between the ABI issue and avoid userspace
parsing is if we export both.
As long as an exported binary blob is also self described as individual
keys, then userspace is both able to understand how to parse the blob
and does not need to parse text in every read of values.

> >
> > >   Callers like systemd will have to parse text and will end up
> > >   converting everything from text into binary anyway; especially for
> > >   mount information. So giving them an option for this out of the box
> > >   would be quite good.
> > >
> > >   Interfaces like statx aim to be as fast as possible because we exptect
> > >   them to be called quite often. Retrieving mount info is quite costly
> > >   and is done quite often as well. Maybe not for all software but for a
> > >   lot of low-level software. Especially when starting services in
> > >   systemd a lot of mount parsing happens similar when starting
> > >   containers in runtimes.
> > >
> >
> > This API is not for *everything*. Obviously it does not replace statx and some
> > info (like the cifs OFFLINE flag) should be added to statx.
>
> I'm not sure why you bring up this API as a replacement for statx().
> That was never part of the discussion. And I didn't think I gave the
> impression I was saying this.
>
> This is about mount information and you can't get a lot of meaningful
> mount information from statx(). Though tbh, I really think a mountx() or
> similar system call wouldn't hurt...

That was my point. This API solves problems - it may not solve all the problems.
If mountx() is needed for some performant application then I think this should
be discussed separately.

>
> > Whether or not mount info needs to get a special treatment like statx
> > is not proven.
>
> Hm, if we take that argument I could also claim that whether or not
> mount info needs to be given in textual form is not proven. Iow, I'm not
> sure what this is an argument for or against.
>
> In fact, most well-used vfs information providing APIs apart from a few
> such as the xattr APIs are based on well-defined structs. And I
> personally at least consider that to be a good thing.
>

It is good for well known information structures.
We want to have something that is flexible and easy to extend
including with fs specific info.

> But I'm willing to compromise and support the textual thing. But I'd
> still like to have the possibility to have some information provided in
> binary form. I don't think that needs to be off the table completely.
>

I am not opposed.

> > Miklos claims this is a notification issue-
> > With David Howells' mount notification API, systemd can be pointed at the new
> > mount that was added/removed/changed and then systemd will rarely need to
> > parse thousands of mounts info.
>
> You seem to be replying to things I didn't say. :)
>

Sorry.
I guess I did not understand why there needs to be a lot of parsing
of mounts when starting systemd services, but I am not a systemd export.

> The notification issue is orthogonal to that and yes, we need that.
> I'm just saying that I want the ability in principal for some properties
> to be given in binary form in addition to textual form. Performance may
> be an aspect of this but that's orthogonal to performance issues when
> being notified about mount changes and reacting to it in e.g. a service.
>
> >
> > > * If we decide to go forward with this interface - and I think I
> > >   mentioned this in the lsfmm session - could we please at least add a
> > >   new system call? It really feels wrong to retrieve mount and other
> > >   information through the xattr interfaces. They aren't really xattrs.
> > >
> > >   Imho, xattrs are a bit like a wonky version of streams already (One of
> > >   the reasons I find them quite unpleasant.). Making mount and other
> > >   information retrievable directly through the getxattr() interface will
> > >   turn them into a full-on streams implementation imho. I'd prefer not
> > >   to do that (Which is another reason I'd prefer at least a separate
> > >   system call.).
> >
> > If you are thinking about a read() like interface for xattr or any alternative
> > data stream then Linus has NACKed it times before.
>
> Please don't just make up an argument for me and then counter it. :D
>

Sorry. should have asked how you envision the new syscall instead of
assuming.

> >
> > However, we could add getxattr_multi() as Dave Chinner suggested for
> > enumerating multiple keys+ (optional) values.
> > In contrast to listxattr(), getxattr_multi() could allow a "short read"
> > at least w.r.t the size of the vector.
>
> Having "xattr" in the system call name is just confusing. These are
> fundamentally not "real" xattrs and we shouldn't mix semantics. There
> should be a clear distinction between traditional xattrs and this vfs
> and potentially fs information providing interface.
>

I know I am not the only one to disagree with that claim.
There is nothing "real" about traditional xattrs in my eyes.
It's just a key/value API.
The name xattr is just as arbitrary as statx and mountx in my eyes.
But I won't shed a tear if people decide to call this something else.

> Just thinking about what the manpage would look like. We would need to
> add a paragraph to xattr(7) explaining that in addition to the system.*,
> security.*, user.* and other namespaces we now also have a set of
> namespaces that function as ways to get information about mounts or
> other things instead of information attached to specific inodes.
>
> That's super random imho. If I were to be presented with this manpage
> I'd wonder if someone was too lazy to add a proper new system call with
> it's own semantics for this and just stuffed it into an existing API
> because it provided matching system call arguments. We can add a new
> system call. It's not that we're running out of them.
>
> Fwiw, and I'm genuinly _not_ trolling you could call it:
> fsinfo(int dfd, const char *path, const char *key, char *value);

And I genuinely don't care how to call this, but considering that we
will need the get/set/list variants my preference would be (as I said
in LSFMM - not sure if people heard me):

getxattrat(dirfd, path, key, value, AT_METAVERSE) ;-)
setxattrat()/listattrat()

This way we can avert the collision with fs that agree to store ":mnt:info"

Thanks,
Amir.
Vivek Goyal May 9, 2022, 9:42 p.m. UTC | #18
On Mon, May 09, 2022 at 05:08:56PM +0200, Christian Brauner wrote:

[..]
> Having "xattr" in the system call name is just confusing. These are
> fundamentally not "real" xattrs and we shouldn't mix semantics. There
> should be a clear distinction between traditional xattrs and this vfs
> and potentially fs information providing interface.
> 
> Just thinking about what the manpage would look like. We would need to
> add a paragraph to xattr(7) explaining that in addition to the system.*,
> security.*, user.* and other namespaces we now also have a set of
> namespaces that function as ways to get information about mounts or
> other things instead of information attached to specific inodes.
> 
> That's super random imho. If I were to be presented with this manpage
> I'd wonder if someone was too lazy to add a proper new system call with
> it's own semantics for this and just stuffed it into an existing API
> because it provided matching system call arguments. We can add a new
> system call. It's not that we're running out of them.

FWIW, I also felt that using xattr API to get some sort of mount info felt
very non-intutive.

Thanks
Vivek
Dave Chinner May 10, 2022, 12:55 a.m. UTC | #19
On Mon, May 09, 2022 at 02:48:15PM +0200, Christian Brauner wrote:
> On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote:
> > This is a simplification of the getvalues(2) prototype and moving it to the
> > getxattr(2) interface, as suggested by Dave.
> > 
> > The patch itself just adds the possibility to retrieve a single line of
> > /proc/$$/mountinfo (which was the basic requirement from which the fsinfo
> > patchset grew out of).
> > 
> > But this should be able to serve Amir's per-sb iostats, as well as a host of
> > other cases where some statistic needs to be retrieved from some object.  Note:
> > a filesystem object often represents other kinds of objects (such as processes
> > in /proc) so this is not limited to fs attributes.
> > 
> > This also opens up the interface to setting attributes via setxattr(2).
> > 
> > After some pondering I made the namespace so:
> > 
> > : - root
> > bar - an attribute
> > foo: - a folder (can contain attributes and/or folders)
> > 
> > The contents of a folder is represented by a null separated list of names.
> > 
> > Examples:
> > 
> > $ getfattr -etext -n ":" .
> > # file: .
> > :="mnt:\000mntns:"
> > 
> > $ getfattr -etext -n ":mnt:" .
> > # file: .
> > :mnt:="info"
> > 
> > $ getfattr -etext -n ":mnt:info" .
> > # file: .
> > :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012"
> 
> Hey Miklos,
> 
> One comment about this. We really need to have this interface support
> giving us mount options like "relatime" back in numeric form (I assume
> this will be possible.). It is royally annoying having to maintain a
> mapping table in userspace just to do:
> 
> relatime -> MS_RELATIME/MOUNT_ATTR_RELATIME
> ro	 -> MS_RDONLY/MOUNT_ATTR_RDONLY

You're asking for a complete change of output information there.
This has nothing to do with the mechanism for extracting key/value
information from the kernel.

i.e. we need to separate demands for "data I want" from "mechanism
to extract data".

> > $ getfattr -etext -n ":mntns:" .
> > # file: .
> > :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:"
> > 
> > $ getfattr -etext -n ":mntns:28:" .
> > # file: .
> > :mntns:28:="info"
> > 
> > Comments?
> 
> I'm not a fan of text-based APIs and I'm particularly not a fan of the
> xattr APIs. But at this point I'm ready to compromise on a lot as long
> as it gets us values out of the kernel in some way. :)
> 
> I had to use xattrs extensively in various low-level userspace projects
> and they continue to be a source of races and memory bugs.
> 
> A few initial questions:
> 
> * The xattr APIs often require the caller to do sm like (copying some go
>   code quickly as I have that lying around):
> 
> 	for _, x := range split {
> 		xattr := string(x)
> 		// Call Getxattr() twice: First, to determine the size of the
> 		// buffer we need to allocate to store the extended attributes,
> 		// second, to actually store the extended attributes in the
> 		// buffer. Also, check if the size of the extended attribute
> 		// hasn't increased between the two calls.
> 		pre, err = unix.Getxattr(path, xattr, nil)
> 		if err != nil || pre < 0 {
> 			return nil, err
> 		}
> 
> 		dest = make([]byte, pre)
> 		post := 0
> 		if pre > 0 {
> 			post, err = unix.Getxattr(path, xattr, dest)
> 			if err != nil || post < 0 {
> 				return nil, err
> 			}
> 		}
> 
> 		if post > pre {
> 			return nil, fmt.Errorf("Extended attribute '%s' size increased from %d to %d during retrieval", xattr, pre, post)
> 		}
> 
> 		xattrs[xattr] = string(dest)
> 	}
> 
>   This pattern of requesting the size first by passing empty arguments,
>   then allocating the buffer and then passing down that buffer to
>   retrieve that value is really annoying to use and error prone (I do
>   of course understand why it exists.).

You're doing it wrong.

When you have an attr extraction loop like this, you allocate a
single 64kB buffer for the value, adn then call getattr() with the
buffer and a length of 64kB. Then the call returns both the value
length and the value in one syscall, and then the application can
allocate+copy into an exact sized buffer if it needs to.

Then you use the same 64kB buffer for the next getxattr() call.

I first saw this pattern in code written in the mid 1990s for
Irix, and for basic listxattr/getxattr operations to find and
retrieve key/value pairs this is much more efficient than the above.

The real problem is that the linux listxattr() syscall only returns
names. It's a shit API design at best, especially the part where it
cannot iterate over a list larger than a single buffer. If you've
got millions of xattrs, listxattr is fucking useless. The man page
is actively harmful - that's where everyone learns the double
getxattr anit-pattern you've described above.


With that in mind, go look at XFS_IOC_ATTRLIST_BY_HANDLE that I've
mentioned in previous discussions this topic.  Each attr entry
returned is one of these:

struct xfs_attrlist_ent {       /* data from attr_list() */
	__u32   a_valuelen;     /* number bytes in value of attr */
	char    a_name[1];      /* attr name (NULL terminated) */
};

It really needs a namelen to make parsing of the output buffer
simpler, but this is the model we should be following - listing
xattrs is no different from listing directory entries. Indeed,
a directory entries is basically just a name/value pair - the name
of the dirent and the inode number it points to is the value.

Further to that point, the control structure for
XFS_IOC_ATTRLIST_BY_HANDLE has a cursor value that works the same
way as readdir cookies do. Hence when iterating over xattrs that
require multiple syscalls to retrieve, the cursor allows the next
list syscall to start off listing exactly where the previous syscall
finished.

IOWs, what Linux really needs is a listxattr2() syscall that works
the same way that getdents/XFS_IOC_ATTRLIST_BY_HANDLE work. With the
list function returning value sizes and being able to iterate
effectively, every problem that listxattr() causes goes away.

And while we are at it, we need to consider a xattr2() syscall to
replace getxattr/setxattr. The model for that is
XFS_IOC_ATTRMULTI_BY_HANDLE, which allows operations to be performed
on mulitple xattrs in a single syscall. e.g. we can do a bulk get,
set and remove operation across multiple xattrs - we can even mix
and match get/set/remove operations on different xattrs in a single
call.

> * Would it be possible to support binary output with this interface?

The xattr API already supports binary names and values. The only
exception is you can't put NULLs in names because APIs use that as a
name terminator. if listxattr2() returns a namelen in it's
structure, then we could allow fully binary names, too (XFS already
fully supports this internally!). In the current API, values are
always determined by length, not null termiantions, so they are
also already fully binary capable.

>   I really think users would love to have an interfact where they can
>   get a struct with binary info back.

No. Not for kernel informational interfaces. We have ioctls and
syscalls for defining structured binary interfaces that pass
non-trivial objects. xattrs are no the place for this - they are
key/value object stores like sysfs is supposed to be, so I really
don't think we should support encoded binary data in xattrs under
this special mount namespace...

Solving the "retreive multiple values per syscall" problem is what
the bulk get interface (XFS_IOC_ATTRMULTI_BY_HANDLE) is for.

>   Callers like systemd will have to parse text and will end up
>   converting everything from text into binary anyway; especially for
>   mount information. So giving them an option for this out of the box
>   would be quite good.

That boat sailed years ago. You're making the same arguments about
binary extraction interfaces being way more efficient than ascii
based value-per-file interfaces like proc/sysfs that were made back
in the early 2000s. That boat has long sailed - while the current
method is somewhat inefficient, it certainly hasn't had all the
problems that maintaining binary interfaces over decades has had....

Much as it pains me to say it (because I came to Linux from an OS
that exported huge amounts of custom binary structures from the
kernel), having all the export interfaces dump human readable
information has proved far more flexible and usable than interfaces
that required binary parsers to dump information before it could be
used on the command line or in scripting languages....

> * If we decide to go forward with this interface - and I think I
>   mentioned this in the lsfmm session - could we please at least add a
>   new system call? It really feels wrong to retrieve mount and other
>   information through the xattr interfaces. They aren't really xattrs.

We are trying to expose structured key-value information. That's
exactly what the xattr API was orginally created to cater for...

>   Imho, xattrs are a bit like a wonky version of streams already (One of
>   the reasons I find them quite unpleasant.). Making mount and other
>   information retrievable directly through the getxattr() interface will
>   turn them into a full-on streams implementation imho. I'd prefer not
>   to do that (Which is another reason I'd prefer at least a separate
>   system call.).

And that's a total misunderstanding of what xattrs are.

Alternate data streams are just {file,offset} based data streams
accessed via ithe same read/write() mechanisms as the primary data
stream.

Xattrs provide an *atomic key-value object store API*, not an offset
based data stream API. They are completely different beasts,
intended for completely different purposes. ADS are redundant when you
have directories and files, whilst an atomic key-value store is
something completely different.

You do realise we have an independent, scalable, ACID compliant
key-value object store in every inode in an XFS filesystem, right?

Cheers,

Dave.
Ian Kent May 10, 2022, 3:34 a.m. UTC | #20
On Mon, 2022-05-09 at 17:42 -0400, Vivek Goyal wrote:
> On Mon, May 09, 2022 at 05:08:56PM +0200, Christian Brauner wrote:
> 
> [..]
> > Having "xattr" in the system call name is just confusing. These are
> > fundamentally not "real" xattrs and we shouldn't mix semantics.
> > There
> > should be a clear distinction between traditional xattrs and this
> > vfs
> > and potentially fs information providing interface.
> > 
> > Just thinking about what the manpage would look like. We would need
> > to
> > add a paragraph to xattr(7) explaining that in addition to the
> > system.*,
> > security.*, user.* and other namespaces we now also have a set of
> > namespaces that function as ways to get information about mounts or
> > other things instead of information attached to specific inodes.
> > 
> > That's super random imho. If I were to be presented with this
> > manpage
> > I'd wonder if someone was too lazy to add a proper new system call
> > with
> > it's own semantics for this and just stuffed it into an existing
> > API
> > because it provided matching system call arguments. We can add a
> > new
> > system call. It's not that we're running out of them.
> 
> FWIW, I also felt that using xattr API to get some sort of mount info
> felt
> very non-intutive.

Yeah, people looking for this function simply wouldn't know to
look here ...

Ian
Miklos Szeredi May 10, 2022, 3:49 a.m. UTC | #21
On Mon, 9 May 2022 at 14:48, Christian Brauner <brauner@kernel.org> wrote:

> One comment about this. We really need to have this interface support
> giving us mount options like "relatime" back in numeric form (I assume
> this will be possible.). It is royally annoying having to maintain a
> mapping table in userspace just to do:
>
> relatime -> MS_RELATIME/MOUNT_ATTR_RELATIME
> ro       -> MS_RDONLY/MOUNT_ATTR_RDONLY
>
> A library shouldn't be required to use this interface. Conservative
> low-level software that keeps its shared library dependencies minimal
> will need to be able to use that interface without having to go to an
> external library that transforms text-based output to binary form (Which
> I'm very sure will need to happen if we go with a text-based
> interface.).

Agreed.

>   This pattern of requesting the size first by passing empty arguments,
>   then allocating the buffer and then passing down that buffer to
>   retrieve that value is really annoying to use and error prone (I do
>   of course understand why it exists.).
>
>   For real xattrs it's not that bad because we can assume that these
>   values don't change often and so the race window between
>   getxattr(GET_SIZE) and getxattr(GET_VALUES) often doesn't matter. But
>   fwiw, the post > pre check doesn't exist for no reason; we do indeed
>   hit that race.

That code is wrong.  Changing xattr size is explicitly documented in
the man page as a non-error condition:

       If size is specified as zero, these calls return the  current  size  of
       the  named extended attribute (and leave value unchanged).  This can be
       used to determine the size of the buffer that should be supplied  in  a
       subsequent  call.   (But, bear in mind that there is a possibility that
       the attribute value may change between the two calls,  so  that  it  is
       still necessary to check the return status from the second call.)

>
>   In addition, it is costly having to call getxattr() twice. Again, for
>   retrieving xattrs it often doesn't matter because it's not a super
>   common operation but for mount and other info it might matter.

You don't *have* to retrieve the size, it's perfectly valid to e.g.
start with a fixed buffer size and double the size until the result
fits.

> * Would it be possible to support binary output with this interface?
>   I really think users would love to have an interfact where they can
>   get a struct with binary info back.

I think that's bad taste.   fsinfo(2) had the same issue.  As well as
mount(2) which still interprets the last argument as a binary blob in
certain cases (nfs is one I know of).

>   Especially for some information at least. I'd really love to have a
>   way go get a struct mount_info or whatever back that gives me all the
>   details about a mount encompassed in a single struct.

If we want that, then can do a new syscall with that specific struct
as an argument.

>   Callers like systemd will have to parse text and will end up
>   converting everything from text into binary anyway; especially for
>   mount information. So giving them an option for this out of the box
>   would be quite good.

What exactly are the attributes that systemd requires?

>   Interfaces like statx aim to be as fast as possible because we exptect
>   them to be called quite often. Retrieving mount info is quite costly
>   and is done quite often as well. Maybe not for all software but for a
>   lot of low-level software. Especially when starting services in
>   systemd a lot of mount parsing happens similar when starting
>   containers in runtimes.

Was there ever a test patch for systemd using fsinfo(2)?  I think not.

Until systemd people start to reengineer the mount handing to allow
for retrieving a single mount instead of the complete mount table we
will never know where the performance bottleneck lies.

>
> * If we decide to go forward with this interface - and I think I
>   mentioned this in the lsfmm session - could we please at least add a
>   new system call? It really feels wrong to retrieve mount and other
>   information through the xattr interfaces. They aren't really xattrs.

I'd argue with that statement.  These are most definitely attributes.
As for being extended, we'd just extended the xattr interface...

Naming aside... imagine that read(2) has always been used to retrieve
disk data, would you say that reading data from proc feels wrong?
And in hindsight, would a new syscall for the purpose make any sense?

Thanks,
Miklos
Ian Kent May 10, 2022, 4:27 a.m. UTC | #22
On Tue, 2022-05-10 at 05:49 +0200, Miklos Szeredi wrote:
> On Mon, 9 May 2022 at 14:48, Christian Brauner <brauner@kernel.org>
> wrote:
> 
> > One comment about this. We really need to have this interface
> > support
> > giving us mount options like "relatime" back in numeric form (I
> > assume
> > this will be possible.). It is royally annoying having to maintain
> > a
> > mapping table in userspace just to do:
> > 
> > relatime -> MS_RELATIME/MOUNT_ATTR_RELATIME
> > ro       -> MS_RDONLY/MOUNT_ATTR_RDONLY
> > 
> > A library shouldn't be required to use this interface. Conservative
> > low-level software that keeps its shared library dependencies
> > minimal
> > will need to be able to use that interface without having to go to
> > an
> > external library that transforms text-based output to binary form
> > (Which
> > I'm very sure will need to happen if we go with a text-based
> > interface.).
> 
> Agreed.
> 
> >   This pattern of requesting the size first by passing empty
> > arguments,
> >   then allocating the buffer and then passing down that buffer to
> >   retrieve that value is really annoying to use and error prone (I
> > do
> >   of course understand why it exists.).
> > 
> >   For real xattrs it's not that bad because we can assume that
> > these
> >   values don't change often and so the race window between
> >   getxattr(GET_SIZE) and getxattr(GET_VALUES) often doesn't matter.
> > But
> >   fwiw, the post > pre check doesn't exist for no reason; we do
> > indeed
> >   hit that race.
> 
> That code is wrong.  Changing xattr size is explicitly documented in
> the man page as a non-error condition:
> 
>        If size is specified as zero, these calls return the  current 
> size  of
>        the  named extended attribute (and leave value unchanged). 
> This can be
>        used to determine the size of the buffer that should be
> supplied  in  a
>        subsequent  call.   (But, bear in mind that there is a
> possibility that
>        the attribute value may change between the two calls,  so 
> that  it  is
>        still necessary to check the return status from the second
> call.)
> 
> > 
> >   In addition, it is costly having to call getxattr() twice. Again,
> > for
> >   retrieving xattrs it often doesn't matter because it's not a
> > super
> >   common operation but for mount and other info it might matter.
> 
> You don't *have* to retrieve the size, it's perfectly valid to e.g.
> start with a fixed buffer size and double the size until the result
> fits.
> 
> > * Would it be possible to support binary output with this
> > interface?
> >   I really think users would love to have an interfact where they
> > can
> >   get a struct with binary info back.
> 
> I think that's bad taste.   fsinfo(2) had the same issue.  As well as
> mount(2) which still interprets the last argument as a binary blob in
> certain cases (nfs is one I know of).
> 
> >   Especially for some information at least. I'd really love to have
> > a
> >   way go get a struct mount_info or whatever back that gives me all
> > the
> >   details about a mount encompassed in a single struct.
> 
> If we want that, then can do a new syscall with that specific struct
> as an argument.
> 
> >   Callers like systemd will have to parse text and will end up
> >   converting everything from text into binary anyway; especially
> > for
> >   mount information. So giving them an option for this out of the
> > box
> >   would be quite good.
> 
> What exactly are the attributes that systemd requires?

It's been a while since I worked on this so my response might not
be too accurrate now.

Monitoring the mount table is used primarily to identify a mount
started and mount completion.

Mount table entry identification requires several fields.

But, in reality, once a direct interface is available it should be
possible to work out what is actually needed and that will be a
rather subset of a mountinfo table entry.

> 
> >   Interfaces like statx aim to be as fast as possible because we
> > exptect
> >   them to be called quite often. Retrieving mount info is quite
> > costly
> >   and is done quite often as well. Maybe not for all software but
> > for a
> >   lot of low-level software. Especially when starting services in
> >   systemd a lot of mount parsing happens similar when starting
> >   containers in runtimes.
> 
> Was there ever a test patch for systemd using fsinfo(2)?  I think
> not.

Mmm ... I'm hurt you didn't pay any attention to what I did on this
during the original fsinfo() discussions.

> 
> Until systemd people start to reengineer the mount handing to allow
> for retrieving a single mount instead of the complete mount table we
> will never know where the performance bottleneck lies.

We didn't need the systemd people to do this only review and contribute
to the pr for the change and eventually merge it.

What I did on this showed that using fsinfo() allone about halved the
CPU overhead (from around 4 processes using about 80%) and once the
mount notifications was added too it went down to well under 10% per
process. The problem here was systemd is quite good at servicing events
and reducing event processing overhead meant more events would then be
processed. Utilizing the mount notifications queueing was the key to
improving this and that was what I was about to work on at the end.

But everything stopped before the work was complete.

As I said above it's been a long time since I looked at the systemd
work and it definitely was a WIP so "what you see is what you get"
at https://github.com/raven-au/systemd/commits/. It looks like the
place to look to get some idea of what was being done is branch
notifications-devel or notifications-rfc-pr. Also note that this
uses the libmount fsinfo() infrastrucure that was done by Karal Zak
(and a tiny bit by me) at the time.

> 
> > 
> > * If we decide to go forward with this interface - and I think I
> >   mentioned this in the lsfmm session - could we please at least
> > add a
> >   new system call? It really feels wrong to retrieve mount and
> > other
> >   information through the xattr interfaces. They aren't really
> > xattrs.
> 
> I'd argue with that statement.  These are most definitely attributes.
> As for being extended, we'd just extended the xattr interface...
> 
> Naming aside... imagine that read(2) has always been used to retrieve
> disk data, would you say that reading data from proc feels wrong?
> And in hindsight, would a new syscall for the purpose make any sense?
> 
> Thanks,
> Miklos
Miklos Szeredi May 10, 2022, 8:06 a.m. UTC | #23
On Tue, 10 May 2022 at 06:27, Ian Kent <raven@themaw.net> wrote:

> > Was there ever a test patch for systemd using fsinfo(2)?  I think
> > not.
>
> Mmm ... I'm hurt you didn't pay any attention to what I did on this
> during the original fsinfo() discussions.

I can't find anything related to this in my mailbox.  Maybe you
mentioned it at some point, but I have not been involved with the
actual systemd changes.  So not meant to belittle your work at all.

> > Until systemd people start to reengineer the mount handing to allow
> > for retrieving a single mount instead of the complete mount table we
> > will never know where the performance bottleneck lies.
>
> We didn't need the systemd people to do this only review and contribute
> to the pr for the change and eventually merge it.
>
> What I did on this showed that using fsinfo() allone about halved the
> CPU overhead (from around 4 processes using about 80%) and once the
> mount notifications was added too it went down to well under 10% per
> process. The problem here was systemd is quite good at servicing events
> and reducing event processing overhead meant more events would then be
> processed. Utilizing the mount notifications queueing was the key to
> improving this and that was what I was about to work on at the end.
>
> But everything stopped before the work was complete.
>
> As I said above it's been a long time since I looked at the systemd
> work and it definitely was a WIP so "what you see is what you get"
> at https://github.com/raven-au/systemd/commits/. It looks like the
> place to look to get some idea of what was being done is branch
> notifications-devel or notifications-rfc-pr. Also note that this
> uses the libmount fsinfo() infrastrucure that was done by Karal Zak
> (and a tiny bit by me) at the time.

Looks great as a first step.

What do you mean by "Utilizing the mount notifications queueing"?

Do you mean batching of notifications?   I think that's a very
important issue: processing each individual notifcation may not make
sense when there are lots of them.  For example, doing ceate
mount+remote mount in a loop a million times will result in two
million notification messages (with high likelyhood of queue
overflow), but in the end the mount table will end up being the
same...

Thanks,
Miklos
Miklos Szeredi May 10, 2022, 8:07 a.m. UTC | #24
On Tue, 10 May 2022 at 10:06, Miklos Szeredi <miklos@szeredi.hu> wrote:
>
> On Tue, 10 May 2022 at 06:27, Ian Kent <raven@themaw.net> wrote:
>
> > > Was there ever a test patch for systemd using fsinfo(2)?  I think
> > > not.
> >
> > Mmm ... I'm hurt you didn't pay any attention to what I did on this
> > during the original fsinfo() discussions.
>
> I can't find anything related to this in my mailbox.  Maybe you
> mentioned it at some point, but I have not been involved with the
> actual systemd changes.  So not meant to belittle your work at all.
>
> > > Until systemd people start to reengineer the mount handing to allow
> > > for retrieving a single mount instead of the complete mount table we
> > > will never know where the performance bottleneck lies.
> >
> > We didn't need the systemd people to do this only review and contribute
> > to the pr for the change and eventually merge it.
> >
> > What I did on this showed that using fsinfo() allone about halved the
> > CPU overhead (from around 4 processes using about 80%) and once the
> > mount notifications was added too it went down to well under 10% per
> > process. The problem here was systemd is quite good at servicing events
> > and reducing event processing overhead meant more events would then be
> > processed. Utilizing the mount notifications queueing was the key to
> > improving this and that was what I was about to work on at the end.
> >
> > But everything stopped before the work was complete.
> >
> > As I said above it's been a long time since I looked at the systemd
> > work and it definitely was a WIP so "what you see is what you get"
> > at https://github.com/raven-au/systemd/commits/. It looks like the
> > place to look to get some idea of what was being done is branch
> > notifications-devel or notifications-rfc-pr. Also note that this
> > uses the libmount fsinfo() infrastrucure that was done by Karal Zak
> > (and a tiny bit by me) at the time.
>
> Looks great as a first step.
>
> What do you mean by "Utilizing the mount notifications queueing"?
>
> Do you mean batching of notifications?   I think that's a very
> important issue: processing each individual notifcation may not make
> sense when there are lots of them.  For example, doing ceate
> mount+remote mount in a loop a million times will result in two

s/remote/remove/

> million notification messages (with high likelyhood of queue
> overflow), but in the end the mount table will end up being the
> same...
>
> Thanks,
> Miklos
Christian Brauner May 10, 2022, 11:53 a.m. UTC | #25
On Tue, May 10, 2022 at 05:49:04AM +0200, Miklos Szeredi wrote:
> On Mon, 9 May 2022 at 14:48, Christian Brauner <brauner@kernel.org> wrote:
> 
> > One comment about this. We really need to have this interface support
> > giving us mount options like "relatime" back in numeric form (I assume
> > this will be possible.). It is royally annoying having to maintain a
> > mapping table in userspace just to do:
> >
> > relatime -> MS_RELATIME/MOUNT_ATTR_RELATIME
> > ro       -> MS_RDONLY/MOUNT_ATTR_RDONLY
> >
> > A library shouldn't be required to use this interface. Conservative
> > low-level software that keeps its shared library dependencies minimal
> > will need to be able to use that interface without having to go to an
> > external library that transforms text-based output to binary form (Which
> > I'm very sure will need to happen if we go with a text-based
> > interface.).
> 
> Agreed.
> 
> >   This pattern of requesting the size first by passing empty arguments,
> >   then allocating the buffer and then passing down that buffer to
> >   retrieve that value is really annoying to use and error prone (I do
> >   of course understand why it exists.).
> >
> >   For real xattrs it's not that bad because we can assume that these
> >   values don't change often and so the race window between
> >   getxattr(GET_SIZE) and getxattr(GET_VALUES) often doesn't matter. But
> >   fwiw, the post > pre check doesn't exist for no reason; we do indeed
> >   hit that race.
> 
> That code is wrong.  Changing xattr size is explicitly documented in

When recursively changing the ownership of a filesystem tree for a
container some xattrs will need to be updated as well.

For example, if I have files with POSIX ACLs set which store {g,u}ids
then the ACL needs to be updated to store the {g,u}id mapped to the
container so the container can interpret them when started.

That is a rather sensitive operation with loads of potentials for bugs.
So if a POSIX ACL changes beneath the chowning daemon they must be
conservative because it means that there's concurrent modfication or
possibly an attack going on.

In general, I feel it's a bit easy to judge the code is wrong without
looking at the concrete scenario.

I'm also unsure how the manpage implies it's not an error condition.
Afaict, it only implies that the caller needs to handle the case where
the xattr changes. Whether or not that's an error is up to the caller to
decide. If the caller expects to be the sole user of a specific
filesystems then a changing xattr in between should probably be an error
condition.

But I think we're starting to go on a detour.

> the man page as a non-error condition:
> 
>        If size is specified as zero, these calls return the  current  size  of
>        the  named extended attribute (and leave value unchanged).  This can be
>        used to determine the size of the buffer that should be supplied  in  a
>        subsequent  call.   (But, bear in mind that there is a possibility that
>        the attribute value may change between the two calls,  so  that  it  is
>        still necessary to check the return status from the second call.)
> 
> >
> >   In addition, it is costly having to call getxattr() twice. Again, for
> >   retrieving xattrs it often doesn't matter because it's not a super
> >   common operation but for mount and other info it might matter.
> 
> You don't *have* to retrieve the size, it's perfectly valid to e.g.
> start with a fixed buffer size and double the size until the result
> fits.

Yes, I understand and accept that. I'm just not fond of such APIs.

> 
> > * Would it be possible to support binary output with this interface?
> >   I really think users would love to have an interfact where they can
> >   get a struct with binary info back.
> 
> I think that's bad taste.   fsinfo(2) had the same issue.  As well as
> mount(2) which still interprets the last argument as a binary blob in
> certain cases (nfs is one I know of).

In the same vein I could argue it's bad taste that everything gets
returned as a string. But I do agree that binary blobs through void
pointers aren't elegant.

I just worry that if we have an interface and there's a legitimate
subset of users that would be well served by a simple struct for e.g.,
mount properties any attempt to get something like this in the form of a
separate system call will be shut down with the argument that we already
have an interface for this.

So I'd compromise if we have your/any other interface return binary
blobs. But of course I'd be equally happy if we'd at least expose basic
mount information in the form of a separate system call.

> 
> >   Especially for some information at least. I'd really love to have a
> >   way go get a struct mount_info or whatever back that gives me all the
> >   details about a mount encompassed in a single struct.
> 
> If we want that, then can do a new syscall with that specific struct
> as an argument.

Ok, that sounds good to me.

> 
> >   Callers like systemd will have to parse text and will end up
> >   converting everything from text into binary anyway; especially for
> >   mount information. So giving them an option for this out of the box
> >   would be quite good.
> 
> What exactly are the attributes that systemd requires?

We keep a repo with ideas for (kernel) extensions - we should probably
publish that somewhere - but the list we used for a prototype roughly
contains:

* mount flags MOUNT_ATTR_RDONLY etc.
* time flags MOUNT_ATTR_RELATIME etc. (could probably be combined with
  mount flags. We missed the opportunity to make them proper enums
  separate from other mount flags imho.)
* propagation "flags" (MS_SHARED)
* peer group
* mnt_id of the mount
* mnt_id of the mount's parent
* owning userns

There's a bit more advanced stuff systemd would really want but which I
think is misplaced in a mountinfo system call including:
* list of primary and auxiliary block device major/minor
* diskseq value of those device nodes (This is a new block device
  feature we added that allows preventing device recycling issues when
  e.g. removing usb devices very quickly and is needed for udev.)
* uuid/fsid
* feature flags (O_TMPFILE, RENAME_EXCHANGE supported etc.)

> 
> >   Interfaces like statx aim to be as fast as possible because we exptect
> >   them to be called quite often. Retrieving mount info is quite costly
> >   and is done quite often as well. Maybe not for all software but for a
> >   lot of low-level software. Especially when starting services in
> >   systemd a lot of mount parsing happens similar when starting
> >   containers in runtimes.
> 
> Was there ever a test patch for systemd using fsinfo(2)?  I think not.
> 
> Until systemd people start to reengineer the mount handing to allow
> for retrieving a single mount instead of the complete mount table we
> will never know where the performance bottleneck lies.

I defer to Ian and Karel to answer that. Both did work to prove that
point triggered by one of your objections to fsinfo() iirc. Karel's
commits at least are here:
https://github.com/util-linux/util-linux/tree/topic/fsinfo

> 
> >
> > * If we decide to go forward with this interface - and I think I
> >   mentioned this in the lsfmm session - could we please at least add a
> >   new system call? It really feels wrong to retrieve mount and other
> >   information through the xattr interfaces. They aren't really xattrs.
> 
> I'd argue with that statement.  These are most definitely attributes.
> As for being extended, we'd just extended the xattr interface...

I just have a really hard time understanding how this belongs into the
(f)getxattr() system call family and why it would be a big deal to just
make this a separate system call.

I saw that Dave has a long mail on the history of all this so maybe
that'll help me. I hope I get around to reading it in detail today.

> 
> Naming aside... imagine that read(2) has always been used to retrieve
> disk data, would you say that reading data from proc feels wrong?
> And in hindsight, would a new syscall for the purpose make any sense?

I think past interface decisions don't need to always inform future
interface decisions.
And fwiw, yes. Imho, there's stuff in proc that should indeed have been
covered by a dedicated system call instead of a read-like interface.
Karel Zak May 10, 2022, 12:35 p.m. UTC | #26
On Mon, May 09, 2022 at 02:48:15PM +0200, Christian Brauner wrote:
> One comment about this. We really need to have this interface support
> giving us mount options like "relatime" back in numeric form (I assume
> this will be possible.). It is royally annoying having to maintain a
> mapping table in userspace just to do:
> 
> relatime -> MS_RELATIME/MOUNT_ATTR_RELATIME
> ro	 -> MS_RDONLY/MOUNT_ATTR_RDONLY
> 
> A library shouldn't be required to use this interface. Conservative
> low-level software that keeps its shared library dependencies minimal
> will need to be able to use that interface without having to go to an
> external library that transforms text-based output to binary form (Which
> I'm very sure will need to happen if we go with a text-based
> interface.).

Sounds like David's fsinfo() :-)

We need an interface where the kernel returns a consistent mount table    
entry (more syscalls to get more key=value could be a way how to get
inconsistent data).                                              

IMHO all the attempts to make a trivial interface will be unsuccessful
because the mount table is complex (tree) and mixes strings, paths,
and flags. We will always end with a complex interface or complex
strings (like the last xatts attempt). There is no 3rd path to go ...

The best would be simplified fsinfo() where userspace defines
a request (wanted "keys"), and the kernel fills a buffer with data
separated by some header metadata struct. In this case, the kernel can
return strings and structs with binary data.  


I'd love something like:

ssize_t sz;
fsinfo_query query[] = {
    { .request = FSINFO_MOUNT_PATH },
    { .request = FSINFO_PROPAGATION },
    { .request = FSINFO_CHILDREN_IDS },
};

sz = fsinfo(dfd, "", AT_EMPTY_PATH,
                &query, ARRAY_SIZE(query),
                buf, sizeof(buf));

for (p = buf; p < buf + sz; ) {
{
    fsinfo_entry *e = (struct fsinfo_entry) p;
    char *data = p + sizeof(struct fsinfo_entry);

    switch(e->request) {
    case FSINFO_MOUNT_PATH:
        printf("mountpoint %s\n", data);
        break;
    case FSINFO_PROPAGATION:
        printf("propagation %x\n", (uintptr_t) data);
        break;
    case FSINFO_CHILDREN_IDS:
        fsinfo_child *x = (fsinfo_child *) data;
        for (i = 0; i < e->count; i++) {
            printf("child: %d\n", x[i].mnt_id);
        }
        break;
    ...
    }

    p += sizeof(struct fsinfo_entry) + e->len;
}



... my two cents :-)

    Karel
Christian Brauner May 10, 2022, 12:40 p.m. UTC | #27
On Tue, May 10, 2022 at 10:55:33AM +1000, Dave Chinner wrote:
> On Mon, May 09, 2022 at 02:48:15PM +0200, Christian Brauner wrote:
> > On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote:
> > > This is a simplification of the getvalues(2) prototype and moving it to the
> > > getxattr(2) interface, as suggested by Dave.
> > > 
> > > The patch itself just adds the possibility to retrieve a single line of
> > > /proc/$$/mountinfo (which was the basic requirement from which the fsinfo
> > > patchset grew out of).
> > > 
> > > But this should be able to serve Amir's per-sb iostats, as well as a host of
> > > other cases where some statistic needs to be retrieved from some object.  Note:
> > > a filesystem object often represents other kinds of objects (such as processes
> > > in /proc) so this is not limited to fs attributes.
> > > 
> > > This also opens up the interface to setting attributes via setxattr(2).
> > > 
> > > After some pondering I made the namespace so:
> > > 
> > > : - root
> > > bar - an attribute
> > > foo: - a folder (can contain attributes and/or folders)
> > > 
> > > The contents of a folder is represented by a null separated list of names.
> > > 
> > > Examples:
> > > 
> > > $ getfattr -etext -n ":" .
> > > # file: .
> > > :="mnt:\000mntns:"
> > > 
> > > $ getfattr -etext -n ":mnt:" .
> > > # file: .
> > > :mnt:="info"
> > > 
> > > $ getfattr -etext -n ":mnt:info" .
> > > # file: .
> > > :mnt:info="21 1 254:0 / / rw,relatime - ext4 /dev/root rw\012"
> > 
> > Hey Miklos,
> > 
> > One comment about this. We really need to have this interface support
> > giving us mount options like "relatime" back in numeric form (I assume
> > this will be possible.). It is royally annoying having to maintain a
> > mapping table in userspace just to do:
> > 
> > relatime -> MS_RELATIME/MOUNT_ATTR_RELATIME
> > ro	 -> MS_RDONLY/MOUNT_ATTR_RDONLY
> 
> You're asking for a complete change of output information there.
> This has nothing to do with the mechanism for extracting key/value
> information from the kernel.
> 
> i.e. we need to separate demands for "data I want" from "mechanism
> to extract data".
> 
> > > $ getfattr -etext -n ":mntns:" .
> > > # file: .
> > > :mntns:="21:\00022:\00024:\00025:\00023:\00026:\00027:\00028:\00029:\00030:\00031:"
> > > 
> > > $ getfattr -etext -n ":mntns:28:" .
> > > # file: .
> > > :mntns:28:="info"
> > > 
> > > Comments?
> > 
> > I'm not a fan of text-based APIs and I'm particularly not a fan of the
> > xattr APIs. But at this point I'm ready to compromise on a lot as long
> > as it gets us values out of the kernel in some way. :)
> > 
> > I had to use xattrs extensively in various low-level userspace projects
> > and they continue to be a source of races and memory bugs.
> > 
> > A few initial questions:
> > 
> > * The xattr APIs often require the caller to do sm like (copying some go
> >   code quickly as I have that lying around):
> > 
> > 	for _, x := range split {
> > 		xattr := string(x)
> > 		// Call Getxattr() twice: First, to determine the size of the
> > 		// buffer we need to allocate to store the extended attributes,
> > 		// second, to actually store the extended attributes in the
> > 		// buffer. Also, check if the size of the extended attribute
> > 		// hasn't increased between the two calls.
> > 		pre, err = unix.Getxattr(path, xattr, nil)
> > 		if err != nil || pre < 0 {
> > 			return nil, err
> > 		}
> > 
> > 		dest = make([]byte, pre)
> > 		post := 0
> > 		if pre > 0 {
> > 			post, err = unix.Getxattr(path, xattr, dest)
> > 			if err != nil || post < 0 {
> > 				return nil, err
> > 			}
> > 		}
> > 
> > 		if post > pre {
> > 			return nil, fmt.Errorf("Extended attribute '%s' size increased from %d to %d during retrieval", xattr, pre, post)
> > 		}
> > 
> > 		xattrs[xattr] = string(dest)
> > 	}
> > 
> >   This pattern of requesting the size first by passing empty arguments,
> >   then allocating the buffer and then passing down that buffer to
> >   retrieve that value is really annoying to use and error prone (I do
> >   of course understand why it exists.).
> 
> You're doing it wrong.
> 
> When you have an attr extraction loop like this, you allocate a
> single 64kB buffer for the value, adn then call getattr() with the
> buffer and a length of 64kB. Then the call returns both the value
> length and the value in one syscall, and then the application can
> allocate+copy into an exact sized buffer if it needs to.
> 
> Then you use the same 64kB buffer for the next getxattr() call.
> 
> I first saw this pattern in code written in the mid 1990s for
> Irix, and for basic listxattr/getxattr operations to find and
> retrieve key/value pairs this is much more efficient than the above.

Right, but this example is in a garbage collected language where this
level of fine-grained memory management is rarely used or encouraged.
But the point is taken.

> 
> The real problem is that the linux listxattr() syscall only returns
> names. It's a shit API design at best, especially the part where it
> cannot iterate over a list larger than a single buffer. If you've
> got millions of xattrs, listxattr is fucking useless. The man page
> is actively harmful - that's where everyone learns the double
> getxattr anit-pattern you've described above.
> 
> 
> With that in mind, go look at XFS_IOC_ATTRLIST_BY_HANDLE that I've
> mentioned in previous discussions this topic.  Each attr entry
> returned is one of these:
> 
> struct xfs_attrlist_ent {       /* data from attr_list() */
> 	__u32   a_valuelen;     /* number bytes in value of attr */
> 	char    a_name[1];      /* attr name (NULL terminated) */
> };
> 
> It really needs a namelen to make parsing of the output buffer
> simpler, but this is the model we should be following - listing
> xattrs is no different from listing directory entries. Indeed,
> a directory entries is basically just a name/value pair - the name
> of the dirent and the inode number it points to is the value.
> 
> Further to that point, the control structure for
> XFS_IOC_ATTRLIST_BY_HANDLE has a cursor value that works the same
> way as readdir cookies do. Hence when iterating over xattrs that
> require multiple syscalls to retrieve, the cursor allows the next
> list syscall to start off listing exactly where the previous syscall
> finished.
> 
> IOWs, what Linux really needs is a listxattr2() syscall that works
> the same way that getdents/XFS_IOC_ATTRLIST_BY_HANDLE work. With the
> list function returning value sizes and being able to iterate
> effectively, every problem that listxattr() causes goes away.

That's an interesting proposal...

> 
> And while we are at it, we need to consider a xattr2() syscall to
> replace getxattr/setxattr. The model for that is
> XFS_IOC_ATTRMULTI_BY_HANDLE, which allows operations to be performed
> on mulitple xattrs in a single syscall. e.g. we can do a bulk get,
> set and remove operation across multiple xattrs - we can even mix
> and match get/set/remove operations on different xattrs in a single
> call.

Yeah, that would've come in handy in various codepaths I've seen.

> 
> > * Would it be possible to support binary output with this interface?
> 
> The xattr API already supports binary names and values. The only
> exception is you can't put NULLs in names because APIs use that as a
> name terminator. if listxattr2() returns a namelen in it's
> structure, then we could allow fully binary names, too (XFS already
> fully supports this internally!). In the current API, values are
> always determined by length, not null termiantions, so they are
> also already fully binary capable.

Yeah, I'm aware of that. I want to know whether we might be amenable to
support this for retrieving structs from mountinfo.

> 
> >   I really think users would love to have an interfact where they can
> >   get a struct with binary info back.
> 
> No. Not for kernel informational interfaces. We have ioctls and

That feels like semantics. statx is in all sensible readings of the
words a kernel informational interface.

> syscalls for defining structured binary interfaces that pass
> non-trivial objects. xattrs are no the place for this - they are
> key/value object stores like sysfs is supposed to be, so I really
> don't think we should support encoded binary data in xattrs under
> this special mount namespace...

I do agree on that point but as I mentioned in my reply to Miklos (hard
to keep track of all ends in this discussion...) if we are fine with
adding some way - in the form of a system call - to retrieve basic mount
information with some of the properties I listed in the other reply, I'm
content.

I'm really looking at this from the perspective of someone who uses
these interfaces regularly in userspace and a text-based interface for
very basic information such as detailed information about a mount is
cumbersome. I know people like to "counter" with "parsing strings is
easy" but it remains a giant pain for userspace; at least for basic
info.

If it's about really specialist deatiled information then sure; go text
based or any other way to retrieve that info.

> 
> Solving the "retreive multiple values per syscall" problem is what
> the bulk get interface (XFS_IOC_ATTRMULTI_BY_HANDLE) is for.
> 
> >   Callers like systemd will have to parse text and will end up
> >   converting everything from text into binary anyway; especially for
> >   mount information. So giving them an option for this out of the box
> >   would be quite good.
> 
> That boat sailed years ago. You're making the same arguments about
> binary extraction interfaces being way more efficient than ascii
> based value-per-file interfaces like proc/sysfs that were made back
> in the early 2000s. That boat has long sailed - while the current
> method is somewhat inefficient, it certainly hasn't had all the
> problems that maintaining binary interfaces over decades has had....
> 
> Much as it pains me to say it (because I came to Linux from an OS
> that exported huge amounts of custom binary structures from the
> kernel), having all the export interfaces dump human readable
> information has proved far more flexible and usable than interfaces
> that required binary parsers to dump information before it could be
> used on the command line or in scripting languages....

I'm really not trying to make far-reaching historical and future claims
about OS interface design. I'm simply asking for a way to retrieve basic
information about e.g. mounts in the form of a struct; statx for mounts.

I feel this is quickly turning into a discussion about how we should in
general design OS interfaces while that is really not at stake here.
Linux is a mix of so many different interfaces - for better or worse -
that any decision we take has no claim on codifying how we should do new
interfaces. That is indeed a ship that has sailed long ago.

> 
> > * If we decide to go forward with this interface - and I think I
> >   mentioned this in the lsfmm session - could we please at least add a
> >   new system call? It really feels wrong to retrieve mount and other
> >   information through the xattr interfaces. They aren't really xattrs.
> 
> We are trying to expose structured key-value information. That's
> exactly what the xattr API was orginally created to cater for...
> 
> >   Imho, xattrs are a bit like a wonky version of streams already (One of
> >   the reasons I find them quite unpleasant.). Making mount and other
> >   information retrievable directly through the getxattr() interface will
> >   turn them into a full-on streams implementation imho. I'd prefer not
> >   to do that (Which is another reason I'd prefer at least a separate
> >   system call.).
> 
> And that's a total misunderstanding of what xattrs are.
> 
> Alternate data streams are just {file,offset} based data streams
> accessed via ithe same read/write() mechanisms as the primary data
> stream.

That's why I said "wonky". But I'm not going to argue this point. I
think you by necessity have wider historical context on these things
that I lack. But I don't find it unreasonable to also see them as an
additional information channel.

Sure, they are a generic key=value store for anything _in principle_. In
practice however xattrs are very much perceived and used as information
storage on files, a metadata side-channel if you will.

All I'm claiming here is that it will confuse the living hell out of
users if the getxattr() api suddenly is used not to just set and get
information associated with inodes but to also provides filesystem or
mount information.

That's a totally a totally differnet type of information. Sure, it may
fit well in the key=value scheme because the xattr key=value _interface_
is generic but that's a very technical argument.

I'm looking at this from the experience of a user of the API for a
moment and in code they'd do in one place:

getxattr('/super/special/binary', "security.capability", ...);

and then in another place they do:

getxattr('/path/to/some/mount', "mntns:info", ...);

that is just flatout confusing.

> 
> Xattrs provide an *atomic key-value object store API*, not an offset
> based data stream API. They are completely different beasts,
> intended for completely different purposes. ADS are redundant when you
> have directories and files, whilst an atomic key-value store is
> something completely different.
> 
> You do realise we have an independent, scalable, ACID compliant
> key-value object store in every inode in an XFS filesystem, right?

So far this was a really mail with good background information but I'm
struggling to make sense of what that last sentence is trying to tell
me. :)
Florian Weimer May 10, 2022, 12:45 p.m. UTC | #28
* Dave Chinner:

> IOWs, what Linux really needs is a listxattr2() syscall that works
> the same way that getdents/XFS_IOC_ATTRLIST_BY_HANDLE work. With the
> list function returning value sizes and being able to iterate
> effectively, every problem that listxattr() causes goes away.

getdents has issues of its own because it's unspecified what happens if
the list of entries is modified during iteration.  Few file systems add
another tree just to guarantee stable iteration.

Maybe that's different for xattrs because they are supposed to be small
and can just be snapshotted with a full copy?

Thanks,
Florian
Miklos Szeredi May 10, 2022, 1:15 p.m. UTC | #29
On Tue, 10 May 2022 at 13:53, Christian Brauner <brauner@kernel.org> wrote:

> > What exactly are the attributes that systemd requires?
>
> We keep a repo with ideas for (kernel) extensions - we should probably
> publish that somewhere - but the list we used for a prototype roughly
> contains:
>
> * mount flags MOUNT_ATTR_RDONLY etc.
> * time flags MOUNT_ATTR_RELATIME etc. (could probably be combined with
>   mount flags. We missed the opportunity to make them proper enums
>   separate from other mount flags imho.)
> * propagation "flags" (MS_SHARED)
> * peer group
> * mnt_id of the mount
> * mnt_id of the mount's parent
> * owning userns

Sounds good thus far.   And hey, we don't even need a new syscall:
statx(2) could handle these fine.

> There's a bit more advanced stuff systemd would really want but which I
> think is misplaced in a mountinfo system call including:
> * list of primary and auxiliary block device major/minor

It's when you need to return variable size arrays or list of strings
that the statx kind of interface falls down.

For that a hierarchical namespace is a much better choice, as it can
represent arbitrary levels of arrays, while doing that with a
specialized syscall is going to be cumbersome.

> I just have a really hard time understanding how this belongs into the
> (f)getxattr() system call family and why it would be a big deal to just
> make this a separate system call.

Fragmenting syntactically equivalent interfaces is bad, unifying them
is good.  Dave's example of adding a new syscall for retrieving
multiple xattrs  is a prime example.

Thanks,
Miklos
Miklos Szeredi May 10, 2022, 1:18 p.m. UTC | #30
On Tue, 10 May 2022 at 15:15, Miklos Szeredi <miklos@szeredi.hu> wrote:
>
> On Tue, 10 May 2022 at 13:53, Christian Brauner <brauner@kernel.org> wrote:
>
> > > What exactly are the attributes that systemd requires?
> >
> > We keep a repo with ideas for (kernel) extensions - we should probably
> > publish that somewhere - but the list we used for a prototype roughly
> > contains:
> >
> > * mount flags MOUNT_ATTR_RDONLY etc.
> > * time flags MOUNT_ATTR_RELATIME etc. (could probably be combined with
> >   mount flags. We missed the opportunity to make them proper enums
> >   separate from other mount flags imho.)
> > * propagation "flags" (MS_SHARED)
> > * peer group
> > * mnt_id of the mount
> > * mnt_id of the mount's parent
> > * owning userns
>
> Sounds good thus far.   And hey, we don't even need a new syscall:
> statx(2) could handle these fine.

Oh, we need this indexed with a mount id, which statx can't do.  So
indeed, a new syscall may be the best choice.

Thanks,
Miklos
Christian Brauner May 10, 2022, 2:19 p.m. UTC | #31
On Tue, May 10, 2022 at 03:15:05PM +0200, Miklos Szeredi wrote:
> On Tue, 10 May 2022 at 13:53, Christian Brauner <brauner@kernel.org> wrote:
> 
> > > What exactly are the attributes that systemd requires?
> >
> > We keep a repo with ideas for (kernel) extensions - we should probably
> > publish that somewhere - but the list we used for a prototype roughly
> > contains:
> >
> > * mount flags MOUNT_ATTR_RDONLY etc.
> > * time flags MOUNT_ATTR_RELATIME etc. (could probably be combined with
> >   mount flags. We missed the opportunity to make them proper enums
> >   separate from other mount flags imho.)
> > * propagation "flags" (MS_SHARED)
> > * peer group
> > * mnt_id of the mount
> > * mnt_id of the mount's parent
> > * owning userns
> 
> Sounds good thus far.   And hey, we don't even need a new syscall:
> statx(2) could handle these fine.
> 
> > There's a bit more advanced stuff systemd would really want but which I
> > think is misplaced in a mountinfo system call including:
> > * list of primary and auxiliary block device major/minor
> 
> It's when you need to return variable size arrays or list of strings
> that the statx kind of interface falls down.
> 
> For that a hierarchical namespace is a much better choice, as it can
> represent arbitrary levels of arrays, while doing that with a
> specialized syscall is going to be cumbersome.
> 
> > I just have a really hard time understanding how this belongs into the
> > (f)getxattr() system call family and why it would be a big deal to just
> > make this a separate system call.
> 
> Fragmenting syntactically equivalent interfaces is bad, unifying them

Fwiw, turning this around: unifying semantically distinct interfaces
because of syntactical similarities is bad. Moving them into a
syntactically equivalent system call that expresses the difference in
semantics in its name is good.
Miklos Szeredi May 10, 2022, 2:41 p.m. UTC | #32
On Tue, 10 May 2022 at 16:19, Christian Brauner <brauner@kernel.org> wrote:

> Fwiw, turning this around: unifying semantically distinct interfaces
> because of syntactical similarities is bad. Moving them into a
> syntactically equivalent system call that expresses the difference in
> semantics in its name is good.

You are ignoring the arguments against fragmentation.

You are also ignoring the fact that semantically the current xattr
interface is already fragmented.   Grep for "strncmp(name, XATTR_" in
fs/xattr.c.

We don't have getsecurityxattr(), getuserxattr(), gettrustedxattr()
and getsystemxattr().  It would be crazy.   Adding getfsxattr()  would
be equally crazy.  getxattr() pretty much describes the semantics of
all of these things.

Thanks,
Miklos
Christian Brauner May 10, 2022, 3:30 p.m. UTC | #33
On Tue, May 10, 2022 at 04:41:35PM +0200, Miklos Szeredi wrote:
> On Tue, 10 May 2022 at 16:19, Christian Brauner <brauner@kernel.org> wrote:
> 
> > Fwiw, turning this around: unifying semantically distinct interfaces
> > because of syntactical similarities is bad. Moving them into a
> > syntactically equivalent system call that expresses the difference in
> > semantics in its name is good.
> 
> You are ignoring the arguments against fragmentation.

No, I'm not ignoring it and really wasn't trying to. What I tried to say
by this, is that the inverse of the argument against fragmentation is
simply equally worth supporting. Meaning the argument against
fragmentation isn't stronger than the argument against aggressive
unification.

(Fwiw, I think that basing the argument on syntactical similarities is
problematic. Stretching this argument for a second, all multiplexers are
almost by necessity syntactically similar (e.g. ptrace() and seccomp())
but we don't use that as an argument to make them a single system call.)

> 
> You are also ignoring the fact that semantically the current xattr
> interface is already fragmented.   Grep for "strncmp(name, XATTR_" in
> fs/xattr.c.
> 
> We don't have getsecurityxattr(), getuserxattr(), gettrustedxattr()
> and getsystemxattr().  It would be crazy.   Adding getfsxattr()  would
> be equally crazy.  getxattr() pretty much describes the semantics of
> all of these things.

getxattr() describes the syntax of all of these things and barely that.
It describes the method of retrieval. And the method of retrieval is
super generic to the point where strings _or binary data_ can be
returned (e.g. POSIX ACLs or fscaps) depending on the xattr namespace.
But wight now, everything we currently get from getxattr() is attributes
associated with inodes.

So getsecurityxattr(), getuserxattr(), gettrustedxattr() etc. would
arguably be fragmentation because all of these things are associated
with inodes.

But now we're in the process of extending the *xattr() calls to operate
on mounts and filesystems so an additional getfsattr() (or another name)
is not fragmentation imho. And I definitely don't think this would
qualify as "crazy".
Miklos Szeredi May 10, 2022, 3:47 p.m. UTC | #34
On Tue, 10 May 2022 at 17:30, Christian Brauner <brauner@kernel.org> wrote:

> But now we're in the process of extending the *xattr() calls to operate
> on mounts and filesystems so an additional getfsattr() (or another name)
> is not fragmentation imho. And I definitely don't think this would
> qualify as "crazy".

In that spirit st_dev does not belong in struct stat, because that is
the property of the block device, not the inode.

But I feel we are going round in circles, lets please not get hung up
on this issue.  Linus will have the final word on which variant (if
either) is going to go in.

Thanks,
Miklos
Christian Brauner May 10, 2022, 3:53 p.m. UTC | #35
On Tue, May 10, 2022 at 05:47:13PM +0200, Miklos Szeredi wrote:
> On Tue, 10 May 2022 at 17:30, Christian Brauner <brauner@kernel.org> wrote:
> 
> > But now we're in the process of extending the *xattr() calls to operate
> > on mounts and filesystems so an additional getfsattr() (or another name)
> > is not fragmentation imho. And I definitely don't think this would
> > qualify as "crazy".
> 
> In that spirit st_dev does not belong in struct stat, because that is
> the property of the block device, not the inode.
> 
> But I feel we are going round in circles, lets please not get hung up
> on this issue.  Linus will have the final word on which variant (if
> either) is going to go in.

Well yes, I'm obviously not going to be d*ck about it and go around
NAKing it just because I didn't get my favorite name but I at least
want to register my strong opposition to the current "unification"
approach loud and clear. :)
Dave Chinner May 10, 2022, 11:04 p.m. UTC | #36
On Tue, May 10, 2022 at 02:45:39PM +0200, Florian Weimer wrote:
> * Dave Chinner:
> 
> > IOWs, what Linux really needs is a listxattr2() syscall that works
> > the same way that getdents/XFS_IOC_ATTRLIST_BY_HANDLE work. With the
> > list function returning value sizes and being able to iterate
> > effectively, every problem that listxattr() causes goes away.
> 
> getdents has issues of its own because it's unspecified what happens if
> the list of entries is modified during iteration.  Few file systems add
> another tree just to guarantee stable iteration.

The filesystem I care about (XFS) guarantees stable iteration and
stable seekdir/telldir cookies. It's not that hard to do, but it
requires the filesystem designer to understand that this is a
necessary feature before they start designing the on-disk directory
format and lookup algorithms....

> Maybe that's different for xattrs because they are supposed to be small
> and can just be snapshotted with a full copy?

It's different for xattrs because we directly control the API
specification for XFS_IOC_ATTRLIST_BY_HANDLE, not POSIX. We can
define the behaviour however we want. Stable iteration is what
listing keys needs.

The cursor is defined as 16 bytes of opaque data, enabling us to
encoded exactly where in the hashed name btree index we have
traversed to:

/*
 * Kernel-internal version of the attrlist cursor.
 */
struct xfs_attrlist_cursor_kern {
        __u32   hashval;        /* hash value of next entry to add */
        __u32   blkno;          /* block containing entry (suggestion) */
        __u32   offset;         /* offset in list of equal-hashvals */
        __u16   pad1;           /* padding to match user-level */
        __u8    pad2;           /* padding to match user-level */
        __u8    initted;        /* T/F: cursor has been initialized */
};

Hence we have all the information in the cursor we need to reset the
btree traversal index to the exact entry we finished at (even in the
presence of hash collisions in the index). Hence removal of the
entry the cursor points to isn't a problem for us, we just move to
the next highest sequential hash index in the btree and start again
from there.

Of course, if this is how we define listxattr2() behaviour (or maybe
we should call it "list_keys()" to make it clear we are treating
this as a key/value store instead of xattrs) then each filesystem
can put what it needs in that cursor to guarantee it can restart key
iteration correctly if the entry the cursor points to has been
removed.  We can also make the cursor larger if necessary for other
filesystems to store the information they need.

Cheers,

Dave.
Dave Chinner May 10, 2022, 11:25 p.m. UTC | #37
On Tue, May 10, 2022 at 02:35:12PM +0200, Karel Zak wrote:
> On Mon, May 09, 2022 at 02:48:15PM +0200, Christian Brauner wrote:
> > One comment about this. We really need to have this interface support
> > giving us mount options like "relatime" back in numeric form (I assume
> > this will be possible.). It is royally annoying having to maintain a
> > mapping table in userspace just to do:
> > 
> > relatime -> MS_RELATIME/MOUNT_ATTR_RELATIME
> > ro	 -> MS_RDONLY/MOUNT_ATTR_RDONLY
> > 
> > A library shouldn't be required to use this interface. Conservative
> > low-level software that keeps its shared library dependencies minimal
> > will need to be able to use that interface without having to go to an
> > external library that transforms text-based output to binary form (Which
> > I'm very sure will need to happen if we go with a text-based
> > interface.).
> 
> Sounds like David's fsinfo() :-)
> 
> We need an interface where the kernel returns a consistent mount table    
> entry (more syscalls to get more key=value could be a way how to get
> inconsistent data).                                              
> 
> IMHO all the attempts to make a trivial interface will be unsuccessful
> because the mount table is complex (tree) and mixes strings, paths,
> and flags. We will always end with a complex interface or complex
> strings (like the last xatts attempt). There is no 3rd path to go ...
> 
> The best would be simplified fsinfo() where userspace defines
> a request (wanted "keys"), and the kernel fills a buffer with data
> separated by some header metadata struct. In this case, the kernel can
> return strings and structs with binary data.  
> 
> 
> I'd love something like:
> 
> ssize_t sz;
> fsinfo_query query[] = {
>     { .request = FSINFO_MOUNT_PATH },
>     { .request = FSINFO_PROPAGATION },
>     { .request = FSINFO_CHILDREN_IDS },
> };
> 
> sz = fsinfo(dfd, "", AT_EMPTY_PATH,
>                 &query, ARRAY_SIZE(query),
>                 buf, sizeof(buf));
> 
> for (p = buf; p < buf + sz; ) {
> {
>     fsinfo_entry *e = (struct fsinfo_entry) p;
>     char *data = p + sizeof(struct fsinfo_entry);
> 
>     switch(e->request) {
>     case FSINFO_MOUNT_PATH:
>         printf("mountpoint %s\n", data);
>         break;
>     case FSINFO_PROPAGATION:
>         printf("propagation %x\n", (uintptr_t) data);
>         break;
>     case FSINFO_CHILDREN_IDS:
>         fsinfo_child *x = (fsinfo_child *) data;
>         for (i = 0; i < e->count; i++) {
>             printf("child: %d\n", x[i].mnt_id);
>         }
>         break;
>     ...
>     }
> 
>     p += sizeof(struct fsinfo_entry) + e->len;
> }

That's pretty much what a multi-xattr get operation looks like.
It's a bit more more intricate in the setup of the request/return
buffer, but otherwise the structure of the code is the same.

I just don't see why we need special purpose interfaces like this
for key/value information when small tweaks to the existing
generic key/value interfaces can provide exactly the same
functionality....

Cheers,

Dave.
Dave Chinner May 11, 2022, 12:42 a.m. UTC | #38
On Tue, May 10, 2022 at 02:40:33PM +0200, Christian Brauner wrote:
> On Tue, May 10, 2022 at 10:55:33AM +1000, Dave Chinner wrote:
> > On Mon, May 09, 2022 at 02:48:15PM +0200, Christian Brauner wrote:
> > > On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote:
> > >   I really think users would love to have an interfact where they can
> > >   get a struct with binary info back.
> > 
> > No. Not for kernel informational interfaces. We have ioctls and
> 
> That feels like semantics. statx is in all sensible readings of the
> words a kernel informational interface.

statx is an special purpose binary syscall interface for returning
inode specific information, it's not an abstract, generic
informational interface.

> I'm really looking at this from the perspective of someone who uses
> these interfaces regularly in userspace and a text-based interface for
> very basic information such as detailed information about a mount is
> cumbersome. I know people like to "counter" with "parsing strings is
> easy" but it remains a giant pain for userspace; at least for basic
> info.

As I said last reply, you are making all the same arguements against
text based information interfaces that were made against proc and
sysfs a long long time again. they weren't convincing a couple of
decades ago, and there aren't really convincing now. Text-based
key/value data is hard to screw up in the long run, binary
interfaces have a habit of biting hard whenever the contents of
the binary structure needs to change...

> > >   Imho, xattrs are a bit like a wonky version of streams already (One of
> > >   the reasons I find them quite unpleasant.). Making mount and other
> > >   information retrievable directly through the getxattr() interface will
> > >   turn them into a full-on streams implementation imho. I'd prefer not
> > >   to do that (Which is another reason I'd prefer at least a separate
> > >   system call.).
> > 
> > And that's a total misunderstanding of what xattrs are.
> > 
> > Alternate data streams are just {file,offset} based data streams
> > accessed via ithe same read/write() mechanisms as the primary data
> > stream.
> 
> That's why I said "wonky". But I'm not going to argue this point. I
> think you by necessity have wider historical context on these things
> that I lack. But I don't find it unreasonable to also see them as an
> additional information channel.
> 
> Sure, they are a generic key=value store for anything _in principle_. In
> practice however xattrs are very much perceived and used as information
> storage on files, a metadata side-channel if you will.

That's how *you* perceive them, not how everyone perceives them.

> All I'm claiming here is that it will confuse the living hell out of
> users if the getxattr() api suddenly is used not to just set and get
> information associated with inodes but to also provides filesystem or
> mount information.

Why would it confuse people? The xattr namespace is already well
known to be heirarchical and context dependent based on the intial
name prefix (user, security, btrfs, trusted, etc). i.e. if you don't
know that the context the xattr acts on is determined by the initial
name prefix, then you need to read the xattr(7) man page again:

Extended attribute namespaces

	Attribute  names  are  null-terminated  strings.   The
	attribute name is always specified in the fully qualified
	namespace.attribute form, for example, user.mime_type,
	trusted.md5sum, system.posix_acl_access, or
	security.selinux.

	The namespace mechanism is used to define different classes
	of extended attributes.  These different classes exist for
	several reasons;  for  example, the permissions and
	capabilities required for manipulating extended attributes
	of one namespace may differ to another.

	Currently,  the  security, system, trusted, and user
	extended attribute classes are defined as described below.
	Additional classes may be added in the future.

> That's a totally a totally differnet type of information. Sure, it may
> fit well in the key=value scheme because the xattr key=value _interface_
> is generic but that's a very technical argument.

Yet adding a new xattr namespace for a new class of information that
is associated the mount that the path/inode/fd is associated with is
exactly what the xattr namespaces are intended to allow. And it is
clearly documented that new classes "may be added in the future".

I just don't see where the confusion would come from...

> 
> I'm looking at this from the experience of a user of the API for a
> moment and in code they'd do in one place:
> 
> getxattr('/super/special/binary', "security.capability", ...);
> 
> and then in another place they do:
> 
> getxattr('/path/to/some/mount', "mntns:info", ...);
> 
> that is just flatout confusing.

Why? Both are getting different classes of key/value information
that is specific to the given path. Just because on is on-disk and
the other is ephemeral doesn't make it in any way confusing. This is
exactly what xattr namesapces are intended to support...

> > Xattrs provide an *atomic key-value object store API*, not an offset
> > based data stream API. They are completely different beasts,
> > intended for completely different purposes. ADS are redundant when you
> > have directories and files, whilst an atomic key-value store is
> > something completely different.
> > 
> > You do realise we have an independent, scalable, ACID compliant
> > key-value object store in every inode in an XFS filesystem, right?
> 
> So far this was a really mail with good background information but I'm
> struggling to make sense of what that last sentence is trying to tell
> me. :)

That people in the past have built large scale data storage
applications that use XFS inodes as key based object stores, not as
a offset based data stream. Who needs atomic write() functionality
when you have ACID set and replace operations for named objects?

The reality is that modern filesystems are really just btree based
object stores with high performance transaction engines overlaid
with a POSIX wrapper. And in the case of xattrs, we effectively
expose that btree based key-value database functionality directly to
userspace....

Stop thinking like xattrs are some useless metadata side channel,
and start thinking of them as an atomic object store that stores and
retreives millions of small (< 1/2 the filesystem block size) named
objects far space effciently than a directory structure full of
small files indexed by object hash.

Cheers,

Dave.
Karel Zak May 11, 2022, 8:58 a.m. UTC | #39
On Wed, May 11, 2022 at 09:25:52AM +1000, Dave Chinner wrote:
> > I'd love something like:
> > 
> > ssize_t sz;
> > fsinfo_query query[] = {
> >     { .request = FSINFO_MOUNT_PATH },
> >     { .request = FSINFO_PROPAGATION },
> >     { .request = FSINFO_CHILDREN_IDS },
> > };
> > 
> > sz = fsinfo(dfd, "", AT_EMPTY_PATH,
> >                 &query, ARRAY_SIZE(query),
> >                 buf, sizeof(buf));
> > 
> > for (p = buf; p < buf + sz; ) {
> > {
> >     fsinfo_entry *e = (struct fsinfo_entry) p;
> >     char *data = p + sizeof(struct fsinfo_entry);
> > 
> >     switch(e->request) {
> >     case FSINFO_MOUNT_PATH:
> >         printf("mountpoint %s\n", data);
> >         break;
> >     case FSINFO_PROPAGATION:
> >         printf("propagation %x\n", (uintptr_t) data);
> >         break;
> >     case FSINFO_CHILDREN_IDS:
> >         fsinfo_child *x = (fsinfo_child *) data;
> >         for (i = 0; i < e->count; i++) {
> >             printf("child: %d\n", x[i].mnt_id);
> >         }
> >         break;
> >     ...
> >     }
> > 
> >     p += sizeof(struct fsinfo_entry) + e->len;
> > }
> 
> That's pretty much what a multi-xattr get operation looks like.
> It's a bit more more intricate in the setup of the request/return
> buffer, but otherwise the structure of the code is the same.
> 
> I just don't see why we need special purpose interfaces like this
> for key/value information when small tweaks to the existing
> generic key/value interfaces can provide exactly the same
> functionality....

I don't say we need a new interface ;-) I'd be happy with whatever as
long as:

  * minimal strings parsing (wish than a requirement)
  * one syscall returns multiple key/value
  * can address mount table entries by ID
  * can ask for list of children (submounts)
  * extensible

if this will be possible with xattr (listxattr2(), or so) when great.

  Karel
Christian Brauner May 11, 2022, 9:16 a.m. UTC | #40
On Wed, May 11, 2022 at 10:42:00AM +1000, Dave Chinner wrote:
> On Tue, May 10, 2022 at 02:40:33PM +0200, Christian Brauner wrote:
> > On Tue, May 10, 2022 at 10:55:33AM +1000, Dave Chinner wrote:
> > > On Mon, May 09, 2022 at 02:48:15PM +0200, Christian Brauner wrote:
> > > > On Tue, May 03, 2022 at 02:23:23PM +0200, Miklos Szeredi wrote:
> > > >   I really think users would love to have an interfact where they can
> > > >   get a struct with binary info back.
> > > 
> > > No. Not for kernel informational interfaces. We have ioctls and
> > 
> > That feels like semantics. statx is in all sensible readings of the
> > words a kernel informational interface.
> 
> statx is an special purpose binary syscall interface for returning
> inode specific information, it's not an abstract, generic
> informational interface.

I want the ability to also have a statx like interface for some basic
mount information is what I'm saying; so a special-purpose interface. I
nowhere argued that statx is a _generic_ information interface.

> 
> > I'm really looking at this from the perspective of someone who uses
> > these interfaces regularly in userspace and a text-based interface for
> > very basic information such as detailed information about a mount is
> > cumbersome. I know people like to "counter" with "parsing strings is
> > easy" but it remains a giant pain for userspace; at least for basic
> > info.
> 
> As I said last reply, you are making all the same arguements against
> text based information interfaces that were made against proc and
> sysfs a long long time again. they weren't convincing a couple of
> decades ago, and there aren't really convincing now. Text-based

I'm not in general opposed to an interface that allows to retrieve
text-based values. I mentioned this earlier in the thread more than
once. Also, we're not talking about pseudo-filesystems here.

I'm saying I want the ability to get some basic information in binary
form. Whether it's as part of this interface or another.

Also, *xattr() interfaces aren't really just text-based as you've
mentioned yourself. They return binary data such as struct
vfs_ns_cap_data or struct posix_acl_xattr_entry.

> key/value data is hard to screw up in the long run, binary
> interfaces have a habit of biting hard whenever the contents of
> the binary structure needs to change...

I partially agree. If we're talking about a netlink like protocol with
complex structs, sure. If we're talking about statx or openat2 then I
don't think so. These are not structs that constantly change and newer
system calls are designed with extensions in mind.

I also think there's a misunderstanding here: I really want to stress
that I'm not arguing for the necessity for a generic binary information
retrieval interface. I also mentioned this earlier. I really just want
the ability to still have some well-defined, special-purpose binary
interface similar to e.g. statx that isn't rejected with the argument
that you can get it text-based from *xattr. And I think others have
agreed in other parts of the thread that is still ok. If that's is
indeed true I'm content.

> 
> > > >   Imho, xattrs are a bit like a wonky version of streams already (One of
> > > >   the reasons I find them quite unpleasant.). Making mount and other
> > > >   information retrievable directly through the getxattr() interface will
> > > >   turn them into a full-on streams implementation imho. I'd prefer not
> > > >   to do that (Which is another reason I'd prefer at least a separate
> > > >   system call.).
> > > 
> > > And that's a total misunderstanding of what xattrs are.
> > > 
> > > Alternate data streams are just {file,offset} based data streams
> > > accessed via ithe same read/write() mechanisms as the primary data
> > > stream.
> > 
> > That's why I said "wonky". But I'm not going to argue this point. I
> > think you by necessity have wider historical context on these things
> > that I lack. But I don't find it unreasonable to also see them as an
> > additional information channel.
> > 
> > Sure, they are a generic key=value store for anything _in principle_. In
> > practice however xattrs are very much perceived and used as information
> > storage on files, a metadata side-channel if you will.
> 
> That's how *you* perceive them, not how everyone perceives them.

I mean, this is as an argument we can't bring to a conclusion or that
will bring us forward in understanding each other. All I could reply to
that is "No, that's how *you* perceive them." in return. I don't know
how this would help.

Let's put it this way, if you go into userspace and ask users of xattrs
what they think xattrs are then they will not come up with a technical
definition for a completely generic key=value interface based on
filesystem btree consideration. They will see them as specific
attributes on files speaking from experience.

> 
> > All I'm claiming here is that it will confuse the living hell out of
> > users if the getxattr() api suddenly is used not to just set and get
> > information associated with inodes but to also provides filesystem or
> > mount information.
> 
> Why would it confuse people? The xattr namespace is already well
> known to be heirarchical and context dependent based on the intial
> name prefix (user, security, btrfs, trusted, etc). i.e. if you don't
> know that the context the xattr acts on is determined by the initial
> name prefix, then you need to read the xattr(7) man page again:

I take it this is a rhetorical device here and you know that I've not
just patched these codepaths multiple times in the kernel but also
extensively programmed with xattrs in userspace for various container
projects.

The reasons why I think it confuses people I already explained in
various parts earlier in the thread. And there are at least two replies
to this thread that find it confusing as well. If you can't relate to
any of these reasons then me reiterating them won't bring this forward.

> 
> Extended attribute namespaces
> 
> 	Attribute  names  are  null-terminated  strings.   The
> 	attribute name is always specified in the fully qualified
> 	namespace.attribute form, for example, user.mime_type,
> 	trusted.md5sum, system.posix_acl_access, or
> 	security.selinux.
> 
> 	The namespace mechanism is used to define different classes
> 	of extended attributes.  These different classes exist for
> 	several reasons;  for  example, the permissions and
> 	capabilities required for manipulating extended attributes
> 	of one namespace may differ to another.
> 
> 	Currently,  the  security, system, trusted, and user
> 	extended attribute classes are defined as described below.
> 	Additional classes may be added in the future.
> 
> > That's a totally a totally differnet type of information. Sure, it may
> > fit well in the key=value scheme because the xattr key=value _interface_
> > is generic but that's a very technical argument.
> 
> Yet adding a new xattr namespace for a new class of information that
> is associated the mount that the path/inode/fd is associated with is
> exactly what the xattr namespaces are intended to allow. And it is
> clearly documented that new classes "may be added in the future".

Fwiw, I don't think the "Additional classes may be added in the future."
means that this interface in principle can be used to retrieve any
key=value based information. If that was really the case then xattr(7)
should really put your definition below on to their.

> 
> I just don't see where the confusion would come from...

Again, there's people who can relate to the arguments even on this
thread but if you can't relate to any of them at all and can't see where
we're coming from than me reiterating all of it once more won't help.

> 
> > 
> > I'm looking at this from the experience of a user of the API for a
> > moment and in code they'd do in one place:
> > 
> > getxattr('/super/special/binary', "security.capability", ...);
> > 
> > and then in another place they do:
> > 
> > getxattr('/path/to/some/mount', "mntns:info", ...);
> > 
> > that is just flatout confusing.
> 
> Why? Both are getting different classes of key/value information
> that is specific to the given path. Just because on is on-disk and
> the other is ephemeral doesn't make it in any way confusing. This is
> exactly what xattr namesapces are intended to support...
> 
> > > Xattrs provide an *atomic key-value object store API*, not an offset
> > > based data stream API. They are completely different beasts,
> > > intended for completely different purposes. ADS are redundant when you
> > > have directories and files, whilst an atomic key-value store is
> > > something completely different.
> > > 
> > > You do realise we have an independent, scalable, ACID compliant
> > > key-value object store in every inode in an XFS filesystem, right?
> > 
> > So far this was a really mail with good background information but I'm
> > struggling to make sense of what that last sentence is trying to tell
> > me. :)
> 
> That people in the past have built large scale data storage
> applications that use XFS inodes as key based object stores, not as
> a offset based data stream. Who needs atomic write() functionality
> when you have ACID set and replace operations for named objects?
> 
> The reality is that modern filesystems are really just btree based
> object stores with high performance transaction engines overlaid
> with a POSIX wrapper. And in the case of xattrs, we effectively
> expose that btree based key-value database functionality directly to
> userspace....
> 
> Stop thinking like xattrs are some useless metadata side channel,

I did say they are a "metadata side channel"; I very much never called
them "useless".
(I don't like it a lot that a subset of them - posix acls - store
additional {g,u}id information on disk but that's a totally different
topic.)

> and start thinking of them as an atomic object store that stores and
> retreives millions of small (< 1/2 the filesystem block size) named
> objects far space effciently than a directory structure full of
> small files indexed by object hash.

What you outline is the perspective of a probably +20 years kernel and
very low-level system software developer. I appreciate that perspective
and I can very much appreciate that this is how we can conceptualize
xattrs in the kernel.

But on the other side of the equation is userspace and it can't be the
expectation that this is how they conceptualize the *xattr() kernel
interfaces based on the manpage or system call. This is so generic that
it becomes meaningless.

By this definition we can just start retrieving any type of information
via this interface all across the kernel. And you'd probably argue that
in principle we could. But then we can also just go back to calling it
getvalues().

I think that's another part of the disconnect between our viewpoints:
you perceive the *xattr() interfaces as a super generic retrieval method
for an abstract key=value storage. And so adding anything in there is
fine as long as it has key=value semantics. I find that to be a shift in
how the *xattr() interfaces are currently used and even how they are
defined.

I think we've pretty much tried to explain our viewpoints in detail and
we don't seem to be getting to common ground for some of it and as I
said somewhere else in the thread I have no intention of blocking this
if people want this interface including making this a direct part of
getxattr() and Linus accepts it.
diff mbox series

Patch

--- a/fs/Makefile
+++ b/fs/Makefile
@@ -16,7 +16,7 @@  obj-y :=	open.o read_write.o file_table.
 		pnode.o splice.o sync.o utimes.o d_path.o \
 		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
 		fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
-		kernel_read_file.o remap_range.o
+		kernel_read_file.o remap_range.o values.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o direct-io.o mpage.o
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -148,3 +148,11 @@  static inline bool is_anon_ns(struct mnt
 }
 
 extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
+
+struct mount *mnt_list_next(struct mnt_namespace *ns, struct list_head *p);
+extern void namespace_lock_read(void);
+extern void namespace_unlock_read(void);
+extern int show_mountinfo_root(struct seq_file *m, struct vfsmount *mnt,
+			       struct path *root);
+extern bool is_path_reachable(struct mount *, struct dentry *,
+			      const struct path *root);
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1332,9 +1332,7 @@  struct vfsmount *mnt_clone_internal(cons
 	return &p->mnt;
 }
 
-#ifdef CONFIG_PROC_FS
-static struct mount *mnt_list_next(struct mnt_namespace *ns,
-				   struct list_head *p)
+struct mount *mnt_list_next(struct mnt_namespace *ns, struct list_head *p)
 {
 	struct mount *mnt, *ret = NULL;
 
@@ -1351,6 +1349,7 @@  static struct mount *mnt_list_next(struc
 	return ret;
 }
 
+#ifdef CONFIG_PROC_FS
 /* iterator; we want it to have access to namespace_sem, thus here... */
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
@@ -1507,6 +1506,16 @@  static inline void namespace_lock(void)
 	down_write(&namespace_sem);
 }
 
+void namespace_lock_read(void)
+{
+	down_read(&namespace_sem);
+}
+
+void namespace_unlock_read(void)
+{
+	up_read(&namespace_sem);
+}
+
 enum umount_tree_flags {
 	UMOUNT_SYNC = 1,
 	UMOUNT_PROPAGATE = 2,
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -50,7 +50,5 @@  void mnt_set_mountpoint(struct mount *,
 void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
 			   struct mount *mnt);
 struct mount *copy_tree(struct mount *, struct dentry *, int);
-bool is_path_reachable(struct mount *, struct dentry *,
-			 const struct path *root);
 int count_mounts(struct mnt_namespace *ns, struct mount *mnt);
 #endif /* _LINUX_PNODE_H */
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -132,9 +132,9 @@  static int show_vfsmnt(struct seq_file *
 	return err;
 }
 
-static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
+int show_mountinfo_root(struct seq_file *m, struct vfsmount *mnt,
+			struct path *root)
 {
-	struct proc_mounts *p = m->private;
 	struct mount *r = real_mount(mnt);
 	struct super_block *sb = mnt->mnt_sb;
 	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
@@ -152,7 +152,7 @@  static int show_mountinfo(struct seq_fil
 	seq_putc(m, ' ');
 
 	/* mountpoints outside of chroot jail will give SEQ_SKIP on this */
-	err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\");
+	err = seq_path_root(m, &mnt_path, root, " \t\n\\");
 	if (err)
 		goto out;
 
@@ -164,7 +164,7 @@  static int show_mountinfo(struct seq_fil
 		seq_printf(m, " shared:%i", r->mnt_group_id);
 	if (IS_MNT_SLAVE(r)) {
 		int master = r->mnt_master->mnt_group_id;
-		int dom = get_dominating_id(r, &p->root);
+		int dom = get_dominating_id(r, root);
 		seq_printf(m, " master:%i", master);
 		if (dom && dom != master)
 			seq_printf(m, " propagate_from:%i", dom);
@@ -194,6 +194,13 @@  static int show_mountinfo(struct seq_fil
 	return err;
 }
 
+static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct proc_mounts *p = m->private;
+
+	return show_mountinfo_root(m, mnt, &p->root);
+}
+
 static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
 {
 	struct proc_mounts *p = m->private;
--- /dev/null
+++ b/fs/values.c
@@ -0,0 +1,242 @@ 
+#include <linux/values.h>
+#include <linux/fs_struct.h>
+#include <linux/seq_file.h>
+#include <linux/nsproxy.h>
+#include "../lib/kstrtox.h"
+#include "mount.h"
+
+struct val_string {
+	const char *str;
+	size_t len;
+};
+
+struct val_iter {
+	struct val_string name;
+	struct seq_file seq;
+	int error;
+};
+
+struct val_desc {
+	struct val_string name;
+	union {
+		u64 idx;
+		int (*get)(struct val_iter *vi, const struct path *path);
+	};
+};
+
+#define VAL_STRING(x) { .str = x, .len = sizeof(x) - 1 }
+#define VD_NAME(x) .name = VAL_STRING(x)
+
+static int val_err(struct val_iter *vi, int err)
+{
+	vi->error = err;
+	return 0;
+}
+
+static int val_end_seq(struct val_iter *vi)
+{
+	if (vi->seq.count == vi->seq.size)
+		return -EOVERFLOW;
+
+	return 0;
+}
+
+static inline void val_string_skip(struct val_string *s, size_t count)
+{
+	WARN_ON(s->len < count);
+	s->str += count;
+	s->len -= count;
+}
+
+static bool val_string_prefix(const struct val_string *p,
+			      const struct val_string *s)
+{
+	return s->len >= p->len && !memcmp(s->str, p->str, p->len);
+}
+
+static struct val_desc *val_lookup(struct val_iter *vi, struct val_desc *vd)
+{
+	for (; vd->name.len; vd++) {
+		if (val_string_prefix(&vd->name, &vi->name)) {
+			val_string_skip(&vi->name, vd->name.len);
+			break;
+		}
+	}
+	return vd;
+}
+
+static int val_get_group(struct val_iter *vi, struct val_desc *vd)
+{
+	for (; vd->name.len; vd++)
+		seq_write(&vi->seq, vd->name.str, vd->name.len + 1);
+
+	return val_end_seq(vi);
+}
+
+enum {
+	VAL_MNT_INFO,
+};
+
+static struct val_desc val_mnt_group[] = {
+	{ VD_NAME("info"),		.idx = VAL_MNT_INFO		},
+	{ }
+};
+
+static int val_mnt_show(struct val_iter *vi, struct vfsmount *mnt)
+{
+	struct val_desc *vd = val_lookup(vi, val_mnt_group);
+	struct path root;
+
+	if (!vd->name.str)
+		return val_err(vi, -ENOENT);
+
+	switch(vd->idx) {
+	case VAL_MNT_INFO:
+		get_fs_root(current->fs, &root);
+		show_mountinfo_root(&vi->seq, mnt, &root);
+		path_put(&root);
+		break;
+	}
+
+	return 0;
+}
+
+static int val_mnt_get(struct val_iter *vi, const struct path *path)
+{
+	int err;
+
+	if (!vi->name.len)
+		return val_get_group(vi, val_mnt_group);
+
+	namespace_lock_read();
+	err = val_mnt_show(vi, path->mnt);
+	namespace_unlock_read();
+
+	return err;
+}
+
+/* called with namespace_sem held for read */
+static struct vfsmount *mnt_lookup_by_id(struct mnt_namespace *ns,
+					 struct path *root, int id)
+{
+	struct mount *m;
+
+	for (m = mnt_list_next(ns, &ns->list); m; m = mnt_list_next(ns, &m->mnt_list)) {
+		if (m->mnt_id == id) {
+			if (is_path_reachable(m, m->mnt.mnt_root, root))
+				return mntget(&m->mnt);
+			else
+				return NULL;
+		}
+	}
+	return NULL;
+}
+
+static void seq_mnt_list(struct seq_file *seq, struct mnt_namespace *ns,
+			 struct path *root)
+{
+	struct mount *m;
+
+	namespace_lock_read();
+	for (m = mnt_list_next(ns, &ns->list); m; m = mnt_list_next(ns, &m->mnt_list)) {
+		if (is_path_reachable(m, m->mnt.mnt_root, root)) {
+			seq_printf(seq, "%i:", m->mnt_id);
+			seq_putc(seq, '\0');
+		}
+	}
+	namespace_unlock_read();
+}
+
+static int val_mntns_get(struct val_iter *vi, const struct path *path)
+{
+	struct mnt_namespace *mnt_ns = current->nsproxy->mnt_ns;
+	struct vfsmount *mnt;
+	struct path root;
+	unsigned long long mnt_id;
+	unsigned int end;
+	int err;
+
+	if (!vi->name.len) {
+		get_fs_root(current->fs, &root);
+		seq_mnt_list(&vi->seq, mnt_ns, &root);
+		path_put(&root);
+		return val_end_seq(vi);
+	}
+
+	end = _parse_integer(vi->name.str, 10, &mnt_id);
+	if (end & KSTRTOX_OVERFLOW)
+		return val_err(vi, -ENOENT);
+	if (vi->name.str[end] != VAL_SEP)
+		return val_err(vi, -ENOENT);
+	val_string_skip(&vi->name, end + 1);
+
+	namespace_lock_read();
+	get_fs_root(current->fs, &root);
+	mnt = mnt_lookup_by_id(mnt_ns, &root, mnt_id);
+	path_put(&root);
+	if (!mnt) {
+		namespace_unlock_read();
+		return val_err(vi, -ENOENT);
+	}
+	if (vi->name.len)
+		err = val_mnt_show(vi, mnt);
+	else
+		err = val_get_group(vi, val_mnt_group);
+
+	namespace_unlock_read();
+	mntput(mnt);
+
+	return err;
+}
+
+
+
+static struct val_desc val_toplevel_group[] = {
+	{ VD_NAME("mnt:"),	.get = val_mnt_get,	},
+	{ VD_NAME("mntns:"),	.get = val_mntns_get,	},
+	{ },
+};
+
+static int getvalues(struct val_iter *vi, const struct path *path)
+{
+	struct val_desc *vd;
+	int err;
+
+	if (!vi->name.len)
+		return val_get_group(vi, val_toplevel_group);
+
+	vd = val_lookup(vi, val_toplevel_group);
+	if (!vd->name.len)
+		err = val_err(vi, -ENOENT);
+	else
+		err = vd->get(vi, path);
+
+	return err ?: vi->error;
+}
+
+ssize_t val_getxattr(struct path *path, const char *name, size_t namelen,
+		     void __user *value, size_t size)
+{
+	int err;
+	char val[1024];
+	struct val_iter vi = {
+		.name = { .str = name, .len = namelen },
+		.seq = { .buf = val, .size = min(sizeof(val), size) },
+	};
+
+	if (!size)
+		return sizeof(val);
+
+	val_string_skip(&vi.name, 1);
+
+	err = getvalues(&vi, path);
+	if (err < 0)
+		return err;
+
+	WARN_ON(vi.seq.count > size);
+	if (copy_to_user(value, vi.seq.buf, vi.seq.count))
+		return -EFAULT;
+
+	return vi.seq.count;
+}
+
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -22,6 +22,7 @@ 
 #include <linux/audit.h>
 #include <linux/vmalloc.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/values.h>
 
 #include <linux/uaccess.h>
 
@@ -643,12 +644,13 @@  SYSCALL_DEFINE5(fsetxattr, int, fd, cons
  * Extended attribute GET operations
  */
 static ssize_t
-getxattr(struct user_namespace *mnt_userns, struct dentry *d,
-	 const char __user *name, void __user *value, size_t size)
+getxattr(struct path *path, const char __user *name,
+	 void __user *value, size_t size)
 {
 	ssize_t error;
 	void *kvalue = NULL;
 	char kname[XATTR_NAME_MAX + 1];
+	struct user_namespace *mnt_userns = mnt_user_ns(path->mnt);
 
 	error = strncpy_from_user(kname, name, sizeof(kname));
 	if (error == 0 || error == sizeof(kname))
@@ -656,6 +658,9 @@  getxattr(struct user_namespace *mnt_user
 	if (error < 0)
 		return error;
 
+	if (kname[0] == VAL_SEP)
+		return val_getxattr(path, kname, error, value, size);
+
 	if (size) {
 		if (size > XATTR_SIZE_MAX)
 			size = XATTR_SIZE_MAX;
@@ -664,7 +669,7 @@  getxattr(struct user_namespace *mnt_user
 			return -ENOMEM;
 	}
 
-	error = vfs_getxattr(mnt_userns, d, kname, kvalue, size);
+	error = vfs_getxattr(mnt_userns, path->dentry, kname, kvalue, size);
 	if (error > 0) {
 		if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
 		    (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
@@ -693,7 +698,7 @@  static ssize_t path_getxattr(const char
 	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
 	if (error)
 		return error;
-	error = getxattr(mnt_user_ns(path.mnt), path.dentry, name, value, size);
+	error = getxattr(&path, name, value, size);
 	path_put(&path);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
@@ -723,8 +728,7 @@  SYSCALL_DEFINE4(fgetxattr, int, fd, cons
 	if (!f.file)
 		return error;
 	audit_file(f.file);
-	error = getxattr(file_mnt_user_ns(f.file), f.file->f_path.dentry,
-			 name, value, size);
+	error = getxattr(&f.file->f_path, name, value, size);
 	fdput(f);
 	return error;
 }
--- /dev/null
+++ b/include/linux/values.h
@@ -0,0 +1,11 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/types.h>
+
+#define VAL_SEP ':'
+
+struct path;
+
+ssize_t val_getxattr(struct path *path, const char *name, size_t namelen,
+		     void __user *value, size_t size);
+